Files
SmollerLM2-10M-sftb/trainer_state.json
ModelHub XC bb2e296c5a 初始化项目,由ModelHub XC社区提供模型
Model: mehmetkeremturkcan/SmollerLM2-10M-sftb
Source: Original Platform
2026-04-11 12:37:01 +08:00

44440 lines
1.1 MiB

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.996920161099265,
"eval_steps": 500,
"global_step": 31650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003158809128958383,
"grad_norm": 6.2127704289584536,
"learning_rate": 6.31911532385466e-07,
"loss": 8.5382,
"step": 1
},
{
"epoch": 0.0015794045644791914,
"grad_norm": 5.441632582092334,
"learning_rate": 3.1595576619273302e-06,
"loss": 8.5222,
"step": 5
},
{
"epoch": 0.003158809128958383,
"grad_norm": 6.223853805333185,
"learning_rate": 6.3191153238546605e-06,
"loss": 8.5644,
"step": 10
},
{
"epoch": 0.004738213693437574,
"grad_norm": 4.555097857317313,
"learning_rate": 9.478672985781992e-06,
"loss": 8.4694,
"step": 15
},
{
"epoch": 0.006317618257916766,
"grad_norm": 3.7490552824171934,
"learning_rate": 1.2638230647709321e-05,
"loss": 8.384,
"step": 20
},
{
"epoch": 0.007897022822395957,
"grad_norm": 1.017431068195726,
"learning_rate": 1.579778830963665e-05,
"loss": 8.2984,
"step": 25
},
{
"epoch": 0.009476427386875147,
"grad_norm": 1.4010697485352104,
"learning_rate": 1.8957345971563984e-05,
"loss": 8.2458,
"step": 30
},
{
"epoch": 0.01105583195135434,
"grad_norm": 1.4454078805201958,
"learning_rate": 2.2116903633491313e-05,
"loss": 8.1732,
"step": 35
},
{
"epoch": 0.012635236515833531,
"grad_norm": 0.5841055333237101,
"learning_rate": 2.5276461295418642e-05,
"loss": 8.1082,
"step": 40
},
{
"epoch": 0.014214641080312722,
"grad_norm": 0.8082771952036348,
"learning_rate": 2.843601895734597e-05,
"loss": 8.0705,
"step": 45
},
{
"epoch": 0.015794045644791914,
"grad_norm": 0.48868551813287026,
"learning_rate": 3.15955766192733e-05,
"loss": 8.0232,
"step": 50
},
{
"epoch": 0.017373450209271106,
"grad_norm": 0.5452976654604186,
"learning_rate": 3.4755134281200636e-05,
"loss": 7.9518,
"step": 55
},
{
"epoch": 0.018952854773750295,
"grad_norm": 0.40190214186715384,
"learning_rate": 3.791469194312797e-05,
"loss": 7.8716,
"step": 60
},
{
"epoch": 0.020532259338229487,
"grad_norm": 0.49313779491880694,
"learning_rate": 4.1074249605055293e-05,
"loss": 7.8674,
"step": 65
},
{
"epoch": 0.02211166390270868,
"grad_norm": 0.34274541171590306,
"learning_rate": 4.4233807266982626e-05,
"loss": 7.8531,
"step": 70
},
{
"epoch": 0.02369106846718787,
"grad_norm": 0.670783515834389,
"learning_rate": 4.739336492890995e-05,
"loss": 7.7934,
"step": 75
},
{
"epoch": 0.025270473031667063,
"grad_norm": 0.3035557533772571,
"learning_rate": 5.0552922590837284e-05,
"loss": 7.7468,
"step": 80
},
{
"epoch": 0.02684987759614625,
"grad_norm": 0.26164246283361264,
"learning_rate": 5.3712480252764616e-05,
"loss": 7.7665,
"step": 85
},
{
"epoch": 0.028429282160625444,
"grad_norm": 0.25633175815508613,
"learning_rate": 5.687203791469194e-05,
"loss": 7.7145,
"step": 90
},
{
"epoch": 0.030008686725104636,
"grad_norm": 0.6842750393577121,
"learning_rate": 6.0031595576619274e-05,
"loss": 7.6732,
"step": 95
},
{
"epoch": 0.03158809128958383,
"grad_norm": 0.23328824707618634,
"learning_rate": 6.31911532385466e-05,
"loss": 7.652,
"step": 100
},
{
"epoch": 0.03316749585406302,
"grad_norm": 0.25073553627568257,
"learning_rate": 6.635071090047395e-05,
"loss": 7.6357,
"step": 105
},
{
"epoch": 0.03474690041854221,
"grad_norm": 0.20089019017365053,
"learning_rate": 6.951026856240127e-05,
"loss": 7.5837,
"step": 110
},
{
"epoch": 0.036326304983021404,
"grad_norm": 0.3824165564128397,
"learning_rate": 7.26698262243286e-05,
"loss": 7.509,
"step": 115
},
{
"epoch": 0.03790570954750059,
"grad_norm": 0.54684949274883,
"learning_rate": 7.582938388625594e-05,
"loss": 7.5753,
"step": 120
},
{
"epoch": 0.03948511411197978,
"grad_norm": 0.4828149930331908,
"learning_rate": 7.898894154818326e-05,
"loss": 7.5584,
"step": 125
},
{
"epoch": 0.04106451867645897,
"grad_norm": 0.4319280294615604,
"learning_rate": 8.214849921011059e-05,
"loss": 7.4968,
"step": 130
},
{
"epoch": 0.042643923240938165,
"grad_norm": 0.14907634526526667,
"learning_rate": 8.530805687203793e-05,
"loss": 7.469,
"step": 135
},
{
"epoch": 0.04422332780541736,
"grad_norm": 0.4441920832206972,
"learning_rate": 8.846761453396525e-05,
"loss": 7.4897,
"step": 140
},
{
"epoch": 0.04580273236989655,
"grad_norm": 0.7131383198303115,
"learning_rate": 9.162717219589258e-05,
"loss": 7.4244,
"step": 145
},
{
"epoch": 0.04738213693437574,
"grad_norm": 0.5231467236556445,
"learning_rate": 9.47867298578199e-05,
"loss": 7.4496,
"step": 150
},
{
"epoch": 0.048961541498854934,
"grad_norm": 0.4218883424696015,
"learning_rate": 9.794628751974724e-05,
"loss": 7.4454,
"step": 155
},
{
"epoch": 0.050540946063334126,
"grad_norm": 0.3258725866439942,
"learning_rate": 0.00010110584518167457,
"loss": 7.3195,
"step": 160
},
{
"epoch": 0.05212035062781331,
"grad_norm": 0.37490805186704756,
"learning_rate": 0.00010426540284360189,
"loss": 7.3362,
"step": 165
},
{
"epoch": 0.0536997551922925,
"grad_norm": 0.8090856015720164,
"learning_rate": 0.00010742496050552923,
"loss": 7.3105,
"step": 170
},
{
"epoch": 0.055279159756771695,
"grad_norm": 0.9628260997280141,
"learning_rate": 0.00011058451816745656,
"loss": 7.2631,
"step": 175
},
{
"epoch": 0.05685856432125089,
"grad_norm": 0.789911407688136,
"learning_rate": 0.00011374407582938388,
"loss": 7.2927,
"step": 180
},
{
"epoch": 0.05843796888573008,
"grad_norm": 0.43613037350148726,
"learning_rate": 0.00011690363349131122,
"loss": 7.3392,
"step": 185
},
{
"epoch": 0.06001737345020927,
"grad_norm": 0.43320105305691636,
"learning_rate": 0.00012006319115323855,
"loss": 7.3604,
"step": 190
},
{
"epoch": 0.061596778014688464,
"grad_norm": 0.4073340900692099,
"learning_rate": 0.0001232227488151659,
"loss": 7.3057,
"step": 195
},
{
"epoch": 0.06317618257916766,
"grad_norm": 1.064883590443732,
"learning_rate": 0.0001263823064770932,
"loss": 7.2429,
"step": 200
},
{
"epoch": 0.06475558714364685,
"grad_norm": 0.7109842164173309,
"learning_rate": 0.00012954186413902054,
"loss": 7.2668,
"step": 205
},
{
"epoch": 0.06633499170812604,
"grad_norm": 1.1073366600163383,
"learning_rate": 0.0001327014218009479,
"loss": 7.2269,
"step": 210
},
{
"epoch": 0.06791439627260523,
"grad_norm": 1.060947638396007,
"learning_rate": 0.00013586097946287522,
"loss": 7.2282,
"step": 215
},
{
"epoch": 0.06949380083708442,
"grad_norm": 0.8011757396098579,
"learning_rate": 0.00013902053712480254,
"loss": 7.1831,
"step": 220
},
{
"epoch": 0.07107320540156362,
"grad_norm": 0.3302248285814246,
"learning_rate": 0.00014218009478672987,
"loss": 7.1975,
"step": 225
},
{
"epoch": 0.07265260996604281,
"grad_norm": 0.3438470557506677,
"learning_rate": 0.0001453396524486572,
"loss": 7.2585,
"step": 230
},
{
"epoch": 0.07423201453052199,
"grad_norm": 0.4087012242932606,
"learning_rate": 0.00014849921011058452,
"loss": 7.1477,
"step": 235
},
{
"epoch": 0.07581141909500118,
"grad_norm": 0.36475666254042266,
"learning_rate": 0.00015165876777251187,
"loss": 7.1608,
"step": 240
},
{
"epoch": 0.07739082365948037,
"grad_norm": 0.37842710558734155,
"learning_rate": 0.0001548183254344392,
"loss": 7.1792,
"step": 245
},
{
"epoch": 0.07897022822395956,
"grad_norm": 0.28097584942692855,
"learning_rate": 0.00015797788309636652,
"loss": 7.1124,
"step": 250
},
{
"epoch": 0.08054963278843875,
"grad_norm": 0.30296611890024583,
"learning_rate": 0.00016113744075829385,
"loss": 7.0683,
"step": 255
},
{
"epoch": 0.08212903735291795,
"grad_norm": 0.26822958956440557,
"learning_rate": 0.00016429699842022117,
"loss": 7.1174,
"step": 260
},
{
"epoch": 0.08370844191739714,
"grad_norm": 0.35292758498440635,
"learning_rate": 0.0001674565560821485,
"loss": 7.1111,
"step": 265
},
{
"epoch": 0.08528784648187633,
"grad_norm": 0.5036068687898875,
"learning_rate": 0.00017061611374407585,
"loss": 7.1127,
"step": 270
},
{
"epoch": 0.08686725104635552,
"grad_norm": 0.5445505627154091,
"learning_rate": 0.00017377567140600318,
"loss": 7.1624,
"step": 275
},
{
"epoch": 0.08844665561083472,
"grad_norm": 0.46005020223145754,
"learning_rate": 0.0001769352290679305,
"loss": 7.1547,
"step": 280
},
{
"epoch": 0.09002606017531391,
"grad_norm": 0.5670663056900972,
"learning_rate": 0.00018009478672985783,
"loss": 7.1144,
"step": 285
},
{
"epoch": 0.0916054647397931,
"grad_norm": 0.15859858120807763,
"learning_rate": 0.00018325434439178515,
"loss": 7.0902,
"step": 290
},
{
"epoch": 0.09318486930427229,
"grad_norm": 1.051807672856598,
"learning_rate": 0.00018641390205371248,
"loss": 7.0528,
"step": 295
},
{
"epoch": 0.09476427386875148,
"grad_norm": 0.29508482169156447,
"learning_rate": 0.0001895734597156398,
"loss": 7.1428,
"step": 300
},
{
"epoch": 0.09634367843323068,
"grad_norm": 0.16647468794562667,
"learning_rate": 0.00019273301737756716,
"loss": 7.0804,
"step": 305
},
{
"epoch": 0.09792308299770987,
"grad_norm": 0.11022893704335947,
"learning_rate": 0.00019589257503949448,
"loss": 7.0636,
"step": 310
},
{
"epoch": 0.09950248756218906,
"grad_norm": 0.22812614469648151,
"learning_rate": 0.0001990521327014218,
"loss": 7.0417,
"step": 315
},
{
"epoch": 0.10108189212666825,
"grad_norm": 0.2046112457147709,
"learning_rate": 0.00020221169036334913,
"loss": 6.9876,
"step": 320
},
{
"epoch": 0.10266129669114744,
"grad_norm": 0.2534026181514207,
"learning_rate": 0.00020537124802527646,
"loss": 7.0649,
"step": 325
},
{
"epoch": 0.10424070125562662,
"grad_norm": 0.6621399961798852,
"learning_rate": 0.00020853080568720379,
"loss": 7.0734,
"step": 330
},
{
"epoch": 0.10582010582010581,
"grad_norm": 0.43723259742782006,
"learning_rate": 0.00021169036334913114,
"loss": 7.0412,
"step": 335
},
{
"epoch": 0.107399510384585,
"grad_norm": 1.2826871012932044,
"learning_rate": 0.00021484992101105846,
"loss": 7.0748,
"step": 340
},
{
"epoch": 0.1089789149490642,
"grad_norm": 0.849524470966563,
"learning_rate": 0.0002180094786729858,
"loss": 7.084,
"step": 345
},
{
"epoch": 0.11055831951354339,
"grad_norm": 0.40236389663315125,
"learning_rate": 0.00022116903633491312,
"loss": 7.0698,
"step": 350
},
{
"epoch": 0.11213772407802258,
"grad_norm": 0.27246238834316205,
"learning_rate": 0.00022432859399684044,
"loss": 7.1011,
"step": 355
},
{
"epoch": 0.11371712864250177,
"grad_norm": 0.30332445638169414,
"learning_rate": 0.00022748815165876777,
"loss": 7.0469,
"step": 360
},
{
"epoch": 0.11529653320698097,
"grad_norm": 0.27851972139163145,
"learning_rate": 0.00023064770932069512,
"loss": 7.0559,
"step": 365
},
{
"epoch": 0.11687593777146016,
"grad_norm": 2.470542225348509,
"learning_rate": 0.00023380726698262244,
"loss": 7.057,
"step": 370
},
{
"epoch": 0.11845534233593935,
"grad_norm": 1.1816037580172367,
"learning_rate": 0.00023696682464454977,
"loss": 7.0296,
"step": 375
},
{
"epoch": 0.12003474690041854,
"grad_norm": 0.9194065752229377,
"learning_rate": 0.0002401263823064771,
"loss": 7.0419,
"step": 380
},
{
"epoch": 0.12161415146489774,
"grad_norm": 0.785676558788233,
"learning_rate": 0.00024328593996840442,
"loss": 7.0803,
"step": 385
},
{
"epoch": 0.12319355602937693,
"grad_norm": 0.3542536042131285,
"learning_rate": 0.0002464454976303318,
"loss": 7.0036,
"step": 390
},
{
"epoch": 0.12477296059385612,
"grad_norm": 0.30928544139513303,
"learning_rate": 0.00024960505529225907,
"loss": 7.0368,
"step": 395
},
{
"epoch": 0.1263523651583353,
"grad_norm": 0.6049027454231509,
"learning_rate": 0.0002527646129541864,
"loss": 7.0539,
"step": 400
},
{
"epoch": 0.1279317697228145,
"grad_norm": 0.2633532003308791,
"learning_rate": 0.0002559241706161137,
"loss": 7.0535,
"step": 405
},
{
"epoch": 0.1295111742872937,
"grad_norm": 0.6449761311297018,
"learning_rate": 0.0002590837282780411,
"loss": 7.0584,
"step": 410
},
{
"epoch": 0.1310905788517729,
"grad_norm": 0.8176988911712082,
"learning_rate": 0.00026224328593996843,
"loss": 7.0499,
"step": 415
},
{
"epoch": 0.13266998341625208,
"grad_norm": 1.0392572109016087,
"learning_rate": 0.0002654028436018958,
"loss": 7.0109,
"step": 420
},
{
"epoch": 0.13424938798073127,
"grad_norm": 0.2934325636821357,
"learning_rate": 0.0002685624012638231,
"loss": 7.0422,
"step": 425
},
{
"epoch": 0.13582879254521046,
"grad_norm": 0.35119595809367254,
"learning_rate": 0.00027172195892575043,
"loss": 6.9462,
"step": 430
},
{
"epoch": 0.13740819710968966,
"grad_norm": 0.6074223601119809,
"learning_rate": 0.00027488151658767773,
"loss": 6.9556,
"step": 435
},
{
"epoch": 0.13898760167416885,
"grad_norm": 0.7676878495406972,
"learning_rate": 0.0002780410742496051,
"loss": 6.9101,
"step": 440
},
{
"epoch": 0.14056700623864804,
"grad_norm": 0.9692798944709154,
"learning_rate": 0.0002812006319115324,
"loss": 6.9324,
"step": 445
},
{
"epoch": 0.14214641080312723,
"grad_norm": 0.44757701778278786,
"learning_rate": 0.00028436018957345974,
"loss": 6.938,
"step": 450
},
{
"epoch": 0.14372581536760642,
"grad_norm": 0.7851047359753064,
"learning_rate": 0.00028751974723538703,
"loss": 6.9853,
"step": 455
},
{
"epoch": 0.14530521993208562,
"grad_norm": 0.39761848688780116,
"learning_rate": 0.0002906793048973144,
"loss": 6.9224,
"step": 460
},
{
"epoch": 0.1468846244965648,
"grad_norm": 0.29640850780132755,
"learning_rate": 0.0002938388625592417,
"loss": 6.9306,
"step": 465
},
{
"epoch": 0.14846402906104397,
"grad_norm": 1.0532977019610126,
"learning_rate": 0.00029699842022116904,
"loss": 6.8812,
"step": 470
},
{
"epoch": 0.15004343362552316,
"grad_norm": 1.7205774722673408,
"learning_rate": 0.0003001579778830964,
"loss": 6.8913,
"step": 475
},
{
"epoch": 0.15162283819000236,
"grad_norm": 0.5666132924088377,
"learning_rate": 0.00030331753554502374,
"loss": 6.974,
"step": 480
},
{
"epoch": 0.15320224275448155,
"grad_norm": 1.6059207113992533,
"learning_rate": 0.00030647709320695104,
"loss": 6.9365,
"step": 485
},
{
"epoch": 0.15478164731896074,
"grad_norm": 0.8633831039844193,
"learning_rate": 0.0003096366508688784,
"loss": 6.9206,
"step": 490
},
{
"epoch": 0.15636105188343993,
"grad_norm": 0.6843817707668207,
"learning_rate": 0.0003127962085308057,
"loss": 6.9343,
"step": 495
},
{
"epoch": 0.15794045644791913,
"grad_norm": 1.154498970933288,
"learning_rate": 0.00031595576619273305,
"loss": 6.8535,
"step": 500
},
{
"epoch": 0.15951986101239832,
"grad_norm": 1.044221506843542,
"learning_rate": 0.00031911532385466034,
"loss": 6.8448,
"step": 505
},
{
"epoch": 0.1610992655768775,
"grad_norm": 0.9026359138753273,
"learning_rate": 0.0003222748815165877,
"loss": 6.883,
"step": 510
},
{
"epoch": 0.1626786701413567,
"grad_norm": 0.7818431127658798,
"learning_rate": 0.000325434439178515,
"loss": 6.8274,
"step": 515
},
{
"epoch": 0.1642580747058359,
"grad_norm": 0.3038148153132765,
"learning_rate": 0.00032859399684044235,
"loss": 6.8275,
"step": 520
},
{
"epoch": 0.16583747927031509,
"grad_norm": 0.5514862335019367,
"learning_rate": 0.00033175355450236965,
"loss": 6.7956,
"step": 525
},
{
"epoch": 0.16741688383479428,
"grad_norm": 0.30623416454852226,
"learning_rate": 0.000334913112164297,
"loss": 6.7252,
"step": 530
},
{
"epoch": 0.16899628839927347,
"grad_norm": 0.2304880515847455,
"learning_rate": 0.0003380726698262243,
"loss": 6.7551,
"step": 535
},
{
"epoch": 0.17057569296375266,
"grad_norm": 0.3433961894230557,
"learning_rate": 0.0003412322274881517,
"loss": 6.7272,
"step": 540
},
{
"epoch": 0.17215509752823185,
"grad_norm": 0.2939590285931192,
"learning_rate": 0.000344391785150079,
"loss": 6.7187,
"step": 545
},
{
"epoch": 0.17373450209271105,
"grad_norm": 0.5364191748296011,
"learning_rate": 0.00034755134281200636,
"loss": 6.7198,
"step": 550
},
{
"epoch": 0.17531390665719024,
"grad_norm": 0.8371082121345393,
"learning_rate": 0.00035071090047393365,
"loss": 6.7018,
"step": 555
},
{
"epoch": 0.17689331122166943,
"grad_norm": 1.976191367544714,
"learning_rate": 0.000353870458135861,
"loss": 6.7703,
"step": 560
},
{
"epoch": 0.17847271578614862,
"grad_norm": 0.9401625695151546,
"learning_rate": 0.0003570300157977883,
"loss": 6.6485,
"step": 565
},
{
"epoch": 0.18005212035062781,
"grad_norm": 0.549662511039272,
"learning_rate": 0.00036018957345971566,
"loss": 6.5869,
"step": 570
},
{
"epoch": 0.181631524915107,
"grad_norm": 0.6162453796965656,
"learning_rate": 0.00036334913112164296,
"loss": 6.5924,
"step": 575
},
{
"epoch": 0.1832109294795862,
"grad_norm": 0.7924029515967493,
"learning_rate": 0.0003665086887835703,
"loss": 6.5347,
"step": 580
},
{
"epoch": 0.1847903340440654,
"grad_norm": 0.6839719475751052,
"learning_rate": 0.0003696682464454976,
"loss": 6.4657,
"step": 585
},
{
"epoch": 0.18636973860854458,
"grad_norm": 0.4801564500944708,
"learning_rate": 0.00037282780410742496,
"loss": 6.4492,
"step": 590
},
{
"epoch": 0.18794914317302377,
"grad_norm": 0.33876622201501777,
"learning_rate": 0.00037598736176935226,
"loss": 6.362,
"step": 595
},
{
"epoch": 0.18952854773750297,
"grad_norm": 0.9890962221430388,
"learning_rate": 0.0003791469194312796,
"loss": 6.3882,
"step": 600
},
{
"epoch": 0.19110795230198216,
"grad_norm": 1.7599819035026254,
"learning_rate": 0.00038230647709320696,
"loss": 6.4227,
"step": 605
},
{
"epoch": 0.19268735686646135,
"grad_norm": 0.5869579130945853,
"learning_rate": 0.0003854660347551343,
"loss": 6.4053,
"step": 610
},
{
"epoch": 0.19426676143094054,
"grad_norm": 0.45551072874042825,
"learning_rate": 0.0003886255924170616,
"loss": 6.2954,
"step": 615
},
{
"epoch": 0.19584616599541974,
"grad_norm": 0.309190315359063,
"learning_rate": 0.00039178515007898897,
"loss": 6.1529,
"step": 620
},
{
"epoch": 0.19742557055989893,
"grad_norm": 0.5257212216446493,
"learning_rate": 0.00039494470774091627,
"loss": 6.2696,
"step": 625
},
{
"epoch": 0.19900497512437812,
"grad_norm": 0.36743671772426867,
"learning_rate": 0.0003981042654028436,
"loss": 6.256,
"step": 630
},
{
"epoch": 0.2005843796888573,
"grad_norm": 0.4249015310170307,
"learning_rate": 0.0004012638230647709,
"loss": 6.1581,
"step": 635
},
{
"epoch": 0.2021637842533365,
"grad_norm": 0.3543328965539145,
"learning_rate": 0.00040442338072669827,
"loss": 6.1331,
"step": 640
},
{
"epoch": 0.2037431888178157,
"grad_norm": 0.3595495143147937,
"learning_rate": 0.00040758293838862557,
"loss": 6.0677,
"step": 645
},
{
"epoch": 0.2053225933822949,
"grad_norm": 0.318573179267392,
"learning_rate": 0.0004107424960505529,
"loss": 6.126,
"step": 650
},
{
"epoch": 0.20690199794677408,
"grad_norm": 1.4164004013583242,
"learning_rate": 0.0004139020537124802,
"loss": 6.0727,
"step": 655
},
{
"epoch": 0.20848140251125324,
"grad_norm": 0.8532063740196797,
"learning_rate": 0.00041706161137440757,
"loss": 6.1032,
"step": 660
},
{
"epoch": 0.21006080707573244,
"grad_norm": 0.6889050112886497,
"learning_rate": 0.0004202211690363349,
"loss": 5.9814,
"step": 665
},
{
"epoch": 0.21164021164021163,
"grad_norm": 0.6321713467797891,
"learning_rate": 0.0004233807266982623,
"loss": 6.1128,
"step": 670
},
{
"epoch": 0.21321961620469082,
"grad_norm": 0.6434857024537683,
"learning_rate": 0.0004265402843601896,
"loss": 5.93,
"step": 675
},
{
"epoch": 0.21479902076917,
"grad_norm": 0.6137569995098859,
"learning_rate": 0.00042969984202211693,
"loss": 5.8829,
"step": 680
},
{
"epoch": 0.2163784253336492,
"grad_norm": 0.38408385420051877,
"learning_rate": 0.0004328593996840442,
"loss": 5.8576,
"step": 685
},
{
"epoch": 0.2179578298981284,
"grad_norm": 0.634119528990219,
"learning_rate": 0.0004360189573459716,
"loss": 5.8313,
"step": 690
},
{
"epoch": 0.2195372344626076,
"grad_norm": 0.918482316143869,
"learning_rate": 0.0004391785150078989,
"loss": 5.8946,
"step": 695
},
{
"epoch": 0.22111663902708678,
"grad_norm": 0.7907809934880858,
"learning_rate": 0.00044233807266982623,
"loss": 5.834,
"step": 700
},
{
"epoch": 0.22269604359156597,
"grad_norm": 0.46370630571448646,
"learning_rate": 0.00044549763033175353,
"loss": 5.7829,
"step": 705
},
{
"epoch": 0.22427544815604517,
"grad_norm": 0.5823074487510479,
"learning_rate": 0.0004486571879936809,
"loss": 5.7058,
"step": 710
},
{
"epoch": 0.22585485272052436,
"grad_norm": 0.48774049732384694,
"learning_rate": 0.00045181674565560823,
"loss": 5.686,
"step": 715
},
{
"epoch": 0.22743425728500355,
"grad_norm": 0.35212117236149904,
"learning_rate": 0.00045497630331753553,
"loss": 5.7281,
"step": 720
},
{
"epoch": 0.22901366184948274,
"grad_norm": 0.43483984405404036,
"learning_rate": 0.0004581358609794629,
"loss": 5.6102,
"step": 725
},
{
"epoch": 0.23059306641396193,
"grad_norm": 0.6591526873658319,
"learning_rate": 0.00046129541864139024,
"loss": 5.739,
"step": 730
},
{
"epoch": 0.23217247097844113,
"grad_norm": 0.49546415150281264,
"learning_rate": 0.0004644549763033176,
"loss": 5.6577,
"step": 735
},
{
"epoch": 0.23375187554292032,
"grad_norm": 0.5496305910361383,
"learning_rate": 0.0004676145339652449,
"loss": 5.6056,
"step": 740
},
{
"epoch": 0.2353312801073995,
"grad_norm": 0.3473999093942096,
"learning_rate": 0.00047077409162717224,
"loss": 5.5325,
"step": 745
},
{
"epoch": 0.2369106846718787,
"grad_norm": 0.5652797885699842,
"learning_rate": 0.00047393364928909954,
"loss": 5.5865,
"step": 750
},
{
"epoch": 0.2384900892363579,
"grad_norm": 0.1869666229929509,
"learning_rate": 0.0004770932069510269,
"loss": 5.5449,
"step": 755
},
{
"epoch": 0.24006949380083709,
"grad_norm": 0.8352438525883608,
"learning_rate": 0.0004802527646129542,
"loss": 5.5177,
"step": 760
},
{
"epoch": 0.24164889836531628,
"grad_norm": 0.4945844539827825,
"learning_rate": 0.00048341232227488154,
"loss": 5.525,
"step": 765
},
{
"epoch": 0.24322830292979547,
"grad_norm": 0.44884298966390257,
"learning_rate": 0.00048657187993680884,
"loss": 5.5426,
"step": 770
},
{
"epoch": 0.24480770749427466,
"grad_norm": 0.4629116737725799,
"learning_rate": 0.0004897314375987362,
"loss": 5.5327,
"step": 775
},
{
"epoch": 0.24638711205875385,
"grad_norm": 0.46472967641352214,
"learning_rate": 0.0004928909952606635,
"loss": 5.4677,
"step": 780
},
{
"epoch": 0.24796651662323305,
"grad_norm": 0.44982438719700835,
"learning_rate": 0.0004960505529225908,
"loss": 5.4653,
"step": 785
},
{
"epoch": 0.24954592118771224,
"grad_norm": 0.3514112193707262,
"learning_rate": 0.0004992101105845181,
"loss": 5.4494,
"step": 790
},
{
"epoch": 0.25112532575219143,
"grad_norm": 0.5242266351187057,
"learning_rate": 0.0005023696682464455,
"loss": 5.4762,
"step": 795
},
{
"epoch": 0.2527047303166706,
"grad_norm": 0.5584823092530848,
"learning_rate": 0.0005055292259083729,
"loss": 5.3685,
"step": 800
},
{
"epoch": 0.2542841348811498,
"grad_norm": 0.33757365754502244,
"learning_rate": 0.0005086887835703001,
"loss": 5.3335,
"step": 805
},
{
"epoch": 0.255863539445629,
"grad_norm": 0.5451094689520501,
"learning_rate": 0.0005118483412322274,
"loss": 5.3352,
"step": 810
},
{
"epoch": 0.2574429440101082,
"grad_norm": 0.6200864058068326,
"learning_rate": 0.0005150078988941548,
"loss": 5.2385,
"step": 815
},
{
"epoch": 0.2590223485745874,
"grad_norm": 0.4764076847440562,
"learning_rate": 0.0005181674565560822,
"loss": 5.2983,
"step": 820
},
{
"epoch": 0.2606017531390666,
"grad_norm": 0.5899460488837064,
"learning_rate": 0.0005213270142180095,
"loss": 5.316,
"step": 825
},
{
"epoch": 0.2621811577035458,
"grad_norm": 0.3984879097129385,
"learning_rate": 0.0005244865718799369,
"loss": 5.2552,
"step": 830
},
{
"epoch": 0.26376056226802497,
"grad_norm": 0.35376936484079524,
"learning_rate": 0.0005276461295418642,
"loss": 5.1943,
"step": 835
},
{
"epoch": 0.26533996683250416,
"grad_norm": 0.24093631193083778,
"learning_rate": 0.0005308056872037916,
"loss": 5.2637,
"step": 840
},
{
"epoch": 0.26691937139698335,
"grad_norm": 0.49391606311149877,
"learning_rate": 0.0005339652448657188,
"loss": 5.4132,
"step": 845
},
{
"epoch": 0.26849877596146254,
"grad_norm": 0.43395300336607834,
"learning_rate": 0.0005371248025276462,
"loss": 5.2483,
"step": 850
},
{
"epoch": 0.27007818052594174,
"grad_norm": 0.46447905290775454,
"learning_rate": 0.0005402843601895735,
"loss": 5.1954,
"step": 855
},
{
"epoch": 0.2716575850904209,
"grad_norm": 0.2886589835076223,
"learning_rate": 0.0005434439178515009,
"loss": 5.2153,
"step": 860
},
{
"epoch": 0.2732369896549001,
"grad_norm": 0.36161004026508903,
"learning_rate": 0.0005466034755134281,
"loss": 5.2296,
"step": 865
},
{
"epoch": 0.2748163942193793,
"grad_norm": 0.36114207768892564,
"learning_rate": 0.0005497630331753555,
"loss": 5.2661,
"step": 870
},
{
"epoch": 0.2763957987838585,
"grad_norm": 0.4311436524552363,
"learning_rate": 0.0005529225908372828,
"loss": 5.2284,
"step": 875
},
{
"epoch": 0.2779752033483377,
"grad_norm": 0.5724727528073159,
"learning_rate": 0.0005560821484992102,
"loss": 5.279,
"step": 880
},
{
"epoch": 0.2795546079128169,
"grad_norm": 0.4022955239127195,
"learning_rate": 0.0005592417061611374,
"loss": 5.1918,
"step": 885
},
{
"epoch": 0.2811340124772961,
"grad_norm": 0.3832205002594842,
"learning_rate": 0.0005624012638230648,
"loss": 5.0748,
"step": 890
},
{
"epoch": 0.2827134170417753,
"grad_norm": 0.42672311680939823,
"learning_rate": 0.0005655608214849921,
"loss": 5.1771,
"step": 895
},
{
"epoch": 0.28429282160625446,
"grad_norm": 0.3416870055878534,
"learning_rate": 0.0005687203791469195,
"loss": 5.0845,
"step": 900
},
{
"epoch": 0.28587222617073366,
"grad_norm": 0.42324854308877585,
"learning_rate": 0.0005718799368088467,
"loss": 5.0715,
"step": 905
},
{
"epoch": 0.28745163073521285,
"grad_norm": 0.6012027315352575,
"learning_rate": 0.0005750394944707741,
"loss": 5.1826,
"step": 910
},
{
"epoch": 0.28903103529969204,
"grad_norm": 0.5218406433884802,
"learning_rate": 0.0005781990521327014,
"loss": 5.1133,
"step": 915
},
{
"epoch": 0.29061043986417123,
"grad_norm": 0.4149110767186936,
"learning_rate": 0.0005813586097946288,
"loss": 5.0224,
"step": 920
},
{
"epoch": 0.2921898444286504,
"grad_norm": 0.42354283670841264,
"learning_rate": 0.000584518167456556,
"loss": 5.1543,
"step": 925
},
{
"epoch": 0.2937692489931296,
"grad_norm": 0.5334352394067117,
"learning_rate": 0.0005876777251184834,
"loss": 5.0157,
"step": 930
},
{
"epoch": 0.2953486535576088,
"grad_norm": 0.4450401762543175,
"learning_rate": 0.0005908372827804107,
"loss": 5.0368,
"step": 935
},
{
"epoch": 0.29692805812208795,
"grad_norm": 0.45932586952217247,
"learning_rate": 0.0005939968404423381,
"loss": 5.0463,
"step": 940
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.42588394451324696,
"learning_rate": 0.0005971563981042653,
"loss": 5.0819,
"step": 945
},
{
"epoch": 0.30008686725104633,
"grad_norm": 0.6656788426154296,
"learning_rate": 0.0006003159557661928,
"loss": 5.1731,
"step": 950
},
{
"epoch": 0.3016662718155255,
"grad_norm": 0.47719556791409956,
"learning_rate": 0.0006034755134281201,
"loss": 5.0238,
"step": 955
},
{
"epoch": 0.3032456763800047,
"grad_norm": 0.5611432111600015,
"learning_rate": 0.0006066350710900475,
"loss": 4.9896,
"step": 960
},
{
"epoch": 0.3048250809444839,
"grad_norm": 0.6143519827313882,
"learning_rate": 0.0006097946287519747,
"loss": 5.0388,
"step": 965
},
{
"epoch": 0.3064044855089631,
"grad_norm": 0.4117370891557286,
"learning_rate": 0.0006129541864139021,
"loss": 4.9773,
"step": 970
},
{
"epoch": 0.3079838900734423,
"grad_norm": 0.4040255243863166,
"learning_rate": 0.0006161137440758294,
"loss": 4.9743,
"step": 975
},
{
"epoch": 0.3095632946379215,
"grad_norm": 0.3685024774727212,
"learning_rate": 0.0006192733017377568,
"loss": 4.9448,
"step": 980
},
{
"epoch": 0.3111426992024007,
"grad_norm": 0.5079257507748496,
"learning_rate": 0.000622432859399684,
"loss": 4.9968,
"step": 985
},
{
"epoch": 0.31272210376687987,
"grad_norm": 0.41938029073526095,
"learning_rate": 0.0006255924170616114,
"loss": 4.8603,
"step": 990
},
{
"epoch": 0.31430150833135906,
"grad_norm": 0.40494473054595437,
"learning_rate": 0.0006287519747235387,
"loss": 5.0043,
"step": 995
},
{
"epoch": 0.31588091289583825,
"grad_norm": 0.38506688078551043,
"learning_rate": 0.0006319115323854661,
"loss": 4.8982,
"step": 1000
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.39620636828274935,
"learning_rate": 0.0006350710900473933,
"loss": 4.9752,
"step": 1005
},
{
"epoch": 0.31903972202479663,
"grad_norm": 0.4173437016756873,
"learning_rate": 0.0006382306477093207,
"loss": 4.88,
"step": 1010
},
{
"epoch": 0.3206191265892758,
"grad_norm": 0.512317814062129,
"learning_rate": 0.000641390205371248,
"loss": 4.9377,
"step": 1015
},
{
"epoch": 0.322198531153755,
"grad_norm": 0.3942980764517883,
"learning_rate": 0.0006445497630331754,
"loss": 4.7866,
"step": 1020
},
{
"epoch": 0.3237779357182342,
"grad_norm": 0.39301676577873923,
"learning_rate": 0.0006477093206951026,
"loss": 4.8813,
"step": 1025
},
{
"epoch": 0.3253573402827134,
"grad_norm": 0.39096154480289047,
"learning_rate": 0.00065086887835703,
"loss": 4.927,
"step": 1030
},
{
"epoch": 0.3269367448471926,
"grad_norm": 0.5660222446285443,
"learning_rate": 0.0006540284360189573,
"loss": 4.9219,
"step": 1035
},
{
"epoch": 0.3285161494116718,
"grad_norm": 0.2968407231102146,
"learning_rate": 0.0006571879936808847,
"loss": 4.8572,
"step": 1040
},
{
"epoch": 0.330095553976151,
"grad_norm": 0.33852019934484984,
"learning_rate": 0.0006603475513428119,
"loss": 4.8026,
"step": 1045
},
{
"epoch": 0.33167495854063017,
"grad_norm": 0.275317165863335,
"learning_rate": 0.0006635071090047393,
"loss": 4.7798,
"step": 1050
},
{
"epoch": 0.33325436310510936,
"grad_norm": 0.36017928511621944,
"learning_rate": 0.0006666666666666666,
"loss": 4.9384,
"step": 1055
},
{
"epoch": 0.33483376766958856,
"grad_norm": 0.37632772082071475,
"learning_rate": 0.000669826224328594,
"loss": 4.7871,
"step": 1060
},
{
"epoch": 0.33641317223406775,
"grad_norm": 0.37278569781126064,
"learning_rate": 0.0006729857819905212,
"loss": 4.8438,
"step": 1065
},
{
"epoch": 0.33799257679854694,
"grad_norm": 0.48509481516036007,
"learning_rate": 0.0006761453396524486,
"loss": 4.7854,
"step": 1070
},
{
"epoch": 0.33957198136302613,
"grad_norm": 0.3689752437680978,
"learning_rate": 0.000679304897314376,
"loss": 4.7908,
"step": 1075
},
{
"epoch": 0.3411513859275053,
"grad_norm": 0.42509360378032895,
"learning_rate": 0.0006824644549763034,
"loss": 4.8053,
"step": 1080
},
{
"epoch": 0.3427307904919845,
"grad_norm": 0.3531227024868157,
"learning_rate": 0.0006856240126382308,
"loss": 4.7319,
"step": 1085
},
{
"epoch": 0.3443101950564637,
"grad_norm": 0.4319768745377021,
"learning_rate": 0.000688783570300158,
"loss": 4.771,
"step": 1090
},
{
"epoch": 0.3458895996209429,
"grad_norm": 0.5218994616666175,
"learning_rate": 0.0006919431279620854,
"loss": 4.6886,
"step": 1095
},
{
"epoch": 0.3474690041854221,
"grad_norm": 0.40081190957648,
"learning_rate": 0.0006951026856240127,
"loss": 4.8512,
"step": 1100
},
{
"epoch": 0.3490484087499013,
"grad_norm": 0.40384089733876094,
"learning_rate": 0.0006982622432859401,
"loss": 4.7655,
"step": 1105
},
{
"epoch": 0.3506278133143805,
"grad_norm": 0.3990224997671872,
"learning_rate": 0.0007014218009478673,
"loss": 4.7179,
"step": 1110
},
{
"epoch": 0.35220721787885967,
"grad_norm": 0.3671177954599768,
"learning_rate": 0.0007045813586097947,
"loss": 4.7594,
"step": 1115
},
{
"epoch": 0.35378662244333886,
"grad_norm": 0.32592358544879374,
"learning_rate": 0.000707740916271722,
"loss": 4.8065,
"step": 1120
},
{
"epoch": 0.35536602700781805,
"grad_norm": 0.27989667046997696,
"learning_rate": 0.0007109004739336494,
"loss": 4.7348,
"step": 1125
},
{
"epoch": 0.35694543157229724,
"grad_norm": 0.2726805249398657,
"learning_rate": 0.0007140600315955766,
"loss": 4.7251,
"step": 1130
},
{
"epoch": 0.35852483613677644,
"grad_norm": 0.26271169229037056,
"learning_rate": 0.000717219589257504,
"loss": 4.6697,
"step": 1135
},
{
"epoch": 0.36010424070125563,
"grad_norm": 0.24966337615323878,
"learning_rate": 0.0007203791469194313,
"loss": 4.6451,
"step": 1140
},
{
"epoch": 0.3616836452657348,
"grad_norm": 0.34651257210740116,
"learning_rate": 0.0007235387045813587,
"loss": 4.743,
"step": 1145
},
{
"epoch": 0.363263049830214,
"grad_norm": 0.30529045148203326,
"learning_rate": 0.0007266982622432859,
"loss": 4.6544,
"step": 1150
},
{
"epoch": 0.3648424543946932,
"grad_norm": 0.29516957256046145,
"learning_rate": 0.0007298578199052133,
"loss": 4.6314,
"step": 1155
},
{
"epoch": 0.3664218589591724,
"grad_norm": 0.37641801638716,
"learning_rate": 0.0007330173775671406,
"loss": 4.6799,
"step": 1160
},
{
"epoch": 0.3680012635236516,
"grad_norm": 0.581499758105289,
"learning_rate": 0.000736176935229068,
"loss": 4.6155,
"step": 1165
},
{
"epoch": 0.3695806680881308,
"grad_norm": 0.3303116674122958,
"learning_rate": 0.0007393364928909952,
"loss": 4.6535,
"step": 1170
},
{
"epoch": 0.37116007265261,
"grad_norm": 0.4152780156861754,
"learning_rate": 0.0007424960505529226,
"loss": 4.6205,
"step": 1175
},
{
"epoch": 0.37273947721708917,
"grad_norm": 0.5075025229078507,
"learning_rate": 0.0007456556082148499,
"loss": 4.7402,
"step": 1180
},
{
"epoch": 0.37431888178156836,
"grad_norm": 0.4168452318847694,
"learning_rate": 0.0007488151658767773,
"loss": 4.6322,
"step": 1185
},
{
"epoch": 0.37589828634604755,
"grad_norm": 0.35699756058795973,
"learning_rate": 0.0007519747235387045,
"loss": 4.5663,
"step": 1190
},
{
"epoch": 0.37747769091052674,
"grad_norm": 0.43478301194689534,
"learning_rate": 0.0007551342812006319,
"loss": 4.6439,
"step": 1195
},
{
"epoch": 0.37905709547500593,
"grad_norm": 0.44362147672336705,
"learning_rate": 0.0007582938388625592,
"loss": 4.6466,
"step": 1200
},
{
"epoch": 0.3806365000394851,
"grad_norm": 0.5273983527555247,
"learning_rate": 0.0007614533965244867,
"loss": 4.5934,
"step": 1205
},
{
"epoch": 0.3822159046039643,
"grad_norm": 0.4386346074087536,
"learning_rate": 0.0007646129541864139,
"loss": 4.5789,
"step": 1210
},
{
"epoch": 0.3837953091684435,
"grad_norm": 0.525664691716359,
"learning_rate": 0.0007677725118483413,
"loss": 4.6282,
"step": 1215
},
{
"epoch": 0.3853747137329227,
"grad_norm": 0.5991296460212412,
"learning_rate": 0.0007709320695102686,
"loss": 4.5133,
"step": 1220
},
{
"epoch": 0.3869541182974019,
"grad_norm": 0.3885180787223512,
"learning_rate": 0.000774091627172196,
"loss": 4.5911,
"step": 1225
},
{
"epoch": 0.3885335228618811,
"grad_norm": 0.2773505625109938,
"learning_rate": 0.0007772511848341232,
"loss": 4.543,
"step": 1230
},
{
"epoch": 0.3901129274263603,
"grad_norm": 0.27789170229758464,
"learning_rate": 0.0007804107424960506,
"loss": 4.6806,
"step": 1235
},
{
"epoch": 0.39169233199083947,
"grad_norm": 0.3647966903511207,
"learning_rate": 0.0007835703001579779,
"loss": 4.5042,
"step": 1240
},
{
"epoch": 0.39327173655531866,
"grad_norm": 0.3325733552089913,
"learning_rate": 0.0007867298578199053,
"loss": 4.6133,
"step": 1245
},
{
"epoch": 0.39485114111979785,
"grad_norm": 0.29407283332852896,
"learning_rate": 0.0007898894154818325,
"loss": 4.546,
"step": 1250
},
{
"epoch": 0.39643054568427705,
"grad_norm": 0.3224036612986526,
"learning_rate": 0.0007930489731437599,
"loss": 4.4823,
"step": 1255
},
{
"epoch": 0.39800995024875624,
"grad_norm": 0.4396548377930201,
"learning_rate": 0.0007962085308056872,
"loss": 4.4521,
"step": 1260
},
{
"epoch": 0.39958935481323543,
"grad_norm": 0.4520622366982296,
"learning_rate": 0.0007993680884676146,
"loss": 4.4983,
"step": 1265
},
{
"epoch": 0.4011687593777146,
"grad_norm": 0.5067471099692891,
"learning_rate": 0.0008025276461295418,
"loss": 4.6509,
"step": 1270
},
{
"epoch": 0.4027481639421938,
"grad_norm": 0.44983652286607784,
"learning_rate": 0.0008056872037914692,
"loss": 4.5551,
"step": 1275
},
{
"epoch": 0.404327568506673,
"grad_norm": 0.2744857789721565,
"learning_rate": 0.0008088467614533965,
"loss": 4.4722,
"step": 1280
},
{
"epoch": 0.4059069730711522,
"grad_norm": 0.3090823230246799,
"learning_rate": 0.0008120063191153239,
"loss": 4.5588,
"step": 1285
},
{
"epoch": 0.4074863776356314,
"grad_norm": 0.22056016603549802,
"learning_rate": 0.0008151658767772511,
"loss": 4.5044,
"step": 1290
},
{
"epoch": 0.4090657822001106,
"grad_norm": 0.22668335200552153,
"learning_rate": 0.0008183254344391785,
"loss": 4.4704,
"step": 1295
},
{
"epoch": 0.4106451867645898,
"grad_norm": 0.2528435325303638,
"learning_rate": 0.0008214849921011058,
"loss": 4.4797,
"step": 1300
},
{
"epoch": 0.41222459132906897,
"grad_norm": 0.3199211131575819,
"learning_rate": 0.0008246445497630332,
"loss": 4.4488,
"step": 1305
},
{
"epoch": 0.41380399589354816,
"grad_norm": 0.2841232699773147,
"learning_rate": 0.0008278041074249604,
"loss": 4.386,
"step": 1310
},
{
"epoch": 0.4153834004580273,
"grad_norm": 0.3289313223734094,
"learning_rate": 0.0008309636650868878,
"loss": 4.3735,
"step": 1315
},
{
"epoch": 0.4169628050225065,
"grad_norm": 0.5061174703186038,
"learning_rate": 0.0008341232227488151,
"loss": 4.501,
"step": 1320
},
{
"epoch": 0.4185422095869857,
"grad_norm": 0.31956130549869527,
"learning_rate": 0.0008372827804107425,
"loss": 4.4453,
"step": 1325
},
{
"epoch": 0.4201216141514649,
"grad_norm": 0.37003410488030913,
"learning_rate": 0.0008404423380726698,
"loss": 4.4463,
"step": 1330
},
{
"epoch": 0.42170101871594406,
"grad_norm": 0.3001295975880741,
"learning_rate": 0.0008436018957345972,
"loss": 4.4065,
"step": 1335
},
{
"epoch": 0.42328042328042326,
"grad_norm": 0.3309371435836036,
"learning_rate": 0.0008467614533965246,
"loss": 4.3952,
"step": 1340
},
{
"epoch": 0.42485982784490245,
"grad_norm": 0.3401308584471871,
"learning_rate": 0.0008499210110584519,
"loss": 4.3779,
"step": 1345
},
{
"epoch": 0.42643923240938164,
"grad_norm": 0.3730128820839676,
"learning_rate": 0.0008530805687203792,
"loss": 4.4364,
"step": 1350
},
{
"epoch": 0.42801863697386083,
"grad_norm": 0.40635210718596704,
"learning_rate": 0.0008562401263823065,
"loss": 4.4198,
"step": 1355
},
{
"epoch": 0.42959804153834,
"grad_norm": 0.3104368315190389,
"learning_rate": 0.0008593996840442339,
"loss": 4.442,
"step": 1360
},
{
"epoch": 0.4311774461028192,
"grad_norm": 0.2756437108872531,
"learning_rate": 0.0008625592417061612,
"loss": 4.3521,
"step": 1365
},
{
"epoch": 0.4327568506672984,
"grad_norm": 0.3209507950932325,
"learning_rate": 0.0008657187993680885,
"loss": 4.3763,
"step": 1370
},
{
"epoch": 0.4343362552317776,
"grad_norm": 0.3513469599362552,
"learning_rate": 0.0008688783570300158,
"loss": 4.4113,
"step": 1375
},
{
"epoch": 0.4359156597962568,
"grad_norm": 0.4580040169664537,
"learning_rate": 0.0008720379146919432,
"loss": 4.3868,
"step": 1380
},
{
"epoch": 0.437495064360736,
"grad_norm": 0.29184114117723914,
"learning_rate": 0.0008751974723538705,
"loss": 4.3205,
"step": 1385
},
{
"epoch": 0.4390744689252152,
"grad_norm": 0.3255059541417882,
"learning_rate": 0.0008783570300157978,
"loss": 4.4277,
"step": 1390
},
{
"epoch": 0.44065387348969437,
"grad_norm": 0.23731895358980856,
"learning_rate": 0.0008815165876777251,
"loss": 4.3787,
"step": 1395
},
{
"epoch": 0.44223327805417356,
"grad_norm": 0.25517686293566755,
"learning_rate": 0.0008846761453396525,
"loss": 4.3998,
"step": 1400
},
{
"epoch": 0.44381268261865275,
"grad_norm": 0.38653219284891066,
"learning_rate": 0.0008878357030015798,
"loss": 4.3319,
"step": 1405
},
{
"epoch": 0.44539208718313195,
"grad_norm": 0.4941047718072808,
"learning_rate": 0.0008909952606635071,
"loss": 4.3752,
"step": 1410
},
{
"epoch": 0.44697149174761114,
"grad_norm": 0.27392813184040077,
"learning_rate": 0.0008941548183254344,
"loss": 4.3326,
"step": 1415
},
{
"epoch": 0.44855089631209033,
"grad_norm": 0.5891785129930696,
"learning_rate": 0.0008973143759873618,
"loss": 4.4047,
"step": 1420
},
{
"epoch": 0.4501303008765695,
"grad_norm": 0.3774727501393342,
"learning_rate": 0.0009004739336492891,
"loss": 4.3729,
"step": 1425
},
{
"epoch": 0.4517097054410487,
"grad_norm": 0.3695157989637981,
"learning_rate": 0.0009036334913112165,
"loss": 4.3076,
"step": 1430
},
{
"epoch": 0.4532891100055279,
"grad_norm": 0.24729477197612165,
"learning_rate": 0.0009067930489731437,
"loss": 4.3055,
"step": 1435
},
{
"epoch": 0.4548685145700071,
"grad_norm": 0.2856405701879058,
"learning_rate": 0.0009099526066350711,
"loss": 4.2279,
"step": 1440
},
{
"epoch": 0.4564479191344863,
"grad_norm": 0.3946323745321831,
"learning_rate": 0.0009131121642969984,
"loss": 4.3059,
"step": 1445
},
{
"epoch": 0.4580273236989655,
"grad_norm": 0.2239737721505975,
"learning_rate": 0.0009162717219589258,
"loss": 4.3098,
"step": 1450
},
{
"epoch": 0.4596067282634447,
"grad_norm": 0.28758656187518616,
"learning_rate": 0.000919431279620853,
"loss": 4.3616,
"step": 1455
},
{
"epoch": 0.46118613282792387,
"grad_norm": 0.4034440744665382,
"learning_rate": 0.0009225908372827805,
"loss": 4.2732,
"step": 1460
},
{
"epoch": 0.46276553739240306,
"grad_norm": 0.3297059692259955,
"learning_rate": 0.0009257503949447078,
"loss": 4.3945,
"step": 1465
},
{
"epoch": 0.46434494195688225,
"grad_norm": 0.3748529466334708,
"learning_rate": 0.0009289099526066352,
"loss": 4.303,
"step": 1470
},
{
"epoch": 0.46592434652136144,
"grad_norm": 0.42040543622175475,
"learning_rate": 0.0009320695102685624,
"loss": 4.2861,
"step": 1475
},
{
"epoch": 0.46750375108584064,
"grad_norm": 0.27875315582903953,
"learning_rate": 0.0009352290679304898,
"loss": 4.2853,
"step": 1480
},
{
"epoch": 0.4690831556503198,
"grad_norm": 0.3086495849332195,
"learning_rate": 0.0009383886255924171,
"loss": 4.232,
"step": 1485
},
{
"epoch": 0.470662560214799,
"grad_norm": 0.2553168340779991,
"learning_rate": 0.0009415481832543445,
"loss": 4.2786,
"step": 1490
},
{
"epoch": 0.4722419647792782,
"grad_norm": 0.4119881398779856,
"learning_rate": 0.0009447077409162717,
"loss": 4.3109,
"step": 1495
},
{
"epoch": 0.4738213693437574,
"grad_norm": 0.24709135378680736,
"learning_rate": 0.0009478672985781991,
"loss": 4.2491,
"step": 1500
},
{
"epoch": 0.4754007739082366,
"grad_norm": 0.25124585986886755,
"learning_rate": 0.0009510268562401264,
"loss": 4.2701,
"step": 1505
},
{
"epoch": 0.4769801784727158,
"grad_norm": 0.2542704936390731,
"learning_rate": 0.0009541864139020538,
"loss": 4.2497,
"step": 1510
},
{
"epoch": 0.478559583037195,
"grad_norm": 0.5057204600813832,
"learning_rate": 0.000957345971563981,
"loss": 4.252,
"step": 1515
},
{
"epoch": 0.48013898760167417,
"grad_norm": 0.3214147081649884,
"learning_rate": 0.0009605055292259084,
"loss": 4.3211,
"step": 1520
},
{
"epoch": 0.48171839216615336,
"grad_norm": 0.3325568221215968,
"learning_rate": 0.0009636650868878357,
"loss": 4.3235,
"step": 1525
},
{
"epoch": 0.48329779673063256,
"grad_norm": 0.35194382502241867,
"learning_rate": 0.0009668246445497631,
"loss": 4.2069,
"step": 1530
},
{
"epoch": 0.48487720129511175,
"grad_norm": 0.3232562671913541,
"learning_rate": 0.0009699842022116903,
"loss": 4.315,
"step": 1535
},
{
"epoch": 0.48645660585959094,
"grad_norm": 0.3917725392029616,
"learning_rate": 0.0009731437598736177,
"loss": 4.2047,
"step": 1540
},
{
"epoch": 0.48803601042407013,
"grad_norm": 0.3564362217383263,
"learning_rate": 0.000976303317535545,
"loss": 4.1988,
"step": 1545
},
{
"epoch": 0.4896154149885493,
"grad_norm": 0.43236466336771057,
"learning_rate": 0.0009794628751974724,
"loss": 4.2329,
"step": 1550
},
{
"epoch": 0.4911948195530285,
"grad_norm": 0.2502583684842727,
"learning_rate": 0.0009826224328593996,
"loss": 4.1544,
"step": 1555
},
{
"epoch": 0.4927742241175077,
"grad_norm": 0.2500902436658623,
"learning_rate": 0.000985781990521327,
"loss": 4.3171,
"step": 1560
},
{
"epoch": 0.4943536286819869,
"grad_norm": 0.2544078332384059,
"learning_rate": 0.0009889415481832543,
"loss": 4.1461,
"step": 1565
},
{
"epoch": 0.4959330332464661,
"grad_norm": 0.3040688173532611,
"learning_rate": 0.0009921011058451816,
"loss": 4.2662,
"step": 1570
},
{
"epoch": 0.4975124378109453,
"grad_norm": 0.35895041445570003,
"learning_rate": 0.000995260663507109,
"loss": 4.2931,
"step": 1575
},
{
"epoch": 0.4990918423754245,
"grad_norm": 0.32434784743319817,
"learning_rate": 0.0009984202211690363,
"loss": 4.1844,
"step": 1580
},
{
"epoch": 0.5006712469399036,
"grad_norm": 0.24574931413980025,
"learning_rate": 0.0010015797788309638,
"loss": 4.2135,
"step": 1585
},
{
"epoch": 0.5022506515043829,
"grad_norm": 0.4906220353757956,
"learning_rate": 0.001004739336492891,
"loss": 4.1729,
"step": 1590
},
{
"epoch": 0.503830056068862,
"grad_norm": 0.292823648070113,
"learning_rate": 0.0010078988941548185,
"loss": 4.2662,
"step": 1595
},
{
"epoch": 0.5054094606333412,
"grad_norm": 0.30934621640685955,
"learning_rate": 0.0010110584518167457,
"loss": 4.1053,
"step": 1600
},
{
"epoch": 0.5069888651978204,
"grad_norm": 0.3114064964705933,
"learning_rate": 0.001014218009478673,
"loss": 4.2144,
"step": 1605
},
{
"epoch": 0.5085682697622996,
"grad_norm": 0.36860508283438886,
"learning_rate": 0.0010173775671406002,
"loss": 4.1171,
"step": 1610
},
{
"epoch": 0.5101476743267788,
"grad_norm": 0.259770472981659,
"learning_rate": 0.0010205371248025277,
"loss": 4.1222,
"step": 1615
},
{
"epoch": 0.511727078891258,
"grad_norm": 0.34669740677241034,
"learning_rate": 0.001023696682464455,
"loss": 4.1779,
"step": 1620
},
{
"epoch": 0.5133064834557372,
"grad_norm": 0.26776977534742985,
"learning_rate": 0.0010268562401263824,
"loss": 4.1824,
"step": 1625
},
{
"epoch": 0.5148858880202164,
"grad_norm": 0.33482765454958535,
"learning_rate": 0.0010300157977883096,
"loss": 4.2453,
"step": 1630
},
{
"epoch": 0.5164652925846955,
"grad_norm": 0.41188740885185754,
"learning_rate": 0.001033175355450237,
"loss": 4.1147,
"step": 1635
},
{
"epoch": 0.5180446971491748,
"grad_norm": 0.2964291035835029,
"learning_rate": 0.0010363349131121643,
"loss": 4.2193,
"step": 1640
},
{
"epoch": 0.5196241017136539,
"grad_norm": 0.2793833405669084,
"learning_rate": 0.0010394944707740915,
"loss": 4.212,
"step": 1645
},
{
"epoch": 0.5212035062781332,
"grad_norm": 0.29713774307604923,
"learning_rate": 0.001042654028436019,
"loss": 4.0438,
"step": 1650
},
{
"epoch": 0.5227829108426123,
"grad_norm": 0.26956554558522977,
"learning_rate": 0.0010458135860979463,
"loss": 4.106,
"step": 1655
},
{
"epoch": 0.5243623154070916,
"grad_norm": 0.24092181432598472,
"learning_rate": 0.0010489731437598737,
"loss": 4.1685,
"step": 1660
},
{
"epoch": 0.5259417199715707,
"grad_norm": 0.26555033371413345,
"learning_rate": 0.001052132701421801,
"loss": 4.0912,
"step": 1665
},
{
"epoch": 0.5275211245360499,
"grad_norm": 0.25132501142979297,
"learning_rate": 0.0010552922590837284,
"loss": 4.0463,
"step": 1670
},
{
"epoch": 0.5291005291005291,
"grad_norm": 0.25984688233738684,
"learning_rate": 0.0010584518167456557,
"loss": 4.0156,
"step": 1675
},
{
"epoch": 0.5306799336650083,
"grad_norm": 0.29518646888723116,
"learning_rate": 0.0010616113744075831,
"loss": 4.1824,
"step": 1680
},
{
"epoch": 0.5322593382294875,
"grad_norm": 0.28830409655868694,
"learning_rate": 0.0010647709320695102,
"loss": 4.0853,
"step": 1685
},
{
"epoch": 0.5338387427939667,
"grad_norm": 0.3629932150134809,
"learning_rate": 0.0010679304897314376,
"loss": 4.1343,
"step": 1690
},
{
"epoch": 0.5354181473584458,
"grad_norm": 0.3206407737910774,
"learning_rate": 0.0010710900473933649,
"loss": 4.0968,
"step": 1695
},
{
"epoch": 0.5369975519229251,
"grad_norm": 0.3976703032902267,
"learning_rate": 0.0010742496050552923,
"loss": 4.1574,
"step": 1700
},
{
"epoch": 0.5385769564874042,
"grad_norm": 0.3410882250123248,
"learning_rate": 0.0010774091627172196,
"loss": 3.9865,
"step": 1705
},
{
"epoch": 0.5401563610518835,
"grad_norm": 0.24914445572445618,
"learning_rate": 0.001080568720379147,
"loss": 4.1298,
"step": 1710
},
{
"epoch": 0.5417357656163626,
"grad_norm": 0.3586153475100128,
"learning_rate": 0.0010837282780410743,
"loss": 4.0494,
"step": 1715
},
{
"epoch": 0.5433151701808419,
"grad_norm": 0.21271739910302082,
"learning_rate": 0.0010868878357030017,
"loss": 4.0796,
"step": 1720
},
{
"epoch": 0.544894574745321,
"grad_norm": 0.25098095308870794,
"learning_rate": 0.0010900473933649288,
"loss": 3.9703,
"step": 1725
},
{
"epoch": 0.5464739793098002,
"grad_norm": 0.22375556358935217,
"learning_rate": 0.0010932069510268562,
"loss": 4.1399,
"step": 1730
},
{
"epoch": 0.5480533838742794,
"grad_norm": 0.24930977346518232,
"learning_rate": 0.0010963665086887835,
"loss": 4.1181,
"step": 1735
},
{
"epoch": 0.5496327884387586,
"grad_norm": 0.332755640979972,
"learning_rate": 0.001099526066350711,
"loss": 4.0582,
"step": 1740
},
{
"epoch": 0.5512121930032378,
"grad_norm": 0.25641546202357435,
"learning_rate": 0.0011026856240126382,
"loss": 4.0961,
"step": 1745
},
{
"epoch": 0.552791597567717,
"grad_norm": 0.20369911360555534,
"learning_rate": 0.0011058451816745656,
"loss": 4.0879,
"step": 1750
},
{
"epoch": 0.5543710021321961,
"grad_norm": 0.20533893742270176,
"learning_rate": 0.0011090047393364929,
"loss": 4.1274,
"step": 1755
},
{
"epoch": 0.5559504066966754,
"grad_norm": 0.25026975391684464,
"learning_rate": 0.0011121642969984203,
"loss": 4.0035,
"step": 1760
},
{
"epoch": 0.5575298112611545,
"grad_norm": 0.30671183509078215,
"learning_rate": 0.0011153238546603474,
"loss": 4.0045,
"step": 1765
},
{
"epoch": 0.5591092158256338,
"grad_norm": 0.39359685055416405,
"learning_rate": 0.0011184834123222748,
"loss": 4.0696,
"step": 1770
},
{
"epoch": 0.5606886203901129,
"grad_norm": 0.4238119417095488,
"learning_rate": 0.0011216429699842023,
"loss": 4.1661,
"step": 1775
},
{
"epoch": 0.5622680249545922,
"grad_norm": 0.3295988445367429,
"learning_rate": 0.0011248025276461295,
"loss": 3.9719,
"step": 1780
},
{
"epoch": 0.5638474295190713,
"grad_norm": 0.3060613640788937,
"learning_rate": 0.001127962085308057,
"loss": 4.1499,
"step": 1785
},
{
"epoch": 0.5654268340835505,
"grad_norm": 0.3952683637011821,
"learning_rate": 0.0011311216429699842,
"loss": 4.0395,
"step": 1790
},
{
"epoch": 0.5670062386480297,
"grad_norm": 0.3734181377848123,
"learning_rate": 0.0011342812006319117,
"loss": 4.0731,
"step": 1795
},
{
"epoch": 0.5685856432125089,
"grad_norm": 0.38077178575661774,
"learning_rate": 0.001137440758293839,
"loss": 4.155,
"step": 1800
},
{
"epoch": 0.5701650477769881,
"grad_norm": 0.2690074343662603,
"learning_rate": 0.0011406003159557664,
"loss": 4.1339,
"step": 1805
},
{
"epoch": 0.5717444523414673,
"grad_norm": 0.3244752552450406,
"learning_rate": 0.0011437598736176934,
"loss": 4.0103,
"step": 1810
},
{
"epoch": 0.5733238569059464,
"grad_norm": 0.3249793558401697,
"learning_rate": 0.0011469194312796209,
"loss": 4.0671,
"step": 1815
},
{
"epoch": 0.5749032614704257,
"grad_norm": 0.37198248417206886,
"learning_rate": 0.0011500789889415481,
"loss": 4.0761,
"step": 1820
},
{
"epoch": 0.5764826660349048,
"grad_norm": 0.23416762438568905,
"learning_rate": 0.0011532385466034756,
"loss": 3.9817,
"step": 1825
},
{
"epoch": 0.5780620705993841,
"grad_norm": 0.2718063712033073,
"learning_rate": 0.0011563981042654028,
"loss": 4.0381,
"step": 1830
},
{
"epoch": 0.5796414751638632,
"grad_norm": 0.24530966492867137,
"learning_rate": 0.0011595576619273303,
"loss": 3.9687,
"step": 1835
},
{
"epoch": 0.5812208797283425,
"grad_norm": 0.3106480588010222,
"learning_rate": 0.0011627172195892575,
"loss": 3.9962,
"step": 1840
},
{
"epoch": 0.5828002842928216,
"grad_norm": 0.31913403219710973,
"learning_rate": 0.001165876777251185,
"loss": 4.0452,
"step": 1845
},
{
"epoch": 0.5843796888573008,
"grad_norm": 0.3275477776859289,
"learning_rate": 0.001169036334913112,
"loss": 4.0008,
"step": 1850
},
{
"epoch": 0.58595909342178,
"grad_norm": 0.25119136619450627,
"learning_rate": 0.0011721958925750395,
"loss": 4.0961,
"step": 1855
},
{
"epoch": 0.5875384979862592,
"grad_norm": 0.22397156772013765,
"learning_rate": 0.0011753554502369667,
"loss": 4.0451,
"step": 1860
},
{
"epoch": 0.5891179025507384,
"grad_norm": 0.20479994245596356,
"learning_rate": 0.0011785150078988942,
"loss": 4.0457,
"step": 1865
},
{
"epoch": 0.5906973071152176,
"grad_norm": 0.394527139585928,
"learning_rate": 0.0011816745655608214,
"loss": 4.0876,
"step": 1870
},
{
"epoch": 0.5922767116796968,
"grad_norm": 0.23435702222052282,
"learning_rate": 0.001184834123222749,
"loss": 3.9747,
"step": 1875
},
{
"epoch": 0.5938561162441759,
"grad_norm": 0.28634780566431706,
"learning_rate": 0.0011879936808846761,
"loss": 4.0351,
"step": 1880
},
{
"epoch": 0.5954355208086551,
"grad_norm": 0.23259288695977196,
"learning_rate": 0.0011911532385466036,
"loss": 4.0478,
"step": 1885
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.2740802794408343,
"learning_rate": 0.0011943127962085306,
"loss": 3.965,
"step": 1890
},
{
"epoch": 0.5985943299376135,
"grad_norm": 0.2968405906789927,
"learning_rate": 0.001197472353870458,
"loss": 3.9978,
"step": 1895
},
{
"epoch": 0.6001737345020927,
"grad_norm": 0.3510632332441351,
"learning_rate": 0.0012006319115323856,
"loss": 3.8756,
"step": 1900
},
{
"epoch": 0.6017531390665719,
"grad_norm": 0.33933157997666646,
"learning_rate": 0.0012037914691943128,
"loss": 4.1102,
"step": 1905
},
{
"epoch": 0.603332543631051,
"grad_norm": 0.2662117361891833,
"learning_rate": 0.0012069510268562403,
"loss": 3.9772,
"step": 1910
},
{
"epoch": 0.6049119481955303,
"grad_norm": 0.19808953368357718,
"learning_rate": 0.0012101105845181675,
"loss": 3.9335,
"step": 1915
},
{
"epoch": 0.6064913527600094,
"grad_norm": 0.1787972286211566,
"learning_rate": 0.001213270142180095,
"loss": 3.9334,
"step": 1920
},
{
"epoch": 0.6080707573244887,
"grad_norm": 0.22756804055302363,
"learning_rate": 0.0012164296998420222,
"loss": 4.0282,
"step": 1925
},
{
"epoch": 0.6096501618889678,
"grad_norm": 0.20739934559933562,
"learning_rate": 0.0012195892575039495,
"loss": 4.0244,
"step": 1930
},
{
"epoch": 0.6112295664534471,
"grad_norm": 0.25709062488029105,
"learning_rate": 0.0012227488151658767,
"loss": 3.9838,
"step": 1935
},
{
"epoch": 0.6128089710179262,
"grad_norm": 0.2859655649994034,
"learning_rate": 0.0012259083728278042,
"loss": 4.1079,
"step": 1940
},
{
"epoch": 0.6143883755824054,
"grad_norm": 0.2728350138342544,
"learning_rate": 0.0012290679304897314,
"loss": 3.9119,
"step": 1945
},
{
"epoch": 0.6159677801468846,
"grad_norm": 0.2575910181962937,
"learning_rate": 0.0012322274881516589,
"loss": 3.8685,
"step": 1950
},
{
"epoch": 0.6175471847113638,
"grad_norm": 0.20895739009488526,
"learning_rate": 0.0012353870458135861,
"loss": 3.8788,
"step": 1955
},
{
"epoch": 0.619126589275843,
"grad_norm": 0.24459107325549784,
"learning_rate": 0.0012385466034755136,
"loss": 3.8308,
"step": 1960
},
{
"epoch": 0.6207059938403222,
"grad_norm": 0.2507588464193189,
"learning_rate": 0.0012417061611374408,
"loss": 3.9587,
"step": 1965
},
{
"epoch": 0.6222853984048013,
"grad_norm": 0.29513467922086983,
"learning_rate": 0.001244865718799368,
"loss": 3.9315,
"step": 1970
},
{
"epoch": 0.6238648029692806,
"grad_norm": 0.20497026627483445,
"learning_rate": 0.0012480252764612953,
"loss": 3.9813,
"step": 1975
},
{
"epoch": 0.6254442075337597,
"grad_norm": 0.2936777035834914,
"learning_rate": 0.0012511848341232228,
"loss": 3.9606,
"step": 1980
},
{
"epoch": 0.627023612098239,
"grad_norm": 0.3422658240468666,
"learning_rate": 0.00125434439178515,
"loss": 3.948,
"step": 1985
},
{
"epoch": 0.6286030166627181,
"grad_norm": 0.31350047253232793,
"learning_rate": 0.0012575039494470775,
"loss": 3.9702,
"step": 1990
},
{
"epoch": 0.6301824212271974,
"grad_norm": 0.23473831158349892,
"learning_rate": 0.0012606635071090047,
"loss": 3.9025,
"step": 1995
},
{
"epoch": 0.6317618257916765,
"grad_norm": 0.23122760516885937,
"learning_rate": 0.0012638230647709322,
"loss": 3.8854,
"step": 2000
},
{
"epoch": 0.6333412303561557,
"grad_norm": 0.24819213985468946,
"learning_rate": 0.0012669826224328594,
"loss": 3.8488,
"step": 2005
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.2814044069474459,
"learning_rate": 0.0012701421800947867,
"loss": 3.8787,
"step": 2010
},
{
"epoch": 0.6365000394851141,
"grad_norm": 0.22554367699900693,
"learning_rate": 0.001273301737756714,
"loss": 3.839,
"step": 2015
},
{
"epoch": 0.6380794440495933,
"grad_norm": 0.24327030764405488,
"learning_rate": 0.0012764612954186414,
"loss": 3.9206,
"step": 2020
},
{
"epoch": 0.6396588486140725,
"grad_norm": 0.16093750563445822,
"learning_rate": 0.0012796208530805686,
"loss": 3.8874,
"step": 2025
},
{
"epoch": 0.6412382531785517,
"grad_norm": 0.194489762604418,
"learning_rate": 0.001282780410742496,
"loss": 3.9038,
"step": 2030
},
{
"epoch": 0.6428176577430309,
"grad_norm": 0.20599428439287257,
"learning_rate": 0.0012859399684044235,
"loss": 3.8576,
"step": 2035
},
{
"epoch": 0.64439706230751,
"grad_norm": 0.205011134486772,
"learning_rate": 0.0012890995260663508,
"loss": 3.9374,
"step": 2040
},
{
"epoch": 0.6459764668719893,
"grad_norm": 0.2377028741469135,
"learning_rate": 0.0012922590837282782,
"loss": 3.9984,
"step": 2045
},
{
"epoch": 0.6475558714364684,
"grad_norm": 0.24304609434521304,
"learning_rate": 0.0012954186413902053,
"loss": 3.9969,
"step": 2050
},
{
"epoch": 0.6491352760009477,
"grad_norm": 0.2623718421955022,
"learning_rate": 0.0012985781990521327,
"loss": 3.819,
"step": 2055
},
{
"epoch": 0.6507146805654268,
"grad_norm": 0.2745914540804823,
"learning_rate": 0.00130173775671406,
"loss": 3.9895,
"step": 2060
},
{
"epoch": 0.652294085129906,
"grad_norm": 0.22452179168678846,
"learning_rate": 0.0013048973143759874,
"loss": 3.8138,
"step": 2065
},
{
"epoch": 0.6538734896943852,
"grad_norm": 0.24802787851682156,
"learning_rate": 0.0013080568720379147,
"loss": 3.8979,
"step": 2070
},
{
"epoch": 0.6554528942588644,
"grad_norm": 0.2641386277150738,
"learning_rate": 0.0013112164296998421,
"loss": 3.9054,
"step": 2075
},
{
"epoch": 0.6570322988233436,
"grad_norm": 0.3221795475676082,
"learning_rate": 0.0013143759873617694,
"loss": 3.8572,
"step": 2080
},
{
"epoch": 0.6586117033878228,
"grad_norm": 0.19839742765982213,
"learning_rate": 0.0013175355450236969,
"loss": 3.8053,
"step": 2085
},
{
"epoch": 0.660191107952302,
"grad_norm": 0.26138504970498594,
"learning_rate": 0.0013206951026856239,
"loss": 3.9861,
"step": 2090
},
{
"epoch": 0.6617705125167812,
"grad_norm": 0.24992309356917045,
"learning_rate": 0.0013238546603475513,
"loss": 3.9771,
"step": 2095
},
{
"epoch": 0.6633499170812603,
"grad_norm": 0.30730598918197166,
"learning_rate": 0.0013270142180094786,
"loss": 3.8441,
"step": 2100
},
{
"epoch": 0.6649293216457396,
"grad_norm": 0.34181324929065954,
"learning_rate": 0.001330173775671406,
"loss": 3.9048,
"step": 2105
},
{
"epoch": 0.6665087262102187,
"grad_norm": 0.3703962744185399,
"learning_rate": 0.0013333333333333333,
"loss": 3.9084,
"step": 2110
},
{
"epoch": 0.668088130774698,
"grad_norm": 0.21963841409456603,
"learning_rate": 0.0013364928909952607,
"loss": 3.7848,
"step": 2115
},
{
"epoch": 0.6696675353391771,
"grad_norm": 0.3456682458725276,
"learning_rate": 0.001339652448657188,
"loss": 4.0472,
"step": 2120
},
{
"epoch": 0.6712469399036564,
"grad_norm": 0.1987812841934723,
"learning_rate": 0.0013428120063191155,
"loss": 3.96,
"step": 2125
},
{
"epoch": 0.6728263444681355,
"grad_norm": 0.19326508591069674,
"learning_rate": 0.0013459715639810425,
"loss": 3.9734,
"step": 2130
},
{
"epoch": 0.6744057490326147,
"grad_norm": 0.2520202968946282,
"learning_rate": 0.00134913112164297,
"loss": 3.8678,
"step": 2135
},
{
"epoch": 0.6759851535970939,
"grad_norm": 0.1832954776535829,
"learning_rate": 0.0013522906793048972,
"loss": 3.8302,
"step": 2140
},
{
"epoch": 0.6775645581615731,
"grad_norm": 0.20427163527235054,
"learning_rate": 0.0013554502369668246,
"loss": 3.7359,
"step": 2145
},
{
"epoch": 0.6791439627260523,
"grad_norm": 0.2014346803725015,
"learning_rate": 0.001358609794628752,
"loss": 3.8759,
"step": 2150
},
{
"epoch": 0.6807233672905315,
"grad_norm": 0.2330789181691575,
"learning_rate": 0.0013617693522906794,
"loss": 3.774,
"step": 2155
},
{
"epoch": 0.6823027718550106,
"grad_norm": 0.2053101704551176,
"learning_rate": 0.0013649289099526068,
"loss": 3.8485,
"step": 2160
},
{
"epoch": 0.6838821764194899,
"grad_norm": 0.2156812272229568,
"learning_rate": 0.001368088467614534,
"loss": 3.8498,
"step": 2165
},
{
"epoch": 0.685461580983969,
"grad_norm": 0.3474900523050622,
"learning_rate": 0.0013712480252764615,
"loss": 3.8156,
"step": 2170
},
{
"epoch": 0.6870409855484483,
"grad_norm": 0.20300679624217857,
"learning_rate": 0.0013744075829383885,
"loss": 3.8208,
"step": 2175
},
{
"epoch": 0.6886203901129274,
"grad_norm": 0.22758535553370787,
"learning_rate": 0.001377567140600316,
"loss": 3.7393,
"step": 2180
},
{
"epoch": 0.6901997946774067,
"grad_norm": 0.2913296112206454,
"learning_rate": 0.0013807266982622433,
"loss": 3.8639,
"step": 2185
},
{
"epoch": 0.6917791992418858,
"grad_norm": 0.22344429229234122,
"learning_rate": 0.0013838862559241707,
"loss": 3.8483,
"step": 2190
},
{
"epoch": 0.693358603806365,
"grad_norm": 0.24781341095554865,
"learning_rate": 0.001387045813586098,
"loss": 3.7865,
"step": 2195
},
{
"epoch": 0.6949380083708442,
"grad_norm": 0.24311562658045918,
"learning_rate": 0.0013902053712480254,
"loss": 3.8346,
"step": 2200
},
{
"epoch": 0.6965174129353234,
"grad_norm": 0.2977627052415685,
"learning_rate": 0.0013933649289099527,
"loss": 3.8134,
"step": 2205
},
{
"epoch": 0.6980968174998026,
"grad_norm": 0.40561638489455504,
"learning_rate": 0.0013965244865718801,
"loss": 3.9838,
"step": 2210
},
{
"epoch": 0.6996762220642818,
"grad_norm": 0.3162312925055,
"learning_rate": 0.0013996840442338072,
"loss": 3.8218,
"step": 2215
},
{
"epoch": 0.701255626628761,
"grad_norm": 0.20741807322760966,
"learning_rate": 0.0014028436018957346,
"loss": 3.8041,
"step": 2220
},
{
"epoch": 0.7028350311932402,
"grad_norm": 0.20987061820283978,
"learning_rate": 0.0014060031595576619,
"loss": 3.8393,
"step": 2225
},
{
"epoch": 0.7044144357577193,
"grad_norm": 0.1911108057821915,
"learning_rate": 0.0014091627172195893,
"loss": 3.7089,
"step": 2230
},
{
"epoch": 0.7059938403221986,
"grad_norm": 0.1768747480315818,
"learning_rate": 0.0014123222748815166,
"loss": 3.6769,
"step": 2235
},
{
"epoch": 0.7075732448866777,
"grad_norm": 0.2004499176891643,
"learning_rate": 0.001415481832543444,
"loss": 3.7227,
"step": 2240
},
{
"epoch": 0.709152649451157,
"grad_norm": 0.16218360545834662,
"learning_rate": 0.0014186413902053713,
"loss": 3.7493,
"step": 2245
},
{
"epoch": 0.7107320540156361,
"grad_norm": 0.2051019817661303,
"learning_rate": 0.0014218009478672987,
"loss": 3.8015,
"step": 2250
},
{
"epoch": 0.7123114585801152,
"grad_norm": 0.1693775898547751,
"learning_rate": 0.0014249605055292258,
"loss": 3.7405,
"step": 2255
},
{
"epoch": 0.7138908631445945,
"grad_norm": 0.23008194178880942,
"learning_rate": 0.0014281200631911532,
"loss": 3.7961,
"step": 2260
},
{
"epoch": 0.7154702677090736,
"grad_norm": 0.22627752956761354,
"learning_rate": 0.0014312796208530805,
"loss": 3.8194,
"step": 2265
},
{
"epoch": 0.7170496722735529,
"grad_norm": 0.16456172442202183,
"learning_rate": 0.001434439178515008,
"loss": 3.7835,
"step": 2270
},
{
"epoch": 0.718629076838032,
"grad_norm": 0.25085943817371653,
"learning_rate": 0.0014375987361769352,
"loss": 3.7775,
"step": 2275
},
{
"epoch": 0.7202084814025113,
"grad_norm": 0.19390913955357955,
"learning_rate": 0.0014407582938388626,
"loss": 3.8324,
"step": 2280
},
{
"epoch": 0.7217878859669904,
"grad_norm": 0.21262144885010617,
"learning_rate": 0.00144391785150079,
"loss": 3.8632,
"step": 2285
},
{
"epoch": 0.7233672905314696,
"grad_norm": 0.20753404886485088,
"learning_rate": 0.0014470774091627173,
"loss": 3.7361,
"step": 2290
},
{
"epoch": 0.7249466950959488,
"grad_norm": 0.18330521721784593,
"learning_rate": 0.0014502369668246446,
"loss": 3.6117,
"step": 2295
},
{
"epoch": 0.726526099660428,
"grad_norm": 0.28928349085054245,
"learning_rate": 0.0014533965244865718,
"loss": 3.7541,
"step": 2300
},
{
"epoch": 0.7281055042249072,
"grad_norm": 0.16420443253956293,
"learning_rate": 0.0014565560821484993,
"loss": 3.7508,
"step": 2305
},
{
"epoch": 0.7296849087893864,
"grad_norm": 0.21614784039850257,
"learning_rate": 0.0014597156398104265,
"loss": 3.7169,
"step": 2310
},
{
"epoch": 0.7312643133538655,
"grad_norm": 0.17984849327994049,
"learning_rate": 0.001462875197472354,
"loss": 3.7626,
"step": 2315
},
{
"epoch": 0.7328437179183448,
"grad_norm": 0.29654569082302534,
"learning_rate": 0.0014660347551342812,
"loss": 3.7716,
"step": 2320
},
{
"epoch": 0.7344231224828239,
"grad_norm": 0.2423639271628589,
"learning_rate": 0.0014691943127962087,
"loss": 3.8041,
"step": 2325
},
{
"epoch": 0.7360025270473032,
"grad_norm": 0.19251162129527793,
"learning_rate": 0.001472353870458136,
"loss": 3.6773,
"step": 2330
},
{
"epoch": 0.7375819316117823,
"grad_norm": 0.17987437526330385,
"learning_rate": 0.0014755134281200632,
"loss": 3.7863,
"step": 2335
},
{
"epoch": 0.7391613361762616,
"grad_norm": 0.28475822855536115,
"learning_rate": 0.0014786729857819904,
"loss": 3.8109,
"step": 2340
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.24249112320758767,
"learning_rate": 0.0014818325434439179,
"loss": 3.7797,
"step": 2345
},
{
"epoch": 0.74232014530522,
"grad_norm": 0.2326020797307572,
"learning_rate": 0.0014849921011058451,
"loss": 3.6954,
"step": 2350
},
{
"epoch": 0.7438995498696991,
"grad_norm": 0.22051402603596704,
"learning_rate": 0.0014881516587677726,
"loss": 3.8969,
"step": 2355
},
{
"epoch": 0.7454789544341783,
"grad_norm": 0.26421968183562905,
"learning_rate": 0.0014913112164296998,
"loss": 3.7278,
"step": 2360
},
{
"epoch": 0.7470583589986575,
"grad_norm": 0.20101343478698072,
"learning_rate": 0.0014944707740916273,
"loss": 3.6197,
"step": 2365
},
{
"epoch": 0.7486377635631367,
"grad_norm": 0.2079706232832599,
"learning_rate": 0.0014976303317535545,
"loss": 3.8934,
"step": 2370
},
{
"epoch": 0.7502171681276159,
"grad_norm": 0.2049265396867882,
"learning_rate": 0.0015007898894154818,
"loss": 3.6839,
"step": 2375
},
{
"epoch": 0.7517965726920951,
"grad_norm": 0.28063519759341937,
"learning_rate": 0.001503949447077409,
"loss": 3.8475,
"step": 2380
},
{
"epoch": 0.7533759772565742,
"grad_norm": 0.255893966061907,
"learning_rate": 0.0015071090047393365,
"loss": 3.7594,
"step": 2385
},
{
"epoch": 0.7549553818210535,
"grad_norm": 0.2258992783417018,
"learning_rate": 0.0015102685624012637,
"loss": 3.6973,
"step": 2390
},
{
"epoch": 0.7565347863855326,
"grad_norm": 0.28679321830822124,
"learning_rate": 0.0015134281200631912,
"loss": 3.7164,
"step": 2395
},
{
"epoch": 0.7581141909500119,
"grad_norm": 0.20356661517477284,
"learning_rate": 0.0015165876777251184,
"loss": 3.7652,
"step": 2400
},
{
"epoch": 0.759693595514491,
"grad_norm": 0.22262758807790112,
"learning_rate": 0.001519747235387046,
"loss": 3.6552,
"step": 2405
},
{
"epoch": 0.7612730000789703,
"grad_norm": 0.2249703886837002,
"learning_rate": 0.0015229067930489734,
"loss": 3.7424,
"step": 2410
},
{
"epoch": 0.7628524046434494,
"grad_norm": 0.16232152509783948,
"learning_rate": 0.0015260663507109004,
"loss": 3.6885,
"step": 2415
},
{
"epoch": 0.7644318092079286,
"grad_norm": 0.21140514606548705,
"learning_rate": 0.0015292259083728279,
"loss": 3.7257,
"step": 2420
},
{
"epoch": 0.7660112137724078,
"grad_norm": 0.2738777752868707,
"learning_rate": 0.001532385466034755,
"loss": 3.6921,
"step": 2425
},
{
"epoch": 0.767590618336887,
"grad_norm": 0.21449459870750393,
"learning_rate": 0.0015355450236966826,
"loss": 3.6344,
"step": 2430
},
{
"epoch": 0.7691700229013662,
"grad_norm": 0.23002450286814663,
"learning_rate": 0.0015387045813586098,
"loss": 3.712,
"step": 2435
},
{
"epoch": 0.7707494274658454,
"grad_norm": 0.22351746355041202,
"learning_rate": 0.0015418641390205373,
"loss": 3.7744,
"step": 2440
},
{
"epoch": 0.7723288320303245,
"grad_norm": 0.21500062869290557,
"learning_rate": 0.0015450236966824645,
"loss": 3.6787,
"step": 2445
},
{
"epoch": 0.7739082365948038,
"grad_norm": 0.22563787125139578,
"learning_rate": 0.001548183254344392,
"loss": 3.725,
"step": 2450
},
{
"epoch": 0.7754876411592829,
"grad_norm": 0.22153051124094789,
"learning_rate": 0.001551342812006319,
"loss": 3.7059,
"step": 2455
},
{
"epoch": 0.7770670457237622,
"grad_norm": 0.18970355960004148,
"learning_rate": 0.0015545023696682465,
"loss": 3.7338,
"step": 2460
},
{
"epoch": 0.7786464502882413,
"grad_norm": 0.1742134853025178,
"learning_rate": 0.0015576619273301737,
"loss": 3.6984,
"step": 2465
},
{
"epoch": 0.7802258548527206,
"grad_norm": 0.23660512962689312,
"learning_rate": 0.0015608214849921012,
"loss": 3.6406,
"step": 2470
},
{
"epoch": 0.7818052594171997,
"grad_norm": 0.3272784229892744,
"learning_rate": 0.0015639810426540284,
"loss": 3.709,
"step": 2475
},
{
"epoch": 0.7833846639816789,
"grad_norm": 0.20833361715866924,
"learning_rate": 0.0015671406003159559,
"loss": 3.6663,
"step": 2480
},
{
"epoch": 0.7849640685461581,
"grad_norm": 0.2748114142491958,
"learning_rate": 0.0015703001579778831,
"loss": 3.6892,
"step": 2485
},
{
"epoch": 0.7865434731106373,
"grad_norm": 0.19890328555853415,
"learning_rate": 0.0015734597156398106,
"loss": 3.7667,
"step": 2490
},
{
"epoch": 0.7881228776751165,
"grad_norm": 0.22412302917861454,
"learning_rate": 0.0015766192733017378,
"loss": 3.6875,
"step": 2495
},
{
"epoch": 0.7897022822395957,
"grad_norm": 0.19236289981188603,
"learning_rate": 0.001579778830963665,
"loss": 3.6889,
"step": 2500
},
{
"epoch": 0.7912816868040748,
"grad_norm": 0.1656764441178424,
"learning_rate": 0.0015829383886255923,
"loss": 3.7048,
"step": 2505
},
{
"epoch": 0.7928610913685541,
"grad_norm": 0.22914169581464922,
"learning_rate": 0.0015860979462875198,
"loss": 3.7057,
"step": 2510
},
{
"epoch": 0.7944404959330332,
"grad_norm": 0.19880632678862692,
"learning_rate": 0.001589257503949447,
"loss": 3.6536,
"step": 2515
},
{
"epoch": 0.7960199004975125,
"grad_norm": 0.20670459783742656,
"learning_rate": 0.0015924170616113745,
"loss": 3.6888,
"step": 2520
},
{
"epoch": 0.7975993050619916,
"grad_norm": 0.20556512892047715,
"learning_rate": 0.0015955766192733017,
"loss": 3.6812,
"step": 2525
},
{
"epoch": 0.7991787096264709,
"grad_norm": 0.2561386862908798,
"learning_rate": 0.0015987361769352292,
"loss": 3.8281,
"step": 2530
},
{
"epoch": 0.80075811419095,
"grad_norm": 0.21114926453470764,
"learning_rate": 0.0016018957345971566,
"loss": 3.8423,
"step": 2535
},
{
"epoch": 0.8023375187554292,
"grad_norm": 0.17098012197547188,
"learning_rate": 0.0016050552922590837,
"loss": 3.6149,
"step": 2540
},
{
"epoch": 0.8039169233199084,
"grad_norm": 0.19508734119650264,
"learning_rate": 0.0016082148499210111,
"loss": 3.6576,
"step": 2545
},
{
"epoch": 0.8054963278843876,
"grad_norm": 0.21765802709335372,
"learning_rate": 0.0016113744075829384,
"loss": 3.7115,
"step": 2550
},
{
"epoch": 0.8070757324488668,
"grad_norm": 0.24314664406509764,
"learning_rate": 0.0016145339652448658,
"loss": 3.7214,
"step": 2555
},
{
"epoch": 0.808655137013346,
"grad_norm": 0.29035553782387197,
"learning_rate": 0.001617693522906793,
"loss": 3.6752,
"step": 2560
},
{
"epoch": 0.8102345415778252,
"grad_norm": 0.16294563361366568,
"learning_rate": 0.0016208530805687205,
"loss": 3.562,
"step": 2565
},
{
"epoch": 0.8118139461423044,
"grad_norm": 0.17538885167621077,
"learning_rate": 0.0016240126382306478,
"loss": 3.653,
"step": 2570
},
{
"epoch": 0.8133933507067835,
"grad_norm": 0.246490697783557,
"learning_rate": 0.0016271721958925752,
"loss": 3.6845,
"step": 2575
},
{
"epoch": 0.8149727552712628,
"grad_norm": 0.17459066139539578,
"learning_rate": 0.0016303317535545023,
"loss": 3.7182,
"step": 2580
},
{
"epoch": 0.8165521598357419,
"grad_norm": 0.18265041543861538,
"learning_rate": 0.0016334913112164297,
"loss": 3.5774,
"step": 2585
},
{
"epoch": 0.8181315644002212,
"grad_norm": 0.23776280417043189,
"learning_rate": 0.001636650868878357,
"loss": 3.8624,
"step": 2590
},
{
"epoch": 0.8197109689647003,
"grad_norm": 0.2207643913126606,
"learning_rate": 0.0016398104265402844,
"loss": 3.7317,
"step": 2595
},
{
"epoch": 0.8212903735291796,
"grad_norm": 0.17566153909957044,
"learning_rate": 0.0016429699842022117,
"loss": 3.6867,
"step": 2600
},
{
"epoch": 0.8228697780936587,
"grad_norm": 0.2588250019337268,
"learning_rate": 0.0016461295418641391,
"loss": 3.6366,
"step": 2605
},
{
"epoch": 0.8244491826581379,
"grad_norm": 0.17498639531811824,
"learning_rate": 0.0016492890995260664,
"loss": 3.6697,
"step": 2610
},
{
"epoch": 0.8260285872226171,
"grad_norm": 0.19205844355571372,
"learning_rate": 0.0016524486571879938,
"loss": 3.6052,
"step": 2615
},
{
"epoch": 0.8276079917870963,
"grad_norm": 0.27220693405931584,
"learning_rate": 0.0016556082148499209,
"loss": 3.635,
"step": 2620
},
{
"epoch": 0.8291873963515755,
"grad_norm": 0.19674637897684247,
"learning_rate": 0.0016587677725118483,
"loss": 3.7254,
"step": 2625
},
{
"epoch": 0.8307668009160546,
"grad_norm": 0.2186697394730908,
"learning_rate": 0.0016619273301737756,
"loss": 3.6222,
"step": 2630
},
{
"epoch": 0.8323462054805338,
"grad_norm": 0.2960355405417273,
"learning_rate": 0.001665086887835703,
"loss": 3.6387,
"step": 2635
},
{
"epoch": 0.833925610045013,
"grad_norm": 0.2596808362060048,
"learning_rate": 0.0016682464454976303,
"loss": 3.5618,
"step": 2640
},
{
"epoch": 0.8355050146094922,
"grad_norm": 0.17733754261557175,
"learning_rate": 0.0016714060031595577,
"loss": 3.5618,
"step": 2645
},
{
"epoch": 0.8370844191739714,
"grad_norm": 0.1774345542731582,
"learning_rate": 0.001674565560821485,
"loss": 3.6559,
"step": 2650
},
{
"epoch": 0.8386638237384506,
"grad_norm": 0.20816053295200482,
"learning_rate": 0.0016777251184834125,
"loss": 3.6909,
"step": 2655
},
{
"epoch": 0.8402432283029297,
"grad_norm": 0.246077125438171,
"learning_rate": 0.0016808846761453397,
"loss": 3.5316,
"step": 2660
},
{
"epoch": 0.841822632867409,
"grad_norm": 0.2318326708694067,
"learning_rate": 0.001684044233807267,
"loss": 3.5794,
"step": 2665
},
{
"epoch": 0.8434020374318881,
"grad_norm": 0.24319493923572003,
"learning_rate": 0.0016872037914691944,
"loss": 3.645,
"step": 2670
},
{
"epoch": 0.8449814419963674,
"grad_norm": 0.17052776048586668,
"learning_rate": 0.0016903633491311216,
"loss": 3.5473,
"step": 2675
},
{
"epoch": 0.8465608465608465,
"grad_norm": 0.17919509242747675,
"learning_rate": 0.0016935229067930491,
"loss": 3.6732,
"step": 2680
},
{
"epoch": 0.8481402511253258,
"grad_norm": 0.24689978205175545,
"learning_rate": 0.0016966824644549764,
"loss": 3.6452,
"step": 2685
},
{
"epoch": 0.8497196556898049,
"grad_norm": 0.1985879167972585,
"learning_rate": 0.0016998420221169038,
"loss": 3.6603,
"step": 2690
},
{
"epoch": 0.8512990602542841,
"grad_norm": 0.17505379214501765,
"learning_rate": 0.001703001579778831,
"loss": 3.5913,
"step": 2695
},
{
"epoch": 0.8528784648187633,
"grad_norm": 0.17049655856229104,
"learning_rate": 0.0017061611374407583,
"loss": 3.5962,
"step": 2700
},
{
"epoch": 0.8544578693832425,
"grad_norm": 0.20477616815014238,
"learning_rate": 0.0017093206951026855,
"loss": 3.6277,
"step": 2705
},
{
"epoch": 0.8560372739477217,
"grad_norm": 0.18238352329571159,
"learning_rate": 0.001712480252764613,
"loss": 3.5996,
"step": 2710
},
{
"epoch": 0.8576166785122009,
"grad_norm": 0.1657684244805557,
"learning_rate": 0.0017156398104265403,
"loss": 3.6448,
"step": 2715
},
{
"epoch": 0.85919608307668,
"grad_norm": 0.18352367622580032,
"learning_rate": 0.0017187993680884677,
"loss": 3.6484,
"step": 2720
},
{
"epoch": 0.8607754876411593,
"grad_norm": 0.18498777399508617,
"learning_rate": 0.001721958925750395,
"loss": 3.6914,
"step": 2725
},
{
"epoch": 0.8623548922056384,
"grad_norm": 0.2006685637489181,
"learning_rate": 0.0017251184834123224,
"loss": 3.7236,
"step": 2730
},
{
"epoch": 0.8639342967701177,
"grad_norm": 0.1532842182126188,
"learning_rate": 0.0017282780410742497,
"loss": 3.6976,
"step": 2735
},
{
"epoch": 0.8655137013345968,
"grad_norm": 0.20769688280185736,
"learning_rate": 0.001731437598736177,
"loss": 3.5407,
"step": 2740
},
{
"epoch": 0.8670931058990761,
"grad_norm": 0.17160714406064806,
"learning_rate": 0.0017345971563981042,
"loss": 3.5158,
"step": 2745
},
{
"epoch": 0.8686725104635552,
"grad_norm": 0.13808832646048677,
"learning_rate": 0.0017377567140600316,
"loss": 3.5806,
"step": 2750
},
{
"epoch": 0.8702519150280345,
"grad_norm": 0.13578753863052162,
"learning_rate": 0.0017409162717219589,
"loss": 3.5781,
"step": 2755
},
{
"epoch": 0.8718313195925136,
"grad_norm": 0.15163041772953603,
"learning_rate": 0.0017440758293838863,
"loss": 3.5626,
"step": 2760
},
{
"epoch": 0.8734107241569928,
"grad_norm": 0.23873184566352487,
"learning_rate": 0.0017472353870458136,
"loss": 3.6502,
"step": 2765
},
{
"epoch": 0.874990128721472,
"grad_norm": 0.16021501896526982,
"learning_rate": 0.001750394944707741,
"loss": 3.5902,
"step": 2770
},
{
"epoch": 0.8765695332859512,
"grad_norm": 0.18029266140185582,
"learning_rate": 0.0017535545023696683,
"loss": 3.5893,
"step": 2775
},
{
"epoch": 0.8781489378504304,
"grad_norm": 0.14301734885212128,
"learning_rate": 0.0017567140600315955,
"loss": 3.6548,
"step": 2780
},
{
"epoch": 0.8797283424149096,
"grad_norm": 0.17731407703210675,
"learning_rate": 0.0017598736176935228,
"loss": 3.559,
"step": 2785
},
{
"epoch": 0.8813077469793887,
"grad_norm": 0.1954449258645855,
"learning_rate": 0.0017630331753554502,
"loss": 3.583,
"step": 2790
},
{
"epoch": 0.882887151543868,
"grad_norm": 0.18652326225135574,
"learning_rate": 0.0017661927330173777,
"loss": 3.5913,
"step": 2795
},
{
"epoch": 0.8844665561083471,
"grad_norm": 0.18624709754027133,
"learning_rate": 0.001769352290679305,
"loss": 3.5689,
"step": 2800
},
{
"epoch": 0.8860459606728264,
"grad_norm": 0.17838643662517628,
"learning_rate": 0.0017725118483412324,
"loss": 3.5603,
"step": 2805
},
{
"epoch": 0.8876253652373055,
"grad_norm": 0.17720919106270716,
"learning_rate": 0.0017756714060031596,
"loss": 3.5922,
"step": 2810
},
{
"epoch": 0.8892047698017848,
"grad_norm": 0.15319410065129782,
"learning_rate": 0.001778830963665087,
"loss": 3.5142,
"step": 2815
},
{
"epoch": 0.8907841743662639,
"grad_norm": 0.14049804512703198,
"learning_rate": 0.0017819905213270141,
"loss": 3.5348,
"step": 2820
},
{
"epoch": 0.8923635789307431,
"grad_norm": 0.13540223274555183,
"learning_rate": 0.0017851500789889416,
"loss": 3.6603,
"step": 2825
},
{
"epoch": 0.8939429834952223,
"grad_norm": 0.16060841249757002,
"learning_rate": 0.0017883096366508688,
"loss": 3.5376,
"step": 2830
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.1509093235271957,
"learning_rate": 0.0017914691943127963,
"loss": 3.5386,
"step": 2835
},
{
"epoch": 0.8971017926241807,
"grad_norm": 0.17837636380553173,
"learning_rate": 0.0017946287519747235,
"loss": 3.5374,
"step": 2840
},
{
"epoch": 0.8986811971886599,
"grad_norm": 0.16174607673705266,
"learning_rate": 0.001797788309636651,
"loss": 3.52,
"step": 2845
},
{
"epoch": 0.900260601753139,
"grad_norm": 0.14319096673100043,
"learning_rate": 0.0018009478672985782,
"loss": 3.5615,
"step": 2850
},
{
"epoch": 0.9018400063176183,
"grad_norm": 0.16168911340122266,
"learning_rate": 0.0018041074249605057,
"loss": 3.5071,
"step": 2855
},
{
"epoch": 0.9034194108820974,
"grad_norm": 0.26405265054409266,
"learning_rate": 0.001807266982622433,
"loss": 3.5178,
"step": 2860
},
{
"epoch": 0.9049988154465767,
"grad_norm": 0.19667628698985987,
"learning_rate": 0.0018104265402843602,
"loss": 3.586,
"step": 2865
},
{
"epoch": 0.9065782200110558,
"grad_norm": 0.18103313500074777,
"learning_rate": 0.0018135860979462874,
"loss": 3.4582,
"step": 2870
},
{
"epoch": 0.9081576245755351,
"grad_norm": 0.16618866533472787,
"learning_rate": 0.0018167456556082149,
"loss": 3.5747,
"step": 2875
},
{
"epoch": 0.9097370291400142,
"grad_norm": 0.1439781651238869,
"learning_rate": 0.0018199052132701421,
"loss": 3.5228,
"step": 2880
},
{
"epoch": 0.9113164337044934,
"grad_norm": 0.22829688021990502,
"learning_rate": 0.0018230647709320696,
"loss": 3.5557,
"step": 2885
},
{
"epoch": 0.9128958382689726,
"grad_norm": 0.20775781246057518,
"learning_rate": 0.0018262243285939968,
"loss": 3.466,
"step": 2890
},
{
"epoch": 0.9144752428334518,
"grad_norm": 0.18957199697528926,
"learning_rate": 0.0018293838862559243,
"loss": 3.5275,
"step": 2895
},
{
"epoch": 0.916054647397931,
"grad_norm": 0.15566648720294615,
"learning_rate": 0.0018325434439178515,
"loss": 3.5882,
"step": 2900
},
{
"epoch": 0.9176340519624102,
"grad_norm": 0.2422542117254597,
"learning_rate": 0.0018357030015797788,
"loss": 3.7308,
"step": 2905
},
{
"epoch": 0.9192134565268893,
"grad_norm": 0.15264299874862852,
"learning_rate": 0.001838862559241706,
"loss": 3.5069,
"step": 2910
},
{
"epoch": 0.9207928610913686,
"grad_norm": 0.3010264670148029,
"learning_rate": 0.0018420221169036335,
"loss": 3.5984,
"step": 2915
},
{
"epoch": 0.9223722656558477,
"grad_norm": 0.18844393665997056,
"learning_rate": 0.001845181674565561,
"loss": 3.5524,
"step": 2920
},
{
"epoch": 0.923951670220327,
"grad_norm": 0.21577593436034362,
"learning_rate": 0.0018483412322274882,
"loss": 3.542,
"step": 2925
},
{
"epoch": 0.9255310747848061,
"grad_norm": 0.2003079822459777,
"learning_rate": 0.0018515007898894157,
"loss": 3.5052,
"step": 2930
},
{
"epoch": 0.9271104793492854,
"grad_norm": 0.21144213220190206,
"learning_rate": 0.001854660347551343,
"loss": 3.4866,
"step": 2935
},
{
"epoch": 0.9286898839137645,
"grad_norm": 0.23696618462111668,
"learning_rate": 0.0018578199052132704,
"loss": 3.5692,
"step": 2940
},
{
"epoch": 0.9302692884782437,
"grad_norm": 0.18040894099227053,
"learning_rate": 0.0018609794628751974,
"loss": 3.556,
"step": 2945
},
{
"epoch": 0.9318486930427229,
"grad_norm": 0.18777757184774668,
"learning_rate": 0.0018641390205371249,
"loss": 3.5738,
"step": 2950
},
{
"epoch": 0.9334280976072021,
"grad_norm": 0.15996203979182136,
"learning_rate": 0.001867298578199052,
"loss": 3.4611,
"step": 2955
},
{
"epoch": 0.9350075021716813,
"grad_norm": 0.12437471578621428,
"learning_rate": 0.0018704581358609796,
"loss": 3.5311,
"step": 2960
},
{
"epoch": 0.9365869067361605,
"grad_norm": 0.20779657848924238,
"learning_rate": 0.0018736176935229068,
"loss": 3.6051,
"step": 2965
},
{
"epoch": 0.9381663113006397,
"grad_norm": 0.22622371621134083,
"learning_rate": 0.0018767772511848343,
"loss": 3.5416,
"step": 2970
},
{
"epoch": 0.9397457158651189,
"grad_norm": 0.15610415052535734,
"learning_rate": 0.0018799368088467615,
"loss": 3.487,
"step": 2975
},
{
"epoch": 0.941325120429598,
"grad_norm": 0.1670956525886456,
"learning_rate": 0.001883096366508689,
"loss": 3.6928,
"step": 2980
},
{
"epoch": 0.9429045249940773,
"grad_norm": 0.14071066099400464,
"learning_rate": 0.001886255924170616,
"loss": 3.4163,
"step": 2985
},
{
"epoch": 0.9444839295585564,
"grad_norm": 0.16270580906780718,
"learning_rate": 0.0018894154818325435,
"loss": 3.5102,
"step": 2990
},
{
"epoch": 0.9460633341230357,
"grad_norm": 0.16650126568655202,
"learning_rate": 0.0018925750394944707,
"loss": 3.5833,
"step": 2995
},
{
"epoch": 0.9476427386875148,
"grad_norm": 0.16986031593446013,
"learning_rate": 0.0018957345971563982,
"loss": 3.4545,
"step": 3000
},
{
"epoch": 0.9492221432519939,
"grad_norm": 0.19076878616947124,
"learning_rate": 0.0018988941548183254,
"loss": 3.6737,
"step": 3005
},
{
"epoch": 0.9508015478164732,
"grad_norm": 0.22207278004599146,
"learning_rate": 0.0019020537124802529,
"loss": 3.5761,
"step": 3010
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.17734953021756708,
"learning_rate": 0.0019052132701421801,
"loss": 3.5675,
"step": 3015
},
{
"epoch": 0.9539603569454316,
"grad_norm": 0.14197629022383074,
"learning_rate": 0.0019083728278041076,
"loss": 3.5852,
"step": 3020
},
{
"epoch": 0.9555397615099107,
"grad_norm": 0.152062693283501,
"learning_rate": 0.0019115323854660346,
"loss": 3.5915,
"step": 3025
},
{
"epoch": 0.95711916607439,
"grad_norm": 0.12267072229882017,
"learning_rate": 0.001914691943127962,
"loss": 3.5183,
"step": 3030
},
{
"epoch": 0.9586985706388691,
"grad_norm": 0.20550673167545833,
"learning_rate": 0.0019178515007898893,
"loss": 3.5475,
"step": 3035
},
{
"epoch": 0.9602779752033483,
"grad_norm": 0.17184158826150192,
"learning_rate": 0.0019210110584518168,
"loss": 3.531,
"step": 3040
},
{
"epoch": 0.9618573797678275,
"grad_norm": 0.16610392459471085,
"learning_rate": 0.0019241706161137442,
"loss": 3.611,
"step": 3045
},
{
"epoch": 0.9634367843323067,
"grad_norm": 0.15981880011297245,
"learning_rate": 0.0019273301737756715,
"loss": 3.593,
"step": 3050
},
{
"epoch": 0.9650161888967859,
"grad_norm": 0.15164121985073623,
"learning_rate": 0.001930489731437599,
"loss": 3.4119,
"step": 3055
},
{
"epoch": 0.9665955934612651,
"grad_norm": 0.15778640574581523,
"learning_rate": 0.0019336492890995262,
"loss": 3.5711,
"step": 3060
},
{
"epoch": 0.9681749980257442,
"grad_norm": 0.147167482910992,
"learning_rate": 0.0019368088467614534,
"loss": 3.4438,
"step": 3065
},
{
"epoch": 0.9697544025902235,
"grad_norm": 0.1901184378323275,
"learning_rate": 0.0019399684044233807,
"loss": 3.4494,
"step": 3070
},
{
"epoch": 0.9713338071547026,
"grad_norm": 0.1569729727319038,
"learning_rate": 0.0019431279620853081,
"loss": 3.4788,
"step": 3075
},
{
"epoch": 0.9729132117191819,
"grad_norm": 0.16207010652889833,
"learning_rate": 0.0019462875197472354,
"loss": 3.5848,
"step": 3080
},
{
"epoch": 0.974492616283661,
"grad_norm": 0.15032096081658786,
"learning_rate": 0.0019494470774091628,
"loss": 3.5918,
"step": 3085
},
{
"epoch": 0.9760720208481403,
"grad_norm": 0.19629734856584571,
"learning_rate": 0.00195260663507109,
"loss": 3.4911,
"step": 3090
},
{
"epoch": 0.9776514254126194,
"grad_norm": 0.15469401910746663,
"learning_rate": 0.0019557661927330173,
"loss": 3.6936,
"step": 3095
},
{
"epoch": 0.9792308299770986,
"grad_norm": 0.13113406422755078,
"learning_rate": 0.0019589257503949448,
"loss": 3.5594,
"step": 3100
},
{
"epoch": 0.9808102345415778,
"grad_norm": 0.15268161125345475,
"learning_rate": 0.0019620853080568722,
"loss": 3.4451,
"step": 3105
},
{
"epoch": 0.982389639106057,
"grad_norm": 0.13078473029692392,
"learning_rate": 0.0019652448657187993,
"loss": 3.5146,
"step": 3110
},
{
"epoch": 0.9839690436705362,
"grad_norm": 0.1342097775561928,
"learning_rate": 0.0019684044233807267,
"loss": 3.4701,
"step": 3115
},
{
"epoch": 0.9855484482350154,
"grad_norm": 0.17754966693141605,
"learning_rate": 0.001971563981042654,
"loss": 3.533,
"step": 3120
},
{
"epoch": 0.9871278527994946,
"grad_norm": 0.11166476841442985,
"learning_rate": 0.0019747235387045812,
"loss": 3.42,
"step": 3125
},
{
"epoch": 0.9887072573639738,
"grad_norm": 0.19911097415765053,
"learning_rate": 0.0019778830963665087,
"loss": 3.4671,
"step": 3130
},
{
"epoch": 0.9902866619284529,
"grad_norm": 0.19387690893246035,
"learning_rate": 0.001981042654028436,
"loss": 3.4164,
"step": 3135
},
{
"epoch": 0.9918660664929322,
"grad_norm": 0.19911109695195278,
"learning_rate": 0.001984202211690363,
"loss": 3.4661,
"step": 3140
},
{
"epoch": 0.9934454710574113,
"grad_norm": 0.22858221565607198,
"learning_rate": 0.0019873617693522906,
"loss": 3.3476,
"step": 3145
},
{
"epoch": 0.9950248756218906,
"grad_norm": 0.19136657300717996,
"learning_rate": 0.001990521327014218,
"loss": 3.4011,
"step": 3150
},
{
"epoch": 0.9966042801863697,
"grad_norm": 0.13796662021891073,
"learning_rate": 0.0019936808846761456,
"loss": 3.6024,
"step": 3155
},
{
"epoch": 0.998183684750849,
"grad_norm": 0.1759449134781156,
"learning_rate": 0.0019968404423380726,
"loss": 3.4973,
"step": 3160
},
{
"epoch": 0.9997630893153281,
"grad_norm": 0.17244156032139696,
"learning_rate": 0.002,
"loss": 3.4933,
"step": 3165
},
{
"epoch": 1.0,
"eval_loss": 3.453442096710205,
"eval_runtime": 118.4568,
"eval_samples_per_second": 22.363,
"eval_steps_per_second": 5.597,
"step": 3166
},
{
"epoch": 1.0012635236515834,
"grad_norm": 0.17419869669927196,
"learning_rate": 0.0019999998479531948,
"loss": 3.4613,
"step": 3170
},
{
"epoch": 1.0028429282160625,
"grad_norm": 0.1336509815659276,
"learning_rate": 0.001999999391812825,
"loss": 3.4318,
"step": 3175
},
{
"epoch": 1.0044223327805417,
"grad_norm": 0.16614666612624818,
"learning_rate": 0.001999998631579029,
"loss": 3.3616,
"step": 3180
},
{
"epoch": 1.006001737345021,
"grad_norm": 0.18577345941657952,
"learning_rate": 0.001999997567252038,
"loss": 3.387,
"step": 3185
},
{
"epoch": 1.0075811419095002,
"grad_norm": 0.14956674408580217,
"learning_rate": 0.0019999961988321764,
"loss": 3.4243,
"step": 3190
},
{
"epoch": 1.0091605464739792,
"grad_norm": 0.15122355799473372,
"learning_rate": 0.00199999452631986,
"loss": 3.3774,
"step": 3195
},
{
"epoch": 1.0107399510384585,
"grad_norm": 0.14460949065708573,
"learning_rate": 0.001999992549715597,
"loss": 3.4325,
"step": 3200
},
{
"epoch": 1.0123193556029377,
"grad_norm": 0.14847452402585345,
"learning_rate": 0.001999990269019989,
"loss": 3.4322,
"step": 3205
},
{
"epoch": 1.013898760167417,
"grad_norm": 0.13700971578846127,
"learning_rate": 0.001999987684233729,
"loss": 3.4382,
"step": 3210
},
{
"epoch": 1.015478164731896,
"grad_norm": 0.17659161113643015,
"learning_rate": 0.001999984795357604,
"loss": 3.5208,
"step": 3215
},
{
"epoch": 1.0170575692963753,
"grad_norm": 0.257228503826398,
"learning_rate": 0.0019999816023924914,
"loss": 3.483,
"step": 3220
},
{
"epoch": 1.0186369738608545,
"grad_norm": 0.1734175654718477,
"learning_rate": 0.0019999781053393626,
"loss": 3.4846,
"step": 3225
},
{
"epoch": 1.0202163784253337,
"grad_norm": 0.19417425336656402,
"learning_rate": 0.0019999743041992806,
"loss": 3.539,
"step": 3230
},
{
"epoch": 1.0217957829898128,
"grad_norm": 0.15962020613799605,
"learning_rate": 0.001999970198973402,
"loss": 3.4282,
"step": 3235
},
{
"epoch": 1.023375187554292,
"grad_norm": 0.14722064841836466,
"learning_rate": 0.001999965789662975,
"loss": 3.5164,
"step": 3240
},
{
"epoch": 1.0249545921187713,
"grad_norm": 0.16675736592182736,
"learning_rate": 0.0019999610762693404,
"loss": 3.4058,
"step": 3245
},
{
"epoch": 1.0265339966832505,
"grad_norm": 0.19849715389813696,
"learning_rate": 0.0019999560587939313,
"loss": 3.4989,
"step": 3250
},
{
"epoch": 1.0281134012477295,
"grad_norm": 0.18020905462463116,
"learning_rate": 0.001999950737238274,
"loss": 3.4392,
"step": 3255
},
{
"epoch": 1.0296928058122088,
"grad_norm": 0.12088354646913968,
"learning_rate": 0.0019999451116039855,
"loss": 3.39,
"step": 3260
},
{
"epoch": 1.031272210376688,
"grad_norm": 0.14689232257108842,
"learning_rate": 0.0019999391818927782,
"loss": 3.4552,
"step": 3265
},
{
"epoch": 1.0328516149411673,
"grad_norm": 0.2071823895016292,
"learning_rate": 0.001999932948106454,
"loss": 3.4981,
"step": 3270
},
{
"epoch": 1.0344310195056463,
"grad_norm": 0.13533866016746368,
"learning_rate": 0.0019999264102469093,
"loss": 3.3788,
"step": 3275
},
{
"epoch": 1.0360104240701256,
"grad_norm": 0.15821544540898988,
"learning_rate": 0.0019999195683161317,
"loss": 3.399,
"step": 3280
},
{
"epoch": 1.0375898286346048,
"grad_norm": 0.1299490903409468,
"learning_rate": 0.0019999124223162024,
"loss": 3.4135,
"step": 3285
},
{
"epoch": 1.039169233199084,
"grad_norm": 0.12444550525295087,
"learning_rate": 0.0019999049722492935,
"loss": 3.4348,
"step": 3290
},
{
"epoch": 1.040748637763563,
"grad_norm": 0.13764088108526956,
"learning_rate": 0.0019998972181176715,
"loss": 3.3488,
"step": 3295
},
{
"epoch": 1.0423280423280423,
"grad_norm": 0.1044556743209048,
"learning_rate": 0.001999889159923694,
"loss": 3.3857,
"step": 3300
},
{
"epoch": 1.0439074468925216,
"grad_norm": 0.11802247972167725,
"learning_rate": 0.001999880797669811,
"loss": 3.3374,
"step": 3305
},
{
"epoch": 1.0454868514570008,
"grad_norm": 0.13697976635150552,
"learning_rate": 0.0019998721313585666,
"loss": 3.4261,
"step": 3310
},
{
"epoch": 1.0470662560214798,
"grad_norm": 0.14888758048381356,
"learning_rate": 0.001999863160992595,
"loss": 3.4024,
"step": 3315
},
{
"epoch": 1.048645660585959,
"grad_norm": 0.14427856736666778,
"learning_rate": 0.0019998538865746243,
"loss": 3.4188,
"step": 3320
},
{
"epoch": 1.0502250651504383,
"grad_norm": 0.1808211556860934,
"learning_rate": 0.0019998443081074755,
"loss": 3.5465,
"step": 3325
},
{
"epoch": 1.0518044697149174,
"grad_norm": 0.13795293901986216,
"learning_rate": 0.0019998344255940602,
"loss": 3.4072,
"step": 3330
},
{
"epoch": 1.0533838742793966,
"grad_norm": 0.14265201579643738,
"learning_rate": 0.0019998242390373844,
"loss": 3.3855,
"step": 3335
},
{
"epoch": 1.0549632788438759,
"grad_norm": 0.12748239513513887,
"learning_rate": 0.0019998137484405457,
"loss": 3.3582,
"step": 3340
},
{
"epoch": 1.056542683408355,
"grad_norm": 0.16161872794834853,
"learning_rate": 0.001999802953806734,
"loss": 3.3484,
"step": 3345
},
{
"epoch": 1.0581220879728341,
"grad_norm": 0.1892452477657128,
"learning_rate": 0.001999791855139232,
"loss": 3.4889,
"step": 3350
},
{
"epoch": 1.0597014925373134,
"grad_norm": 0.17436112643159674,
"learning_rate": 0.0019997804524414147,
"loss": 3.4394,
"step": 3355
},
{
"epoch": 1.0612808971017926,
"grad_norm": 0.14612103988932162,
"learning_rate": 0.001999768745716749,
"loss": 3.4021,
"step": 3360
},
{
"epoch": 1.0628603016662719,
"grad_norm": 0.15180816088108712,
"learning_rate": 0.001999756734968796,
"loss": 3.31,
"step": 3365
},
{
"epoch": 1.064439706230751,
"grad_norm": 0.12107289850375573,
"learning_rate": 0.0019997444202012075,
"loss": 3.3731,
"step": 3370
},
{
"epoch": 1.0660191107952302,
"grad_norm": 0.11237567710696524,
"learning_rate": 0.0019997318014177284,
"loss": 3.3806,
"step": 3375
},
{
"epoch": 1.0675985153597094,
"grad_norm": 0.14292253124066845,
"learning_rate": 0.001999718878622196,
"loss": 3.3619,
"step": 3380
},
{
"epoch": 1.0691779199241886,
"grad_norm": 0.14054299799410475,
"learning_rate": 0.0019997056518185397,
"loss": 3.4553,
"step": 3385
},
{
"epoch": 1.0707573244886677,
"grad_norm": 0.13396428088016363,
"learning_rate": 0.001999692121010782,
"loss": 3.3671,
"step": 3390
},
{
"epoch": 1.072336729053147,
"grad_norm": 0.16443477530304862,
"learning_rate": 0.001999678286203038,
"loss": 3.4458,
"step": 3395
},
{
"epoch": 1.0739161336176262,
"grad_norm": 0.1550207567605643,
"learning_rate": 0.0019996641473995136,
"loss": 3.4284,
"step": 3400
},
{
"epoch": 1.0754955381821054,
"grad_norm": 0.1642689908902813,
"learning_rate": 0.0019996497046045093,
"loss": 3.4561,
"step": 3405
},
{
"epoch": 1.0770749427465844,
"grad_norm": 0.1362356053231775,
"learning_rate": 0.001999634957822417,
"loss": 3.3014,
"step": 3410
},
{
"epoch": 1.0786543473110637,
"grad_norm": 0.1331477986694049,
"learning_rate": 0.0019996199070577204,
"loss": 3.3593,
"step": 3415
},
{
"epoch": 1.080233751875543,
"grad_norm": 0.10846175011905092,
"learning_rate": 0.0019996045523149974,
"loss": 3.3759,
"step": 3420
},
{
"epoch": 1.0818131564400222,
"grad_norm": 0.15117569072212125,
"learning_rate": 0.0019995888935989163,
"loss": 3.38,
"step": 3425
},
{
"epoch": 1.0833925610045012,
"grad_norm": 0.21982954923773612,
"learning_rate": 0.0019995729309142396,
"loss": 3.459,
"step": 3430
},
{
"epoch": 1.0849719655689805,
"grad_norm": 0.1277491243359439,
"learning_rate": 0.0019995566642658203,
"loss": 3.3356,
"step": 3435
},
{
"epoch": 1.0865513701334597,
"grad_norm": 0.10597539270733282,
"learning_rate": 0.001999540093658606,
"loss": 3.3145,
"step": 3440
},
{
"epoch": 1.088130774697939,
"grad_norm": 0.14743827645774274,
"learning_rate": 0.001999523219097636,
"loss": 3.4258,
"step": 3445
},
{
"epoch": 1.089710179262418,
"grad_norm": 0.1279665344640135,
"learning_rate": 0.001999506040588041,
"loss": 3.5508,
"step": 3450
},
{
"epoch": 1.0912895838268972,
"grad_norm": 0.16499800131667175,
"learning_rate": 0.001999488558135045,
"loss": 3.4194,
"step": 3455
},
{
"epoch": 1.0928689883913765,
"grad_norm": 0.15359201841946143,
"learning_rate": 0.001999470771743964,
"loss": 3.2986,
"step": 3460
},
{
"epoch": 1.0944483929558557,
"grad_norm": 0.13265460036428217,
"learning_rate": 0.0019994526814202077,
"loss": 3.3454,
"step": 3465
},
{
"epoch": 1.0960277975203347,
"grad_norm": 0.1620045489947208,
"learning_rate": 0.0019994342871692762,
"loss": 3.3475,
"step": 3470
},
{
"epoch": 1.097607202084814,
"grad_norm": 0.20594824306793155,
"learning_rate": 0.0019994155889967637,
"loss": 3.2884,
"step": 3475
},
{
"epoch": 1.0991866066492932,
"grad_norm": 0.21057978013237635,
"learning_rate": 0.001999396586908356,
"loss": 3.4672,
"step": 3480
},
{
"epoch": 1.1007660112137725,
"grad_norm": 0.18108584882641593,
"learning_rate": 0.001999377280909832,
"loss": 3.3923,
"step": 3485
},
{
"epoch": 1.1023454157782515,
"grad_norm": 0.17909932454295607,
"learning_rate": 0.0019993576710070613,
"loss": 3.4725,
"step": 3490
},
{
"epoch": 1.1039248203427308,
"grad_norm": 0.14020488452965366,
"learning_rate": 0.0019993377572060083,
"loss": 3.4036,
"step": 3495
},
{
"epoch": 1.10550422490721,
"grad_norm": 0.13843283876669563,
"learning_rate": 0.0019993175395127284,
"loss": 3.4219,
"step": 3500
},
{
"epoch": 1.1070836294716893,
"grad_norm": 0.13070152265216842,
"learning_rate": 0.0019992970179333693,
"loss": 3.2739,
"step": 3505
},
{
"epoch": 1.1086630340361683,
"grad_norm": 0.11474143922408123,
"learning_rate": 0.001999276192474172,
"loss": 3.4454,
"step": 3510
},
{
"epoch": 1.1102424386006475,
"grad_norm": 0.12351976907394624,
"learning_rate": 0.0019992550631414687,
"loss": 3.3727,
"step": 3515
},
{
"epoch": 1.1118218431651268,
"grad_norm": 0.1236748122633322,
"learning_rate": 0.0019992336299416856,
"loss": 3.3862,
"step": 3520
},
{
"epoch": 1.113401247729606,
"grad_norm": 0.1248956099278761,
"learning_rate": 0.00199921189288134,
"loss": 3.3038,
"step": 3525
},
{
"epoch": 1.114980652294085,
"grad_norm": 0.12812904370726583,
"learning_rate": 0.0019991898519670414,
"loss": 3.4662,
"step": 3530
},
{
"epoch": 1.1165600568585643,
"grad_norm": 0.12789536088777056,
"learning_rate": 0.001999167507205493,
"loss": 3.3488,
"step": 3535
},
{
"epoch": 1.1181394614230435,
"grad_norm": 0.1279786622030309,
"learning_rate": 0.0019991448586034895,
"loss": 3.3606,
"step": 3540
},
{
"epoch": 1.1197188659875228,
"grad_norm": 0.13026742250652543,
"learning_rate": 0.001999121906167918,
"loss": 3.2312,
"step": 3545
},
{
"epoch": 1.1212982705520018,
"grad_norm": 0.11605126778184269,
"learning_rate": 0.001999098649905759,
"loss": 3.1845,
"step": 3550
},
{
"epoch": 1.122877675116481,
"grad_norm": 0.1579503573003752,
"learning_rate": 0.001999075089824084,
"loss": 3.3747,
"step": 3555
},
{
"epoch": 1.1244570796809603,
"grad_norm": 0.11793767104036577,
"learning_rate": 0.0019990512259300567,
"loss": 3.3676,
"step": 3560
},
{
"epoch": 1.1260364842454396,
"grad_norm": 0.10291434075214628,
"learning_rate": 0.0019990270582309353,
"loss": 3.3204,
"step": 3565
},
{
"epoch": 1.1276158888099186,
"grad_norm": 0.1277837851461023,
"learning_rate": 0.001999002586734068,
"loss": 3.4178,
"step": 3570
},
{
"epoch": 1.1291952933743978,
"grad_norm": 0.11724537011509982,
"learning_rate": 0.0019989778114468974,
"loss": 3.2551,
"step": 3575
},
{
"epoch": 1.130774697938877,
"grad_norm": 0.13588269554424826,
"learning_rate": 0.0019989527323769564,
"loss": 3.3492,
"step": 3580
},
{
"epoch": 1.1323541025033563,
"grad_norm": 0.14909668675411505,
"learning_rate": 0.0019989273495318724,
"loss": 3.3557,
"step": 3585
},
{
"epoch": 1.1339335070678354,
"grad_norm": 0.26584167932220104,
"learning_rate": 0.001998901662919364,
"loss": 3.381,
"step": 3590
},
{
"epoch": 1.1355129116323146,
"grad_norm": 0.2140440086183858,
"learning_rate": 0.0019988756725472416,
"loss": 3.3428,
"step": 3595
},
{
"epoch": 1.1370923161967939,
"grad_norm": 0.1938956532318173,
"learning_rate": 0.001998849378423409,
"loss": 3.2431,
"step": 3600
},
{
"epoch": 1.1386717207612729,
"grad_norm": 0.13215716093456611,
"learning_rate": 0.001998822780555863,
"loss": 3.3249,
"step": 3605
},
{
"epoch": 1.1402511253257521,
"grad_norm": 0.1332396552503134,
"learning_rate": 0.00199879587895269,
"loss": 3.2683,
"step": 3610
},
{
"epoch": 1.1418305298902314,
"grad_norm": 0.1376765381345977,
"learning_rate": 0.0019987686736220723,
"loss": 3.4012,
"step": 3615
},
{
"epoch": 1.1434099344547106,
"grad_norm": 0.10991614005842894,
"learning_rate": 0.0019987411645722825,
"loss": 3.262,
"step": 3620
},
{
"epoch": 1.1449893390191899,
"grad_norm": 0.20587342578391718,
"learning_rate": 0.0019987133518116857,
"loss": 3.3281,
"step": 3625
},
{
"epoch": 1.146568743583669,
"grad_norm": 0.16029495756721407,
"learning_rate": 0.0019986852353487392,
"loss": 3.179,
"step": 3630
},
{
"epoch": 1.1481481481481481,
"grad_norm": 0.17119484920176706,
"learning_rate": 0.0019986568151919935,
"loss": 3.3625,
"step": 3635
},
{
"epoch": 1.1497275527126274,
"grad_norm": 0.18007771763834388,
"learning_rate": 0.001998628091350091,
"loss": 3.2947,
"step": 3640
},
{
"epoch": 1.1513069572771064,
"grad_norm": 0.133112890117228,
"learning_rate": 0.001998599063831766,
"loss": 3.3428,
"step": 3645
},
{
"epoch": 1.1528863618415857,
"grad_norm": 0.11548842626474974,
"learning_rate": 0.0019985697326458463,
"loss": 3.2769,
"step": 3650
},
{
"epoch": 1.154465766406065,
"grad_norm": 0.14524782223820165,
"learning_rate": 0.0019985400978012506,
"loss": 3.3381,
"step": 3655
},
{
"epoch": 1.1560451709705442,
"grad_norm": 0.17275599053916738,
"learning_rate": 0.001998510159306991,
"loss": 3.3258,
"step": 3660
},
{
"epoch": 1.1576245755350234,
"grad_norm": 0.15188261477993478,
"learning_rate": 0.001998479917172172,
"loss": 3.2279,
"step": 3665
},
{
"epoch": 1.1592039800995024,
"grad_norm": 0.11594055773246185,
"learning_rate": 0.0019984493714059895,
"loss": 3.2907,
"step": 3670
},
{
"epoch": 1.1607833846639817,
"grad_norm": 0.16472747694603998,
"learning_rate": 0.0019984185220177325,
"loss": 3.3045,
"step": 3675
},
{
"epoch": 1.162362789228461,
"grad_norm": 0.12695428028045702,
"learning_rate": 0.001998387369016782,
"loss": 3.2843,
"step": 3680
},
{
"epoch": 1.16394219379294,
"grad_norm": 0.1383779290181551,
"learning_rate": 0.0019983559124126114,
"loss": 3.3356,
"step": 3685
},
{
"epoch": 1.1655215983574192,
"grad_norm": 0.1178710544115228,
"learning_rate": 0.0019983241522147865,
"loss": 3.1765,
"step": 3690
},
{
"epoch": 1.1671010029218984,
"grad_norm": 0.1074282742115371,
"learning_rate": 0.0019982920884329654,
"loss": 3.4055,
"step": 3695
},
{
"epoch": 1.1686804074863777,
"grad_norm": 0.1536495034869951,
"learning_rate": 0.0019982597210768983,
"loss": 3.2441,
"step": 3700
},
{
"epoch": 1.170259812050857,
"grad_norm": 0.1255113710506922,
"learning_rate": 0.0019982270501564285,
"loss": 3.2343,
"step": 3705
},
{
"epoch": 1.171839216615336,
"grad_norm": 0.14783407849501978,
"learning_rate": 0.00199819407568149,
"loss": 3.3616,
"step": 3710
},
{
"epoch": 1.1734186211798152,
"grad_norm": 0.14370786416564313,
"learning_rate": 0.0019981607976621114,
"loss": 3.1636,
"step": 3715
},
{
"epoch": 1.1749980257442945,
"grad_norm": 0.1645304933045862,
"learning_rate": 0.0019981272161084113,
"loss": 3.2503,
"step": 3720
},
{
"epoch": 1.1765774303087735,
"grad_norm": 0.11846710353311589,
"learning_rate": 0.001998093331030602,
"loss": 3.2984,
"step": 3725
},
{
"epoch": 1.1781568348732527,
"grad_norm": 0.34162094556522477,
"learning_rate": 0.0019980591424389876,
"loss": 3.3242,
"step": 3730
},
{
"epoch": 1.179736239437732,
"grad_norm": 0.18324615960578552,
"learning_rate": 0.001998024650343965,
"loss": 3.3071,
"step": 3735
},
{
"epoch": 1.1813156440022112,
"grad_norm": 0.12879840752783278,
"learning_rate": 0.001997989854756023,
"loss": 3.3383,
"step": 3740
},
{
"epoch": 1.1828950485666905,
"grad_norm": 0.17522825779942647,
"learning_rate": 0.001997954755685742,
"loss": 3.2078,
"step": 3745
},
{
"epoch": 1.1844744531311695,
"grad_norm": 0.14932656564575525,
"learning_rate": 0.0019979193531437962,
"loss": 3.2501,
"step": 3750
},
{
"epoch": 1.1860538576956487,
"grad_norm": 0.11865123535977866,
"learning_rate": 0.0019978836471409504,
"loss": 3.372,
"step": 3755
},
{
"epoch": 1.187633262260128,
"grad_norm": 0.11106273613599128,
"learning_rate": 0.001997847637688064,
"loss": 3.3333,
"step": 3760
},
{
"epoch": 1.189212666824607,
"grad_norm": 0.11783759839988478,
"learning_rate": 0.0019978113247960862,
"loss": 3.2367,
"step": 3765
},
{
"epoch": 1.1907920713890863,
"grad_norm": 0.12384146060700155,
"learning_rate": 0.0019977747084760594,
"loss": 3.1183,
"step": 3770
},
{
"epoch": 1.1923714759535655,
"grad_norm": 0.12503161294868623,
"learning_rate": 0.001997737788739119,
"loss": 3.3296,
"step": 3775
},
{
"epoch": 1.1939508805180448,
"grad_norm": 0.11667544977862357,
"learning_rate": 0.0019977005655964913,
"loss": 3.2352,
"step": 3780
},
{
"epoch": 1.1955302850825238,
"grad_norm": 0.17936254357422665,
"learning_rate": 0.0019976630390594967,
"loss": 3.236,
"step": 3785
},
{
"epoch": 1.197109689647003,
"grad_norm": 0.12000465722482047,
"learning_rate": 0.001997625209139546,
"loss": 3.1179,
"step": 3790
},
{
"epoch": 1.1986890942114823,
"grad_norm": 0.14110759430170178,
"learning_rate": 0.0019975870758481428,
"loss": 3.2982,
"step": 3795
},
{
"epoch": 1.2002684987759615,
"grad_norm": 0.11537924004492343,
"learning_rate": 0.001997548639196884,
"loss": 3.2035,
"step": 3800
},
{
"epoch": 1.2018479033404406,
"grad_norm": 0.15342351191916725,
"learning_rate": 0.0019975098991974576,
"loss": 3.2441,
"step": 3805
},
{
"epoch": 1.2034273079049198,
"grad_norm": 0.1517303082175663,
"learning_rate": 0.0019974708558616436,
"loss": 3.1546,
"step": 3810
},
{
"epoch": 1.205006712469399,
"grad_norm": 0.14179565122989465,
"learning_rate": 0.001997431509201316,
"loss": 3.3172,
"step": 3815
},
{
"epoch": 1.2065861170338783,
"grad_norm": 0.13118393817648813,
"learning_rate": 0.001997391859228439,
"loss": 3.1891,
"step": 3820
},
{
"epoch": 1.2081655215983573,
"grad_norm": 0.1341617483608506,
"learning_rate": 0.0019973519059550697,
"loss": 3.109,
"step": 3825
},
{
"epoch": 1.2097449261628366,
"grad_norm": 0.13620070052167957,
"learning_rate": 0.0019973116493933584,
"loss": 3.1534,
"step": 3830
},
{
"epoch": 1.2113243307273158,
"grad_norm": 0.12939284342206667,
"learning_rate": 0.0019972710895555467,
"loss": 3.1814,
"step": 3835
},
{
"epoch": 1.212903735291795,
"grad_norm": 0.13498974475198405,
"learning_rate": 0.0019972302264539684,
"loss": 3.1666,
"step": 3840
},
{
"epoch": 1.214483139856274,
"grad_norm": 0.13367835010491091,
"learning_rate": 0.0019971890601010495,
"loss": 3.289,
"step": 3845
},
{
"epoch": 1.2160625444207533,
"grad_norm": 0.13877130837928153,
"learning_rate": 0.0019971475905093084,
"loss": 3.2157,
"step": 3850
},
{
"epoch": 1.2176419489852326,
"grad_norm": 0.1384877772812798,
"learning_rate": 0.001997105817691357,
"loss": 3.2791,
"step": 3855
},
{
"epoch": 1.2192213535497118,
"grad_norm": 0.12456354108198894,
"learning_rate": 0.001997063741659896,
"loss": 3.2563,
"step": 3860
},
{
"epoch": 1.2208007581141909,
"grad_norm": 0.13081867644203868,
"learning_rate": 0.001997021362427722,
"loss": 3.119,
"step": 3865
},
{
"epoch": 1.2223801626786701,
"grad_norm": 0.12928400341888488,
"learning_rate": 0.0019969786800077215,
"loss": 3.2369,
"step": 3870
},
{
"epoch": 1.2239595672431494,
"grad_norm": 0.10358618908421555,
"learning_rate": 0.001996935694412875,
"loss": 3.2927,
"step": 3875
},
{
"epoch": 1.2255389718076286,
"grad_norm": 0.0966325015324056,
"learning_rate": 0.001996892405656253,
"loss": 3.2167,
"step": 3880
},
{
"epoch": 1.2271183763721076,
"grad_norm": 0.10711532367579371,
"learning_rate": 0.0019968488137510195,
"loss": 3.252,
"step": 3885
},
{
"epoch": 1.2286977809365869,
"grad_norm": 0.10983836977588884,
"learning_rate": 0.0019968049187104315,
"loss": 3.0567,
"step": 3890
},
{
"epoch": 1.2302771855010661,
"grad_norm": 0.10952845946022985,
"learning_rate": 0.0019967607205478356,
"loss": 3.0549,
"step": 3895
},
{
"epoch": 1.2318565900655454,
"grad_norm": 0.1238381423941252,
"learning_rate": 0.0019967162192766736,
"loss": 3.1814,
"step": 3900
},
{
"epoch": 1.2334359946300244,
"grad_norm": 0.13420597612755072,
"learning_rate": 0.0019966714149104777,
"loss": 3.1823,
"step": 3905
},
{
"epoch": 1.2350153991945036,
"grad_norm": 0.10329111557583345,
"learning_rate": 0.001996626307462872,
"loss": 3.187,
"step": 3910
},
{
"epoch": 1.236594803758983,
"grad_norm": 0.10633064709263741,
"learning_rate": 0.001996580896947574,
"loss": 3.0926,
"step": 3915
},
{
"epoch": 1.2381742083234621,
"grad_norm": 0.1404412639659265,
"learning_rate": 0.0019965351833783926,
"loss": 3.1486,
"step": 3920
},
{
"epoch": 1.2397536128879412,
"grad_norm": 0.08542198459750817,
"learning_rate": 0.0019964891667692292,
"loss": 3.1266,
"step": 3925
},
{
"epoch": 1.2413330174524204,
"grad_norm": 0.09777778835577029,
"learning_rate": 0.001996442847134076,
"loss": 3.1231,
"step": 3930
},
{
"epoch": 1.2429124220168997,
"grad_norm": 0.11792785325465897,
"learning_rate": 0.0019963962244870202,
"loss": 3.1933,
"step": 3935
},
{
"epoch": 1.244491826581379,
"grad_norm": 0.1097854930910212,
"learning_rate": 0.001996349298842239,
"loss": 3.1978,
"step": 3940
},
{
"epoch": 1.246071231145858,
"grad_norm": 0.13829225491817454,
"learning_rate": 0.0019963020702140014,
"loss": 3.1931,
"step": 3945
},
{
"epoch": 1.2476506357103372,
"grad_norm": 0.09611777306619898,
"learning_rate": 0.0019962545386166698,
"loss": 3.1986,
"step": 3950
},
{
"epoch": 1.2492300402748164,
"grad_norm": 0.11954156274404722,
"learning_rate": 0.0019962067040646984,
"loss": 3.1796,
"step": 3955
},
{
"epoch": 1.2508094448392955,
"grad_norm": 0.11600798040388846,
"learning_rate": 0.001996158566572633,
"loss": 3.1458,
"step": 3960
},
{
"epoch": 1.2523888494037747,
"grad_norm": 0.09983825050457865,
"learning_rate": 0.0019961101261551126,
"loss": 3.0764,
"step": 3965
},
{
"epoch": 1.253968253968254,
"grad_norm": 0.12089312291063067,
"learning_rate": 0.001996061382826867,
"loss": 3.1573,
"step": 3970
},
{
"epoch": 1.2555476585327332,
"grad_norm": 0.1365955307509055,
"learning_rate": 0.0019960123366027185,
"loss": 3.2639,
"step": 3975
},
{
"epoch": 1.2571270630972124,
"grad_norm": 0.15924451338159068,
"learning_rate": 0.0019959629874975824,
"loss": 3.2309,
"step": 3980
},
{
"epoch": 1.2587064676616915,
"grad_norm": 0.1485695640107557,
"learning_rate": 0.0019959133355264653,
"loss": 3.0889,
"step": 3985
},
{
"epoch": 1.2602858722261707,
"grad_norm": 0.12865335238484993,
"learning_rate": 0.0019958633807044654,
"loss": 3.1314,
"step": 3990
},
{
"epoch": 1.26186527679065,
"grad_norm": 0.12968582736289083,
"learning_rate": 0.0019958131230467745,
"loss": 3.0844,
"step": 3995
},
{
"epoch": 1.263444681355129,
"grad_norm": 0.10831091445134017,
"learning_rate": 0.0019957625625686756,
"loss": 3.1432,
"step": 4000
},
{
"epoch": 1.2650240859196082,
"grad_norm": 0.10875440072652662,
"learning_rate": 0.0019957116992855434,
"loss": 3.0823,
"step": 4005
},
{
"epoch": 1.2666034904840875,
"grad_norm": 0.1391997700234881,
"learning_rate": 0.001995660533212845,
"loss": 3.1656,
"step": 4010
},
{
"epoch": 1.2681828950485667,
"grad_norm": 0.12772909277929304,
"learning_rate": 0.0019956090643661398,
"loss": 3.08,
"step": 4015
},
{
"epoch": 1.269762299613046,
"grad_norm": 0.1267454766007706,
"learning_rate": 0.0019955572927610795,
"loss": 3.1432,
"step": 4020
},
{
"epoch": 1.271341704177525,
"grad_norm": 0.11164960171247067,
"learning_rate": 0.0019955052184134074,
"loss": 3.1631,
"step": 4025
},
{
"epoch": 1.2729211087420043,
"grad_norm": 0.10158383886480729,
"learning_rate": 0.0019954528413389586,
"loss": 3.0912,
"step": 4030
},
{
"epoch": 1.2745005133064835,
"grad_norm": 0.1072604836685963,
"learning_rate": 0.001995400161553661,
"loss": 3.1724,
"step": 4035
},
{
"epoch": 1.2760799178709625,
"grad_norm": 0.10241192413829599,
"learning_rate": 0.0019953471790735344,
"loss": 3.2245,
"step": 4040
},
{
"epoch": 1.2776593224354418,
"grad_norm": 0.12048005165040691,
"learning_rate": 0.0019952938939146896,
"loss": 3.165,
"step": 4045
},
{
"epoch": 1.279238726999921,
"grad_norm": 0.13777460457101334,
"learning_rate": 0.001995240306093331,
"loss": 3.1549,
"step": 4050
},
{
"epoch": 1.2808181315644003,
"grad_norm": 0.14104721610668222,
"learning_rate": 0.001995186415625754,
"loss": 3.0806,
"step": 4055
},
{
"epoch": 1.2823975361288795,
"grad_norm": 0.15189782154441522,
"learning_rate": 0.001995132222528346,
"loss": 3.1992,
"step": 4060
},
{
"epoch": 1.2839769406933585,
"grad_norm": 0.09726047026710698,
"learning_rate": 0.0019950777268175875,
"loss": 3.0581,
"step": 4065
},
{
"epoch": 1.2855563452578378,
"grad_norm": 0.11930065052202582,
"learning_rate": 0.0019950229285100505,
"loss": 3.049,
"step": 4070
},
{
"epoch": 1.287135749822317,
"grad_norm": 0.15432859275743474,
"learning_rate": 0.0019949678276223975,
"loss": 3.1533,
"step": 4075
},
{
"epoch": 1.288715154386796,
"grad_norm": 0.13762216779820105,
"learning_rate": 0.0019949124241713857,
"loss": 3.063,
"step": 4080
},
{
"epoch": 1.2902945589512753,
"grad_norm": 0.11435516872439676,
"learning_rate": 0.0019948567181738625,
"loss": 3.1914,
"step": 4085
},
{
"epoch": 1.2918739635157546,
"grad_norm": 0.11364780853900598,
"learning_rate": 0.0019948007096467673,
"loss": 3.0757,
"step": 4090
},
{
"epoch": 1.2934533680802338,
"grad_norm": 0.13025643217942512,
"learning_rate": 0.0019947443986071327,
"loss": 3.1264,
"step": 4095
},
{
"epoch": 1.295032772644713,
"grad_norm": 0.10444186688171346,
"learning_rate": 0.0019946877850720813,
"loss": 3.0088,
"step": 4100
},
{
"epoch": 1.296612177209192,
"grad_norm": 0.10515226058000508,
"learning_rate": 0.0019946308690588304,
"loss": 3.1422,
"step": 4105
},
{
"epoch": 1.2981915817736713,
"grad_norm": 0.09955106249820891,
"learning_rate": 0.0019945736505846867,
"loss": 3.0543,
"step": 4110
},
{
"epoch": 1.2997709863381506,
"grad_norm": 0.11936337099156746,
"learning_rate": 0.0019945161296670505,
"loss": 3.1021,
"step": 4115
},
{
"epoch": 1.3013503909026296,
"grad_norm": 0.11200407436331895,
"learning_rate": 0.001994458306323413,
"loss": 3.1028,
"step": 4120
},
{
"epoch": 1.3029297954671089,
"grad_norm": 0.11340792686905848,
"learning_rate": 0.001994400180571359,
"loss": 3.0485,
"step": 4125
},
{
"epoch": 1.304509200031588,
"grad_norm": 0.10922500132746676,
"learning_rate": 0.0019943417524285628,
"loss": 3.1952,
"step": 4130
},
{
"epoch": 1.3060886045960673,
"grad_norm": 0.17111466112095622,
"learning_rate": 0.0019942830219127935,
"loss": 3.0573,
"step": 4135
},
{
"epoch": 1.3076680091605466,
"grad_norm": 0.12758696103189704,
"learning_rate": 0.0019942239890419094,
"loss": 3.0893,
"step": 4140
},
{
"epoch": 1.3092474137250256,
"grad_norm": 0.1225924821208543,
"learning_rate": 0.0019941646538338626,
"loss": 3.1334,
"step": 4145
},
{
"epoch": 1.3108268182895049,
"grad_norm": 0.09912067745879768,
"learning_rate": 0.0019941050163066964,
"loss": 3.1031,
"step": 4150
},
{
"epoch": 1.3124062228539841,
"grad_norm": 0.13833974443279878,
"learning_rate": 0.0019940450764785464,
"loss": 3.102,
"step": 4155
},
{
"epoch": 1.3139856274184631,
"grad_norm": 0.10815876254157507,
"learning_rate": 0.0019939848343676395,
"loss": 3.118,
"step": 4160
},
{
"epoch": 1.3155650319829424,
"grad_norm": 0.11804236405004581,
"learning_rate": 0.001993924289992295,
"loss": 3.1192,
"step": 4165
},
{
"epoch": 1.3171444365474216,
"grad_norm": 0.13162961380042495,
"learning_rate": 0.0019938634433709253,
"loss": 3.1547,
"step": 4170
},
{
"epoch": 1.3187238411119009,
"grad_norm": 0.12597257223968147,
"learning_rate": 0.0019938022945220316,
"loss": 3.1191,
"step": 4175
},
{
"epoch": 1.3203032456763801,
"grad_norm": 0.14620891462000044,
"learning_rate": 0.00199374084346421,
"loss": 3.1462,
"step": 4180
},
{
"epoch": 1.3218826502408592,
"grad_norm": 0.14662006556760473,
"learning_rate": 0.001993679090216147,
"loss": 3.0915,
"step": 4185
},
{
"epoch": 1.3234620548053384,
"grad_norm": 0.1355029139391511,
"learning_rate": 0.0019936170347966214,
"loss": 3.0743,
"step": 4190
},
{
"epoch": 1.3250414593698177,
"grad_norm": 0.09561133746027113,
"learning_rate": 0.001993554677224504,
"loss": 3.0205,
"step": 4195
},
{
"epoch": 1.3266208639342967,
"grad_norm": 0.12680139091132397,
"learning_rate": 0.001993492017518757,
"loss": 3.0777,
"step": 4200
},
{
"epoch": 1.328200268498776,
"grad_norm": 0.10167135455161361,
"learning_rate": 0.0019934290556984356,
"loss": 3.0413,
"step": 4205
},
{
"epoch": 1.3297796730632552,
"grad_norm": 0.09893384227573258,
"learning_rate": 0.001993365791782685,
"loss": 3.0541,
"step": 4210
},
{
"epoch": 1.3313590776277344,
"grad_norm": 0.12806959547374103,
"learning_rate": 0.0019933022257907444,
"loss": 3.1291,
"step": 4215
},
{
"epoch": 1.3329384821922137,
"grad_norm": 0.10631455656766883,
"learning_rate": 0.001993238357741943,
"loss": 3.066,
"step": 4220
},
{
"epoch": 1.3345178867566927,
"grad_norm": 0.09020847358227843,
"learning_rate": 0.0019931741876557034,
"loss": 2.9794,
"step": 4225
},
{
"epoch": 1.336097291321172,
"grad_norm": 0.11716990851001001,
"learning_rate": 0.0019931097155515384,
"loss": 3.087,
"step": 4230
},
{
"epoch": 1.3376766958856512,
"grad_norm": 0.1559309966590921,
"learning_rate": 0.001993044941449054,
"loss": 3.0382,
"step": 4235
},
{
"epoch": 1.3392561004501302,
"grad_norm": 0.18501164183590774,
"learning_rate": 0.001992979865367948,
"loss": 3.1347,
"step": 4240
},
{
"epoch": 1.3408355050146095,
"grad_norm": 0.13204163189060986,
"learning_rate": 0.001992914487328009,
"loss": 3.1074,
"step": 4245
},
{
"epoch": 1.3424149095790887,
"grad_norm": 0.13187411270146468,
"learning_rate": 0.0019928488073491187,
"loss": 3.0863,
"step": 4250
},
{
"epoch": 1.343994314143568,
"grad_norm": 0.14665195514889143,
"learning_rate": 0.0019927828254512493,
"loss": 3.083,
"step": 4255
},
{
"epoch": 1.345573718708047,
"grad_norm": 0.11771818576434624,
"learning_rate": 0.0019927165416544655,
"loss": 2.9603,
"step": 4260
},
{
"epoch": 1.3471531232725262,
"grad_norm": 0.0999785437643928,
"learning_rate": 0.0019926499559789245,
"loss": 3.0477,
"step": 4265
},
{
"epoch": 1.3487325278370055,
"grad_norm": 0.09901228786007872,
"learning_rate": 0.001992583068444874,
"loss": 3.0167,
"step": 4270
},
{
"epoch": 1.3503119324014845,
"grad_norm": 0.08862562160808428,
"learning_rate": 0.001992515879072654,
"loss": 3.0456,
"step": 4275
},
{
"epoch": 1.3518913369659638,
"grad_norm": 0.10845634985741569,
"learning_rate": 0.0019924483878826964,
"loss": 2.9824,
"step": 4280
},
{
"epoch": 1.353470741530443,
"grad_norm": 0.0995199133515051,
"learning_rate": 0.001992380594895525,
"loss": 3.084,
"step": 4285
},
{
"epoch": 1.3550501460949222,
"grad_norm": 0.1329601121160114,
"learning_rate": 0.001992312500131756,
"loss": 3.0172,
"step": 4290
},
{
"epoch": 1.3566295506594015,
"grad_norm": 0.1140598576669714,
"learning_rate": 0.001992244103612095,
"loss": 3.0689,
"step": 4295
},
{
"epoch": 1.3582089552238805,
"grad_norm": 0.12755509737439705,
"learning_rate": 0.0019921754053573416,
"loss": 3.0067,
"step": 4300
},
{
"epoch": 1.3597883597883598,
"grad_norm": 0.09952094527330028,
"learning_rate": 0.001992106405388387,
"loss": 3.0083,
"step": 4305
},
{
"epoch": 1.361367764352839,
"grad_norm": 0.11698793022306847,
"learning_rate": 0.001992037103726213,
"loss": 3.0387,
"step": 4310
},
{
"epoch": 1.362947168917318,
"grad_norm": 0.1363797347516853,
"learning_rate": 0.001991967500391894,
"loss": 3.0957,
"step": 4315
},
{
"epoch": 1.3645265734817973,
"grad_norm": 0.10233191377919058,
"learning_rate": 0.0019918975954065963,
"loss": 3.0116,
"step": 4320
},
{
"epoch": 1.3661059780462765,
"grad_norm": 0.1308041636276131,
"learning_rate": 0.0019918273887915773,
"loss": 3.1729,
"step": 4325
},
{
"epoch": 1.3676853826107558,
"grad_norm": 0.11065995577696967,
"learning_rate": 0.001991756880568186,
"loss": 3.118,
"step": 4330
},
{
"epoch": 1.369264787175235,
"grad_norm": 0.13714608654227864,
"learning_rate": 0.0019916860707578643,
"loss": 3.0521,
"step": 4335
},
{
"epoch": 1.370844191739714,
"grad_norm": 0.11194789482141944,
"learning_rate": 0.001991614959382144,
"loss": 2.9706,
"step": 4340
},
{
"epoch": 1.3724235963041933,
"grad_norm": 0.1637143852040583,
"learning_rate": 0.0019915435464626504,
"loss": 2.9616,
"step": 4345
},
{
"epoch": 1.3740030008686726,
"grad_norm": 0.12488036224199506,
"learning_rate": 0.0019914718320210995,
"loss": 3.0638,
"step": 4350
},
{
"epoch": 1.3755824054331516,
"grad_norm": 0.11984278163621372,
"learning_rate": 0.001991399816079299,
"loss": 3.0649,
"step": 4355
},
{
"epoch": 1.3771618099976308,
"grad_norm": 0.10466430162726809,
"learning_rate": 0.001991327498659149,
"loss": 2.9393,
"step": 4360
},
{
"epoch": 1.37874121456211,
"grad_norm": 0.1185557598443159,
"learning_rate": 0.00199125487978264,
"loss": 3.0532,
"step": 4365
},
{
"epoch": 1.3803206191265893,
"grad_norm": 0.10203033131461912,
"learning_rate": 0.0019911819594718556,
"loss": 2.9857,
"step": 4370
},
{
"epoch": 1.3819000236910686,
"grad_norm": 0.0987878808276104,
"learning_rate": 0.00199110873774897,
"loss": 3.0392,
"step": 4375
},
{
"epoch": 1.3834794282555476,
"grad_norm": 0.11449697953789201,
"learning_rate": 0.0019910352146362497,
"loss": 3.059,
"step": 4380
},
{
"epoch": 1.3850588328200268,
"grad_norm": 0.1545129904912003,
"learning_rate": 0.0019909613901560527,
"loss": 3.0935,
"step": 4385
},
{
"epoch": 1.386638237384506,
"grad_norm": 0.17034858880784884,
"learning_rate": 0.0019908872643308283,
"loss": 3.0646,
"step": 4390
},
{
"epoch": 1.3882176419489851,
"grad_norm": 0.11078244673665372,
"learning_rate": 0.0019908128371831178,
"loss": 3.1438,
"step": 4395
},
{
"epoch": 1.3897970465134644,
"grad_norm": 0.13774166764383755,
"learning_rate": 0.0019907381087355537,
"loss": 3.0062,
"step": 4400
},
{
"epoch": 1.3913764510779436,
"grad_norm": 0.10786798709984459,
"learning_rate": 0.001990663079010861,
"loss": 3.0725,
"step": 4405
},
{
"epoch": 1.3929558556424229,
"grad_norm": 0.12162781102145018,
"learning_rate": 0.0019905877480318555,
"loss": 3.0076,
"step": 4410
},
{
"epoch": 1.394535260206902,
"grad_norm": 0.10185279989897872,
"learning_rate": 0.0019905121158214447,
"loss": 3.0459,
"step": 4415
},
{
"epoch": 1.3961146647713811,
"grad_norm": 0.1437979177483079,
"learning_rate": 0.001990436182402628,
"loss": 3.0439,
"step": 4420
},
{
"epoch": 1.3976940693358604,
"grad_norm": 0.13528480927930733,
"learning_rate": 0.001990359947798497,
"loss": 2.9768,
"step": 4425
},
{
"epoch": 1.3992734739003396,
"grad_norm": 0.10152004278416521,
"learning_rate": 0.001990283412032233,
"loss": 3.0177,
"step": 4430
},
{
"epoch": 1.4008528784648187,
"grad_norm": 0.10698464459247188,
"learning_rate": 0.00199020657512711,
"loss": 2.9564,
"step": 4435
},
{
"epoch": 1.402432283029298,
"grad_norm": 0.12627145665399941,
"learning_rate": 0.0019901294371064944,
"loss": 2.9423,
"step": 4440
},
{
"epoch": 1.4040116875937771,
"grad_norm": 0.13409019326545243,
"learning_rate": 0.0019900519979938434,
"loss": 3.0,
"step": 4445
},
{
"epoch": 1.4055910921582564,
"grad_norm": 0.10552324075089604,
"learning_rate": 0.001989974257812705,
"loss": 2.9409,
"step": 4450
},
{
"epoch": 1.4071704967227356,
"grad_norm": 0.122826319671673,
"learning_rate": 0.0019898962165867205,
"loss": 3.0135,
"step": 4455
},
{
"epoch": 1.4087499012872147,
"grad_norm": 0.11354791157697926,
"learning_rate": 0.0019898178743396207,
"loss": 3.0774,
"step": 4460
},
{
"epoch": 1.410329305851694,
"grad_norm": 0.15224792619213673,
"learning_rate": 0.0019897392310952292,
"loss": 3.0452,
"step": 4465
},
{
"epoch": 1.4119087104161732,
"grad_norm": 0.10087355471090065,
"learning_rate": 0.0019896602868774618,
"loss": 2.9939,
"step": 4470
},
{
"epoch": 1.4134881149806522,
"grad_norm": 0.08272242215342274,
"learning_rate": 0.001989581041710324,
"loss": 3.0465,
"step": 4475
},
{
"epoch": 1.4150675195451314,
"grad_norm": 0.11210873003897295,
"learning_rate": 0.001989501495617914,
"loss": 3.1138,
"step": 4480
},
{
"epoch": 1.4166469241096107,
"grad_norm": 0.09534871752668661,
"learning_rate": 0.001989421648624421,
"loss": 2.9726,
"step": 4485
},
{
"epoch": 1.41822632867409,
"grad_norm": 0.10770938407123477,
"learning_rate": 0.0019893415007541265,
"loss": 3.1972,
"step": 4490
},
{
"epoch": 1.4198057332385692,
"grad_norm": 0.09610267990196253,
"learning_rate": 0.001989261052031403,
"loss": 3.13,
"step": 4495
},
{
"epoch": 1.4213851378030482,
"grad_norm": 0.13605706757511912,
"learning_rate": 0.0019891803024807138,
"loss": 3.068,
"step": 4500
},
{
"epoch": 1.4229645423675275,
"grad_norm": 0.12325106578155333,
"learning_rate": 0.0019890992521266145,
"loss": 2.9774,
"step": 4505
},
{
"epoch": 1.4245439469320067,
"grad_norm": 0.14455830288755464,
"learning_rate": 0.0019890179009937527,
"loss": 3.0241,
"step": 4510
},
{
"epoch": 1.4261233514964857,
"grad_norm": 0.1052669832362689,
"learning_rate": 0.0019889362491068655,
"loss": 2.9926,
"step": 4515
},
{
"epoch": 1.427702756060965,
"grad_norm": 0.1642757068083475,
"learning_rate": 0.001988854296490784,
"loss": 3.0745,
"step": 4520
},
{
"epoch": 1.4292821606254442,
"grad_norm": 0.20946325027308604,
"learning_rate": 0.001988772043170429,
"loss": 3.0698,
"step": 4525
},
{
"epoch": 1.4308615651899235,
"grad_norm": 0.13936163585211253,
"learning_rate": 0.001988689489170813,
"loss": 2.999,
"step": 4530
},
{
"epoch": 1.4324409697544027,
"grad_norm": 0.13050964476816038,
"learning_rate": 0.0019886066345170396,
"loss": 3.004,
"step": 4535
},
{
"epoch": 1.4340203743188817,
"grad_norm": 0.125075763003334,
"learning_rate": 0.0019885234792343057,
"loss": 3.0632,
"step": 4540
},
{
"epoch": 1.435599778883361,
"grad_norm": 0.08896244058788337,
"learning_rate": 0.0019884400233478976,
"loss": 2.9419,
"step": 4545
},
{
"epoch": 1.4371791834478402,
"grad_norm": 0.11862506407549697,
"learning_rate": 0.001988356266883193,
"loss": 2.9654,
"step": 4550
},
{
"epoch": 1.4387585880123193,
"grad_norm": 0.12431900571134885,
"learning_rate": 0.001988272209865663,
"loss": 2.8973,
"step": 4555
},
{
"epoch": 1.4403379925767985,
"grad_norm": 0.1088210197989654,
"learning_rate": 0.0019881878523208686,
"loss": 3.0581,
"step": 4560
},
{
"epoch": 1.4419173971412778,
"grad_norm": 0.10081325955650519,
"learning_rate": 0.0019881031942744617,
"loss": 2.9722,
"step": 4565
},
{
"epoch": 1.443496801705757,
"grad_norm": 0.09884952322328694,
"learning_rate": 0.0019880182357521867,
"loss": 3.1218,
"step": 4570
},
{
"epoch": 1.4450762062702363,
"grad_norm": 0.08478558540659335,
"learning_rate": 0.0019879329767798787,
"loss": 2.9096,
"step": 4575
},
{
"epoch": 1.4466556108347153,
"grad_norm": 0.10555755370610553,
"learning_rate": 0.001987847417383464,
"loss": 2.9623,
"step": 4580
},
{
"epoch": 1.4482350153991945,
"grad_norm": 0.08622649112731266,
"learning_rate": 0.001987761557588962,
"loss": 3.0071,
"step": 4585
},
{
"epoch": 1.4498144199636738,
"grad_norm": 0.09445114381187443,
"learning_rate": 0.001987675397422481,
"loss": 2.9472,
"step": 4590
},
{
"epoch": 1.4513938245281528,
"grad_norm": 0.08645520160432053,
"learning_rate": 0.001987588936910222,
"loss": 3.0327,
"step": 4595
},
{
"epoch": 1.452973229092632,
"grad_norm": 0.09266557898694067,
"learning_rate": 0.0019875021760784773,
"loss": 2.9679,
"step": 4600
},
{
"epoch": 1.4545526336571113,
"grad_norm": 0.09510456931243706,
"learning_rate": 0.00198741511495363,
"loss": 3.0081,
"step": 4605
},
{
"epoch": 1.4561320382215905,
"grad_norm": 0.10882985029674294,
"learning_rate": 0.0019873277535621555,
"loss": 2.9785,
"step": 4610
},
{
"epoch": 1.4577114427860698,
"grad_norm": 0.09723739191160814,
"learning_rate": 0.001987240091930619,
"loss": 2.858,
"step": 4615
},
{
"epoch": 1.4592908473505488,
"grad_norm": 0.08482317791765791,
"learning_rate": 0.001987152130085678,
"loss": 2.9225,
"step": 4620
},
{
"epoch": 1.460870251915028,
"grad_norm": 0.09570859232600319,
"learning_rate": 0.0019870638680540816,
"loss": 3.0148,
"step": 4625
},
{
"epoch": 1.462449656479507,
"grad_norm": 0.09277192471107024,
"learning_rate": 0.0019869753058626696,
"loss": 3.0133,
"step": 4630
},
{
"epoch": 1.4640290610439863,
"grad_norm": 0.11133410369142042,
"learning_rate": 0.0019868864435383725,
"loss": 2.8989,
"step": 4635
},
{
"epoch": 1.4656084656084656,
"grad_norm": 0.07727152770646367,
"learning_rate": 0.0019867972811082137,
"loss": 2.9846,
"step": 4640
},
{
"epoch": 1.4671878701729448,
"grad_norm": 0.097514775027411,
"learning_rate": 0.0019867078185993067,
"loss": 2.9711,
"step": 4645
},
{
"epoch": 1.468767274737424,
"grad_norm": 0.09621032301747534,
"learning_rate": 0.0019866180560388557,
"loss": 3.0579,
"step": 4650
},
{
"epoch": 1.470346679301903,
"grad_norm": 0.09648128346327675,
"learning_rate": 0.0019865279934541584,
"loss": 3.0346,
"step": 4655
},
{
"epoch": 1.4719260838663824,
"grad_norm": 0.11365142360663955,
"learning_rate": 0.0019864376308726004,
"loss": 2.9421,
"step": 4660
},
{
"epoch": 1.4735054884308616,
"grad_norm": 0.091741612081307,
"learning_rate": 0.0019863469683216624,
"loss": 2.9495,
"step": 4665
},
{
"epoch": 1.4750848929953406,
"grad_norm": 0.10035727620230114,
"learning_rate": 0.0019862560058289125,
"loss": 2.9744,
"step": 4670
},
{
"epoch": 1.4766642975598199,
"grad_norm": 0.09735171737874418,
"learning_rate": 0.001986164743422013,
"loss": 2.967,
"step": 4675
},
{
"epoch": 1.4782437021242991,
"grad_norm": 0.11441091058772467,
"learning_rate": 0.0019860731811287154,
"loss": 3.024,
"step": 4680
},
{
"epoch": 1.4798231066887784,
"grad_norm": 0.09619007273414427,
"learning_rate": 0.0019859813189768644,
"loss": 2.9589,
"step": 4685
},
{
"epoch": 1.4814025112532576,
"grad_norm": 0.1296771374184954,
"learning_rate": 0.0019858891569943934,
"loss": 2.9977,
"step": 4690
},
{
"epoch": 1.4829819158177366,
"grad_norm": 0.11256784358801805,
"learning_rate": 0.0019857966952093286,
"loss": 2.9332,
"step": 4695
},
{
"epoch": 1.4845613203822159,
"grad_norm": 0.11178265985854806,
"learning_rate": 0.0019857039336497874,
"loss": 2.9517,
"step": 4700
},
{
"epoch": 1.4861407249466951,
"grad_norm": 0.11370303447286977,
"learning_rate": 0.001985610872343978,
"loss": 2.9207,
"step": 4705
},
{
"epoch": 1.4877201295111742,
"grad_norm": 0.10863653182760975,
"learning_rate": 0.0019855175113201993,
"loss": 2.8895,
"step": 4710
},
{
"epoch": 1.4892995340756534,
"grad_norm": 0.09663215015277192,
"learning_rate": 0.001985423850606842,
"loss": 2.9217,
"step": 4715
},
{
"epoch": 1.4908789386401327,
"grad_norm": 0.13809403189713054,
"learning_rate": 0.001985329890232388,
"loss": 3.0305,
"step": 4720
},
{
"epoch": 1.492458343204612,
"grad_norm": 0.11821789782684203,
"learning_rate": 0.0019852356302254097,
"loss": 2.9642,
"step": 4725
},
{
"epoch": 1.4940377477690912,
"grad_norm": 0.11090497998614943,
"learning_rate": 0.001985141070614571,
"loss": 2.9144,
"step": 4730
},
{
"epoch": 1.4956171523335702,
"grad_norm": 0.08742000966752303,
"learning_rate": 0.001985046211428627,
"loss": 2.9174,
"step": 4735
},
{
"epoch": 1.4971965568980494,
"grad_norm": 0.09589758068562293,
"learning_rate": 0.001984951052696424,
"loss": 3.0086,
"step": 4740
},
{
"epoch": 1.4987759614625287,
"grad_norm": 0.09592446840177818,
"learning_rate": 0.001984855594446899,
"loss": 3.017,
"step": 4745
},
{
"epoch": 1.5003553660270077,
"grad_norm": 0.10689358313731816,
"learning_rate": 0.0019847598367090796,
"loss": 2.9822,
"step": 4750
},
{
"epoch": 1.501934770591487,
"grad_norm": 0.09503783883425908,
"learning_rate": 0.0019846637795120857,
"loss": 2.9189,
"step": 4755
},
{
"epoch": 1.5035141751559662,
"grad_norm": 0.09938516992928695,
"learning_rate": 0.001984567422885128,
"loss": 3.0234,
"step": 4760
},
{
"epoch": 1.5050935797204454,
"grad_norm": 0.0883607481998521,
"learning_rate": 0.0019844707668575075,
"loss": 2.9138,
"step": 4765
},
{
"epoch": 1.5066729842849247,
"grad_norm": 0.1508418282651358,
"learning_rate": 0.001984373811458617,
"loss": 2.9186,
"step": 4770
},
{
"epoch": 1.508252388849404,
"grad_norm": 0.09830299547741146,
"learning_rate": 0.001984276556717939,
"loss": 2.9431,
"step": 4775
},
{
"epoch": 1.509831793413883,
"grad_norm": 0.10920442240888395,
"learning_rate": 0.0019841790026650496,
"loss": 3.0,
"step": 4780
},
{
"epoch": 1.5114111979783622,
"grad_norm": 0.12541008593529593,
"learning_rate": 0.0019840811493296133,
"loss": 2.9093,
"step": 4785
},
{
"epoch": 1.5129906025428412,
"grad_norm": 0.12553700549411517,
"learning_rate": 0.001983982996741387,
"loss": 2.94,
"step": 4790
},
{
"epoch": 1.5145700071073205,
"grad_norm": 0.10866989638304686,
"learning_rate": 0.001983884544930218,
"loss": 2.8466,
"step": 4795
},
{
"epoch": 1.5161494116717997,
"grad_norm": 0.09874987150780433,
"learning_rate": 0.0019837857939260456,
"loss": 2.8555,
"step": 4800
},
{
"epoch": 1.517728816236279,
"grad_norm": 0.09699900376639088,
"learning_rate": 0.0019836867437588988,
"loss": 2.9514,
"step": 4805
},
{
"epoch": 1.5193082208007582,
"grad_norm": 0.09793401970835895,
"learning_rate": 0.0019835873944588976,
"loss": 2.9401,
"step": 4810
},
{
"epoch": 1.5208876253652373,
"grad_norm": 0.08558765289450597,
"learning_rate": 0.0019834877460562545,
"loss": 2.9612,
"step": 4815
},
{
"epoch": 1.5224670299297165,
"grad_norm": 0.11862913842448683,
"learning_rate": 0.0019833877985812715,
"loss": 2.9126,
"step": 4820
},
{
"epoch": 1.5240464344941955,
"grad_norm": 0.0985856306095801,
"learning_rate": 0.0019832875520643415,
"loss": 2.966,
"step": 4825
},
{
"epoch": 1.5256258390586748,
"grad_norm": 0.08714163509947663,
"learning_rate": 0.0019831870065359497,
"loss": 3.0231,
"step": 4830
},
{
"epoch": 1.527205243623154,
"grad_norm": 0.08236307357637031,
"learning_rate": 0.001983086162026671,
"loss": 2.8827,
"step": 4835
},
{
"epoch": 1.5287846481876333,
"grad_norm": 0.07833304673352745,
"learning_rate": 0.0019829850185671717,
"loss": 2.9736,
"step": 4840
},
{
"epoch": 1.5303640527521125,
"grad_norm": 0.0953787909476679,
"learning_rate": 0.0019828835761882086,
"loss": 2.9241,
"step": 4845
},
{
"epoch": 1.5319434573165918,
"grad_norm": 0.08773467195247037,
"learning_rate": 0.0019827818349206295,
"loss": 2.996,
"step": 4850
},
{
"epoch": 1.5335228618810708,
"grad_norm": 0.10556485908717649,
"learning_rate": 0.001982679794795374,
"loss": 2.9428,
"step": 4855
},
{
"epoch": 1.53510226644555,
"grad_norm": 0.11706965366371103,
"learning_rate": 0.001982577455843471,
"loss": 3.0093,
"step": 4860
},
{
"epoch": 1.536681671010029,
"grad_norm": 0.11575180262595218,
"learning_rate": 0.0019824748180960416,
"loss": 2.9668,
"step": 4865
},
{
"epoch": 1.5382610755745083,
"grad_norm": 0.09629159206346939,
"learning_rate": 0.0019823718815842974,
"loss": 2.9568,
"step": 4870
},
{
"epoch": 1.5398404801389876,
"grad_norm": 0.13118955864125037,
"learning_rate": 0.0019822686463395406,
"loss": 2.9814,
"step": 4875
},
{
"epoch": 1.5414198847034668,
"grad_norm": 0.11129128119414668,
"learning_rate": 0.001982165112393164,
"loss": 2.8452,
"step": 4880
},
{
"epoch": 1.542999289267946,
"grad_norm": 0.11644279357587702,
"learning_rate": 0.0019820612797766526,
"loss": 2.9676,
"step": 4885
},
{
"epoch": 1.5445786938324253,
"grad_norm": 0.09994823334524525,
"learning_rate": 0.00198195714852158,
"loss": 2.8952,
"step": 4890
},
{
"epoch": 1.5461580983969043,
"grad_norm": 0.10139353655854862,
"learning_rate": 0.0019818527186596124,
"loss": 2.9744,
"step": 4895
},
{
"epoch": 1.5477375029613836,
"grad_norm": 0.1039446792162674,
"learning_rate": 0.0019817479902225067,
"loss": 2.9435,
"step": 4900
},
{
"epoch": 1.5493169075258626,
"grad_norm": 0.0796025536388445,
"learning_rate": 0.0019816429632421094,
"loss": 2.8935,
"step": 4905
},
{
"epoch": 1.5508963120903418,
"grad_norm": 0.10209172127058755,
"learning_rate": 0.0019815376377503593,
"loss": 2.9229,
"step": 4910
},
{
"epoch": 1.552475716654821,
"grad_norm": 0.10910718583932719,
"learning_rate": 0.001981432013779284,
"loss": 3.0488,
"step": 4915
},
{
"epoch": 1.5540551212193003,
"grad_norm": 0.1521828551806453,
"learning_rate": 0.0019813260913610045,
"loss": 2.9689,
"step": 4920
},
{
"epoch": 1.5556345257837796,
"grad_norm": 0.10994423634599962,
"learning_rate": 0.0019812198705277304,
"loss": 2.9151,
"step": 4925
},
{
"epoch": 1.5572139303482588,
"grad_norm": 0.09818489043815783,
"learning_rate": 0.0019811133513117627,
"loss": 2.8832,
"step": 4930
},
{
"epoch": 1.5587933349127379,
"grad_norm": 0.11140774818652234,
"learning_rate": 0.0019810065337454935,
"loss": 3.035,
"step": 4935
},
{
"epoch": 1.560372739477217,
"grad_norm": 0.1280699733503442,
"learning_rate": 0.001980899417861405,
"loss": 2.9458,
"step": 4940
},
{
"epoch": 1.5619521440416961,
"grad_norm": 0.1331934725393578,
"learning_rate": 0.001980792003692071,
"loss": 3.0324,
"step": 4945
},
{
"epoch": 1.5635315486061754,
"grad_norm": 0.11489890170242362,
"learning_rate": 0.001980684291270155,
"loss": 2.8935,
"step": 4950
},
{
"epoch": 1.5651109531706546,
"grad_norm": 0.09146807750801558,
"learning_rate": 0.001980576280628412,
"loss": 2.9042,
"step": 4955
},
{
"epoch": 1.5666903577351339,
"grad_norm": 0.10620129749069569,
"learning_rate": 0.0019804679717996864,
"loss": 3.0244,
"step": 4960
},
{
"epoch": 1.5682697622996131,
"grad_norm": 0.11571456540243834,
"learning_rate": 0.001980359364816916,
"loss": 2.8789,
"step": 4965
},
{
"epoch": 1.5698491668640924,
"grad_norm": 0.10809737964437593,
"learning_rate": 0.001980250459713126,
"loss": 2.9525,
"step": 4970
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.09186481444865727,
"learning_rate": 0.001980141256521434,
"loss": 2.9055,
"step": 4975
},
{
"epoch": 1.5730079759930506,
"grad_norm": 0.10354895162682684,
"learning_rate": 0.001980031755275048,
"loss": 2.9214,
"step": 4980
},
{
"epoch": 1.5745873805575297,
"grad_norm": 0.10228779853102214,
"learning_rate": 0.001979921956007267,
"loss": 2.9981,
"step": 4985
},
{
"epoch": 1.576166785122009,
"grad_norm": 0.11682083874165497,
"learning_rate": 0.0019798118587514802,
"loss": 3.0015,
"step": 4990
},
{
"epoch": 1.5777461896864882,
"grad_norm": 0.12403310979186538,
"learning_rate": 0.0019797014635411676,
"loss": 2.9444,
"step": 4995
},
{
"epoch": 1.5793255942509674,
"grad_norm": 0.11390491651356927,
"learning_rate": 0.001979590770409899,
"loss": 2.9445,
"step": 5000
},
{
"epoch": 1.5809049988154467,
"grad_norm": 0.10930893827752045,
"learning_rate": 0.001979479779391336,
"loss": 2.8585,
"step": 5005
},
{
"epoch": 1.582484403379926,
"grad_norm": 0.11160804717514534,
"learning_rate": 0.0019793684905192303,
"loss": 2.9065,
"step": 5010
},
{
"epoch": 1.584063807944405,
"grad_norm": 0.10915732890128317,
"learning_rate": 0.001979256903827424,
"loss": 2.993,
"step": 5015
},
{
"epoch": 1.5856432125088842,
"grad_norm": 0.07314647716296047,
"learning_rate": 0.0019791450193498497,
"loss": 2.849,
"step": 5020
},
{
"epoch": 1.5872226170733632,
"grad_norm": 0.07912808490967632,
"learning_rate": 0.001979032837120531,
"loss": 2.9493,
"step": 5025
},
{
"epoch": 1.5888020216378425,
"grad_norm": 0.14762568424607497,
"learning_rate": 0.001978920357173582,
"loss": 2.9197,
"step": 5030
},
{
"epoch": 1.5903814262023217,
"grad_norm": 0.15722179817288173,
"learning_rate": 0.0019788075795432064,
"loss": 2.9489,
"step": 5035
},
{
"epoch": 1.591960830766801,
"grad_norm": 0.1389110492503963,
"learning_rate": 0.0019786945042637,
"loss": 2.9015,
"step": 5040
},
{
"epoch": 1.5935402353312802,
"grad_norm": 0.11722287293596434,
"learning_rate": 0.0019785811313694475,
"loss": 2.9693,
"step": 5045
},
{
"epoch": 1.5951196398957594,
"grad_norm": 0.1176910685832534,
"learning_rate": 0.0019784674608949258,
"loss": 2.9582,
"step": 5050
},
{
"epoch": 1.5966990444602385,
"grad_norm": 0.1076042621246041,
"learning_rate": 0.0019783534928747007,
"loss": 3.05,
"step": 5055
},
{
"epoch": 1.5982784490247177,
"grad_norm": 0.08073816464558037,
"learning_rate": 0.001978239227343429,
"loss": 2.8257,
"step": 5060
},
{
"epoch": 1.5998578535891967,
"grad_norm": 0.09398261646040879,
"learning_rate": 0.0019781246643358584,
"loss": 2.9379,
"step": 5065
},
{
"epoch": 1.601437258153676,
"grad_norm": 0.15458833787235945,
"learning_rate": 0.001978009803886827,
"loss": 2.8705,
"step": 5070
},
{
"epoch": 1.6030166627181552,
"grad_norm": 0.09455723991841981,
"learning_rate": 0.001977894646031263,
"loss": 2.9589,
"step": 5075
},
{
"epoch": 1.6045960672826345,
"grad_norm": 0.12283382903989729,
"learning_rate": 0.001977779190804185,
"loss": 2.9334,
"step": 5080
},
{
"epoch": 1.6061754718471137,
"grad_norm": 0.09996430272852855,
"learning_rate": 0.0019776634382407026,
"loss": 2.9807,
"step": 5085
},
{
"epoch": 1.607754876411593,
"grad_norm": 0.1110137840520642,
"learning_rate": 0.0019775473883760146,
"loss": 2.8746,
"step": 5090
},
{
"epoch": 1.609334280976072,
"grad_norm": 0.09264037749590708,
"learning_rate": 0.0019774310412454116,
"loss": 2.9106,
"step": 5095
},
{
"epoch": 1.6109136855405513,
"grad_norm": 0.09037511086608532,
"learning_rate": 0.001977314396884274,
"loss": 2.9531,
"step": 5100
},
{
"epoch": 1.6124930901050303,
"grad_norm": 0.0975055802153603,
"learning_rate": 0.0019771974553280725,
"loss": 2.8963,
"step": 5105
},
{
"epoch": 1.6140724946695095,
"grad_norm": 0.11115116771807802,
"learning_rate": 0.0019770802166123687,
"loss": 2.8541,
"step": 5110
},
{
"epoch": 1.6156518992339888,
"grad_norm": 0.10695337184066091,
"learning_rate": 0.001976962680772813,
"loss": 2.9809,
"step": 5115
},
{
"epoch": 1.617231303798468,
"grad_norm": 0.104160743986913,
"learning_rate": 0.001976844847845149,
"loss": 3.0279,
"step": 5120
},
{
"epoch": 1.6188107083629473,
"grad_norm": 0.12343849835439115,
"learning_rate": 0.0019767267178652076,
"loss": 2.8222,
"step": 5125
},
{
"epoch": 1.6203901129274265,
"grad_norm": 0.0985309741194177,
"learning_rate": 0.0019766082908689118,
"loss": 2.9535,
"step": 5130
},
{
"epoch": 1.6219695174919055,
"grad_norm": 0.1007765759478301,
"learning_rate": 0.001976489566892274,
"loss": 2.9793,
"step": 5135
},
{
"epoch": 1.6235489220563848,
"grad_norm": 0.12202324187509646,
"learning_rate": 0.0019763705459713986,
"loss": 2.9642,
"step": 5140
},
{
"epoch": 1.6251283266208638,
"grad_norm": 0.09713677698438208,
"learning_rate": 0.0019762512281424776,
"loss": 2.9162,
"step": 5145
},
{
"epoch": 1.626707731185343,
"grad_norm": 0.08602069645790571,
"learning_rate": 0.001976131613441796,
"loss": 2.9203,
"step": 5150
},
{
"epoch": 1.6282871357498223,
"grad_norm": 0.1145716985651162,
"learning_rate": 0.0019760117019057277,
"loss": 2.9748,
"step": 5155
},
{
"epoch": 1.6298665403143016,
"grad_norm": 0.10787362431996148,
"learning_rate": 0.001975891493570737,
"loss": 3.0134,
"step": 5160
},
{
"epoch": 1.6314459448787808,
"grad_norm": 0.11064743389167223,
"learning_rate": 0.0019757709884733773,
"loss": 2.9397,
"step": 5165
},
{
"epoch": 1.63302534944326,
"grad_norm": 0.08614236606541124,
"learning_rate": 0.001975650186650295,
"loss": 2.9192,
"step": 5170
},
{
"epoch": 1.634604754007739,
"grad_norm": 0.09366852847388188,
"learning_rate": 0.0019755290881382243,
"loss": 2.9402,
"step": 5175
},
{
"epoch": 1.6361841585722183,
"grad_norm": 0.09730626420222427,
"learning_rate": 0.0019754076929739905,
"loss": 2.8907,
"step": 5180
},
{
"epoch": 1.6377635631366974,
"grad_norm": 0.10083090191566357,
"learning_rate": 0.00197528600119451,
"loss": 2.9226,
"step": 5185
},
{
"epoch": 1.6393429677011766,
"grad_norm": 0.1050001786747672,
"learning_rate": 0.0019751640128367874,
"loss": 2.9151,
"step": 5190
},
{
"epoch": 1.6409223722656558,
"grad_norm": 0.1020005922841891,
"learning_rate": 0.001975041727937919,
"loss": 2.8825,
"step": 5195
},
{
"epoch": 1.642501776830135,
"grad_norm": 0.09371317149531022,
"learning_rate": 0.001974919146535091,
"loss": 2.9128,
"step": 5200
},
{
"epoch": 1.6440811813946143,
"grad_norm": 0.10562421951491806,
"learning_rate": 0.001974796268665579,
"loss": 2.8204,
"step": 5205
},
{
"epoch": 1.6456605859590934,
"grad_norm": 0.11761558427266451,
"learning_rate": 0.0019746730943667502,
"loss": 2.9857,
"step": 5210
},
{
"epoch": 1.6472399905235726,
"grad_norm": 0.10865044339281003,
"learning_rate": 0.001974549623676061,
"loss": 2.8921,
"step": 5215
},
{
"epoch": 1.6488193950880516,
"grad_norm": 0.11581069329966616,
"learning_rate": 0.0019744258566310575,
"loss": 2.878,
"step": 5220
},
{
"epoch": 1.650398799652531,
"grad_norm": 0.1170453726056059,
"learning_rate": 0.0019743017932693763,
"loss": 2.8566,
"step": 5225
},
{
"epoch": 1.6519782042170101,
"grad_norm": 0.10845987251851028,
"learning_rate": 0.0019741774336287455,
"loss": 2.9,
"step": 5230
},
{
"epoch": 1.6535576087814894,
"grad_norm": 0.1024077422432529,
"learning_rate": 0.001974052777746981,
"loss": 2.9584,
"step": 5235
},
{
"epoch": 1.6551370133459686,
"grad_norm": 0.09824210313823284,
"learning_rate": 0.0019739278256619905,
"loss": 2.8304,
"step": 5240
},
{
"epoch": 1.6567164179104479,
"grad_norm": 0.09766997021888332,
"learning_rate": 0.0019738025774117705,
"loss": 2.9197,
"step": 5245
},
{
"epoch": 1.658295822474927,
"grad_norm": 0.11670891039271215,
"learning_rate": 0.0019736770330344086,
"loss": 2.829,
"step": 5250
},
{
"epoch": 1.6598752270394062,
"grad_norm": 0.10093571661443894,
"learning_rate": 0.001973551192568082,
"loss": 2.8732,
"step": 5255
},
{
"epoch": 1.6614546316038852,
"grad_norm": 0.10502890462616174,
"learning_rate": 0.001973425056051058,
"loss": 2.8835,
"step": 5260
},
{
"epoch": 1.6630340361683644,
"grad_norm": 0.08830167834623308,
"learning_rate": 0.0019732986235216935,
"loss": 2.7945,
"step": 5265
},
{
"epoch": 1.6646134407328437,
"grad_norm": 0.08185027792051866,
"learning_rate": 0.0019731718950184367,
"loss": 2.8968,
"step": 5270
},
{
"epoch": 1.666192845297323,
"grad_norm": 0.110446561845335,
"learning_rate": 0.0019730448705798237,
"loss": 2.8783,
"step": 5275
},
{
"epoch": 1.6677722498618022,
"grad_norm": 0.08209502578334943,
"learning_rate": 0.001972917550244483,
"loss": 2.9148,
"step": 5280
},
{
"epoch": 1.6693516544262814,
"grad_norm": 0.11105492789083406,
"learning_rate": 0.001972789934051131,
"loss": 2.8242,
"step": 5285
},
{
"epoch": 1.6709310589907604,
"grad_norm": 0.10894083448820191,
"learning_rate": 0.001972662022038576,
"loss": 2.8603,
"step": 5290
},
{
"epoch": 1.6725104635552397,
"grad_norm": 0.11154559870326458,
"learning_rate": 0.0019725338142457145,
"loss": 2.8131,
"step": 5295
},
{
"epoch": 1.6740898681197187,
"grad_norm": 0.10610140342224161,
"learning_rate": 0.0019724053107115338,
"loss": 2.8424,
"step": 5300
},
{
"epoch": 1.675669272684198,
"grad_norm": 0.10471075913756912,
"learning_rate": 0.0019722765114751103,
"loss": 2.9517,
"step": 5305
},
{
"epoch": 1.6772486772486772,
"grad_norm": 0.10961610964173045,
"learning_rate": 0.001972147416575612,
"loss": 2.9874,
"step": 5310
},
{
"epoch": 1.6788280818131565,
"grad_norm": 0.11590908074547357,
"learning_rate": 0.001972018026052296,
"loss": 2.9816,
"step": 5315
},
{
"epoch": 1.6804074863776357,
"grad_norm": 0.09014877820325726,
"learning_rate": 0.0019718883399445085,
"loss": 2.8988,
"step": 5320
},
{
"epoch": 1.681986890942115,
"grad_norm": 0.08843261743895112,
"learning_rate": 0.001971758358291686,
"loss": 2.8956,
"step": 5325
},
{
"epoch": 1.683566295506594,
"grad_norm": 0.13524407413650683,
"learning_rate": 0.001971628081133356,
"loss": 2.8855,
"step": 5330
},
{
"epoch": 1.6851457000710732,
"grad_norm": 0.125877431534522,
"learning_rate": 0.001971497508509134,
"loss": 2.8863,
"step": 5335
},
{
"epoch": 1.6867251046355523,
"grad_norm": 0.08065617976051762,
"learning_rate": 0.0019713666404587273,
"loss": 2.8977,
"step": 5340
},
{
"epoch": 1.6883045092000315,
"grad_norm": 0.08766136947767611,
"learning_rate": 0.001971235477021931,
"loss": 2.9427,
"step": 5345
},
{
"epoch": 1.6898839137645107,
"grad_norm": 0.09373146559188211,
"learning_rate": 0.0019711040182386315,
"loss": 2.9298,
"step": 5350
},
{
"epoch": 1.69146331832899,
"grad_norm": 0.09378089597874405,
"learning_rate": 0.001970972264148805,
"loss": 2.8447,
"step": 5355
},
{
"epoch": 1.6930427228934692,
"grad_norm": 0.08863466546773448,
"learning_rate": 0.001970840214792516,
"loss": 2.9024,
"step": 5360
},
{
"epoch": 1.6946221274579485,
"grad_norm": 0.09604705858875463,
"learning_rate": 0.001970707870209921,
"loss": 2.8971,
"step": 5365
},
{
"epoch": 1.6962015320224275,
"grad_norm": 0.08691993851387995,
"learning_rate": 0.0019705752304412646,
"loss": 2.8304,
"step": 5370
},
{
"epoch": 1.6977809365869068,
"grad_norm": 0.08637512290496353,
"learning_rate": 0.001970442295526882,
"loss": 2.863,
"step": 5375
},
{
"epoch": 1.6993603411513858,
"grad_norm": 0.11610905002376368,
"learning_rate": 0.0019703090655071977,
"loss": 2.9165,
"step": 5380
},
{
"epoch": 1.700939745715865,
"grad_norm": 0.0974746503636226,
"learning_rate": 0.001970175540422726,
"loss": 2.8237,
"step": 5385
},
{
"epoch": 1.7025191502803443,
"grad_norm": 0.08653648143193664,
"learning_rate": 0.0019700417203140706,
"loss": 2.8013,
"step": 5390
},
{
"epoch": 1.7040985548448235,
"grad_norm": 0.10331561155385749,
"learning_rate": 0.0019699076052219263,
"loss": 2.8358,
"step": 5395
},
{
"epoch": 1.7056779594093028,
"grad_norm": 0.10734105048757829,
"learning_rate": 0.001969773195187076,
"loss": 2.8952,
"step": 5400
},
{
"epoch": 1.707257363973782,
"grad_norm": 0.10187960767446501,
"learning_rate": 0.001969638490250393,
"loss": 2.8426,
"step": 5405
},
{
"epoch": 1.708836768538261,
"grad_norm": 0.10952454351464175,
"learning_rate": 0.0019695034904528407,
"loss": 2.8591,
"step": 5410
},
{
"epoch": 1.7104161731027403,
"grad_norm": 0.11794184071738911,
"learning_rate": 0.0019693681958354707,
"loss": 3.0401,
"step": 5415
},
{
"epoch": 1.7119955776672193,
"grad_norm": 0.11416879924650811,
"learning_rate": 0.0019692326064394265,
"loss": 2.8389,
"step": 5420
},
{
"epoch": 1.7135749822316986,
"grad_norm": 0.11238489422970098,
"learning_rate": 0.0019690967223059386,
"loss": 2.8062,
"step": 5425
},
{
"epoch": 1.7151543867961778,
"grad_norm": 0.11617189166521032,
"learning_rate": 0.00196896054347633,
"loss": 2.851,
"step": 5430
},
{
"epoch": 1.716733791360657,
"grad_norm": 0.11380891130531372,
"learning_rate": 0.00196882406999201,
"loss": 2.8799,
"step": 5435
},
{
"epoch": 1.7183131959251363,
"grad_norm": 0.0892821522527821,
"learning_rate": 0.001968687301894481,
"loss": 2.826,
"step": 5440
},
{
"epoch": 1.7198926004896156,
"grad_norm": 0.129963031390663,
"learning_rate": 0.0019685502392253326,
"loss": 2.7768,
"step": 5445
},
{
"epoch": 1.7214720050540946,
"grad_norm": 0.14752416451669453,
"learning_rate": 0.0019684128820262443,
"loss": 2.9128,
"step": 5450
},
{
"epoch": 1.7230514096185738,
"grad_norm": 0.10287349342814446,
"learning_rate": 0.001968275230338986,
"loss": 2.9499,
"step": 5455
},
{
"epoch": 1.7246308141830529,
"grad_norm": 0.09967919811875595,
"learning_rate": 0.001968137284205417,
"loss": 2.777,
"step": 5460
},
{
"epoch": 1.726210218747532,
"grad_norm": 0.10413546356378635,
"learning_rate": 0.001967999043667485,
"loss": 2.8979,
"step": 5465
},
{
"epoch": 1.7277896233120114,
"grad_norm": 0.1072432492489738,
"learning_rate": 0.001967860508767229,
"loss": 2.8283,
"step": 5470
},
{
"epoch": 1.7293690278764906,
"grad_norm": 0.09655611693814051,
"learning_rate": 0.001967721679546776,
"loss": 2.7948,
"step": 5475
},
{
"epoch": 1.7309484324409699,
"grad_norm": 0.08712375046387344,
"learning_rate": 0.001967582556048343,
"loss": 2.7723,
"step": 5480
},
{
"epoch": 1.732527837005449,
"grad_norm": 0.09225677250022102,
"learning_rate": 0.001967443138314237,
"loss": 2.8686,
"step": 5485
},
{
"epoch": 1.7341072415699281,
"grad_norm": 0.10393698580518194,
"learning_rate": 0.001967303426386853,
"loss": 2.9854,
"step": 5490
},
{
"epoch": 1.7356866461344074,
"grad_norm": 0.09691227894220021,
"learning_rate": 0.0019671634203086786,
"loss": 2.8644,
"step": 5495
},
{
"epoch": 1.7372660506988864,
"grad_norm": 0.08534322592873014,
"learning_rate": 0.0019670231201222867,
"loss": 2.814,
"step": 5500
},
{
"epoch": 1.7388454552633656,
"grad_norm": 0.11043464541805863,
"learning_rate": 0.0019668825258703426,
"loss": 2.9104,
"step": 5505
},
{
"epoch": 1.740424859827845,
"grad_norm": 0.11923335025448265,
"learning_rate": 0.0019667416375955997,
"loss": 2.8873,
"step": 5510
},
{
"epoch": 1.7420042643923241,
"grad_norm": 0.0944140055846587,
"learning_rate": 0.001966600455340902,
"loss": 2.8354,
"step": 5515
},
{
"epoch": 1.7435836689568034,
"grad_norm": 0.09289305803071748,
"learning_rate": 0.0019664589791491814,
"loss": 2.8526,
"step": 5520
},
{
"epoch": 1.7451630735212826,
"grad_norm": 0.10652799839126612,
"learning_rate": 0.00196631720906346,
"loss": 2.7941,
"step": 5525
},
{
"epoch": 1.7467424780857617,
"grad_norm": 0.08877425547680012,
"learning_rate": 0.0019661751451268495,
"loss": 2.7967,
"step": 5530
},
{
"epoch": 1.748321882650241,
"grad_norm": 0.09920541060889011,
"learning_rate": 0.00196603278738255,
"loss": 2.8083,
"step": 5535
},
{
"epoch": 1.74990128721472,
"grad_norm": 0.11134150673030374,
"learning_rate": 0.001965890135873852,
"loss": 2.7912,
"step": 5540
},
{
"epoch": 1.7514806917791992,
"grad_norm": 0.08671812361927851,
"learning_rate": 0.0019657471906441354,
"loss": 2.8215,
"step": 5545
},
{
"epoch": 1.7530600963436784,
"grad_norm": 0.09189414759023765,
"learning_rate": 0.0019656039517368684,
"loss": 2.848,
"step": 5550
},
{
"epoch": 1.7546395009081577,
"grad_norm": 0.08457665537333102,
"learning_rate": 0.0019654604191956093,
"loss": 2.8594,
"step": 5555
},
{
"epoch": 1.756218905472637,
"grad_norm": 0.08088442370962645,
"learning_rate": 0.0019653165930640045,
"loss": 2.858,
"step": 5560
},
{
"epoch": 1.757798310037116,
"grad_norm": 0.08221707893249937,
"learning_rate": 0.0019651724733857918,
"loss": 2.7932,
"step": 5565
},
{
"epoch": 1.7593777146015952,
"grad_norm": 0.10413482648930877,
"learning_rate": 0.0019650280602047966,
"loss": 2.8853,
"step": 5570
},
{
"epoch": 1.7609571191660742,
"grad_norm": 0.09223354313728577,
"learning_rate": 0.001964883353564934,
"loss": 2.8474,
"step": 5575
},
{
"epoch": 1.7625365237305535,
"grad_norm": 0.11170715489075345,
"learning_rate": 0.0019647383535102082,
"loss": 2.7924,
"step": 5580
},
{
"epoch": 1.7641159282950327,
"grad_norm": 0.10085002954995868,
"learning_rate": 0.0019645930600847134,
"loss": 2.8615,
"step": 5585
},
{
"epoch": 1.765695332859512,
"grad_norm": 0.13033848574238788,
"learning_rate": 0.0019644474733326316,
"loss": 2.8704,
"step": 5590
},
{
"epoch": 1.7672747374239912,
"grad_norm": 0.10626223537711284,
"learning_rate": 0.0019643015932982355,
"loss": 2.8016,
"step": 5595
},
{
"epoch": 1.7688541419884705,
"grad_norm": 0.09109802005589106,
"learning_rate": 0.0019641554200258856,
"loss": 2.8567,
"step": 5600
},
{
"epoch": 1.7704335465529495,
"grad_norm": 0.105574279290575,
"learning_rate": 0.0019640089535600327,
"loss": 2.8931,
"step": 5605
},
{
"epoch": 1.7720129511174287,
"grad_norm": 0.09271871358737761,
"learning_rate": 0.0019638621939452165,
"loss": 2.8455,
"step": 5610
},
{
"epoch": 1.7735923556819078,
"grad_norm": 0.08940779528973208,
"learning_rate": 0.001963715141226065,
"loss": 2.8497,
"step": 5615
},
{
"epoch": 1.775171760246387,
"grad_norm": 0.09151765966803481,
"learning_rate": 0.001963567795447297,
"loss": 2.7759,
"step": 5620
},
{
"epoch": 1.7767511648108663,
"grad_norm": 0.12790289749531383,
"learning_rate": 0.0019634201566537182,
"loss": 2.8067,
"step": 5625
},
{
"epoch": 1.7783305693753455,
"grad_norm": 0.09309013128028407,
"learning_rate": 0.0019632722248902256,
"loss": 2.8082,
"step": 5630
},
{
"epoch": 1.7799099739398248,
"grad_norm": 0.08581911812029219,
"learning_rate": 0.0019631240002018035,
"loss": 2.8794,
"step": 5635
},
{
"epoch": 1.781489378504304,
"grad_norm": 0.09431687919055587,
"learning_rate": 0.001962975482633527,
"loss": 2.8182,
"step": 5640
},
{
"epoch": 1.783068783068783,
"grad_norm": 0.10147779341824549,
"learning_rate": 0.001962826672230559,
"loss": 2.823,
"step": 5645
},
{
"epoch": 1.7846481876332623,
"grad_norm": 0.10002615650609308,
"learning_rate": 0.001962677569038151,
"loss": 2.8715,
"step": 5650
},
{
"epoch": 1.7862275921977413,
"grad_norm": 0.07803919999016076,
"learning_rate": 0.0019625281731016453,
"loss": 2.8076,
"step": 5655
},
{
"epoch": 1.7878069967622205,
"grad_norm": 0.09534400601107142,
"learning_rate": 0.001962378484466472,
"loss": 2.7778,
"step": 5660
},
{
"epoch": 1.7893864013266998,
"grad_norm": 0.0671038402986734,
"learning_rate": 0.0019622285031781505,
"loss": 2.8857,
"step": 5665
},
{
"epoch": 1.790965805891179,
"grad_norm": 0.07972258446393975,
"learning_rate": 0.001962078229282289,
"loss": 2.8196,
"step": 5670
},
{
"epoch": 1.7925452104556583,
"grad_norm": 0.09681179345298509,
"learning_rate": 0.0019619276628245843,
"loss": 2.825,
"step": 5675
},
{
"epoch": 1.7941246150201375,
"grad_norm": 0.10801872450890859,
"learning_rate": 0.0019617768038508237,
"loss": 2.903,
"step": 5680
},
{
"epoch": 1.7957040195846166,
"grad_norm": 0.09544098038024276,
"learning_rate": 0.0019616256524068823,
"loss": 2.8387,
"step": 5685
},
{
"epoch": 1.7972834241490958,
"grad_norm": 0.1320662539458633,
"learning_rate": 0.001961474208538723,
"loss": 2.9244,
"step": 5690
},
{
"epoch": 1.7988628287135748,
"grad_norm": 0.12796798547022245,
"learning_rate": 0.0019613224722924007,
"loss": 2.9441,
"step": 5695
},
{
"epoch": 1.800442233278054,
"grad_norm": 0.14013468102503435,
"learning_rate": 0.0019611704437140567,
"loss": 2.9045,
"step": 5700
},
{
"epoch": 1.8020216378425333,
"grad_norm": 0.10693907084566522,
"learning_rate": 0.0019610181228499218,
"loss": 2.8957,
"step": 5705
},
{
"epoch": 1.8036010424070126,
"grad_norm": 0.10501285453729153,
"learning_rate": 0.0019608655097463155,
"loss": 2.8295,
"step": 5710
},
{
"epoch": 1.8051804469714918,
"grad_norm": 0.08292288367824334,
"learning_rate": 0.0019607126044496473,
"loss": 2.8416,
"step": 5715
},
{
"epoch": 1.806759851535971,
"grad_norm": 0.08426166408478367,
"learning_rate": 0.001960559407006414,
"loss": 2.8757,
"step": 5720
},
{
"epoch": 1.80833925610045,
"grad_norm": 0.12618186204150836,
"learning_rate": 0.001960405917463202,
"loss": 2.8091,
"step": 5725
},
{
"epoch": 1.8099186606649293,
"grad_norm": 0.0979784528685163,
"learning_rate": 0.001960252135866687,
"loss": 2.7894,
"step": 5730
},
{
"epoch": 1.8114980652294084,
"grad_norm": 0.09225963447119595,
"learning_rate": 0.0019600980622636326,
"loss": 2.8949,
"step": 5735
},
{
"epoch": 1.8130774697938876,
"grad_norm": 0.0957347564504516,
"learning_rate": 0.001959943696700892,
"loss": 2.792,
"step": 5740
},
{
"epoch": 1.8146568743583669,
"grad_norm": 0.09816516235276534,
"learning_rate": 0.001959789039225406,
"loss": 2.7762,
"step": 5745
},
{
"epoch": 1.8162362789228461,
"grad_norm": 0.14832825727107027,
"learning_rate": 0.0019596340898842056,
"loss": 2.8905,
"step": 5750
},
{
"epoch": 1.8178156834873254,
"grad_norm": 0.10127269862435555,
"learning_rate": 0.00195947884872441,
"loss": 2.9231,
"step": 5755
},
{
"epoch": 1.8193950880518046,
"grad_norm": 0.10596226828499104,
"learning_rate": 0.0019593233157932264,
"loss": 2.8561,
"step": 5760
},
{
"epoch": 1.8209744926162836,
"grad_norm": 0.08437118924922375,
"learning_rate": 0.001959167491137952,
"loss": 2.934,
"step": 5765
},
{
"epoch": 1.8225538971807629,
"grad_norm": 0.084182086109871,
"learning_rate": 0.0019590113748059715,
"loss": 2.8071,
"step": 5770
},
{
"epoch": 1.824133301745242,
"grad_norm": 0.08334964947133998,
"learning_rate": 0.0019588549668447595,
"loss": 2.8028,
"step": 5775
},
{
"epoch": 1.8257127063097212,
"grad_norm": 0.10502427527644426,
"learning_rate": 0.0019586982673018786,
"loss": 2.8621,
"step": 5780
},
{
"epoch": 1.8272921108742004,
"grad_norm": 0.08792831410856203,
"learning_rate": 0.00195854127622498,
"loss": 2.7531,
"step": 5785
},
{
"epoch": 1.8288715154386797,
"grad_norm": 0.0879256838351816,
"learning_rate": 0.0019583839936618028,
"loss": 2.848,
"step": 5790
},
{
"epoch": 1.830450920003159,
"grad_norm": 0.08223569631478067,
"learning_rate": 0.001958226419660177,
"loss": 2.7847,
"step": 5795
},
{
"epoch": 1.8320303245676381,
"grad_norm": 0.08954888379371354,
"learning_rate": 0.001958068554268019,
"loss": 2.8953,
"step": 5800
},
{
"epoch": 1.8336097291321172,
"grad_norm": 0.08809234715104736,
"learning_rate": 0.001957910397533335,
"loss": 2.9025,
"step": 5805
},
{
"epoch": 1.8351891336965964,
"grad_norm": 0.09182459314591035,
"learning_rate": 0.0019577519495042194,
"loss": 2.8099,
"step": 5810
},
{
"epoch": 1.8367685382610754,
"grad_norm": 0.0802234998272935,
"learning_rate": 0.0019575932102288553,
"loss": 2.8088,
"step": 5815
},
{
"epoch": 1.8383479428255547,
"grad_norm": 0.08500552989609395,
"learning_rate": 0.0019574341797555144,
"loss": 2.8726,
"step": 5820
},
{
"epoch": 1.839927347390034,
"grad_norm": 0.08692276443581538,
"learning_rate": 0.001957274858132556,
"loss": 2.8033,
"step": 5825
},
{
"epoch": 1.8415067519545132,
"grad_norm": 0.08476049437607958,
"learning_rate": 0.00195711524540843,
"loss": 2.8139,
"step": 5830
},
{
"epoch": 1.8430861565189924,
"grad_norm": 0.09529065856387889,
"learning_rate": 0.0019569553416316724,
"loss": 2.7915,
"step": 5835
},
{
"epoch": 1.8446655610834717,
"grad_norm": 0.10095333087758838,
"learning_rate": 0.0019567951468509102,
"loss": 2.8123,
"step": 5840
},
{
"epoch": 1.8462449656479507,
"grad_norm": 0.10951937044106012,
"learning_rate": 0.001956634661114857,
"loss": 2.7474,
"step": 5845
},
{
"epoch": 1.84782437021243,
"grad_norm": 0.12812453264923507,
"learning_rate": 0.001956473884472315,
"loss": 2.8927,
"step": 5850
},
{
"epoch": 1.849403774776909,
"grad_norm": 0.08917260558020511,
"learning_rate": 0.001956312816972176,
"loss": 2.767,
"step": 5855
},
{
"epoch": 1.8509831793413882,
"grad_norm": 0.09192768008047074,
"learning_rate": 0.00195615145866342,
"loss": 2.8303,
"step": 5860
},
{
"epoch": 1.8525625839058675,
"grad_norm": 0.08903746231326805,
"learning_rate": 0.0019559898095951136,
"loss": 2.7969,
"step": 5865
},
{
"epoch": 1.8541419884703467,
"grad_norm": 0.08198358367157664,
"learning_rate": 0.0019558278698164145,
"loss": 2.8563,
"step": 5870
},
{
"epoch": 1.855721393034826,
"grad_norm": 0.09826949164597297,
"learning_rate": 0.001955665639376567,
"loss": 2.9417,
"step": 5875
},
{
"epoch": 1.8573007975993052,
"grad_norm": 0.097023198766505,
"learning_rate": 0.0019555031183249045,
"loss": 2.9257,
"step": 5880
},
{
"epoch": 1.8588802021637842,
"grad_norm": 0.09128928830765581,
"learning_rate": 0.001955340306710849,
"loss": 2.8324,
"step": 5885
},
{
"epoch": 1.8604596067282635,
"grad_norm": 0.08445857191549251,
"learning_rate": 0.0019551772045839095,
"loss": 2.6883,
"step": 5890
},
{
"epoch": 1.8620390112927425,
"grad_norm": 0.07992959206633901,
"learning_rate": 0.0019550138119936848,
"loss": 2.876,
"step": 5895
},
{
"epoch": 1.8636184158572218,
"grad_norm": 0.0718896788646997,
"learning_rate": 0.001954850128989862,
"loss": 2.7931,
"step": 5900
},
{
"epoch": 1.865197820421701,
"grad_norm": 0.06519170978053945,
"learning_rate": 0.001954686155622216,
"loss": 2.8115,
"step": 5905
},
{
"epoch": 1.8667772249861803,
"grad_norm": 0.09720596950849911,
"learning_rate": 0.0019545218919406093,
"loss": 2.7797,
"step": 5910
},
{
"epoch": 1.8683566295506595,
"grad_norm": 0.12678939451280705,
"learning_rate": 0.001954357337994994,
"loss": 2.9284,
"step": 5915
},
{
"epoch": 1.8699360341151388,
"grad_norm": 0.12164911465261,
"learning_rate": 0.0019541924938354096,
"loss": 2.7888,
"step": 5920
},
{
"epoch": 1.8715154386796178,
"grad_norm": 0.12786708775383082,
"learning_rate": 0.001954027359511984,
"loss": 2.8952,
"step": 5925
},
{
"epoch": 1.873094843244097,
"grad_norm": 0.08646118211188386,
"learning_rate": 0.0019538619350749345,
"loss": 2.7687,
"step": 5930
},
{
"epoch": 1.874674247808576,
"grad_norm": 0.08618728690802949,
"learning_rate": 0.0019536962205745647,
"loss": 2.7424,
"step": 5935
},
{
"epoch": 1.8762536523730553,
"grad_norm": 0.07380398698166094,
"learning_rate": 0.001953530216061267,
"loss": 2.7674,
"step": 5940
},
{
"epoch": 1.8778330569375346,
"grad_norm": 0.13229423018200095,
"learning_rate": 0.0019533639215855237,
"loss": 2.7864,
"step": 5945
},
{
"epoch": 1.8794124615020138,
"grad_norm": 0.11409026363699672,
"learning_rate": 0.0019531973371979027,
"loss": 2.9098,
"step": 5950
},
{
"epoch": 1.880991866066493,
"grad_norm": 0.10927764439517655,
"learning_rate": 0.0019530304629490618,
"loss": 2.784,
"step": 5955
},
{
"epoch": 1.882571270630972,
"grad_norm": 0.08758344666216031,
"learning_rate": 0.0019528632988897458,
"loss": 2.7787,
"step": 5960
},
{
"epoch": 1.8841506751954513,
"grad_norm": 0.08989382431986725,
"learning_rate": 0.001952695845070789,
"loss": 2.8563,
"step": 5965
},
{
"epoch": 1.8857300797599303,
"grad_norm": 0.11095653749941818,
"learning_rate": 0.0019525281015431127,
"loss": 2.8349,
"step": 5970
},
{
"epoch": 1.8873094843244096,
"grad_norm": 0.09356354029199736,
"learning_rate": 0.0019523600683577264,
"loss": 2.772,
"step": 5975
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.09802457615928181,
"learning_rate": 0.001952191745565728,
"loss": 2.8127,
"step": 5980
},
{
"epoch": 1.890468293453368,
"grad_norm": 0.09130936925810829,
"learning_rate": 0.0019520231332183036,
"loss": 2.8453,
"step": 5985
},
{
"epoch": 1.8920476980178473,
"grad_norm": 0.08809660646950714,
"learning_rate": 0.001951854231366727,
"loss": 2.8222,
"step": 5990
},
{
"epoch": 1.8936271025823266,
"grad_norm": 0.11070882861672758,
"learning_rate": 0.0019516850400623604,
"loss": 2.7529,
"step": 5995
},
{
"epoch": 1.8952065071468056,
"grad_norm": 0.14618290547433194,
"learning_rate": 0.0019515155593566535,
"loss": 2.7643,
"step": 6000
},
{
"epoch": 1.8967859117112849,
"grad_norm": 0.1279958619359713,
"learning_rate": 0.0019513457893011444,
"loss": 2.7829,
"step": 6005
},
{
"epoch": 1.8983653162757639,
"grad_norm": 0.10863296125619862,
"learning_rate": 0.0019511757299474591,
"loss": 2.861,
"step": 6010
},
{
"epoch": 1.8999447208402431,
"grad_norm": 0.07358786598532395,
"learning_rate": 0.0019510053813473114,
"loss": 2.8414,
"step": 6015
},
{
"epoch": 1.9015241254047224,
"grad_norm": 0.07582738144151509,
"learning_rate": 0.0019508347435525037,
"loss": 2.8013,
"step": 6020
},
{
"epoch": 1.9031035299692016,
"grad_norm": 0.09336565960832502,
"learning_rate": 0.001950663816614925,
"loss": 2.9326,
"step": 6025
},
{
"epoch": 1.9046829345336809,
"grad_norm": 0.10824071803830503,
"learning_rate": 0.001950492600586554,
"loss": 2.8029,
"step": 6030
},
{
"epoch": 1.9062623390981601,
"grad_norm": 0.08575271033474953,
"learning_rate": 0.001950321095519456,
"loss": 2.8342,
"step": 6035
},
{
"epoch": 1.9078417436626391,
"grad_norm": 0.1018277175039707,
"learning_rate": 0.0019501493014657846,
"loss": 2.8839,
"step": 6040
},
{
"epoch": 1.9094211482271184,
"grad_norm": 0.09328347992445929,
"learning_rate": 0.0019499772184777813,
"loss": 2.8144,
"step": 6045
},
{
"epoch": 1.9110005527915974,
"grad_norm": 0.10485085743520843,
"learning_rate": 0.0019498048466077753,
"loss": 2.86,
"step": 6050
},
{
"epoch": 1.9125799573560767,
"grad_norm": 0.0955680047571786,
"learning_rate": 0.0019496321859081842,
"loss": 2.8175,
"step": 6055
},
{
"epoch": 1.914159361920556,
"grad_norm": 0.09273907882866253,
"learning_rate": 0.0019494592364315126,
"loss": 2.8755,
"step": 6060
},
{
"epoch": 1.9157387664850352,
"grad_norm": 0.07407064110854321,
"learning_rate": 0.0019492859982303532,
"loss": 2.9163,
"step": 6065
},
{
"epoch": 1.9173181710495144,
"grad_norm": 0.0772395616577009,
"learning_rate": 0.0019491124713573874,
"loss": 2.7888,
"step": 6070
},
{
"epoch": 1.9188975756139937,
"grad_norm": 0.07922545919001302,
"learning_rate": 0.0019489386558653827,
"loss": 2.7501,
"step": 6075
},
{
"epoch": 1.9204769801784727,
"grad_norm": 0.08135294434296457,
"learning_rate": 0.0019487645518071958,
"loss": 2.8935,
"step": 6080
},
{
"epoch": 1.922056384742952,
"grad_norm": 0.08030592271812116,
"learning_rate": 0.0019485901592357707,
"loss": 2.8497,
"step": 6085
},
{
"epoch": 1.923635789307431,
"grad_norm": 0.08200182929395594,
"learning_rate": 0.0019484154782041388,
"loss": 2.6893,
"step": 6090
},
{
"epoch": 1.9252151938719102,
"grad_norm": 0.08130310335267442,
"learning_rate": 0.0019482405087654193,
"loss": 2.8332,
"step": 6095
},
{
"epoch": 1.9267945984363894,
"grad_norm": 0.07548347105029962,
"learning_rate": 0.00194806525097282,
"loss": 2.7958,
"step": 6100
},
{
"epoch": 1.9283740030008687,
"grad_norm": 0.09933339001977781,
"learning_rate": 0.0019478897048796349,
"loss": 2.8233,
"step": 6105
},
{
"epoch": 1.929953407565348,
"grad_norm": 0.08574779037700413,
"learning_rate": 0.0019477138705392468,
"loss": 2.7458,
"step": 6110
},
{
"epoch": 1.9315328121298272,
"grad_norm": 0.08066906047062622,
"learning_rate": 0.001947537748005126,
"loss": 2.8612,
"step": 6115
},
{
"epoch": 1.9331122166943062,
"grad_norm": 0.09598145548735948,
"learning_rate": 0.0019473613373308298,
"loss": 2.7525,
"step": 6120
},
{
"epoch": 1.9346916212587855,
"grad_norm": 0.07458977511454752,
"learning_rate": 0.001947184638570004,
"loss": 2.8079,
"step": 6125
},
{
"epoch": 1.9362710258232645,
"grad_norm": 0.09282378084023518,
"learning_rate": 0.001947007651776381,
"loss": 2.7308,
"step": 6130
},
{
"epoch": 1.9378504303877437,
"grad_norm": 0.08151088064882638,
"learning_rate": 0.001946830377003782,
"loss": 2.7715,
"step": 6135
},
{
"epoch": 1.939429834952223,
"grad_norm": 0.08967279639477138,
"learning_rate": 0.0019466528143061148,
"loss": 2.8798,
"step": 6140
},
{
"epoch": 1.9410092395167022,
"grad_norm": 0.08014834943720231,
"learning_rate": 0.0019464749637373752,
"loss": 2.8762,
"step": 6145
},
{
"epoch": 1.9425886440811815,
"grad_norm": 0.1015957088000604,
"learning_rate": 0.0019462968253516459,
"loss": 2.7706,
"step": 6150
},
{
"epoch": 1.9441680486456607,
"grad_norm": 0.08424020119718752,
"learning_rate": 0.0019461183992030985,
"loss": 2.8594,
"step": 6155
},
{
"epoch": 1.9457474532101398,
"grad_norm": 0.07337585037431,
"learning_rate": 0.0019459396853459905,
"loss": 2.8301,
"step": 6160
},
{
"epoch": 1.947326857774619,
"grad_norm": 0.0832795576496703,
"learning_rate": 0.001945760683834668,
"loss": 2.818,
"step": 6165
},
{
"epoch": 1.948906262339098,
"grad_norm": 0.0857588105421658,
"learning_rate": 0.0019455813947235644,
"loss": 2.8158,
"step": 6170
},
{
"epoch": 1.9504856669035773,
"grad_norm": 0.0784707318258361,
"learning_rate": 0.0019454018180672002,
"loss": 2.8325,
"step": 6175
},
{
"epoch": 1.9520650714680565,
"grad_norm": 0.08506714452247786,
"learning_rate": 0.0019452219539201829,
"loss": 2.8065,
"step": 6180
},
{
"epoch": 1.9536444760325358,
"grad_norm": 0.1189752375053285,
"learning_rate": 0.0019450418023372093,
"loss": 2.8073,
"step": 6185
},
{
"epoch": 1.955223880597015,
"grad_norm": 0.0997349870460688,
"learning_rate": 0.0019448613633730614,
"loss": 2.7286,
"step": 6190
},
{
"epoch": 1.9568032851614943,
"grad_norm": 0.09809781879864145,
"learning_rate": 0.0019446806370826098,
"loss": 2.8428,
"step": 6195
},
{
"epoch": 1.9583826897259733,
"grad_norm": 0.09740558269055588,
"learning_rate": 0.001944499623520812,
"loss": 2.7367,
"step": 6200
},
{
"epoch": 1.9599620942904525,
"grad_norm": 0.09927752259372388,
"learning_rate": 0.0019443183227427134,
"loss": 2.8624,
"step": 6205
},
{
"epoch": 1.9615414988549316,
"grad_norm": 0.09389302280465255,
"learning_rate": 0.0019441367348034461,
"loss": 2.8001,
"step": 6210
},
{
"epoch": 1.9631209034194108,
"grad_norm": 0.1420223065796786,
"learning_rate": 0.0019439548597582302,
"loss": 2.8459,
"step": 6215
},
{
"epoch": 1.96470030798389,
"grad_norm": 0.09125430607116715,
"learning_rate": 0.0019437726976623726,
"loss": 2.7651,
"step": 6220
},
{
"epoch": 1.9662797125483693,
"grad_norm": 0.09225743396647654,
"learning_rate": 0.0019435902485712676,
"loss": 2.7648,
"step": 6225
},
{
"epoch": 1.9678591171128486,
"grad_norm": 0.10098194753518079,
"learning_rate": 0.0019434075125403965,
"loss": 2.8095,
"step": 6230
},
{
"epoch": 1.9694385216773278,
"grad_norm": 0.08996164948245026,
"learning_rate": 0.0019432244896253287,
"loss": 2.8509,
"step": 6235
},
{
"epoch": 1.9710179262418068,
"grad_norm": 0.08739058653092374,
"learning_rate": 0.0019430411798817197,
"loss": 2.733,
"step": 6240
},
{
"epoch": 1.972597330806286,
"grad_norm": 0.08466777415280602,
"learning_rate": 0.0019428575833653134,
"loss": 2.7618,
"step": 6245
},
{
"epoch": 1.974176735370765,
"grad_norm": 0.0879874293503665,
"learning_rate": 0.00194267370013194,
"loss": 2.8032,
"step": 6250
},
{
"epoch": 1.9757561399352443,
"grad_norm": 0.08491973521692751,
"learning_rate": 0.0019424895302375177,
"loss": 2.8203,
"step": 6255
},
{
"epoch": 1.9773355444997236,
"grad_norm": 0.09233997552168374,
"learning_rate": 0.0019423050737380505,
"loss": 2.7658,
"step": 6260
},
{
"epoch": 1.9789149490642028,
"grad_norm": 0.08341466289063551,
"learning_rate": 0.0019421203306896311,
"loss": 2.8405,
"step": 6265
},
{
"epoch": 1.980494353628682,
"grad_norm": 0.09675894067841306,
"learning_rate": 0.0019419353011484385,
"loss": 2.8052,
"step": 6270
},
{
"epoch": 1.9820737581931613,
"grad_norm": 0.13064086882653447,
"learning_rate": 0.001941749985170739,
"loss": 2.8245,
"step": 6275
},
{
"epoch": 1.9836531627576404,
"grad_norm": 0.09444034799577457,
"learning_rate": 0.001941564382812886,
"loss": 2.8968,
"step": 6280
},
{
"epoch": 1.9852325673221196,
"grad_norm": 0.0874316229946107,
"learning_rate": 0.0019413784941313202,
"loss": 2.7382,
"step": 6285
},
{
"epoch": 1.9868119718865986,
"grad_norm": 0.09345391189400902,
"learning_rate": 0.0019411923191825686,
"loss": 2.829,
"step": 6290
},
{
"epoch": 1.9883913764510779,
"grad_norm": 0.09191899691273726,
"learning_rate": 0.0019410058580232464,
"loss": 2.8228,
"step": 6295
},
{
"epoch": 1.9899707810155571,
"grad_norm": 0.08165945666933225,
"learning_rate": 0.0019408191107100552,
"loss": 2.726,
"step": 6300
},
{
"epoch": 1.9915501855800364,
"grad_norm": 0.0881495686780152,
"learning_rate": 0.0019406320772997832,
"loss": 2.7863,
"step": 6305
},
{
"epoch": 1.9931295901445156,
"grad_norm": 0.07429023971824186,
"learning_rate": 0.0019404447578493062,
"loss": 2.7083,
"step": 6310
},
{
"epoch": 1.9947089947089947,
"grad_norm": 0.08992601670792755,
"learning_rate": 0.0019402571524155877,
"loss": 2.7003,
"step": 6315
},
{
"epoch": 1.996288399273474,
"grad_norm": 0.09666646843109974,
"learning_rate": 0.001940069261055676,
"loss": 2.8079,
"step": 6320
},
{
"epoch": 1.997867803837953,
"grad_norm": 0.08191896132889992,
"learning_rate": 0.0019398810838267084,
"loss": 2.8366,
"step": 6325
},
{
"epoch": 1.9994472084024322,
"grad_norm": 0.08391348675660519,
"learning_rate": 0.0019396926207859084,
"loss": 2.825,
"step": 6330
},
{
"epoch": 2.0,
"eval_loss": 2.780498743057251,
"eval_runtime": 118.3809,
"eval_samples_per_second": 22.377,
"eval_steps_per_second": 5.601,
"step": 6332
},
{
"epoch": 2.0009476427386876,
"grad_norm": 0.09575742921571473,
"learning_rate": 0.0019395038719905863,
"loss": 2.8089,
"step": 6335
},
{
"epoch": 2.002527047303167,
"grad_norm": 0.09475021327498588,
"learning_rate": 0.0019393148374981393,
"loss": 2.7499,
"step": 6340
},
{
"epoch": 2.0041064518676457,
"grad_norm": 0.09332000534038609,
"learning_rate": 0.0019391255173660516,
"loss": 2.8389,
"step": 6345
},
{
"epoch": 2.005685856432125,
"grad_norm": 0.10298028032627798,
"learning_rate": 0.0019389359116518943,
"loss": 2.748,
"step": 6350
},
{
"epoch": 2.007265260996604,
"grad_norm": 0.08145562501910214,
"learning_rate": 0.0019387460204133254,
"loss": 2.6825,
"step": 6355
},
{
"epoch": 2.0088446655610834,
"grad_norm": 0.07447178719663891,
"learning_rate": 0.0019385558437080897,
"loss": 2.778,
"step": 6360
},
{
"epoch": 2.0104240701255627,
"grad_norm": 0.0927632838786091,
"learning_rate": 0.0019383653815940184,
"loss": 2.7132,
"step": 6365
},
{
"epoch": 2.012003474690042,
"grad_norm": 0.09299068690281939,
"learning_rate": 0.0019381746341290299,
"loss": 2.7509,
"step": 6370
},
{
"epoch": 2.013582879254521,
"grad_norm": 0.07410954144318185,
"learning_rate": 0.001937983601371129,
"loss": 2.7203,
"step": 6375
},
{
"epoch": 2.0151622838190004,
"grad_norm": 0.08505430947007388,
"learning_rate": 0.0019377922833784082,
"loss": 2.885,
"step": 6380
},
{
"epoch": 2.0167416883834792,
"grad_norm": 0.07506113878222667,
"learning_rate": 0.0019376006802090458,
"loss": 2.7818,
"step": 6385
},
{
"epoch": 2.0183210929479585,
"grad_norm": 0.07410981032228242,
"learning_rate": 0.0019374087919213068,
"loss": 2.8087,
"step": 6390
},
{
"epoch": 2.0199004975124377,
"grad_norm": 0.07533333285267724,
"learning_rate": 0.0019372166185735436,
"loss": 2.9084,
"step": 6395
},
{
"epoch": 2.021479902076917,
"grad_norm": 0.08729901810896008,
"learning_rate": 0.0019370241602241949,
"loss": 2.7977,
"step": 6400
},
{
"epoch": 2.023059306641396,
"grad_norm": 0.06778399060340229,
"learning_rate": 0.0019368314169317855,
"loss": 2.7203,
"step": 6405
},
{
"epoch": 2.0246387112058755,
"grad_norm": 0.07716673460749471,
"learning_rate": 0.001936638388754928,
"loss": 2.7924,
"step": 6410
},
{
"epoch": 2.0262181157703547,
"grad_norm": 0.08371038858709097,
"learning_rate": 0.0019364450757523208,
"loss": 2.7855,
"step": 6415
},
{
"epoch": 2.027797520334834,
"grad_norm": 0.10523104303606545,
"learning_rate": 0.0019362514779827495,
"loss": 2.7961,
"step": 6420
},
{
"epoch": 2.0293769248993128,
"grad_norm": 0.09169356093760239,
"learning_rate": 0.0019360575955050853,
"loss": 2.7656,
"step": 6425
},
{
"epoch": 2.030956329463792,
"grad_norm": 0.08391763073861802,
"learning_rate": 0.0019358634283782867,
"loss": 2.7048,
"step": 6430
},
{
"epoch": 2.0325357340282713,
"grad_norm": 0.0739510810514508,
"learning_rate": 0.0019356689766613993,
"loss": 2.7977,
"step": 6435
},
{
"epoch": 2.0341151385927505,
"grad_norm": 0.10007494348550869,
"learning_rate": 0.001935474240413554,
"loss": 2.8089,
"step": 6440
},
{
"epoch": 2.0356945431572298,
"grad_norm": 0.09409379273580709,
"learning_rate": 0.0019352792196939694,
"loss": 2.7812,
"step": 6445
},
{
"epoch": 2.037273947721709,
"grad_norm": 0.08934297438992823,
"learning_rate": 0.0019350839145619496,
"loss": 2.8069,
"step": 6450
},
{
"epoch": 2.0388533522861882,
"grad_norm": 0.08802671161638972,
"learning_rate": 0.0019348883250768858,
"loss": 2.7853,
"step": 6455
},
{
"epoch": 2.0404327568506675,
"grad_norm": 0.0865003277963274,
"learning_rate": 0.0019346924512982555,
"loss": 2.7491,
"step": 6460
},
{
"epoch": 2.0420121614151463,
"grad_norm": 0.07928593456326218,
"learning_rate": 0.0019344962932856227,
"loss": 2.7655,
"step": 6465
},
{
"epoch": 2.0435915659796255,
"grad_norm": 0.0830289002304728,
"learning_rate": 0.0019342998510986377,
"loss": 2.7275,
"step": 6470
},
{
"epoch": 2.045170970544105,
"grad_norm": 0.07527174943423653,
"learning_rate": 0.0019341031247970375,
"loss": 2.7265,
"step": 6475
},
{
"epoch": 2.046750375108584,
"grad_norm": 0.07589622127882759,
"learning_rate": 0.0019339061144406453,
"loss": 2.863,
"step": 6480
},
{
"epoch": 2.0483297796730633,
"grad_norm": 0.08128329962885099,
"learning_rate": 0.0019337088200893705,
"loss": 2.7377,
"step": 6485
},
{
"epoch": 2.0499091842375425,
"grad_norm": 0.09961372451168636,
"learning_rate": 0.0019335112418032091,
"loss": 2.8205,
"step": 6490
},
{
"epoch": 2.051488588802022,
"grad_norm": 0.06696727578829288,
"learning_rate": 0.0019333133796422435,
"loss": 2.7783,
"step": 6495
},
{
"epoch": 2.053067993366501,
"grad_norm": 0.08682450943383599,
"learning_rate": 0.001933115233666642,
"loss": 2.9212,
"step": 6500
},
{
"epoch": 2.05464739793098,
"grad_norm": 0.09446946892027293,
"learning_rate": 0.00193291680393666,
"loss": 2.7808,
"step": 6505
},
{
"epoch": 2.056226802495459,
"grad_norm": 0.10711069375479783,
"learning_rate": 0.0019327180905126386,
"loss": 2.7313,
"step": 6510
},
{
"epoch": 2.0578062070599383,
"grad_norm": 0.09875559420662403,
"learning_rate": 0.0019325190934550047,
"loss": 2.7998,
"step": 6515
},
{
"epoch": 2.0593856116244176,
"grad_norm": 0.10038130097656887,
"learning_rate": 0.001932319812824273,
"loss": 2.7825,
"step": 6520
},
{
"epoch": 2.060965016188897,
"grad_norm": 0.08287967698391464,
"learning_rate": 0.0019321202486810428,
"loss": 2.7155,
"step": 6525
},
{
"epoch": 2.062544420753376,
"grad_norm": 0.07994162592288757,
"learning_rate": 0.0019319204010860005,
"loss": 2.7948,
"step": 6530
},
{
"epoch": 2.0641238253178553,
"grad_norm": 0.08453083899950711,
"learning_rate": 0.0019317202700999184,
"loss": 2.8841,
"step": 6535
},
{
"epoch": 2.0657032298823346,
"grad_norm": 0.10125131581986914,
"learning_rate": 0.0019315198557836553,
"loss": 2.8616,
"step": 6540
},
{
"epoch": 2.0672826344468134,
"grad_norm": 0.08479114783412135,
"learning_rate": 0.0019313191581981552,
"loss": 2.8251,
"step": 6545
},
{
"epoch": 2.0688620390112926,
"grad_norm": 0.0843902017175401,
"learning_rate": 0.00193111817740445,
"loss": 2.7163,
"step": 6550
},
{
"epoch": 2.070441443575772,
"grad_norm": 0.08435827895842664,
"learning_rate": 0.0019309169134636558,
"loss": 2.7285,
"step": 6555
},
{
"epoch": 2.072020848140251,
"grad_norm": 0.08523910283562491,
"learning_rate": 0.0019307153664369762,
"loss": 2.8544,
"step": 6560
},
{
"epoch": 2.0736002527047304,
"grad_norm": 0.09297632793559693,
"learning_rate": 0.0019305135363857,
"loss": 2.807,
"step": 6565
},
{
"epoch": 2.0751796572692096,
"grad_norm": 0.08500085833146576,
"learning_rate": 0.0019303114233712028,
"loss": 2.8476,
"step": 6570
},
{
"epoch": 2.076759061833689,
"grad_norm": 0.08830857153484742,
"learning_rate": 0.0019301090274549454,
"loss": 2.7331,
"step": 6575
},
{
"epoch": 2.078338466398168,
"grad_norm": 0.09386122618388641,
"learning_rate": 0.0019299063486984756,
"loss": 2.811,
"step": 6580
},
{
"epoch": 2.079917870962647,
"grad_norm": 0.08350414162315721,
"learning_rate": 0.0019297033871634264,
"loss": 2.857,
"step": 6585
},
{
"epoch": 2.081497275527126,
"grad_norm": 0.08295216253089162,
"learning_rate": 0.0019295001429115173,
"loss": 2.7419,
"step": 6590
},
{
"epoch": 2.0830766800916054,
"grad_norm": 0.0869171272650609,
"learning_rate": 0.0019292966160045536,
"loss": 2.7898,
"step": 6595
},
{
"epoch": 2.0846560846560847,
"grad_norm": 0.09241253732508625,
"learning_rate": 0.001929092806504426,
"loss": 2.6644,
"step": 6600
},
{
"epoch": 2.086235489220564,
"grad_norm": 0.08196169328886187,
"learning_rate": 0.0019288887144731125,
"loss": 2.7326,
"step": 6605
},
{
"epoch": 2.087814893785043,
"grad_norm": 0.09264153484001544,
"learning_rate": 0.0019286843399726754,
"loss": 2.7017,
"step": 6610
},
{
"epoch": 2.0893942983495224,
"grad_norm": 0.08433629884904234,
"learning_rate": 0.0019284796830652642,
"loss": 2.7395,
"step": 6615
},
{
"epoch": 2.0909737029140016,
"grad_norm": 0.08668778961859433,
"learning_rate": 0.0019282747438131135,
"loss": 2.7928,
"step": 6620
},
{
"epoch": 2.0925531074784804,
"grad_norm": 0.08001395369320938,
"learning_rate": 0.0019280695222785443,
"loss": 2.7409,
"step": 6625
},
{
"epoch": 2.0941325120429597,
"grad_norm": 0.11753845536111973,
"learning_rate": 0.0019278640185239628,
"loss": 2.7326,
"step": 6630
},
{
"epoch": 2.095711916607439,
"grad_norm": 0.11852435357562266,
"learning_rate": 0.001927658232611862,
"loss": 2.8578,
"step": 6635
},
{
"epoch": 2.097291321171918,
"grad_norm": 0.09027187736266508,
"learning_rate": 0.001927452164604819,
"loss": 2.7912,
"step": 6640
},
{
"epoch": 2.0988707257363974,
"grad_norm": 0.09429557482319734,
"learning_rate": 0.0019272458145654988,
"loss": 2.7154,
"step": 6645
},
{
"epoch": 2.1004501303008767,
"grad_norm": 0.09542397456612339,
"learning_rate": 0.0019270391825566508,
"loss": 2.7675,
"step": 6650
},
{
"epoch": 2.102029534865356,
"grad_norm": 0.1113955713106657,
"learning_rate": 0.0019268322686411099,
"loss": 2.7859,
"step": 6655
},
{
"epoch": 2.1036089394298347,
"grad_norm": 0.11053959264725198,
"learning_rate": 0.0019266250728817984,
"loss": 2.8418,
"step": 6660
},
{
"epoch": 2.105188343994314,
"grad_norm": 0.10237308436161732,
"learning_rate": 0.0019264175953417222,
"loss": 2.8229,
"step": 6665
},
{
"epoch": 2.1067677485587932,
"grad_norm": 0.11394409500405601,
"learning_rate": 0.0019262098360839745,
"loss": 2.8163,
"step": 6670
},
{
"epoch": 2.1083471531232725,
"grad_norm": 0.08096007436211594,
"learning_rate": 0.0019260017951717332,
"loss": 2.7214,
"step": 6675
},
{
"epoch": 2.1099265576877517,
"grad_norm": 0.07284434295650646,
"learning_rate": 0.0019257934726682627,
"loss": 2.8227,
"step": 6680
},
{
"epoch": 2.111505962252231,
"grad_norm": 0.08167810722420875,
"learning_rate": 0.001925584868636912,
"loss": 2.6956,
"step": 6685
},
{
"epoch": 2.11308536681671,
"grad_norm": 0.09417870744998465,
"learning_rate": 0.0019253759831411165,
"loss": 2.7186,
"step": 6690
},
{
"epoch": 2.1146647713811895,
"grad_norm": 0.11213015574372565,
"learning_rate": 0.001925166816244397,
"loss": 2.7526,
"step": 6695
},
{
"epoch": 2.1162441759456683,
"grad_norm": 0.10756824240366811,
"learning_rate": 0.0019249573680103595,
"loss": 2.8488,
"step": 6700
},
{
"epoch": 2.1178235805101475,
"grad_norm": 0.08711495340656947,
"learning_rate": 0.0019247476385026961,
"loss": 2.7033,
"step": 6705
},
{
"epoch": 2.1194029850746268,
"grad_norm": 0.07179842865878266,
"learning_rate": 0.0019245376277851846,
"loss": 2.7082,
"step": 6710
},
{
"epoch": 2.120982389639106,
"grad_norm": 0.09467353597491755,
"learning_rate": 0.0019243273359216872,
"loss": 2.695,
"step": 6715
},
{
"epoch": 2.1225617942035853,
"grad_norm": 0.08034724022561057,
"learning_rate": 0.0019241167629761528,
"loss": 2.7169,
"step": 6720
},
{
"epoch": 2.1241411987680645,
"grad_norm": 0.08242354846761851,
"learning_rate": 0.001923905909012615,
"loss": 2.8581,
"step": 6725
},
{
"epoch": 2.1257206033325438,
"grad_norm": 0.07521641516870188,
"learning_rate": 0.0019236947740951932,
"loss": 2.7248,
"step": 6730
},
{
"epoch": 2.127300007897023,
"grad_norm": 0.09292490715429318,
"learning_rate": 0.0019234833582880923,
"loss": 2.761,
"step": 6735
},
{
"epoch": 2.128879412461502,
"grad_norm": 0.08571654691757924,
"learning_rate": 0.0019232716616556025,
"loss": 2.7784,
"step": 6740
},
{
"epoch": 2.130458817025981,
"grad_norm": 0.09267628020872272,
"learning_rate": 0.0019230596842620994,
"loss": 2.6894,
"step": 6745
},
{
"epoch": 2.1320382215904603,
"grad_norm": 0.09786426971579432,
"learning_rate": 0.001922847426172044,
"loss": 2.8413,
"step": 6750
},
{
"epoch": 2.1336176261549396,
"grad_norm": 0.07894207352301423,
"learning_rate": 0.001922634887449982,
"loss": 2.767,
"step": 6755
},
{
"epoch": 2.135197030719419,
"grad_norm": 0.07560782993568409,
"learning_rate": 0.0019224220681605462,
"loss": 2.7604,
"step": 6760
},
{
"epoch": 2.136776435283898,
"grad_norm": 0.08052217048798144,
"learning_rate": 0.0019222089683684528,
"loss": 2.742,
"step": 6765
},
{
"epoch": 2.1383558398483773,
"grad_norm": 0.0766530719173359,
"learning_rate": 0.001921995588138504,
"loss": 2.785,
"step": 6770
},
{
"epoch": 2.1399352444128565,
"grad_norm": 0.08226433057004057,
"learning_rate": 0.001921781927535588,
"loss": 2.8312,
"step": 6775
},
{
"epoch": 2.1415146489773353,
"grad_norm": 0.08154192567298578,
"learning_rate": 0.001921567986624677,
"loss": 2.6801,
"step": 6780
},
{
"epoch": 2.1430940535418146,
"grad_norm": 0.09302973546737621,
"learning_rate": 0.0019213537654708297,
"loss": 2.7382,
"step": 6785
},
{
"epoch": 2.144673458106294,
"grad_norm": 0.09581085650749972,
"learning_rate": 0.001921139264139189,
"loss": 2.7129,
"step": 6790
},
{
"epoch": 2.146252862670773,
"grad_norm": 0.07855147333701644,
"learning_rate": 0.001920924482694983,
"loss": 2.7476,
"step": 6795
},
{
"epoch": 2.1478322672352523,
"grad_norm": 0.07584347975072561,
"learning_rate": 0.0019207094212035259,
"loss": 2.7737,
"step": 6800
},
{
"epoch": 2.1494116717997316,
"grad_norm": 0.07390332512105707,
"learning_rate": 0.0019204940797302164,
"loss": 2.6535,
"step": 6805
},
{
"epoch": 2.150991076364211,
"grad_norm": 0.08542053475529204,
"learning_rate": 0.0019202784583405386,
"loss": 2.79,
"step": 6810
},
{
"epoch": 2.15257048092869,
"grad_norm": 0.0924217776712588,
"learning_rate": 0.0019200625571000613,
"loss": 2.7868,
"step": 6815
},
{
"epoch": 2.154149885493169,
"grad_norm": 0.0744042476621044,
"learning_rate": 0.001919846376074439,
"loss": 2.7307,
"step": 6820
},
{
"epoch": 2.155729290057648,
"grad_norm": 0.10510057195379835,
"learning_rate": 0.0019196299153294105,
"loss": 2.7757,
"step": 6825
},
{
"epoch": 2.1573086946221274,
"grad_norm": 0.08019805135179159,
"learning_rate": 0.0019194131749308006,
"loss": 2.7172,
"step": 6830
},
{
"epoch": 2.1588880991866066,
"grad_norm": 0.0941289115865038,
"learning_rate": 0.0019191961549445186,
"loss": 2.7177,
"step": 6835
},
{
"epoch": 2.160467503751086,
"grad_norm": 0.09756375773672207,
"learning_rate": 0.0019189788554365586,
"loss": 2.7795,
"step": 6840
},
{
"epoch": 2.162046908315565,
"grad_norm": 0.10057056244741966,
"learning_rate": 0.0019187612764730003,
"loss": 2.742,
"step": 6845
},
{
"epoch": 2.1636263128800444,
"grad_norm": 0.08929203686667446,
"learning_rate": 0.0019185434181200078,
"loss": 2.6888,
"step": 6850
},
{
"epoch": 2.1652057174445236,
"grad_norm": 0.0809877539288576,
"learning_rate": 0.0019183252804438307,
"loss": 2.7447,
"step": 6855
},
{
"epoch": 2.1667851220090024,
"grad_norm": 0.08171483838903393,
"learning_rate": 0.0019181068635108032,
"loss": 2.7844,
"step": 6860
},
{
"epoch": 2.1683645265734817,
"grad_norm": 0.08029439443597056,
"learning_rate": 0.0019178881673873444,
"loss": 2.8377,
"step": 6865
},
{
"epoch": 2.169943931137961,
"grad_norm": 0.0869907621724929,
"learning_rate": 0.0019176691921399586,
"loss": 2.8235,
"step": 6870
},
{
"epoch": 2.17152333570244,
"grad_norm": 0.09465182406104862,
"learning_rate": 0.0019174499378352343,
"loss": 2.6915,
"step": 6875
},
{
"epoch": 2.1731027402669194,
"grad_norm": 0.08379838935165305,
"learning_rate": 0.0019172304045398459,
"loss": 2.8486,
"step": 6880
},
{
"epoch": 2.1746821448313987,
"grad_norm": 0.11732839131285723,
"learning_rate": 0.0019170105923205516,
"loss": 2.8215,
"step": 6885
},
{
"epoch": 2.176261549395878,
"grad_norm": 0.09292157119205179,
"learning_rate": 0.0019167905012441953,
"loss": 2.7212,
"step": 6890
},
{
"epoch": 2.177840953960357,
"grad_norm": 0.09627096033681211,
"learning_rate": 0.0019165701313777054,
"loss": 2.7765,
"step": 6895
},
{
"epoch": 2.179420358524836,
"grad_norm": 0.0696154891978351,
"learning_rate": 0.0019163494827880944,
"loss": 2.7463,
"step": 6900
},
{
"epoch": 2.180999763089315,
"grad_norm": 0.09306287488799425,
"learning_rate": 0.0019161285555424601,
"loss": 2.8591,
"step": 6905
},
{
"epoch": 2.1825791676537944,
"grad_norm": 0.09150579519712532,
"learning_rate": 0.0019159073497079856,
"loss": 2.7563,
"step": 6910
},
{
"epoch": 2.1841585722182737,
"grad_norm": 0.09795962987423949,
"learning_rate": 0.001915685865351938,
"loss": 2.7914,
"step": 6915
},
{
"epoch": 2.185737976782753,
"grad_norm": 0.1017505867967586,
"learning_rate": 0.0019154641025416694,
"loss": 2.7037,
"step": 6920
},
{
"epoch": 2.187317381347232,
"grad_norm": 0.10253439906939013,
"learning_rate": 0.001915242061344616,
"loss": 2.6436,
"step": 6925
},
{
"epoch": 2.1888967859117114,
"grad_norm": 0.0802442919313515,
"learning_rate": 0.0019150197418282993,
"loss": 2.6696,
"step": 6930
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.09520857188664072,
"learning_rate": 0.0019147971440603255,
"loss": 2.7685,
"step": 6935
},
{
"epoch": 2.1920555950406695,
"grad_norm": 0.08125953302885813,
"learning_rate": 0.0019145742681083852,
"loss": 2.7746,
"step": 6940
},
{
"epoch": 2.1936349996051487,
"grad_norm": 0.08665968783848398,
"learning_rate": 0.0019143511140402533,
"loss": 2.8095,
"step": 6945
},
{
"epoch": 2.195214404169628,
"grad_norm": 0.08960842367347455,
"learning_rate": 0.0019141276819237892,
"loss": 2.7944,
"step": 6950
},
{
"epoch": 2.1967938087341072,
"grad_norm": 0.08881407853420388,
"learning_rate": 0.0019139039718269377,
"loss": 2.7121,
"step": 6955
},
{
"epoch": 2.1983732132985865,
"grad_norm": 0.0731255303402879,
"learning_rate": 0.0019136799838177277,
"loss": 2.719,
"step": 6960
},
{
"epoch": 2.1999526178630657,
"grad_norm": 0.0848000016657035,
"learning_rate": 0.001913455717964272,
"loss": 2.719,
"step": 6965
},
{
"epoch": 2.201532022427545,
"grad_norm": 0.12587471348602663,
"learning_rate": 0.001913231174334769,
"loss": 2.7345,
"step": 6970
},
{
"epoch": 2.203111426992024,
"grad_norm": 0.10317234672447549,
"learning_rate": 0.0019130063529975005,
"loss": 2.8799,
"step": 6975
},
{
"epoch": 2.204690831556503,
"grad_norm": 0.09287084065190662,
"learning_rate": 0.0019127812540208331,
"loss": 2.6778,
"step": 6980
},
{
"epoch": 2.2062702361209823,
"grad_norm": 0.10572359558412954,
"learning_rate": 0.001912555877473219,
"loss": 2.9066,
"step": 6985
},
{
"epoch": 2.2078496406854615,
"grad_norm": 0.09676134765470186,
"learning_rate": 0.0019123302234231923,
"loss": 2.6953,
"step": 6990
},
{
"epoch": 2.2094290452499408,
"grad_norm": 0.09201142855993154,
"learning_rate": 0.0019121042919393741,
"loss": 2.8742,
"step": 6995
},
{
"epoch": 2.21100844981442,
"grad_norm": 0.11246187851139017,
"learning_rate": 0.001911878083090468,
"loss": 2.679,
"step": 7000
},
{
"epoch": 2.2125878543788993,
"grad_norm": 0.09040985497435061,
"learning_rate": 0.0019116515969452635,
"loss": 2.9079,
"step": 7005
},
{
"epoch": 2.2141672589433785,
"grad_norm": 0.11929576764954936,
"learning_rate": 0.0019114248335726327,
"loss": 2.7648,
"step": 7010
},
{
"epoch": 2.2157466635078578,
"grad_norm": 0.09500460087605361,
"learning_rate": 0.0019111977930415334,
"loss": 2.6747,
"step": 7015
},
{
"epoch": 2.2173260680723366,
"grad_norm": 0.07765834823398761,
"learning_rate": 0.001910970475421007,
"loss": 2.7504,
"step": 7020
},
{
"epoch": 2.218905472636816,
"grad_norm": 0.09131955370795612,
"learning_rate": 0.0019107428807801795,
"loss": 2.6785,
"step": 7025
},
{
"epoch": 2.220484877201295,
"grad_norm": 0.08552501793151021,
"learning_rate": 0.0019105150091882606,
"loss": 2.7142,
"step": 7030
},
{
"epoch": 2.2220642817657743,
"grad_norm": 0.07197777044252249,
"learning_rate": 0.001910286860714545,
"loss": 2.707,
"step": 7035
},
{
"epoch": 2.2236436863302536,
"grad_norm": 0.08719897653729854,
"learning_rate": 0.001910058435428411,
"loss": 2.7893,
"step": 7040
},
{
"epoch": 2.225223090894733,
"grad_norm": 0.08806121479772976,
"learning_rate": 0.0019098297333993213,
"loss": 2.7701,
"step": 7045
},
{
"epoch": 2.226802495459212,
"grad_norm": 0.09407117990776794,
"learning_rate": 0.0019096007546968228,
"loss": 2.7902,
"step": 7050
},
{
"epoch": 2.228381900023691,
"grad_norm": 0.1111393281263788,
"learning_rate": 0.0019093714993905465,
"loss": 2.722,
"step": 7055
},
{
"epoch": 2.22996130458817,
"grad_norm": 0.08640788224102358,
"learning_rate": 0.001909141967550207,
"loss": 2.7669,
"step": 7060
},
{
"epoch": 2.2315407091526493,
"grad_norm": 0.07853172068323665,
"learning_rate": 0.0019089121592456041,
"loss": 2.6776,
"step": 7065
},
{
"epoch": 2.2331201137171286,
"grad_norm": 0.09115129421219734,
"learning_rate": 0.0019086820745466207,
"loss": 2.7785,
"step": 7070
},
{
"epoch": 2.234699518281608,
"grad_norm": 0.07666777840195774,
"learning_rate": 0.0019084517135232245,
"loss": 2.822,
"step": 7075
},
{
"epoch": 2.236278922846087,
"grad_norm": 0.09210271949346142,
"learning_rate": 0.001908221076245466,
"loss": 2.8014,
"step": 7080
},
{
"epoch": 2.2378583274105663,
"grad_norm": 0.06827990396702065,
"learning_rate": 0.0019079901627834812,
"loss": 2.8159,
"step": 7085
},
{
"epoch": 2.2394377319750456,
"grad_norm": 0.0839761250511822,
"learning_rate": 0.001907758973207489,
"loss": 2.7452,
"step": 7090
},
{
"epoch": 2.241017136539525,
"grad_norm": 0.09989425447281891,
"learning_rate": 0.0019075275075877932,
"loss": 2.7432,
"step": 7095
},
{
"epoch": 2.2425965411040036,
"grad_norm": 0.07219217006340295,
"learning_rate": 0.0019072957659947804,
"loss": 2.739,
"step": 7100
},
{
"epoch": 2.244175945668483,
"grad_norm": 0.0820885686734087,
"learning_rate": 0.0019070637484989224,
"loss": 2.7551,
"step": 7105
},
{
"epoch": 2.245755350232962,
"grad_norm": 0.09874155721156277,
"learning_rate": 0.0019068314551707736,
"loss": 2.7175,
"step": 7110
},
{
"epoch": 2.2473347547974414,
"grad_norm": 0.11703533790943776,
"learning_rate": 0.0019065988860809734,
"loss": 2.7516,
"step": 7115
},
{
"epoch": 2.2489141593619206,
"grad_norm": 0.08032580616927876,
"learning_rate": 0.001906366041300244,
"loss": 2.7831,
"step": 7120
},
{
"epoch": 2.2504935639264,
"grad_norm": 0.09921541096208267,
"learning_rate": 0.0019061329208993928,
"loss": 2.8153,
"step": 7125
},
{
"epoch": 2.252072968490879,
"grad_norm": 0.091975480333108,
"learning_rate": 0.0019058995249493097,
"loss": 2.7338,
"step": 7130
},
{
"epoch": 2.253652373055358,
"grad_norm": 0.08328090621779681,
"learning_rate": 0.0019056658535209687,
"loss": 2.7909,
"step": 7135
},
{
"epoch": 2.255231777619837,
"grad_norm": 0.09282748988959821,
"learning_rate": 0.0019054319066854283,
"loss": 2.7228,
"step": 7140
},
{
"epoch": 2.2568111821843164,
"grad_norm": 0.10177668917275927,
"learning_rate": 0.0019051976845138301,
"loss": 2.7497,
"step": 7145
},
{
"epoch": 2.2583905867487957,
"grad_norm": 0.0724734455676322,
"learning_rate": 0.0019049631870773993,
"loss": 2.7286,
"step": 7150
},
{
"epoch": 2.259969991313275,
"grad_norm": 0.10467079801447612,
"learning_rate": 0.0019047284144474456,
"loss": 2.6547,
"step": 7155
},
{
"epoch": 2.261549395877754,
"grad_norm": 0.08612703666372605,
"learning_rate": 0.0019044933666953615,
"loss": 2.7056,
"step": 7160
},
{
"epoch": 2.2631288004422334,
"grad_norm": 0.09418909454140281,
"learning_rate": 0.0019042580438926233,
"loss": 2.7448,
"step": 7165
},
{
"epoch": 2.2647082050067127,
"grad_norm": 0.07778155554833255,
"learning_rate": 0.0019040224461107915,
"loss": 2.608,
"step": 7170
},
{
"epoch": 2.266287609571192,
"grad_norm": 0.09163630244228571,
"learning_rate": 0.0019037865734215101,
"loss": 2.6925,
"step": 7175
},
{
"epoch": 2.2678670141356707,
"grad_norm": 0.09184719746656586,
"learning_rate": 0.0019035504258965057,
"loss": 2.7412,
"step": 7180
},
{
"epoch": 2.26944641870015,
"grad_norm": 0.07598524804116423,
"learning_rate": 0.00190331400360759,
"loss": 2.7431,
"step": 7185
},
{
"epoch": 2.271025823264629,
"grad_norm": 0.07840403354867737,
"learning_rate": 0.0019030773066266572,
"loss": 2.6449,
"step": 7190
},
{
"epoch": 2.2726052278291085,
"grad_norm": 0.0703405315681963,
"learning_rate": 0.0019028403350256854,
"loss": 2.7094,
"step": 7195
},
{
"epoch": 2.2741846323935877,
"grad_norm": 0.08088970583186511,
"learning_rate": 0.0019026030888767364,
"loss": 2.7412,
"step": 7200
},
{
"epoch": 2.275764036958067,
"grad_norm": 0.09736156750218407,
"learning_rate": 0.0019023655682519544,
"loss": 2.7812,
"step": 7205
},
{
"epoch": 2.2773434415225458,
"grad_norm": 0.09153884777790493,
"learning_rate": 0.0019021277732235687,
"loss": 2.7142,
"step": 7210
},
{
"epoch": 2.278922846087025,
"grad_norm": 0.09744944882721748,
"learning_rate": 0.001901889703863891,
"loss": 2.763,
"step": 7215
},
{
"epoch": 2.2805022506515042,
"grad_norm": 0.08586046564916572,
"learning_rate": 0.001901651360245317,
"loss": 2.6857,
"step": 7220
},
{
"epoch": 2.2820816552159835,
"grad_norm": 0.09087609130878502,
"learning_rate": 0.0019014127424403246,
"loss": 2.6869,
"step": 7225
},
{
"epoch": 2.2836610597804627,
"grad_norm": 0.08836849990028749,
"learning_rate": 0.0019011738505214767,
"loss": 2.7596,
"step": 7230
},
{
"epoch": 2.285240464344942,
"grad_norm": 0.08747104858505658,
"learning_rate": 0.001900934684561419,
"loss": 2.7931,
"step": 7235
},
{
"epoch": 2.2868198689094212,
"grad_norm": 0.08755416213412862,
"learning_rate": 0.0019006952446328795,
"loss": 2.7607,
"step": 7240
},
{
"epoch": 2.2883992734739005,
"grad_norm": 0.07536655725479138,
"learning_rate": 0.001900455530808671,
"loss": 2.8365,
"step": 7245
},
{
"epoch": 2.2899786780383797,
"grad_norm": 0.07865625517548534,
"learning_rate": 0.0019002155431616888,
"loss": 2.7188,
"step": 7250
},
{
"epoch": 2.2915580826028585,
"grad_norm": 0.07071226651998248,
"learning_rate": 0.0018999752817649115,
"loss": 2.6903,
"step": 7255
},
{
"epoch": 2.293137487167338,
"grad_norm": 0.07225608791148679,
"learning_rate": 0.0018997347466914011,
"loss": 2.7235,
"step": 7260
},
{
"epoch": 2.294716891731817,
"grad_norm": 0.07348591764373122,
"learning_rate": 0.0018994939380143029,
"loss": 2.7162,
"step": 7265
},
{
"epoch": 2.2962962962962963,
"grad_norm": 0.08593372411968381,
"learning_rate": 0.0018992528558068452,
"loss": 2.7526,
"step": 7270
},
{
"epoch": 2.2978757008607755,
"grad_norm": 0.09704319575792537,
"learning_rate": 0.0018990115001423394,
"loss": 2.8124,
"step": 7275
},
{
"epoch": 2.2994551054252548,
"grad_norm": 0.08275708015065528,
"learning_rate": 0.00189876987109418,
"loss": 2.6462,
"step": 7280
},
{
"epoch": 2.301034509989734,
"grad_norm": 0.10017074009024984,
"learning_rate": 0.0018985279687358458,
"loss": 2.6671,
"step": 7285
},
{
"epoch": 2.302613914554213,
"grad_norm": 0.09120856152822093,
"learning_rate": 0.001898285793140897,
"loss": 2.7268,
"step": 7290
},
{
"epoch": 2.304193319118692,
"grad_norm": 0.07112627860712405,
"learning_rate": 0.0018980433443829777,
"loss": 2.6372,
"step": 7295
},
{
"epoch": 2.3057727236831713,
"grad_norm": 0.07453676740025032,
"learning_rate": 0.001897800622535815,
"loss": 2.7637,
"step": 7300
},
{
"epoch": 2.3073521282476506,
"grad_norm": 0.09143889265627127,
"learning_rate": 0.0018975576276732196,
"loss": 2.7296,
"step": 7305
},
{
"epoch": 2.30893153281213,
"grad_norm": 0.07443627857352321,
"learning_rate": 0.0018973143598690842,
"loss": 2.8072,
"step": 7310
},
{
"epoch": 2.310510937376609,
"grad_norm": 0.08305278027683252,
"learning_rate": 0.0018970708191973847,
"loss": 2.6825,
"step": 7315
},
{
"epoch": 2.3120903419410883,
"grad_norm": 0.09507987362363093,
"learning_rate": 0.0018968270057321808,
"loss": 2.6737,
"step": 7320
},
{
"epoch": 2.3136697465055676,
"grad_norm": 0.10136716170746057,
"learning_rate": 0.0018965829195476144,
"loss": 2.8396,
"step": 7325
},
{
"epoch": 2.315249151070047,
"grad_norm": 0.07136962176276128,
"learning_rate": 0.001896338560717911,
"loss": 2.6985,
"step": 7330
},
{
"epoch": 2.3168285556345256,
"grad_norm": 0.09289629626468898,
"learning_rate": 0.0018960939293173776,
"loss": 2.7599,
"step": 7335
},
{
"epoch": 2.318407960199005,
"grad_norm": 0.066454845519932,
"learning_rate": 0.001895849025420406,
"loss": 2.6878,
"step": 7340
},
{
"epoch": 2.319987364763484,
"grad_norm": 0.08965211928543629,
"learning_rate": 0.001895603849101469,
"loss": 2.7131,
"step": 7345
},
{
"epoch": 2.3215667693279634,
"grad_norm": 0.06976491835977368,
"learning_rate": 0.001895358400435124,
"loss": 2.7597,
"step": 7350
},
{
"epoch": 2.3231461738924426,
"grad_norm": 0.08732959121801767,
"learning_rate": 0.0018951126794960103,
"loss": 2.6749,
"step": 7355
},
{
"epoch": 2.324725578456922,
"grad_norm": 0.07146480125302472,
"learning_rate": 0.0018948666863588494,
"loss": 2.7254,
"step": 7360
},
{
"epoch": 2.326304983021401,
"grad_norm": 0.08655105891480916,
"learning_rate": 0.0018946204210984468,
"loss": 2.7201,
"step": 7365
},
{
"epoch": 2.32788438758588,
"grad_norm": 0.09835250777514457,
"learning_rate": 0.00189437388378969,
"loss": 2.8441,
"step": 7370
},
{
"epoch": 2.329463792150359,
"grad_norm": 0.06363496418764451,
"learning_rate": 0.0018941270745075497,
"loss": 2.6174,
"step": 7375
},
{
"epoch": 2.3310431967148384,
"grad_norm": 0.07929593580818732,
"learning_rate": 0.0018938799933270784,
"loss": 2.6466,
"step": 7380
},
{
"epoch": 2.3326226012793176,
"grad_norm": 0.07900756004679405,
"learning_rate": 0.0018936326403234123,
"loss": 2.76,
"step": 7385
},
{
"epoch": 2.334202005843797,
"grad_norm": 0.09717551544912385,
"learning_rate": 0.00189338501557177,
"loss": 2.6163,
"step": 7390
},
{
"epoch": 2.335781410408276,
"grad_norm": 0.08719728186567895,
"learning_rate": 0.0018931371191474524,
"loss": 2.6595,
"step": 7395
},
{
"epoch": 2.3373608149727554,
"grad_norm": 0.07114676790454187,
"learning_rate": 0.0018928889511258431,
"loss": 2.6956,
"step": 7400
},
{
"epoch": 2.3389402195372346,
"grad_norm": 0.07869140529911982,
"learning_rate": 0.001892640511582409,
"loss": 2.6961,
"step": 7405
},
{
"epoch": 2.340519624101714,
"grad_norm": 0.07186879629277657,
"learning_rate": 0.0018923918005926983,
"loss": 2.6958,
"step": 7410
},
{
"epoch": 2.3420990286661927,
"grad_norm": 0.0718933979053246,
"learning_rate": 0.0018921428182323429,
"loss": 2.6829,
"step": 7415
},
{
"epoch": 2.343678433230672,
"grad_norm": 0.07859991062129688,
"learning_rate": 0.0018918935645770563,
"loss": 2.7591,
"step": 7420
},
{
"epoch": 2.345257837795151,
"grad_norm": 0.07024473445387007,
"learning_rate": 0.0018916440397026353,
"loss": 2.9005,
"step": 7425
},
{
"epoch": 2.3468372423596304,
"grad_norm": 0.06626110340336555,
"learning_rate": 0.0018913942436849587,
"loss": 2.734,
"step": 7430
},
{
"epoch": 2.3484166469241097,
"grad_norm": 0.07754610033325414,
"learning_rate": 0.0018911441765999877,
"loss": 2.7525,
"step": 7435
},
{
"epoch": 2.349996051488589,
"grad_norm": 0.08855254824958801,
"learning_rate": 0.0018908938385237665,
"loss": 2.7487,
"step": 7440
},
{
"epoch": 2.351575456053068,
"grad_norm": 0.07971196533610837,
"learning_rate": 0.0018906432295324209,
"loss": 2.6746,
"step": 7445
},
{
"epoch": 2.353154860617547,
"grad_norm": 0.09318710563334719,
"learning_rate": 0.00189039234970216,
"loss": 2.7336,
"step": 7450
},
{
"epoch": 2.354734265182026,
"grad_norm": 0.08034857917011158,
"learning_rate": 0.0018901411991092741,
"loss": 2.6857,
"step": 7455
},
{
"epoch": 2.3563136697465055,
"grad_norm": 0.08624688466455536,
"learning_rate": 0.001889889777830137,
"loss": 2.6624,
"step": 7460
},
{
"epoch": 2.3578930743109847,
"grad_norm": 0.07256184186888504,
"learning_rate": 0.001889638085941204,
"loss": 2.679,
"step": 7465
},
{
"epoch": 2.359472478875464,
"grad_norm": 0.0735825503294827,
"learning_rate": 0.001889386123519013,
"loss": 2.7518,
"step": 7470
},
{
"epoch": 2.361051883439943,
"grad_norm": 0.09243129527436082,
"learning_rate": 0.0018891338906401845,
"loss": 2.6232,
"step": 7475
},
{
"epoch": 2.3626312880044225,
"grad_norm": 0.07493880585978271,
"learning_rate": 0.0018888813873814208,
"loss": 2.7287,
"step": 7480
},
{
"epoch": 2.3642106925689017,
"grad_norm": 0.08015136572695773,
"learning_rate": 0.0018886286138195061,
"loss": 2.6657,
"step": 7485
},
{
"epoch": 2.365790097133381,
"grad_norm": 0.08648966776424792,
"learning_rate": 0.0018883755700313078,
"loss": 2.6228,
"step": 7490
},
{
"epoch": 2.3673695016978598,
"grad_norm": 0.07390479939905129,
"learning_rate": 0.0018881222560937745,
"loss": 2.6354,
"step": 7495
},
{
"epoch": 2.368948906262339,
"grad_norm": 0.07441542053721882,
"learning_rate": 0.0018878686720839376,
"loss": 2.6607,
"step": 7500
},
{
"epoch": 2.3705283108268183,
"grad_norm": 0.07216972032624433,
"learning_rate": 0.00188761481807891,
"loss": 2.6969,
"step": 7505
},
{
"epoch": 2.3721077153912975,
"grad_norm": 0.06369129438154124,
"learning_rate": 0.0018873606941558875,
"loss": 2.7562,
"step": 7510
},
{
"epoch": 2.3736871199557767,
"grad_norm": 0.0777730619367833,
"learning_rate": 0.0018871063003921477,
"loss": 2.7014,
"step": 7515
},
{
"epoch": 2.375266524520256,
"grad_norm": 0.08640642863482138,
"learning_rate": 0.0018868516368650498,
"loss": 2.7399,
"step": 7520
},
{
"epoch": 2.3768459290847352,
"grad_norm": 0.0846546261392458,
"learning_rate": 0.0018865967036520348,
"loss": 2.7098,
"step": 7525
},
{
"epoch": 2.378425333649214,
"grad_norm": 0.07491805680460785,
"learning_rate": 0.0018863415008306276,
"loss": 2.6934,
"step": 7530
},
{
"epoch": 2.3800047382136933,
"grad_norm": 0.07523148122995321,
"learning_rate": 0.0018860860284784322,
"loss": 2.6807,
"step": 7535
},
{
"epoch": 2.3815841427781725,
"grad_norm": 0.10958219801763429,
"learning_rate": 0.0018858302866731375,
"loss": 2.7444,
"step": 7540
},
{
"epoch": 2.383163547342652,
"grad_norm": 0.08805880564158441,
"learning_rate": 0.001885574275492512,
"loss": 2.7311,
"step": 7545
},
{
"epoch": 2.384742951907131,
"grad_norm": 0.07842191253583962,
"learning_rate": 0.0018853179950144077,
"loss": 2.6944,
"step": 7550
},
{
"epoch": 2.3863223564716103,
"grad_norm": 0.08057227108054174,
"learning_rate": 0.0018850614453167576,
"loss": 2.7204,
"step": 7555
},
{
"epoch": 2.3879017610360895,
"grad_norm": 0.07942781307655891,
"learning_rate": 0.0018848046264775765,
"loss": 2.689,
"step": 7560
},
{
"epoch": 2.389481165600569,
"grad_norm": 0.07497938925568602,
"learning_rate": 0.001884547538574962,
"loss": 2.7696,
"step": 7565
},
{
"epoch": 2.3910605701650476,
"grad_norm": 0.08612690456510577,
"learning_rate": 0.001884290181687092,
"loss": 2.8199,
"step": 7570
},
{
"epoch": 2.392639974729527,
"grad_norm": 0.08797114556231153,
"learning_rate": 0.0018840325558922282,
"loss": 2.7724,
"step": 7575
},
{
"epoch": 2.394219379294006,
"grad_norm": 0.09420516265260621,
"learning_rate": 0.001883774661268712,
"loss": 2.7078,
"step": 7580
},
{
"epoch": 2.3957987838584853,
"grad_norm": 0.08453869336872923,
"learning_rate": 0.001883516497894968,
"loss": 2.8102,
"step": 7585
},
{
"epoch": 2.3973781884229646,
"grad_norm": 0.07032148456996877,
"learning_rate": 0.0018832580658495024,
"loss": 2.7734,
"step": 7590
},
{
"epoch": 2.398957592987444,
"grad_norm": 0.09374154890695309,
"learning_rate": 0.0018829993652109019,
"loss": 2.7003,
"step": 7595
},
{
"epoch": 2.400536997551923,
"grad_norm": 0.08401160942707464,
"learning_rate": 0.001882740396057836,
"loss": 2.7927,
"step": 7600
},
{
"epoch": 2.402116402116402,
"grad_norm": 0.06927986287686265,
"learning_rate": 0.0018824811584690555,
"loss": 2.6737,
"step": 7605
},
{
"epoch": 2.403695806680881,
"grad_norm": 0.07529877322546603,
"learning_rate": 0.0018822216525233935,
"loss": 2.7634,
"step": 7610
},
{
"epoch": 2.4052752112453604,
"grad_norm": 0.07469206529645643,
"learning_rate": 0.0018819618782997631,
"loss": 2.7092,
"step": 7615
},
{
"epoch": 2.4068546158098396,
"grad_norm": 0.07470263412071848,
"learning_rate": 0.0018817018358771608,
"loss": 2.6249,
"step": 7620
},
{
"epoch": 2.408434020374319,
"grad_norm": 0.0800266080961645,
"learning_rate": 0.0018814415253346638,
"loss": 2.7841,
"step": 7625
},
{
"epoch": 2.410013424938798,
"grad_norm": 0.07779178979769348,
"learning_rate": 0.0018811809467514302,
"loss": 2.7671,
"step": 7630
},
{
"epoch": 2.4115928295032774,
"grad_norm": 0.0688955490712083,
"learning_rate": 0.001880920100206701,
"loss": 2.7498,
"step": 7635
},
{
"epoch": 2.4131722340677566,
"grad_norm": 0.08367516993109311,
"learning_rate": 0.0018806589857797977,
"loss": 2.6605,
"step": 7640
},
{
"epoch": 2.414751638632236,
"grad_norm": 0.08742889444589272,
"learning_rate": 0.0018803976035501233,
"loss": 2.6678,
"step": 7645
},
{
"epoch": 2.4163310431967147,
"grad_norm": 0.0790738567939181,
"learning_rate": 0.0018801359535971626,
"loss": 2.669,
"step": 7650
},
{
"epoch": 2.417910447761194,
"grad_norm": 0.07121469436260072,
"learning_rate": 0.0018798740360004822,
"loss": 2.6622,
"step": 7655
},
{
"epoch": 2.419489852325673,
"grad_norm": 0.06891514623432604,
"learning_rate": 0.0018796118508397287,
"loss": 2.7473,
"step": 7660
},
{
"epoch": 2.4210692568901524,
"grad_norm": 0.08026374467155266,
"learning_rate": 0.0018793493981946318,
"loss": 2.6967,
"step": 7665
},
{
"epoch": 2.4226486614546316,
"grad_norm": 0.07066077070523862,
"learning_rate": 0.0018790866781450007,
"loss": 2.6993,
"step": 7670
},
{
"epoch": 2.424228066019111,
"grad_norm": 0.07868215840609977,
"learning_rate": 0.001878823690770728,
"loss": 2.739,
"step": 7675
},
{
"epoch": 2.42580747058359,
"grad_norm": 0.06467538125762497,
"learning_rate": 0.001878560436151785,
"loss": 2.6918,
"step": 7680
},
{
"epoch": 2.427386875148069,
"grad_norm": 0.09734857618605057,
"learning_rate": 0.0018782969143682276,
"loss": 2.7678,
"step": 7685
},
{
"epoch": 2.428966279712548,
"grad_norm": 0.07731364571259443,
"learning_rate": 0.0018780331255001898,
"loss": 2.677,
"step": 7690
},
{
"epoch": 2.4305456842770274,
"grad_norm": 0.07767700316172481,
"learning_rate": 0.0018777690696278881,
"loss": 2.7618,
"step": 7695
},
{
"epoch": 2.4321250888415067,
"grad_norm": 0.1037623342084868,
"learning_rate": 0.0018775047468316212,
"loss": 2.7872,
"step": 7700
},
{
"epoch": 2.433704493405986,
"grad_norm": 0.1065049396723975,
"learning_rate": 0.0018772401571917668,
"loss": 2.7372,
"step": 7705
},
{
"epoch": 2.435283897970465,
"grad_norm": 0.07938619491090596,
"learning_rate": 0.0018769753007887855,
"loss": 2.5384,
"step": 7710
},
{
"epoch": 2.4368633025349444,
"grad_norm": 0.08631045408098625,
"learning_rate": 0.0018767101777032184,
"loss": 2.6442,
"step": 7715
},
{
"epoch": 2.4384427070994237,
"grad_norm": 0.08323949226651153,
"learning_rate": 0.0018764447880156878,
"loss": 2.6652,
"step": 7720
},
{
"epoch": 2.440022111663903,
"grad_norm": 0.0847747509987669,
"learning_rate": 0.001876179131806897,
"loss": 2.653,
"step": 7725
},
{
"epoch": 2.4416015162283817,
"grad_norm": 0.08543114070870507,
"learning_rate": 0.0018759132091576301,
"loss": 2.6623,
"step": 7730
},
{
"epoch": 2.443180920792861,
"grad_norm": 0.08083946579402632,
"learning_rate": 0.0018756470201487527,
"loss": 2.6318,
"step": 7735
},
{
"epoch": 2.4447603253573402,
"grad_norm": 0.0725618021471989,
"learning_rate": 0.0018753805648612115,
"loss": 2.6657,
"step": 7740
},
{
"epoch": 2.4463397299218195,
"grad_norm": 0.08341020006084542,
"learning_rate": 0.001875113843376033,
"loss": 2.678,
"step": 7745
},
{
"epoch": 2.4479191344862987,
"grad_norm": 0.07192478514855673,
"learning_rate": 0.0018748468557743263,
"loss": 2.6607,
"step": 7750
},
{
"epoch": 2.449498539050778,
"grad_norm": 0.07975965263261633,
"learning_rate": 0.00187457960213728,
"loss": 2.7002,
"step": 7755
},
{
"epoch": 2.451077943615257,
"grad_norm": 0.0721191524169921,
"learning_rate": 0.0018743120825461647,
"loss": 2.7017,
"step": 7760
},
{
"epoch": 2.452657348179736,
"grad_norm": 0.08858243755028575,
"learning_rate": 0.0018740442970823312,
"loss": 2.697,
"step": 7765
},
{
"epoch": 2.4542367527442153,
"grad_norm": 0.08699423256790373,
"learning_rate": 0.0018737762458272114,
"loss": 2.7567,
"step": 7770
},
{
"epoch": 2.4558161573086945,
"grad_norm": 0.07730088922028397,
"learning_rate": 0.0018735079288623182,
"loss": 2.7256,
"step": 7775
},
{
"epoch": 2.4573955618731738,
"grad_norm": 0.08158156654649575,
"learning_rate": 0.0018732393462692445,
"loss": 2.7248,
"step": 7780
},
{
"epoch": 2.458974966437653,
"grad_norm": 0.08136571604368042,
"learning_rate": 0.0018729704981296652,
"loss": 2.6473,
"step": 7785
},
{
"epoch": 2.4605543710021323,
"grad_norm": 0.08344350173638025,
"learning_rate": 0.0018727013845253344,
"loss": 2.678,
"step": 7790
},
{
"epoch": 2.4621337755666115,
"grad_norm": 0.08222325140799036,
"learning_rate": 0.001872432005538089,
"loss": 2.6582,
"step": 7795
},
{
"epoch": 2.4637131801310908,
"grad_norm": 0.07101265264728349,
"learning_rate": 0.0018721623612498446,
"loss": 2.6721,
"step": 7800
},
{
"epoch": 2.46529258469557,
"grad_norm": 0.0719783248403732,
"learning_rate": 0.0018718924517425986,
"loss": 2.669,
"step": 7805
},
{
"epoch": 2.466871989260049,
"grad_norm": 0.08021611335702594,
"learning_rate": 0.0018716222770984285,
"loss": 2.634,
"step": 7810
},
{
"epoch": 2.468451393824528,
"grad_norm": 0.0935973449220456,
"learning_rate": 0.0018713518373994931,
"loss": 2.6396,
"step": 7815
},
{
"epoch": 2.4700307983890073,
"grad_norm": 0.0873982163416268,
"learning_rate": 0.0018710811327280312,
"loss": 2.5957,
"step": 7820
},
{
"epoch": 2.4716102029534865,
"grad_norm": 0.06000719379053246,
"learning_rate": 0.0018708101631663622,
"loss": 2.7188,
"step": 7825
},
{
"epoch": 2.473189607517966,
"grad_norm": 0.0882888251028582,
"learning_rate": 0.0018705389287968863,
"loss": 2.6632,
"step": 7830
},
{
"epoch": 2.474769012082445,
"grad_norm": 0.09869515337928537,
"learning_rate": 0.0018702674297020844,
"loss": 2.6711,
"step": 7835
},
{
"epoch": 2.4763484166469243,
"grad_norm": 0.08240705547146975,
"learning_rate": 0.0018699956659645172,
"loss": 2.7613,
"step": 7840
},
{
"epoch": 2.477927821211403,
"grad_norm": 0.07248905164425486,
"learning_rate": 0.0018697236376668267,
"loss": 2.696,
"step": 7845
},
{
"epoch": 2.4795072257758823,
"grad_norm": 0.0737655102966124,
"learning_rate": 0.0018694513448917348,
"loss": 2.7168,
"step": 7850
},
{
"epoch": 2.4810866303403616,
"grad_norm": 0.07535024042252804,
"learning_rate": 0.0018691787877220438,
"loss": 2.7605,
"step": 7855
},
{
"epoch": 2.482666034904841,
"grad_norm": 0.0771969438222813,
"learning_rate": 0.0018689059662406371,
"loss": 2.6679,
"step": 7860
},
{
"epoch": 2.48424543946932,
"grad_norm": 0.07142066911663333,
"learning_rate": 0.0018686328805304774,
"loss": 2.7337,
"step": 7865
},
{
"epoch": 2.4858248440337993,
"grad_norm": 0.07640651552592222,
"learning_rate": 0.0018683595306746086,
"loss": 2.6871,
"step": 7870
},
{
"epoch": 2.4874042485982786,
"grad_norm": 0.07816469379115588,
"learning_rate": 0.0018680859167561547,
"loss": 2.712,
"step": 7875
},
{
"epoch": 2.488983653162758,
"grad_norm": 0.07092137707362926,
"learning_rate": 0.00186781203885832,
"loss": 2.6838,
"step": 7880
},
{
"epoch": 2.490563057727237,
"grad_norm": 0.08452392581027952,
"learning_rate": 0.0018675378970643885,
"loss": 2.7528,
"step": 7885
},
{
"epoch": 2.492142462291716,
"grad_norm": 0.07700920122144246,
"learning_rate": 0.0018672634914577257,
"loss": 2.7425,
"step": 7890
},
{
"epoch": 2.493721866856195,
"grad_norm": 0.0725194849975088,
"learning_rate": 0.001866988822121776,
"loss": 2.72,
"step": 7895
},
{
"epoch": 2.4953012714206744,
"grad_norm": 0.08647124238367777,
"learning_rate": 0.0018667138891400653,
"loss": 2.7387,
"step": 7900
},
{
"epoch": 2.4968806759851536,
"grad_norm": 0.09775103044838994,
"learning_rate": 0.001866438692596198,
"loss": 2.7273,
"step": 7905
},
{
"epoch": 2.498460080549633,
"grad_norm": 0.06664858654174687,
"learning_rate": 0.0018661632325738605,
"loss": 2.7252,
"step": 7910
},
{
"epoch": 2.500039485114112,
"grad_norm": 0.08104867842322877,
"learning_rate": 0.0018658875091568177,
"loss": 2.648,
"step": 7915
},
{
"epoch": 2.501618889678591,
"grad_norm": 0.06850407967810379,
"learning_rate": 0.0018656115224289158,
"loss": 2.6029,
"step": 7920
},
{
"epoch": 2.50319829424307,
"grad_norm": 0.07774145402202129,
"learning_rate": 0.0018653352724740807,
"loss": 2.7816,
"step": 7925
},
{
"epoch": 2.5047776988075494,
"grad_norm": 0.067705446229846,
"learning_rate": 0.0018650587593763179,
"loss": 2.6936,
"step": 7930
},
{
"epoch": 2.5063571033720287,
"grad_norm": 0.0730342806916298,
"learning_rate": 0.0018647819832197131,
"loss": 2.6904,
"step": 7935
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.07501734056288688,
"learning_rate": 0.0018645049440884325,
"loss": 2.6693,
"step": 7940
},
{
"epoch": 2.509515912500987,
"grad_norm": 0.07247920381952802,
"learning_rate": 0.001864227642066722,
"loss": 2.6361,
"step": 7945
},
{
"epoch": 2.5110953170654664,
"grad_norm": 0.07729876869468007,
"learning_rate": 0.0018639500772389074,
"loss": 2.7373,
"step": 7950
},
{
"epoch": 2.5126747216299457,
"grad_norm": 0.05958962188110759,
"learning_rate": 0.0018636722496893942,
"loss": 2.6341,
"step": 7955
},
{
"epoch": 2.514254126194425,
"grad_norm": 0.08290918057113816,
"learning_rate": 0.001863394159502668,
"loss": 2.662,
"step": 7960
},
{
"epoch": 2.515833530758904,
"grad_norm": 0.07605813821249847,
"learning_rate": 0.001863115806763294,
"loss": 2.7292,
"step": 7965
},
{
"epoch": 2.517412935323383,
"grad_norm": 0.08339138369725956,
"learning_rate": 0.001862837191555918,
"loss": 2.7135,
"step": 7970
},
{
"epoch": 2.518992339887862,
"grad_norm": 0.08347322719724491,
"learning_rate": 0.0018625583139652649,
"loss": 2.6136,
"step": 7975
},
{
"epoch": 2.5205717444523414,
"grad_norm": 0.08893567182034597,
"learning_rate": 0.0018622791740761395,
"loss": 2.8399,
"step": 7980
},
{
"epoch": 2.5221511490168207,
"grad_norm": 0.09214698048473759,
"learning_rate": 0.0018619997719734266,
"loss": 2.6203,
"step": 7985
},
{
"epoch": 2.5237305535813,
"grad_norm": 0.07526186136361122,
"learning_rate": 0.0018617201077420905,
"loss": 2.6775,
"step": 7990
},
{
"epoch": 2.525309958145779,
"grad_norm": 0.09091886358113357,
"learning_rate": 0.001861440181467175,
"loss": 2.7433,
"step": 7995
},
{
"epoch": 2.526889362710258,
"grad_norm": 0.0796869768273692,
"learning_rate": 0.0018611599932338045,
"loss": 2.6444,
"step": 8000
},
{
"epoch": 2.5284687672747372,
"grad_norm": 0.07939729630626526,
"learning_rate": 0.001860879543127182,
"loss": 2.6763,
"step": 8005
},
{
"epoch": 2.5300481718392165,
"grad_norm": 0.08612245528516328,
"learning_rate": 0.0018605988312325912,
"loss": 2.5781,
"step": 8010
},
{
"epoch": 2.5316275764036957,
"grad_norm": 0.05992673706296889,
"learning_rate": 0.0018603178576353941,
"loss": 2.6827,
"step": 8015
},
{
"epoch": 2.533206980968175,
"grad_norm": 0.08097528211699864,
"learning_rate": 0.001860036622421033,
"loss": 2.6819,
"step": 8020
},
{
"epoch": 2.5347863855326542,
"grad_norm": 0.08482703413986586,
"learning_rate": 0.00185975512567503,
"loss": 2.7946,
"step": 8025
},
{
"epoch": 2.5363657900971335,
"grad_norm": 0.08872520065295834,
"learning_rate": 0.0018594733674829867,
"loss": 2.7496,
"step": 8030
},
{
"epoch": 2.5379451946616127,
"grad_norm": 0.09288288886907606,
"learning_rate": 0.0018591913479305833,
"loss": 2.6849,
"step": 8035
},
{
"epoch": 2.539524599226092,
"grad_norm": 0.08531654651171618,
"learning_rate": 0.0018589090671035807,
"loss": 2.7099,
"step": 8040
},
{
"epoch": 2.541104003790571,
"grad_norm": 0.0916481767628531,
"learning_rate": 0.0018586265250878184,
"loss": 2.6628,
"step": 8045
},
{
"epoch": 2.54268340835505,
"grad_norm": 0.07175547103052027,
"learning_rate": 0.0018583437219692161,
"loss": 2.6703,
"step": 8050
},
{
"epoch": 2.5442628129195293,
"grad_norm": 0.06774903006142224,
"learning_rate": 0.0018580606578337715,
"loss": 2.6817,
"step": 8055
},
{
"epoch": 2.5458422174840085,
"grad_norm": 0.0826462540723309,
"learning_rate": 0.0018577773327675638,
"loss": 2.7343,
"step": 8060
},
{
"epoch": 2.5474216220484878,
"grad_norm": 0.08344610696981357,
"learning_rate": 0.0018574937468567492,
"loss": 2.6898,
"step": 8065
},
{
"epoch": 2.549001026612967,
"grad_norm": 0.0772110804875115,
"learning_rate": 0.0018572099001875652,
"loss": 2.6706,
"step": 8070
},
{
"epoch": 2.5505804311774463,
"grad_norm": 0.07610845646960257,
"learning_rate": 0.001856925792846327,
"loss": 2.7425,
"step": 8075
},
{
"epoch": 2.552159835741925,
"grad_norm": 0.06543900145379539,
"learning_rate": 0.0018566414249194306,
"loss": 2.6781,
"step": 8080
},
{
"epoch": 2.5537392403064043,
"grad_norm": 0.07079232505612955,
"learning_rate": 0.0018563567964933498,
"loss": 2.5946,
"step": 8085
},
{
"epoch": 2.5553186448708836,
"grad_norm": 0.10262069972175505,
"learning_rate": 0.0018560719076546389,
"loss": 2.7899,
"step": 8090
},
{
"epoch": 2.556898049435363,
"grad_norm": 0.08666390464029669,
"learning_rate": 0.0018557867584899305,
"loss": 2.6093,
"step": 8095
},
{
"epoch": 2.558477453999842,
"grad_norm": 0.07262218443994439,
"learning_rate": 0.0018555013490859364,
"loss": 2.6317,
"step": 8100
},
{
"epoch": 2.5600568585643213,
"grad_norm": 0.08203991908098497,
"learning_rate": 0.0018552156795294482,
"loss": 2.5987,
"step": 8105
},
{
"epoch": 2.5616362631288006,
"grad_norm": 0.07864166237147685,
"learning_rate": 0.0018549297499073356,
"loss": 2.7561,
"step": 8110
},
{
"epoch": 2.56321566769328,
"grad_norm": 0.0802474496306534,
"learning_rate": 0.0018546435603065486,
"loss": 2.7016,
"step": 8115
},
{
"epoch": 2.564795072257759,
"grad_norm": 0.07907611783306082,
"learning_rate": 0.0018543571108141155,
"loss": 2.6933,
"step": 8120
},
{
"epoch": 2.5663744768222383,
"grad_norm": 0.07617577072615397,
"learning_rate": 0.0018540704015171437,
"loss": 2.7774,
"step": 8125
},
{
"epoch": 2.567953881386717,
"grad_norm": 0.07745229149181031,
"learning_rate": 0.0018537834325028193,
"loss": 2.709,
"step": 8130
},
{
"epoch": 2.5695332859511963,
"grad_norm": 0.05920274025904176,
"learning_rate": 0.0018534962038584083,
"loss": 2.6502,
"step": 8135
},
{
"epoch": 2.5711126905156756,
"grad_norm": 0.07804375423712556,
"learning_rate": 0.0018532087156712547,
"loss": 2.7016,
"step": 8140
},
{
"epoch": 2.572692095080155,
"grad_norm": 0.07157365496401365,
"learning_rate": 0.001852920968028782,
"loss": 2.7592,
"step": 8145
},
{
"epoch": 2.574271499644634,
"grad_norm": 0.07040408906894956,
"learning_rate": 0.001852632961018492,
"loss": 2.7072,
"step": 8150
},
{
"epoch": 2.575850904209113,
"grad_norm": 0.08606791875324905,
"learning_rate": 0.0018523446947279667,
"loss": 2.7338,
"step": 8155
},
{
"epoch": 2.577430308773592,
"grad_norm": 0.06450312373755046,
"learning_rate": 0.0018520561692448654,
"loss": 2.6513,
"step": 8160
},
{
"epoch": 2.5790097133380714,
"grad_norm": 0.08074620572550516,
"learning_rate": 0.001851767384656927,
"loss": 2.6686,
"step": 8165
},
{
"epoch": 2.5805891179025506,
"grad_norm": 0.07786908704366824,
"learning_rate": 0.0018514783410519692,
"loss": 2.6336,
"step": 8170
},
{
"epoch": 2.58216852246703,
"grad_norm": 0.07287148075741615,
"learning_rate": 0.0018511890385178877,
"loss": 2.7318,
"step": 8175
},
{
"epoch": 2.583747927031509,
"grad_norm": 0.07682255875376442,
"learning_rate": 0.0018508994771426583,
"loss": 2.7095,
"step": 8180
},
{
"epoch": 2.5853273315959884,
"grad_norm": 0.06738640096293755,
"learning_rate": 0.0018506096570143342,
"loss": 2.7102,
"step": 8185
},
{
"epoch": 2.5869067361604676,
"grad_norm": 0.08277153940086601,
"learning_rate": 0.0018503195782210483,
"loss": 2.6777,
"step": 8190
},
{
"epoch": 2.588486140724947,
"grad_norm": 0.07075781475437158,
"learning_rate": 0.0018500292408510112,
"loss": 2.6732,
"step": 8195
},
{
"epoch": 2.590065545289426,
"grad_norm": 0.0891710854698283,
"learning_rate": 0.0018497386449925135,
"loss": 2.6126,
"step": 8200
},
{
"epoch": 2.591644949853905,
"grad_norm": 0.07448667313538068,
"learning_rate": 0.0018494477907339225,
"loss": 2.6267,
"step": 8205
},
{
"epoch": 2.593224354418384,
"grad_norm": 0.07955598196227132,
"learning_rate": 0.001849156678163686,
"loss": 2.6357,
"step": 8210
},
{
"epoch": 2.5948037589828634,
"grad_norm": 0.057716126963121936,
"learning_rate": 0.0018488653073703287,
"loss": 2.6709,
"step": 8215
},
{
"epoch": 2.5963831635473427,
"grad_norm": 0.24803273908784967,
"learning_rate": 0.0018485736784424553,
"loss": 2.7145,
"step": 8220
},
{
"epoch": 2.597962568111822,
"grad_norm": 0.13060863770251266,
"learning_rate": 0.0018482817914687478,
"loss": 2.7761,
"step": 8225
},
{
"epoch": 2.599541972676301,
"grad_norm": 0.09447300673293828,
"learning_rate": 0.0018479896465379672,
"loss": 2.6685,
"step": 8230
},
{
"epoch": 2.60112137724078,
"grad_norm": 0.07751070583791715,
"learning_rate": 0.0018476972437389532,
"loss": 2.7205,
"step": 8235
},
{
"epoch": 2.602700781805259,
"grad_norm": 0.08781007878460902,
"learning_rate": 0.0018474045831606235,
"loss": 2.6261,
"step": 8240
},
{
"epoch": 2.6042801863697385,
"grad_norm": 0.08697823868726591,
"learning_rate": 0.0018471116648919744,
"loss": 2.7048,
"step": 8245
},
{
"epoch": 2.6058595909342177,
"grad_norm": 0.09962612079922767,
"learning_rate": 0.00184681848902208,
"loss": 2.7425,
"step": 8250
},
{
"epoch": 2.607438995498697,
"grad_norm": 0.09497362891723997,
"learning_rate": 0.0018465250556400936,
"loss": 2.7101,
"step": 8255
},
{
"epoch": 2.609018400063176,
"grad_norm": 0.1095961760999583,
"learning_rate": 0.001846231364835247,
"loss": 2.749,
"step": 8260
},
{
"epoch": 2.6105978046276554,
"grad_norm": 0.07792276572908069,
"learning_rate": 0.0018459374166968484,
"loss": 2.6432,
"step": 8265
},
{
"epoch": 2.6121772091921347,
"grad_norm": 0.07934151624028277,
"learning_rate": 0.0018456432113142865,
"loss": 2.7084,
"step": 8270
},
{
"epoch": 2.613756613756614,
"grad_norm": 0.09765015908766372,
"learning_rate": 0.0018453487487770268,
"loss": 2.6978,
"step": 8275
},
{
"epoch": 2.615336018321093,
"grad_norm": 0.07424207466262638,
"learning_rate": 0.001845054029174614,
"loss": 2.7314,
"step": 8280
},
{
"epoch": 2.616915422885572,
"grad_norm": 0.06654401533251812,
"learning_rate": 0.0018447590525966697,
"loss": 2.6611,
"step": 8285
},
{
"epoch": 2.6184948274500512,
"grad_norm": 0.07343882122515961,
"learning_rate": 0.0018444638191328952,
"loss": 2.679,
"step": 8290
},
{
"epoch": 2.6200742320145305,
"grad_norm": 0.07141272681091779,
"learning_rate": 0.0018441683288730687,
"loss": 2.7138,
"step": 8295
},
{
"epoch": 2.6216536365790097,
"grad_norm": 0.08065369323647667,
"learning_rate": 0.0018438725819070467,
"loss": 2.6659,
"step": 8300
},
{
"epoch": 2.623233041143489,
"grad_norm": 0.06394475792669799,
"learning_rate": 0.0018435765783247641,
"loss": 2.5876,
"step": 8305
},
{
"epoch": 2.6248124457079682,
"grad_norm": 0.07213582091096707,
"learning_rate": 0.0018432803182162343,
"loss": 2.6576,
"step": 8310
},
{
"epoch": 2.626391850272447,
"grad_norm": 0.06569845411375552,
"learning_rate": 0.0018429838016715471,
"loss": 2.6349,
"step": 8315
},
{
"epoch": 2.6279712548369263,
"grad_norm": 0.062004370999434975,
"learning_rate": 0.0018426870287808722,
"loss": 2.4996,
"step": 8320
},
{
"epoch": 2.6295506594014055,
"grad_norm": 0.07777343451046,
"learning_rate": 0.0018423899996344558,
"loss": 2.7423,
"step": 8325
},
{
"epoch": 2.631130063965885,
"grad_norm": 0.07268488245099335,
"learning_rate": 0.0018420927143226226,
"loss": 2.7066,
"step": 8330
},
{
"epoch": 2.632709468530364,
"grad_norm": 0.09713553113440274,
"learning_rate": 0.001841795172935775,
"loss": 2.6784,
"step": 8335
},
{
"epoch": 2.6342888730948433,
"grad_norm": 0.07579561403529118,
"learning_rate": 0.0018414973755643941,
"loss": 2.6956,
"step": 8340
},
{
"epoch": 2.6358682776593225,
"grad_norm": 0.07593637072975637,
"learning_rate": 0.0018411993222990377,
"loss": 2.5793,
"step": 8345
},
{
"epoch": 2.6374476822238018,
"grad_norm": 0.07902721671937,
"learning_rate": 0.0018409010132303418,
"loss": 2.6508,
"step": 8350
},
{
"epoch": 2.639027086788281,
"grad_norm": 0.06976273290655018,
"learning_rate": 0.0018406024484490207,
"loss": 2.7403,
"step": 8355
},
{
"epoch": 2.6406064913527603,
"grad_norm": 0.06631988590855742,
"learning_rate": 0.0018403036280458657,
"loss": 2.5342,
"step": 8360
},
{
"epoch": 2.642185895917239,
"grad_norm": 0.07970795771258071,
"learning_rate": 0.0018400045521117462,
"loss": 2.6565,
"step": 8365
},
{
"epoch": 2.6437653004817183,
"grad_norm": 0.09089034026053808,
"learning_rate": 0.001839705220737609,
"loss": 2.6178,
"step": 8370
},
{
"epoch": 2.6453447050461976,
"grad_norm": 0.0873756080233504,
"learning_rate": 0.0018394056340144795,
"loss": 2.6376,
"step": 8375
},
{
"epoch": 2.646924109610677,
"grad_norm": 0.07068153933211149,
"learning_rate": 0.00183910579203346,
"loss": 2.6346,
"step": 8380
},
{
"epoch": 2.648503514175156,
"grad_norm": 0.08184173420274206,
"learning_rate": 0.0018388056948857301,
"loss": 2.7169,
"step": 8385
},
{
"epoch": 2.6500829187396353,
"grad_norm": 0.09004242599016954,
"learning_rate": 0.0018385053426625477,
"loss": 2.6163,
"step": 8390
},
{
"epoch": 2.651662323304114,
"grad_norm": 0.07791768928388704,
"learning_rate": 0.001838204735455248,
"loss": 2.6018,
"step": 8395
},
{
"epoch": 2.6532417278685934,
"grad_norm": 0.06776848053010266,
"learning_rate": 0.0018379038733552435,
"loss": 2.7123,
"step": 8400
},
{
"epoch": 2.6548211324330726,
"grad_norm": 0.08293504619586714,
"learning_rate": 0.0018376027564540249,
"loss": 2.7125,
"step": 8405
},
{
"epoch": 2.656400536997552,
"grad_norm": 0.08305143011589787,
"learning_rate": 0.0018373013848431597,
"loss": 2.6436,
"step": 8410
},
{
"epoch": 2.657979941562031,
"grad_norm": 0.1003820323837237,
"learning_rate": 0.0018369997586142929,
"loss": 2.6058,
"step": 8415
},
{
"epoch": 2.6595593461265103,
"grad_norm": 0.07876546071996927,
"learning_rate": 0.0018366978778591471,
"loss": 2.7217,
"step": 8420
},
{
"epoch": 2.6611387506909896,
"grad_norm": 0.0737061663083282,
"learning_rate": 0.0018363957426695227,
"loss": 2.6446,
"step": 8425
},
{
"epoch": 2.662718155255469,
"grad_norm": 0.06658529844569622,
"learning_rate": 0.0018360933531372968,
"loss": 2.6969,
"step": 8430
},
{
"epoch": 2.664297559819948,
"grad_norm": 0.07403289532509451,
"learning_rate": 0.0018357907093544238,
"loss": 2.5625,
"step": 8435
},
{
"epoch": 2.6658769643844273,
"grad_norm": 0.06490303091705928,
"learning_rate": 0.0018354878114129364,
"loss": 2.6345,
"step": 8440
},
{
"epoch": 2.667456368948906,
"grad_norm": 0.0690962464048344,
"learning_rate": 0.0018351846594049437,
"loss": 2.5917,
"step": 8445
},
{
"epoch": 2.6690357735133854,
"grad_norm": 0.07191291298335026,
"learning_rate": 0.001834881253422632,
"loss": 2.6261,
"step": 8450
},
{
"epoch": 2.6706151780778646,
"grad_norm": 0.09168957592632122,
"learning_rate": 0.0018345775935582657,
"loss": 2.6488,
"step": 8455
},
{
"epoch": 2.672194582642344,
"grad_norm": 0.08380802121900667,
"learning_rate": 0.001834273679904185,
"loss": 2.6766,
"step": 8460
},
{
"epoch": 2.673773987206823,
"grad_norm": 0.07923024055346786,
"learning_rate": 0.0018339695125528088,
"loss": 2.6511,
"step": 8465
},
{
"epoch": 2.6753533917713024,
"grad_norm": 0.07735194440863055,
"learning_rate": 0.0018336650915966324,
"loss": 2.6581,
"step": 8470
},
{
"epoch": 2.676932796335781,
"grad_norm": 0.07714028781489839,
"learning_rate": 0.0018333604171282278,
"loss": 2.6848,
"step": 8475
},
{
"epoch": 2.6785122009002604,
"grad_norm": 0.08332223079564574,
"learning_rate": 0.001833055489240245,
"loss": 2.668,
"step": 8480
},
{
"epoch": 2.6800916054647397,
"grad_norm": 0.07460803289578619,
"learning_rate": 0.0018327503080254105,
"loss": 2.8177,
"step": 8485
},
{
"epoch": 2.681671010029219,
"grad_norm": 0.08270742539127242,
"learning_rate": 0.0018324448735765277,
"loss": 2.6497,
"step": 8490
},
{
"epoch": 2.683250414593698,
"grad_norm": 0.059781586839295804,
"learning_rate": 0.0018321391859864775,
"loss": 2.7406,
"step": 8495
},
{
"epoch": 2.6848298191581774,
"grad_norm": 0.08694086473733448,
"learning_rate": 0.0018318332453482176,
"loss": 2.6342,
"step": 8500
},
{
"epoch": 2.6864092237226567,
"grad_norm": 0.08834612488705565,
"learning_rate": 0.0018315270517547826,
"loss": 2.6524,
"step": 8505
},
{
"epoch": 2.687988628287136,
"grad_norm": 0.08160955341364691,
"learning_rate": 0.0018312206052992837,
"loss": 2.6655,
"step": 8510
},
{
"epoch": 2.689568032851615,
"grad_norm": 0.08101212710873497,
"learning_rate": 0.0018309139060749097,
"loss": 2.7838,
"step": 8515
},
{
"epoch": 2.691147437416094,
"grad_norm": 0.08096363371817464,
"learning_rate": 0.0018306069541749257,
"loss": 2.7462,
"step": 8520
},
{
"epoch": 2.692726841980573,
"grad_norm": 0.06736595463146286,
"learning_rate": 0.001830299749692674,
"loss": 2.6882,
"step": 8525
},
{
"epoch": 2.6943062465450525,
"grad_norm": 0.07318768228212218,
"learning_rate": 0.001829992292721573,
"loss": 2.6515,
"step": 8530
},
{
"epoch": 2.6958856511095317,
"grad_norm": 0.0841948725662524,
"learning_rate": 0.0018296845833551192,
"loss": 2.7602,
"step": 8535
},
{
"epoch": 2.697465055674011,
"grad_norm": 0.07570074708905208,
"learning_rate": 0.0018293766216868842,
"loss": 2.758,
"step": 8540
},
{
"epoch": 2.69904446023849,
"grad_norm": 0.08026041820741821,
"learning_rate": 0.0018290684078105177,
"loss": 2.5792,
"step": 8545
},
{
"epoch": 2.700623864802969,
"grad_norm": 0.07746096583186972,
"learning_rate": 0.0018287599418197456,
"loss": 2.6286,
"step": 8550
},
{
"epoch": 2.7022032693674483,
"grad_norm": 0.07472450813156946,
"learning_rate": 0.0018284512238083703,
"loss": 2.7215,
"step": 8555
},
{
"epoch": 2.7037826739319275,
"grad_norm": 0.07941333908553108,
"learning_rate": 0.0018281422538702708,
"loss": 2.6878,
"step": 8560
},
{
"epoch": 2.7053620784964068,
"grad_norm": 0.07397615560057773,
"learning_rate": 0.0018278330320994033,
"loss": 2.6282,
"step": 8565
},
{
"epoch": 2.706941483060886,
"grad_norm": 0.08761091382284485,
"learning_rate": 0.0018275235585897996,
"loss": 2.767,
"step": 8570
},
{
"epoch": 2.7085208876253652,
"grad_norm": 0.07762415547466953,
"learning_rate": 0.0018272138334355689,
"loss": 2.6113,
"step": 8575
},
{
"epoch": 2.7101002921898445,
"grad_norm": 0.06892300386844659,
"learning_rate": 0.0018269038567308967,
"loss": 2.6718,
"step": 8580
},
{
"epoch": 2.7116796967543237,
"grad_norm": 0.08692744814530534,
"learning_rate": 0.001826593628570045,
"loss": 2.748,
"step": 8585
},
{
"epoch": 2.713259101318803,
"grad_norm": 0.06708209311079495,
"learning_rate": 0.001826283149047352,
"loss": 2.5921,
"step": 8590
},
{
"epoch": 2.7148385058832822,
"grad_norm": 0.0638740190394845,
"learning_rate": 0.001825972418257233,
"loss": 2.6677,
"step": 8595
},
{
"epoch": 2.716417910447761,
"grad_norm": 0.07573045477360121,
"learning_rate": 0.0018256614362941786,
"loss": 2.7767,
"step": 8600
},
{
"epoch": 2.7179973150122403,
"grad_norm": 0.07837415095835595,
"learning_rate": 0.0018253502032527567,
"loss": 2.6454,
"step": 8605
},
{
"epoch": 2.7195767195767195,
"grad_norm": 0.07259376544303658,
"learning_rate": 0.0018250387192276115,
"loss": 2.6446,
"step": 8610
},
{
"epoch": 2.721156124141199,
"grad_norm": 0.09368952879952992,
"learning_rate": 0.0018247269843134628,
"loss": 2.6191,
"step": 8615
},
{
"epoch": 2.722735528705678,
"grad_norm": 0.08013335502408653,
"learning_rate": 0.0018244149986051076,
"loss": 2.614,
"step": 8620
},
{
"epoch": 2.7243149332701573,
"grad_norm": 0.07343312294788908,
"learning_rate": 0.0018241027621974189,
"loss": 2.6262,
"step": 8625
},
{
"epoch": 2.725894337834636,
"grad_norm": 0.08713602436595895,
"learning_rate": 0.0018237902751853453,
"loss": 2.6984,
"step": 8630
},
{
"epoch": 2.7274737423991153,
"grad_norm": 0.07572530467226961,
"learning_rate": 0.0018234775376639125,
"loss": 2.5924,
"step": 8635
},
{
"epoch": 2.7290531469635946,
"grad_norm": 0.0651715649995805,
"learning_rate": 0.0018231645497282217,
"loss": 2.6392,
"step": 8640
},
{
"epoch": 2.730632551528074,
"grad_norm": 0.07046431160450398,
"learning_rate": 0.0018228513114734507,
"loss": 2.6397,
"step": 8645
},
{
"epoch": 2.732211956092553,
"grad_norm": 0.07222286406847223,
"learning_rate": 0.0018225378229948532,
"loss": 2.687,
"step": 8650
},
{
"epoch": 2.7337913606570323,
"grad_norm": 0.06309111384469673,
"learning_rate": 0.0018222240843877593,
"loss": 2.5812,
"step": 8655
},
{
"epoch": 2.7353707652215116,
"grad_norm": 0.07938025900094192,
"learning_rate": 0.0018219100957475745,
"loss": 2.5991,
"step": 8660
},
{
"epoch": 2.736950169785991,
"grad_norm": 0.07870940461239452,
"learning_rate": 0.0018215958571697808,
"loss": 2.6641,
"step": 8665
},
{
"epoch": 2.73852957435047,
"grad_norm": 0.06813435830119051,
"learning_rate": 0.0018212813687499363,
"loss": 2.7464,
"step": 8670
},
{
"epoch": 2.7401089789149493,
"grad_norm": 0.07370358186967162,
"learning_rate": 0.001820966630583675,
"loss": 2.5753,
"step": 8675
},
{
"epoch": 2.741688383479428,
"grad_norm": 0.09256574504652948,
"learning_rate": 0.0018206516427667068,
"loss": 2.7527,
"step": 8680
},
{
"epoch": 2.7432677880439074,
"grad_norm": 0.09791825478747279,
"learning_rate": 0.001820336405394817,
"loss": 2.6537,
"step": 8685
},
{
"epoch": 2.7448471926083866,
"grad_norm": 0.0711977245700122,
"learning_rate": 0.0018200209185638676,
"loss": 2.7593,
"step": 8690
},
{
"epoch": 2.746426597172866,
"grad_norm": 0.06319159971095922,
"learning_rate": 0.0018197051823697964,
"loss": 2.6357,
"step": 8695
},
{
"epoch": 2.748006001737345,
"grad_norm": 0.07053879924636965,
"learning_rate": 0.0018193891969086162,
"loss": 2.7059,
"step": 8700
},
{
"epoch": 2.7495854063018244,
"grad_norm": 0.0728205579126769,
"learning_rate": 0.0018190729622764167,
"loss": 2.6518,
"step": 8705
},
{
"epoch": 2.751164810866303,
"grad_norm": 0.057084351505856216,
"learning_rate": 0.0018187564785693625,
"loss": 2.5939,
"step": 8710
},
{
"epoch": 2.7527442154307824,
"grad_norm": 0.06874940758099113,
"learning_rate": 0.001818439745883694,
"loss": 2.6319,
"step": 8715
},
{
"epoch": 2.7543236199952617,
"grad_norm": 0.07274610037668357,
"learning_rate": 0.0018181227643157283,
"loss": 2.5699,
"step": 8720
},
{
"epoch": 2.755903024559741,
"grad_norm": 0.06424607912088136,
"learning_rate": 0.001817805533961857,
"loss": 2.6361,
"step": 8725
},
{
"epoch": 2.75748242912422,
"grad_norm": 0.07664431543647424,
"learning_rate": 0.001817488054918548,
"loss": 2.6116,
"step": 8730
},
{
"epoch": 2.7590618336886994,
"grad_norm": 0.07680904439386883,
"learning_rate": 0.0018171703272823444,
"loss": 2.6897,
"step": 8735
},
{
"epoch": 2.7606412382531786,
"grad_norm": 0.08070368351555539,
"learning_rate": 0.0018168523511498656,
"loss": 2.5728,
"step": 8740
},
{
"epoch": 2.762220642817658,
"grad_norm": 0.10305902984751368,
"learning_rate": 0.0018165341266178055,
"loss": 2.6039,
"step": 8745
},
{
"epoch": 2.763800047382137,
"grad_norm": 0.08180323592745962,
"learning_rate": 0.0018162156537829346,
"loss": 2.6779,
"step": 8750
},
{
"epoch": 2.7653794519466164,
"grad_norm": 0.08138935595180714,
"learning_rate": 0.0018158969327420984,
"loss": 2.7306,
"step": 8755
},
{
"epoch": 2.766958856511095,
"grad_norm": 0.07257660977943017,
"learning_rate": 0.0018155779635922178,
"loss": 2.642,
"step": 8760
},
{
"epoch": 2.7685382610755744,
"grad_norm": 0.07180881891544696,
"learning_rate": 0.0018152587464302897,
"loss": 2.7033,
"step": 8765
},
{
"epoch": 2.7701176656400537,
"grad_norm": 0.06433718516880173,
"learning_rate": 0.0018149392813533853,
"loss": 2.6953,
"step": 8770
},
{
"epoch": 2.771697070204533,
"grad_norm": 0.07729242651671715,
"learning_rate": 0.001814619568458652,
"loss": 2.7412,
"step": 8775
},
{
"epoch": 2.773276474769012,
"grad_norm": 0.07921295884974441,
"learning_rate": 0.0018142996078433131,
"loss": 2.6721,
"step": 8780
},
{
"epoch": 2.7748558793334914,
"grad_norm": 0.07152778542092869,
"learning_rate": 0.001813979399604666,
"loss": 2.7206,
"step": 8785
},
{
"epoch": 2.7764352838979702,
"grad_norm": 0.0733619109588905,
"learning_rate": 0.001813658943840084,
"loss": 2.6843,
"step": 8790
},
{
"epoch": 2.7780146884624495,
"grad_norm": 0.07316169248525446,
"learning_rate": 0.001813338240647016,
"loss": 2.7021,
"step": 8795
},
{
"epoch": 2.7795940930269287,
"grad_norm": 0.08060596316716877,
"learning_rate": 0.0018130172901229856,
"loss": 2.7007,
"step": 8800
},
{
"epoch": 2.781173497591408,
"grad_norm": 0.11422921262794959,
"learning_rate": 0.0018126960923655914,
"loss": 2.6103,
"step": 8805
},
{
"epoch": 2.782752902155887,
"grad_norm": 0.06882021473600086,
"learning_rate": 0.0018123746474725084,
"loss": 2.6624,
"step": 8810
},
{
"epoch": 2.7843323067203665,
"grad_norm": 0.08117965393169584,
"learning_rate": 0.0018120529555414855,
"loss": 2.6246,
"step": 8815
},
{
"epoch": 2.7859117112848457,
"grad_norm": 0.06802721481144403,
"learning_rate": 0.001811731016670347,
"loss": 2.7133,
"step": 8820
},
{
"epoch": 2.787491115849325,
"grad_norm": 0.0712690638293664,
"learning_rate": 0.0018114088309569927,
"loss": 2.6747,
"step": 8825
},
{
"epoch": 2.789070520413804,
"grad_norm": 0.09285584313170489,
"learning_rate": 0.001811086398499397,
"loss": 2.6563,
"step": 8830
},
{
"epoch": 2.7906499249782835,
"grad_norm": 0.07566449189012191,
"learning_rate": 0.0018107637193956099,
"loss": 2.6572,
"step": 8835
},
{
"epoch": 2.7922293295427623,
"grad_norm": 0.07798799454761576,
"learning_rate": 0.0018104407937437558,
"loss": 2.7,
"step": 8840
},
{
"epoch": 2.7938087341072415,
"grad_norm": 0.07913274316285812,
"learning_rate": 0.0018101176216420343,
"loss": 2.7162,
"step": 8845
},
{
"epoch": 2.7953881386717208,
"grad_norm": 0.08250947208597523,
"learning_rate": 0.0018097942031887197,
"loss": 2.5854,
"step": 8850
},
{
"epoch": 2.7969675432362,
"grad_norm": 0.0725336504766197,
"learning_rate": 0.0018094705384821626,
"loss": 2.6304,
"step": 8855
},
{
"epoch": 2.7985469478006793,
"grad_norm": 0.07030194627786095,
"learning_rate": 0.0018091466276207863,
"loss": 2.6804,
"step": 8860
},
{
"epoch": 2.8001263523651585,
"grad_norm": 0.07505722120062171,
"learning_rate": 0.00180882247070309,
"loss": 2.626,
"step": 8865
},
{
"epoch": 2.8017057569296373,
"grad_norm": 0.06820303064495715,
"learning_rate": 0.0018084980678276482,
"loss": 2.7694,
"step": 8870
},
{
"epoch": 2.8032851614941166,
"grad_norm": 0.056961786091470915,
"learning_rate": 0.0018081734190931096,
"loss": 2.5769,
"step": 8875
},
{
"epoch": 2.804864566058596,
"grad_norm": 0.06909140931913867,
"learning_rate": 0.001807848524598198,
"loss": 2.6962,
"step": 8880
},
{
"epoch": 2.806443970623075,
"grad_norm": 0.0648444583829478,
"learning_rate": 0.0018075233844417117,
"loss": 2.7129,
"step": 8885
},
{
"epoch": 2.8080233751875543,
"grad_norm": 0.07674247498206403,
"learning_rate": 0.001807197998722523,
"loss": 2.6899,
"step": 8890
},
{
"epoch": 2.8096027797520335,
"grad_norm": 0.08273112738453625,
"learning_rate": 0.0018068723675395807,
"loss": 2.6684,
"step": 8895
},
{
"epoch": 2.811182184316513,
"grad_norm": 0.08260356529408745,
"learning_rate": 0.0018065464909919067,
"loss": 2.692,
"step": 8900
},
{
"epoch": 2.812761588880992,
"grad_norm": 0.07402764354032675,
"learning_rate": 0.0018062203691785977,
"loss": 2.6813,
"step": 8905
},
{
"epoch": 2.8143409934454713,
"grad_norm": 0.07010201292401957,
"learning_rate": 0.0018058940021988257,
"loss": 2.6113,
"step": 8910
},
{
"epoch": 2.81592039800995,
"grad_norm": 0.06834416411554649,
"learning_rate": 0.0018055673901518365,
"loss": 2.6313,
"step": 8915
},
{
"epoch": 2.8174998025744293,
"grad_norm": 0.0805976545965805,
"learning_rate": 0.001805240533136951,
"loss": 2.5651,
"step": 8920
},
{
"epoch": 2.8190792071389086,
"grad_norm": 0.08638320793892658,
"learning_rate": 0.001804913431253564,
"loss": 2.5595,
"step": 8925
},
{
"epoch": 2.820658611703388,
"grad_norm": 0.08947587288989334,
"learning_rate": 0.0018045860846011455,
"loss": 2.7886,
"step": 8930
},
{
"epoch": 2.822238016267867,
"grad_norm": 0.07867181893362114,
"learning_rate": 0.0018042584932792393,
"loss": 2.6616,
"step": 8935
},
{
"epoch": 2.8238174208323463,
"grad_norm": 0.08242063971099742,
"learning_rate": 0.001803930657387464,
"loss": 2.6616,
"step": 8940
},
{
"epoch": 2.825396825396825,
"grad_norm": 0.06935681434474747,
"learning_rate": 0.0018036025770255119,
"loss": 2.6493,
"step": 8945
},
{
"epoch": 2.8269762299613044,
"grad_norm": 0.07899255108632972,
"learning_rate": 0.0018032742522931505,
"loss": 2.704,
"step": 8950
},
{
"epoch": 2.8285556345257836,
"grad_norm": 0.08103020399055681,
"learning_rate": 0.0018029456832902213,
"loss": 2.5642,
"step": 8955
},
{
"epoch": 2.830135039090263,
"grad_norm": 0.06181469755310813,
"learning_rate": 0.0018026168701166401,
"loss": 2.6621,
"step": 8960
},
{
"epoch": 2.831714443654742,
"grad_norm": 0.06706026030234262,
"learning_rate": 0.0018022878128723966,
"loss": 2.5971,
"step": 8965
},
{
"epoch": 2.8332938482192214,
"grad_norm": 0.06883385907004572,
"learning_rate": 0.0018019585116575555,
"loss": 2.5211,
"step": 8970
},
{
"epoch": 2.8348732527837006,
"grad_norm": 0.10706360603552771,
"learning_rate": 0.0018016289665722543,
"loss": 2.6044,
"step": 8975
},
{
"epoch": 2.83645265734818,
"grad_norm": 0.08541241786364799,
"learning_rate": 0.0018012991777167065,
"loss": 2.6751,
"step": 8980
},
{
"epoch": 2.838032061912659,
"grad_norm": 0.08674184408950136,
"learning_rate": 0.0018009691451911984,
"loss": 2.5867,
"step": 8985
},
{
"epoch": 2.8396114664771384,
"grad_norm": 0.07121169031771728,
"learning_rate": 0.001800638869096091,
"loss": 2.6856,
"step": 8990
},
{
"epoch": 2.841190871041617,
"grad_norm": 0.08691352314313025,
"learning_rate": 0.0018003083495318184,
"loss": 2.6154,
"step": 8995
},
{
"epoch": 2.8427702756060964,
"grad_norm": 0.0695792691516324,
"learning_rate": 0.00179997758659889,
"loss": 2.6422,
"step": 9000
},
{
"epoch": 2.8443496801705757,
"grad_norm": 0.06266454082852482,
"learning_rate": 0.0017996465803978893,
"loss": 2.5924,
"step": 9005
},
{
"epoch": 2.845929084735055,
"grad_norm": 0.06347254201154383,
"learning_rate": 0.0017993153310294722,
"loss": 2.6595,
"step": 9010
},
{
"epoch": 2.847508489299534,
"grad_norm": 0.07694282442243464,
"learning_rate": 0.0017989838385943698,
"loss": 2.6099,
"step": 9015
},
{
"epoch": 2.8490878938640134,
"grad_norm": 0.0721336693151254,
"learning_rate": 0.0017986521031933874,
"loss": 2.5332,
"step": 9020
},
{
"epoch": 2.850667298428492,
"grad_norm": 0.08170449455876193,
"learning_rate": 0.0017983201249274026,
"loss": 2.6325,
"step": 9025
},
{
"epoch": 2.8522467029929714,
"grad_norm": 0.08908686952938166,
"learning_rate": 0.0017979879038973687,
"loss": 2.7075,
"step": 9030
},
{
"epoch": 2.8538261075574507,
"grad_norm": 0.06949885801379575,
"learning_rate": 0.0017976554402043116,
"loss": 2.7515,
"step": 9035
},
{
"epoch": 2.85540551212193,
"grad_norm": 0.07438198238236672,
"learning_rate": 0.0017973227339493317,
"loss": 2.6265,
"step": 9040
},
{
"epoch": 2.856984916686409,
"grad_norm": 0.07955118978605713,
"learning_rate": 0.0017969897852336027,
"loss": 2.6064,
"step": 9045
},
{
"epoch": 2.8585643212508884,
"grad_norm": 0.07190618121590747,
"learning_rate": 0.0017966565941583716,
"loss": 2.6345,
"step": 9050
},
{
"epoch": 2.8601437258153677,
"grad_norm": 0.08231651950371446,
"learning_rate": 0.0017963231608249606,
"loss": 2.5874,
"step": 9055
},
{
"epoch": 2.861723130379847,
"grad_norm": 0.06309285425461614,
"learning_rate": 0.0017959894853347641,
"loss": 2.7133,
"step": 9060
},
{
"epoch": 2.863302534944326,
"grad_norm": 0.07904453688390618,
"learning_rate": 0.001795655567789251,
"loss": 2.6427,
"step": 9065
},
{
"epoch": 2.8648819395088054,
"grad_norm": 0.07721599679986656,
"learning_rate": 0.0017953214082899631,
"loss": 2.6347,
"step": 9070
},
{
"epoch": 2.8664613440732842,
"grad_norm": 0.06904436553479418,
"learning_rate": 0.0017949870069385167,
"loss": 2.5924,
"step": 9075
},
{
"epoch": 2.8680407486377635,
"grad_norm": 0.08396225233471166,
"learning_rate": 0.0017946523638366005,
"loss": 2.655,
"step": 9080
},
{
"epoch": 2.8696201532022427,
"grad_norm": 0.07898738662235477,
"learning_rate": 0.0017943174790859778,
"loss": 2.6799,
"step": 9085
},
{
"epoch": 2.871199557766722,
"grad_norm": 0.08239123848848463,
"learning_rate": 0.0017939823527884844,
"loss": 2.5772,
"step": 9090
},
{
"epoch": 2.8727789623312012,
"grad_norm": 0.07204567013237982,
"learning_rate": 0.001793646985046031,
"loss": 2.6602,
"step": 9095
},
{
"epoch": 2.8743583668956805,
"grad_norm": 0.0704621218942999,
"learning_rate": 0.0017933113759605996,
"loss": 2.781,
"step": 9100
},
{
"epoch": 2.8759377714601593,
"grad_norm": 0.07728139646563548,
"learning_rate": 0.001792975525634248,
"loss": 2.6204,
"step": 9105
},
{
"epoch": 2.8775171760246385,
"grad_norm": 0.07943593620001566,
"learning_rate": 0.001792639434169105,
"loss": 2.5765,
"step": 9110
},
{
"epoch": 2.8790965805891178,
"grad_norm": 0.09213001247403269,
"learning_rate": 0.0017923031016673745,
"loss": 2.5809,
"step": 9115
},
{
"epoch": 2.880675985153597,
"grad_norm": 0.08413854923337333,
"learning_rate": 0.0017919665282313333,
"loss": 2.5438,
"step": 9120
},
{
"epoch": 2.8822553897180763,
"grad_norm": 0.07453017380711009,
"learning_rate": 0.0017916297139633304,
"loss": 2.6411,
"step": 9125
},
{
"epoch": 2.8838347942825555,
"grad_norm": 0.07573358551384778,
"learning_rate": 0.0017912926589657896,
"loss": 2.6865,
"step": 9130
},
{
"epoch": 2.8854141988470348,
"grad_norm": 0.08215839866502005,
"learning_rate": 0.0017909553633412068,
"loss": 2.6374,
"step": 9135
},
{
"epoch": 2.886993603411514,
"grad_norm": 0.06722700205323821,
"learning_rate": 0.0017906178271921518,
"loss": 2.6867,
"step": 9140
},
{
"epoch": 2.8885730079759933,
"grad_norm": 0.08422569175308589,
"learning_rate": 0.0017902800506212667,
"loss": 2.6698,
"step": 9145
},
{
"epoch": 2.8901524125404725,
"grad_norm": 0.07714249389406545,
"learning_rate": 0.0017899420337312674,
"loss": 2.6391,
"step": 9150
},
{
"epoch": 2.8917318171049513,
"grad_norm": 0.07899679303828389,
"learning_rate": 0.0017896037766249428,
"loss": 2.6281,
"step": 9155
},
{
"epoch": 2.8933112216694306,
"grad_norm": 0.065691455655413,
"learning_rate": 0.0017892652794051548,
"loss": 2.5724,
"step": 9160
},
{
"epoch": 2.89489062623391,
"grad_norm": 0.07336744027738036,
"learning_rate": 0.001788926542174838,
"loss": 2.6424,
"step": 9165
},
{
"epoch": 2.896470030798389,
"grad_norm": 0.0744732310866987,
"learning_rate": 0.0017885875650370002,
"loss": 2.6277,
"step": 9170
},
{
"epoch": 2.8980494353628683,
"grad_norm": 0.06972848194547832,
"learning_rate": 0.0017882483480947224,
"loss": 2.5853,
"step": 9175
},
{
"epoch": 2.8996288399273475,
"grad_norm": 0.08866138334123327,
"learning_rate": 0.001787908891451158,
"loss": 2.631,
"step": 9180
},
{
"epoch": 2.9012082444918263,
"grad_norm": 0.06284868660280536,
"learning_rate": 0.0017875691952095342,
"loss": 2.5575,
"step": 9185
},
{
"epoch": 2.9027876490563056,
"grad_norm": 0.06891932325044023,
"learning_rate": 0.0017872292594731498,
"loss": 2.6453,
"step": 9190
},
{
"epoch": 2.904367053620785,
"grad_norm": 0.09066742526387073,
"learning_rate": 0.0017868890843453773,
"loss": 2.6731,
"step": 9195
},
{
"epoch": 2.905946458185264,
"grad_norm": 0.06442962526556177,
"learning_rate": 0.0017865486699296623,
"loss": 2.5985,
"step": 9200
},
{
"epoch": 2.9075258627497433,
"grad_norm": 0.08971001588641436,
"learning_rate": 0.0017862080163295216,
"loss": 2.6238,
"step": 9205
},
{
"epoch": 2.9091052673142226,
"grad_norm": 0.08715457828992919,
"learning_rate": 0.0017858671236485467,
"loss": 2.6166,
"step": 9210
},
{
"epoch": 2.910684671878702,
"grad_norm": 0.07841888929423384,
"learning_rate": 0.0017855259919904002,
"loss": 2.6162,
"step": 9215
},
{
"epoch": 2.912264076443181,
"grad_norm": 0.09556274750492062,
"learning_rate": 0.0017851846214588189,
"loss": 2.6934,
"step": 9220
},
{
"epoch": 2.9138434810076603,
"grad_norm": 0.08687540070929045,
"learning_rate": 0.0017848430121576101,
"loss": 2.6846,
"step": 9225
},
{
"epoch": 2.9154228855721396,
"grad_norm": 0.0794664687643129,
"learning_rate": 0.0017845011641906563,
"loss": 2.7429,
"step": 9230
},
{
"epoch": 2.9170022901366184,
"grad_norm": 0.0564704948366244,
"learning_rate": 0.001784159077661911,
"loss": 2.6262,
"step": 9235
},
{
"epoch": 2.9185816947010976,
"grad_norm": 0.06430940252174853,
"learning_rate": 0.0017838167526754,
"loss": 2.694,
"step": 9240
},
{
"epoch": 2.920161099265577,
"grad_norm": 0.0739805659056966,
"learning_rate": 0.0017834741893352226,
"loss": 2.7582,
"step": 9245
},
{
"epoch": 2.921740503830056,
"grad_norm": 0.07103193701377211,
"learning_rate": 0.00178313138774555,
"loss": 2.651,
"step": 9250
},
{
"epoch": 2.9233199083945354,
"grad_norm": 0.07525436923618634,
"learning_rate": 0.0017827883480106257,
"loss": 2.8208,
"step": 9255
},
{
"epoch": 2.924899312959014,
"grad_norm": 0.06882539320170787,
"learning_rate": 0.0017824450702347663,
"loss": 2.6875,
"step": 9260
},
{
"epoch": 2.9264787175234934,
"grad_norm": 0.0605989026210221,
"learning_rate": 0.0017821015545223604,
"loss": 2.7123,
"step": 9265
},
{
"epoch": 2.9280581220879727,
"grad_norm": 0.06663742358876884,
"learning_rate": 0.0017817578009778686,
"loss": 2.6781,
"step": 9270
},
{
"epoch": 2.929637526652452,
"grad_norm": 0.09360076561750917,
"learning_rate": 0.0017814138097058244,
"loss": 2.6964,
"step": 9275
},
{
"epoch": 2.931216931216931,
"grad_norm": 0.07721356789273802,
"learning_rate": 0.001781069580810833,
"loss": 2.7239,
"step": 9280
},
{
"epoch": 2.9327963357814104,
"grad_norm": 0.0791584728486138,
"learning_rate": 0.0017807251143975727,
"loss": 2.6406,
"step": 9285
},
{
"epoch": 2.9343757403458897,
"grad_norm": 0.08301309547051534,
"learning_rate": 0.0017803804105707933,
"loss": 2.6048,
"step": 9290
},
{
"epoch": 2.935955144910369,
"grad_norm": 0.08859445122320836,
"learning_rate": 0.0017800354694353167,
"loss": 2.5487,
"step": 9295
},
{
"epoch": 2.937534549474848,
"grad_norm": 0.0708367658010825,
"learning_rate": 0.001779690291096038,
"loss": 2.7331,
"step": 9300
},
{
"epoch": 2.9391139540393274,
"grad_norm": 0.08041620724686906,
"learning_rate": 0.001779344875657923,
"loss": 2.7053,
"step": 9305
},
{
"epoch": 2.940693358603806,
"grad_norm": 0.07920350812926849,
"learning_rate": 0.0017789992232260113,
"loss": 2.5801,
"step": 9310
},
{
"epoch": 2.9422727631682855,
"grad_norm": 0.0679019390245231,
"learning_rate": 0.0017786533339054125,
"loss": 2.637,
"step": 9315
},
{
"epoch": 2.9438521677327647,
"grad_norm": 0.08024512137613557,
"learning_rate": 0.00177830720780131,
"loss": 2.717,
"step": 9320
},
{
"epoch": 2.945431572297244,
"grad_norm": 0.06745817346604058,
"learning_rate": 0.001777960845018958,
"loss": 2.6298,
"step": 9325
},
{
"epoch": 2.947010976861723,
"grad_norm": 0.07363382024958783,
"learning_rate": 0.0017776142456636843,
"loss": 2.7999,
"step": 9330
},
{
"epoch": 2.9485903814262024,
"grad_norm": 0.08057562652277658,
"learning_rate": 0.0017772674098408864,
"loss": 2.6378,
"step": 9335
},
{
"epoch": 2.9501697859906812,
"grad_norm": 0.0824127445696666,
"learning_rate": 0.0017769203376560353,
"loss": 2.5429,
"step": 9340
},
{
"epoch": 2.9517491905551605,
"grad_norm": 0.0678009430109042,
"learning_rate": 0.0017765730292146728,
"loss": 2.6524,
"step": 9345
},
{
"epoch": 2.9533285951196397,
"grad_norm": 0.06532565401449761,
"learning_rate": 0.0017762254846224144,
"loss": 2.5821,
"step": 9350
},
{
"epoch": 2.954907999684119,
"grad_norm": 0.10240606684967388,
"learning_rate": 0.0017758777039849456,
"loss": 2.6531,
"step": 9355
},
{
"epoch": 2.9564874042485982,
"grad_norm": 0.06945713608409096,
"learning_rate": 0.001775529687408024,
"loss": 2.5384,
"step": 9360
},
{
"epoch": 2.9580668088130775,
"grad_norm": 0.06926524796204718,
"learning_rate": 0.0017751814349974797,
"loss": 2.6369,
"step": 9365
},
{
"epoch": 2.9596462133775567,
"grad_norm": 0.07910183159512352,
"learning_rate": 0.0017748329468592137,
"loss": 2.6389,
"step": 9370
},
{
"epoch": 2.961225617942036,
"grad_norm": 0.07519712410902045,
"learning_rate": 0.001774484223099199,
"loss": 2.6558,
"step": 9375
},
{
"epoch": 2.9628050225065152,
"grad_norm": 0.09726346801188548,
"learning_rate": 0.0017741352638234807,
"loss": 2.6349,
"step": 9380
},
{
"epoch": 2.9643844270709945,
"grad_norm": 0.07907053145441649,
"learning_rate": 0.0017737860691381742,
"loss": 2.5493,
"step": 9385
},
{
"epoch": 2.9659638316354733,
"grad_norm": 0.08620631358894117,
"learning_rate": 0.0017734366391494684,
"loss": 2.5902,
"step": 9390
},
{
"epoch": 2.9675432361999525,
"grad_norm": 0.09907659889237273,
"learning_rate": 0.0017730869739636219,
"loss": 2.5682,
"step": 9395
},
{
"epoch": 2.9691226407644318,
"grad_norm": 0.10191989416827558,
"learning_rate": 0.0017727370736869662,
"loss": 2.6995,
"step": 9400
},
{
"epoch": 2.970702045328911,
"grad_norm": 0.08278402092617577,
"learning_rate": 0.0017723869384259038,
"loss": 2.6782,
"step": 9405
},
{
"epoch": 2.9722814498933903,
"grad_norm": 0.062115181425159635,
"learning_rate": 0.0017720365682869078,
"loss": 2.631,
"step": 9410
},
{
"epoch": 2.9738608544578695,
"grad_norm": 0.07399639042405928,
"learning_rate": 0.0017716859633765244,
"loss": 2.5604,
"step": 9415
},
{
"epoch": 2.9754402590223483,
"grad_norm": 0.07863036800465888,
"learning_rate": 0.00177133512380137,
"loss": 2.6059,
"step": 9420
},
{
"epoch": 2.9770196635868276,
"grad_norm": 0.08014900090250153,
"learning_rate": 0.0017709840496681324,
"loss": 2.5692,
"step": 9425
},
{
"epoch": 2.978599068151307,
"grad_norm": 0.07315791918630918,
"learning_rate": 0.0017706327410835713,
"loss": 2.5941,
"step": 9430
},
{
"epoch": 2.980178472715786,
"grad_norm": 0.08420232912319141,
"learning_rate": 0.0017702811981545174,
"loss": 2.6569,
"step": 9435
},
{
"epoch": 2.9817578772802653,
"grad_norm": 0.07675128706356427,
"learning_rate": 0.001769929420987873,
"loss": 2.6167,
"step": 9440
},
{
"epoch": 2.9833372818447446,
"grad_norm": 0.07447243945199063,
"learning_rate": 0.0017695774096906103,
"loss": 2.6945,
"step": 9445
},
{
"epoch": 2.984916686409224,
"grad_norm": 0.06591328173763196,
"learning_rate": 0.0017692251643697747,
"loss": 2.6763,
"step": 9450
},
{
"epoch": 2.986496090973703,
"grad_norm": 0.06352381774469144,
"learning_rate": 0.0017688726851324812,
"loss": 2.5987,
"step": 9455
},
{
"epoch": 2.9880754955381823,
"grad_norm": 0.0725897617533654,
"learning_rate": 0.0017685199720859166,
"loss": 2.7126,
"step": 9460
},
{
"epoch": 2.9896549001026615,
"grad_norm": 0.06655104738152724,
"learning_rate": 0.0017681670253373385,
"loss": 2.6709,
"step": 9465
},
{
"epoch": 2.9912343046671404,
"grad_norm": 0.06338658405519165,
"learning_rate": 0.0017678138449940765,
"loss": 2.6405,
"step": 9470
},
{
"epoch": 2.9928137092316196,
"grad_norm": 0.07272869654997406,
"learning_rate": 0.0017674604311635294,
"loss": 2.691,
"step": 9475
},
{
"epoch": 2.994393113796099,
"grad_norm": 0.06869160712622453,
"learning_rate": 0.0017671067839531687,
"loss": 2.6887,
"step": 9480
},
{
"epoch": 2.995972518360578,
"grad_norm": 0.08164852981872348,
"learning_rate": 0.0017667529034705364,
"loss": 2.6771,
"step": 9485
},
{
"epoch": 2.9975519229250573,
"grad_norm": 0.08074893728400574,
"learning_rate": 0.0017663987898232448,
"loss": 2.6561,
"step": 9490
},
{
"epoch": 2.9991313274895366,
"grad_norm": 0.062375815361901135,
"learning_rate": 0.001766044443118978,
"loss": 2.5912,
"step": 9495
},
{
"epoch": 3.0,
"eval_loss": 2.620439052581787,
"eval_runtime": 118.3599,
"eval_samples_per_second": 22.381,
"eval_steps_per_second": 5.602,
"step": 9498
},
{
"epoch": 3.0006317618257916,
"grad_norm": 0.06853007086830248,
"learning_rate": 0.0017656898634654905,
"loss": 2.5767,
"step": 9500
},
{
"epoch": 3.002211166390271,
"grad_norm": 0.07075166572388598,
"learning_rate": 0.0017653350509706075,
"loss": 2.7542,
"step": 9505
},
{
"epoch": 3.00379057095475,
"grad_norm": 0.07412174784052934,
"learning_rate": 0.0017649800057422257,
"loss": 2.6384,
"step": 9510
},
{
"epoch": 3.0053699755192294,
"grad_norm": 0.07560914597403712,
"learning_rate": 0.0017646247278883115,
"loss": 2.5841,
"step": 9515
},
{
"epoch": 3.0069493800837086,
"grad_norm": 0.060978546234931104,
"learning_rate": 0.0017642692175169029,
"loss": 2.5974,
"step": 9520
},
{
"epoch": 3.008528784648188,
"grad_norm": 0.06719904776452261,
"learning_rate": 0.0017639134747361083,
"loss": 2.604,
"step": 9525
},
{
"epoch": 3.0101081892126667,
"grad_norm": 0.06949796822599907,
"learning_rate": 0.0017635574996541065,
"loss": 2.5313,
"step": 9530
},
{
"epoch": 3.011687593777146,
"grad_norm": 0.09536273280100076,
"learning_rate": 0.001763201292379148,
"loss": 2.5393,
"step": 9535
},
{
"epoch": 3.013266998341625,
"grad_norm": 0.07227009941124116,
"learning_rate": 0.0017628448530195527,
"loss": 2.6122,
"step": 9540
},
{
"epoch": 3.0148464029061044,
"grad_norm": 0.07996626556678565,
"learning_rate": 0.0017624881816837115,
"loss": 2.5837,
"step": 9545
},
{
"epoch": 3.0164258074705836,
"grad_norm": 0.07043989340248183,
"learning_rate": 0.001762131278480086,
"loss": 2.6142,
"step": 9550
},
{
"epoch": 3.018005212035063,
"grad_norm": 0.07238655665106472,
"learning_rate": 0.001761774143517208,
"loss": 2.6761,
"step": 9555
},
{
"epoch": 3.019584616599542,
"grad_norm": 0.07953742761959369,
"learning_rate": 0.0017614167769036797,
"loss": 2.6247,
"step": 9560
},
{
"epoch": 3.0211640211640214,
"grad_norm": 0.07777915965850513,
"learning_rate": 0.0017610591787481748,
"loss": 2.5844,
"step": 9565
},
{
"epoch": 3.0227434257285,
"grad_norm": 0.07908379772586491,
"learning_rate": 0.001760701349159436,
"loss": 2.6966,
"step": 9570
},
{
"epoch": 3.0243228302929794,
"grad_norm": 0.08185851953480823,
"learning_rate": 0.0017603432882462773,
"loss": 2.5849,
"step": 9575
},
{
"epoch": 3.0259022348574587,
"grad_norm": 0.0770538911908877,
"learning_rate": 0.0017599849961175825,
"loss": 2.572,
"step": 9580
},
{
"epoch": 3.027481639421938,
"grad_norm": 0.08170033315514708,
"learning_rate": 0.0017596264728823063,
"loss": 2.635,
"step": 9585
},
{
"epoch": 3.029061043986417,
"grad_norm": 0.07094356873902435,
"learning_rate": 0.0017592677186494727,
"loss": 2.6195,
"step": 9590
},
{
"epoch": 3.0306404485508964,
"grad_norm": 0.07146743501629951,
"learning_rate": 0.0017589087335281772,
"loss": 2.5883,
"step": 9595
},
{
"epoch": 3.0322198531153757,
"grad_norm": 0.06500767908445167,
"learning_rate": 0.0017585495176275848,
"loss": 2.6091,
"step": 9600
},
{
"epoch": 3.0337992576798545,
"grad_norm": 0.08646040532750637,
"learning_rate": 0.00175819007105693,
"loss": 2.6357,
"step": 9605
},
{
"epoch": 3.0353786622443337,
"grad_norm": 0.07956158839233514,
"learning_rate": 0.0017578303939255195,
"loss": 2.6634,
"step": 9610
},
{
"epoch": 3.036958066808813,
"grad_norm": 0.077271325539974,
"learning_rate": 0.0017574704863427277,
"loss": 2.5939,
"step": 9615
},
{
"epoch": 3.038537471373292,
"grad_norm": 0.06878782348149351,
"learning_rate": 0.0017571103484180007,
"loss": 2.6799,
"step": 9620
},
{
"epoch": 3.0401168759377715,
"grad_norm": 0.07725957018593155,
"learning_rate": 0.0017567499802608542,
"loss": 2.6557,
"step": 9625
},
{
"epoch": 3.0416962805022507,
"grad_norm": 0.06834187968092709,
"learning_rate": 0.0017563893819808737,
"loss": 2.6065,
"step": 9630
},
{
"epoch": 3.04327568506673,
"grad_norm": 0.06522637545595603,
"learning_rate": 0.0017560285536877148,
"loss": 2.649,
"step": 9635
},
{
"epoch": 3.044855089631209,
"grad_norm": 0.07050536765462488,
"learning_rate": 0.001755667495491103,
"loss": 2.6291,
"step": 9640
},
{
"epoch": 3.046434494195688,
"grad_norm": 0.06027223804064083,
"learning_rate": 0.0017553062075008339,
"loss": 2.5298,
"step": 9645
},
{
"epoch": 3.0480138987601673,
"grad_norm": 0.06690320540753218,
"learning_rate": 0.0017549446898267732,
"loss": 2.6128,
"step": 9650
},
{
"epoch": 3.0495933033246465,
"grad_norm": 0.06605666511918341,
"learning_rate": 0.0017545829425788554,
"loss": 2.6637,
"step": 9655
},
{
"epoch": 3.0511727078891258,
"grad_norm": 0.08318070058406675,
"learning_rate": 0.001754220965867086,
"loss": 2.5553,
"step": 9660
},
{
"epoch": 3.052752112453605,
"grad_norm": 0.062722653781602,
"learning_rate": 0.0017538587598015401,
"loss": 2.5263,
"step": 9665
},
{
"epoch": 3.0543315170180843,
"grad_norm": 0.07799851012916846,
"learning_rate": 0.0017534963244923616,
"loss": 2.6617,
"step": 9670
},
{
"epoch": 3.0559109215825635,
"grad_norm": 0.08255384317691386,
"learning_rate": 0.0017531336600497647,
"loss": 2.6013,
"step": 9675
},
{
"epoch": 3.0574903261470427,
"grad_norm": 0.08425274192174091,
"learning_rate": 0.0017527707665840344,
"loss": 2.5305,
"step": 9680
},
{
"epoch": 3.0590697307115216,
"grad_norm": 0.07106324766354837,
"learning_rate": 0.001752407644205523,
"loss": 2.5293,
"step": 9685
},
{
"epoch": 3.060649135276001,
"grad_norm": 0.07181770959936766,
"learning_rate": 0.0017520442930246546,
"loss": 2.7556,
"step": 9690
},
{
"epoch": 3.06222853984048,
"grad_norm": 0.07549862141267126,
"learning_rate": 0.0017516807131519214,
"loss": 2.5865,
"step": 9695
},
{
"epoch": 3.0638079444049593,
"grad_norm": 0.07924017400028194,
"learning_rate": 0.001751316904697886,
"loss": 2.5501,
"step": 9700
},
{
"epoch": 3.0653873489694385,
"grad_norm": 0.07144201821865072,
"learning_rate": 0.0017509528677731802,
"loss": 2.6066,
"step": 9705
},
{
"epoch": 3.066966753533918,
"grad_norm": 0.08845565636076341,
"learning_rate": 0.0017505886024885055,
"loss": 2.6793,
"step": 9710
},
{
"epoch": 3.068546158098397,
"grad_norm": 0.07287890828079492,
"learning_rate": 0.0017502241089546323,
"loss": 2.7102,
"step": 9715
},
{
"epoch": 3.0701255626628763,
"grad_norm": 0.06424521906054119,
"learning_rate": 0.0017498593872824007,
"loss": 2.5869,
"step": 9720
},
{
"epoch": 3.071704967227355,
"grad_norm": 0.07195688620873095,
"learning_rate": 0.0017494944375827206,
"loss": 2.5623,
"step": 9725
},
{
"epoch": 3.0732843717918343,
"grad_norm": 0.08059766034889969,
"learning_rate": 0.0017491292599665705,
"loss": 2.5604,
"step": 9730
},
{
"epoch": 3.0748637763563136,
"grad_norm": 0.06236888181816163,
"learning_rate": 0.0017487638545449992,
"loss": 2.6125,
"step": 9735
},
{
"epoch": 3.076443180920793,
"grad_norm": 0.0673023550989496,
"learning_rate": 0.0017483982214291233,
"loss": 2.6745,
"step": 9740
},
{
"epoch": 3.078022585485272,
"grad_norm": 0.07204762804360407,
"learning_rate": 0.00174803236073013,
"loss": 2.5546,
"step": 9745
},
{
"epoch": 3.0796019900497513,
"grad_norm": 0.06487939861273825,
"learning_rate": 0.001747666272559275,
"loss": 2.5708,
"step": 9750
},
{
"epoch": 3.0811813946142306,
"grad_norm": 0.06301181261958937,
"learning_rate": 0.0017472999570278835,
"loss": 2.502,
"step": 9755
},
{
"epoch": 3.08276079917871,
"grad_norm": 0.07313450032879039,
"learning_rate": 0.0017469334142473502,
"loss": 2.5558,
"step": 9760
},
{
"epoch": 3.0843402037431886,
"grad_norm": 0.06526967263517186,
"learning_rate": 0.0017465666443291373,
"loss": 2.6304,
"step": 9765
},
{
"epoch": 3.085919608307668,
"grad_norm": 0.09200477440986868,
"learning_rate": 0.001746199647384778,
"loss": 2.6975,
"step": 9770
},
{
"epoch": 3.087499012872147,
"grad_norm": 0.06902130842790342,
"learning_rate": 0.0017458324235258736,
"loss": 2.6248,
"step": 9775
},
{
"epoch": 3.0890784174366264,
"grad_norm": 0.09976036882145513,
"learning_rate": 0.0017454649728640944,
"loss": 2.7803,
"step": 9780
},
{
"epoch": 3.0906578220011056,
"grad_norm": 0.07615180829761749,
"learning_rate": 0.00174509729551118,
"loss": 2.5904,
"step": 9785
},
{
"epoch": 3.092237226565585,
"grad_norm": 0.07064698067596042,
"learning_rate": 0.0017447293915789385,
"loss": 2.7008,
"step": 9790
},
{
"epoch": 3.093816631130064,
"grad_norm": 0.06806186891626341,
"learning_rate": 0.0017443612611792471,
"loss": 2.6445,
"step": 9795
},
{
"epoch": 3.0953960356945434,
"grad_norm": 0.07944195857211085,
"learning_rate": 0.0017439929044240521,
"loss": 2.5441,
"step": 9800
},
{
"epoch": 3.096975440259022,
"grad_norm": 0.06674703110757672,
"learning_rate": 0.0017436243214253686,
"loss": 2.6004,
"step": 9805
},
{
"epoch": 3.0985548448235014,
"grad_norm": 0.06689002583984435,
"learning_rate": 0.0017432555122952797,
"loss": 2.5689,
"step": 9810
},
{
"epoch": 3.1001342493879807,
"grad_norm": 0.06592068441572836,
"learning_rate": 0.0017428864771459388,
"loss": 2.6173,
"step": 9815
},
{
"epoch": 3.10171365395246,
"grad_norm": 0.07615257679874533,
"learning_rate": 0.0017425172160895662,
"loss": 2.639,
"step": 9820
},
{
"epoch": 3.103293058516939,
"grad_norm": 0.06105335137406268,
"learning_rate": 0.0017421477292384525,
"loss": 2.6122,
"step": 9825
},
{
"epoch": 3.1048724630814184,
"grad_norm": 0.09219718184998302,
"learning_rate": 0.001741778016704956,
"loss": 2.6857,
"step": 9830
},
{
"epoch": 3.1064518676458976,
"grad_norm": 0.07152092051204469,
"learning_rate": 0.0017414080786015038,
"loss": 2.5573,
"step": 9835
},
{
"epoch": 3.108031272210377,
"grad_norm": 0.09511116323096488,
"learning_rate": 0.001741037915040592,
"loss": 2.5658,
"step": 9840
},
{
"epoch": 3.1096106767748557,
"grad_norm": 0.06827500668300775,
"learning_rate": 0.0017406675261347848,
"loss": 2.5764,
"step": 9845
},
{
"epoch": 3.111190081339335,
"grad_norm": 0.06580983086010492,
"learning_rate": 0.0017402969119967155,
"loss": 2.6376,
"step": 9850
},
{
"epoch": 3.112769485903814,
"grad_norm": 0.06724198233210324,
"learning_rate": 0.0017399260727390847,
"loss": 2.6765,
"step": 9855
},
{
"epoch": 3.1143488904682934,
"grad_norm": 0.08960721837846888,
"learning_rate": 0.0017395550084746629,
"loss": 2.5794,
"step": 9860
},
{
"epoch": 3.1159282950327727,
"grad_norm": 0.0702518321175971,
"learning_rate": 0.0017391837193162882,
"loss": 2.5468,
"step": 9865
},
{
"epoch": 3.117507699597252,
"grad_norm": 0.060236678991715406,
"learning_rate": 0.0017388122053768674,
"loss": 2.6592,
"step": 9870
},
{
"epoch": 3.119087104161731,
"grad_norm": 0.07673494445082678,
"learning_rate": 0.001738440466769375,
"loss": 2.6642,
"step": 9875
},
{
"epoch": 3.1206665087262104,
"grad_norm": 0.0920573903652007,
"learning_rate": 0.001738068503606855,
"loss": 2.614,
"step": 9880
},
{
"epoch": 3.1222459132906892,
"grad_norm": 0.08801492578726929,
"learning_rate": 0.0017376963160024184,
"loss": 2.595,
"step": 9885
},
{
"epoch": 3.1238253178551685,
"grad_norm": 0.07984682108598687,
"learning_rate": 0.0017373239040692455,
"loss": 2.6862,
"step": 9890
},
{
"epoch": 3.1254047224196477,
"grad_norm": 0.06869423990760569,
"learning_rate": 0.0017369512679205844,
"loss": 2.6215,
"step": 9895
},
{
"epoch": 3.126984126984127,
"grad_norm": 0.07429979311671929,
"learning_rate": 0.0017365784076697512,
"loss": 2.6688,
"step": 9900
},
{
"epoch": 3.1285635315486062,
"grad_norm": 0.06488201514452846,
"learning_rate": 0.00173620532343013,
"loss": 2.5465,
"step": 9905
},
{
"epoch": 3.1301429361130855,
"grad_norm": 0.06599819169160161,
"learning_rate": 0.001735832015315174,
"loss": 2.5547,
"step": 9910
},
{
"epoch": 3.1317223406775647,
"grad_norm": 0.07495722665220762,
"learning_rate": 0.0017354584834384035,
"loss": 2.5515,
"step": 9915
},
{
"epoch": 3.1333017452420435,
"grad_norm": 0.10720551807822579,
"learning_rate": 0.001735084727913407,
"loss": 2.6747,
"step": 9920
},
{
"epoch": 3.1348811498065228,
"grad_norm": 0.07199510657860425,
"learning_rate": 0.0017347107488538413,
"loss": 2.6041,
"step": 9925
},
{
"epoch": 3.136460554371002,
"grad_norm": 0.07082328358473082,
"learning_rate": 0.0017343365463734313,
"loss": 2.6154,
"step": 9930
},
{
"epoch": 3.1380399589354813,
"grad_norm": 0.06337705531192898,
"learning_rate": 0.0017339621205859693,
"loss": 2.6622,
"step": 9935
},
{
"epoch": 3.1396193634999605,
"grad_norm": 0.07163483374589369,
"learning_rate": 0.0017335874716053158,
"loss": 2.6712,
"step": 9940
},
{
"epoch": 3.1411987680644398,
"grad_norm": 0.0688206120903354,
"learning_rate": 0.001733212599545399,
"loss": 2.5605,
"step": 9945
},
{
"epoch": 3.142778172628919,
"grad_norm": 0.07385893720045704,
"learning_rate": 0.0017328375045202158,
"loss": 2.6157,
"step": 9950
},
{
"epoch": 3.1443575771933983,
"grad_norm": 0.07979459656789621,
"learning_rate": 0.0017324621866438294,
"loss": 2.6269,
"step": 9955
},
{
"epoch": 3.1459369817578775,
"grad_norm": 0.06655834052180862,
"learning_rate": 0.0017320866460303719,
"loss": 2.532,
"step": 9960
},
{
"epoch": 3.1475163863223563,
"grad_norm": 0.08016158513606657,
"learning_rate": 0.001731710882794043,
"loss": 2.6182,
"step": 9965
},
{
"epoch": 3.1490957908868356,
"grad_norm": 0.07361139147390075,
"learning_rate": 0.0017313348970491092,
"loss": 2.5775,
"step": 9970
},
{
"epoch": 3.150675195451315,
"grad_norm": 0.07320547421945703,
"learning_rate": 0.0017309586889099062,
"loss": 2.6028,
"step": 9975
},
{
"epoch": 3.152254600015794,
"grad_norm": 0.06761582611132382,
"learning_rate": 0.001730582258490836,
"loss": 2.6096,
"step": 9980
},
{
"epoch": 3.1538340045802733,
"grad_norm": 0.061185146073342085,
"learning_rate": 0.001730205605906369,
"loss": 2.6569,
"step": 9985
},
{
"epoch": 3.1554134091447525,
"grad_norm": 0.07883684609795155,
"learning_rate": 0.0017298287312710423,
"loss": 2.628,
"step": 9990
},
{
"epoch": 3.156992813709232,
"grad_norm": 0.06254523129730534,
"learning_rate": 0.0017294516346994615,
"loss": 2.5979,
"step": 9995
},
{
"epoch": 3.1585722182737106,
"grad_norm": 0.06917664553913802,
"learning_rate": 0.0017290743163062994,
"loss": 2.6045,
"step": 10000
},
{
"epoch": 3.16015162283819,
"grad_norm": 0.07158404060336618,
"learning_rate": 0.0017286967762062957,
"loss": 2.5623,
"step": 10005
},
{
"epoch": 3.161731027402669,
"grad_norm": 0.06991889347493886,
"learning_rate": 0.0017283190145142581,
"loss": 2.6436,
"step": 10010
},
{
"epoch": 3.1633104319671483,
"grad_norm": 0.05855646545913392,
"learning_rate": 0.001727941031345062,
"loss": 2.6896,
"step": 10015
},
{
"epoch": 3.1648898365316276,
"grad_norm": 0.05698565849999702,
"learning_rate": 0.0017275628268136486,
"loss": 2.603,
"step": 10020
},
{
"epoch": 3.166469241096107,
"grad_norm": 0.0766093223960529,
"learning_rate": 0.0017271844010350286,
"loss": 2.5193,
"step": 10025
},
{
"epoch": 3.168048645660586,
"grad_norm": 0.08252746996753374,
"learning_rate": 0.0017268057541242779,
"loss": 2.6378,
"step": 10030
},
{
"epoch": 3.1696280502250653,
"grad_norm": 0.07373629477286828,
"learning_rate": 0.0017264268861965414,
"loss": 2.566,
"step": 10035
},
{
"epoch": 3.1712074547895446,
"grad_norm": 0.060221071508953124,
"learning_rate": 0.0017260477973670301,
"loss": 2.5316,
"step": 10040
},
{
"epoch": 3.1727868593540234,
"grad_norm": 0.07124905911273843,
"learning_rate": 0.001725668487751022,
"loss": 2.651,
"step": 10045
},
{
"epoch": 3.1743662639185026,
"grad_norm": 0.05897673333679736,
"learning_rate": 0.0017252889574638638,
"loss": 2.6663,
"step": 10050
},
{
"epoch": 3.175945668482982,
"grad_norm": 0.06628163393976025,
"learning_rate": 0.0017249092066209672,
"loss": 2.4943,
"step": 10055
},
{
"epoch": 3.177525073047461,
"grad_norm": 0.07978408642895472,
"learning_rate": 0.0017245292353378129,
"loss": 2.5568,
"step": 10060
},
{
"epoch": 3.1791044776119404,
"grad_norm": 0.08663555240063688,
"learning_rate": 0.0017241490437299467,
"loss": 2.6161,
"step": 10065
},
{
"epoch": 3.1806838821764196,
"grad_norm": 0.08172943902726323,
"learning_rate": 0.0017237686319129834,
"loss": 2.5907,
"step": 10070
},
{
"epoch": 3.182263286740899,
"grad_norm": 0.08483170153181023,
"learning_rate": 0.0017233880000026031,
"loss": 2.5362,
"step": 10075
},
{
"epoch": 3.1838426913053777,
"grad_norm": 0.07807691290483945,
"learning_rate": 0.0017230071481145544,
"loss": 2.5229,
"step": 10080
},
{
"epoch": 3.185422095869857,
"grad_norm": 0.07373477776597755,
"learning_rate": 0.001722626076364651,
"loss": 2.599,
"step": 10085
},
{
"epoch": 3.187001500434336,
"grad_norm": 0.0776405119123419,
"learning_rate": 0.0017222447848687747,
"loss": 2.5785,
"step": 10090
},
{
"epoch": 3.1885809049988154,
"grad_norm": 0.06815474505145369,
"learning_rate": 0.0017218632737428742,
"loss": 2.6602,
"step": 10095
},
{
"epoch": 3.1901603095632947,
"grad_norm": 0.07314480378904138,
"learning_rate": 0.0017214815431029638,
"loss": 2.6064,
"step": 10100
},
{
"epoch": 3.191739714127774,
"grad_norm": 0.09008509637089876,
"learning_rate": 0.0017210995930651261,
"loss": 2.528,
"step": 10105
},
{
"epoch": 3.193319118692253,
"grad_norm": 0.07379312421095183,
"learning_rate": 0.0017207174237455095,
"loss": 2.5804,
"step": 10110
},
{
"epoch": 3.1948985232567324,
"grad_norm": 0.0639290039333819,
"learning_rate": 0.0017203350352603289,
"loss": 2.5763,
"step": 10115
},
{
"epoch": 3.196477927821211,
"grad_norm": 0.07728896943521313,
"learning_rate": 0.0017199524277258665,
"loss": 2.5719,
"step": 10120
},
{
"epoch": 3.1980573323856905,
"grad_norm": 0.06501244129816812,
"learning_rate": 0.0017195696012584707,
"loss": 2.6149,
"step": 10125
},
{
"epoch": 3.1996367369501697,
"grad_norm": 0.07450112316148066,
"learning_rate": 0.0017191865559745567,
"loss": 2.5726,
"step": 10130
},
{
"epoch": 3.201216141514649,
"grad_norm": 0.06735309656942795,
"learning_rate": 0.001718803291990606,
"loss": 2.5708,
"step": 10135
},
{
"epoch": 3.202795546079128,
"grad_norm": 0.06066504577737473,
"learning_rate": 0.0017184198094231666,
"loss": 2.552,
"step": 10140
},
{
"epoch": 3.2043749506436074,
"grad_norm": 0.07540293640624653,
"learning_rate": 0.0017180361083888537,
"loss": 2.5849,
"step": 10145
},
{
"epoch": 3.2059543552080867,
"grad_norm": 0.08359737734603907,
"learning_rate": 0.0017176521890043474,
"loss": 2.6413,
"step": 10150
},
{
"epoch": 3.207533759772566,
"grad_norm": 0.07467973947266096,
"learning_rate": 0.0017172680513863959,
"loss": 2.516,
"step": 10155
},
{
"epoch": 3.2091131643370447,
"grad_norm": 0.07418893020745522,
"learning_rate": 0.0017168836956518128,
"loss": 2.6784,
"step": 10160
},
{
"epoch": 3.210692568901524,
"grad_norm": 0.06447644926826153,
"learning_rate": 0.001716499121917478,
"loss": 2.5746,
"step": 10165
},
{
"epoch": 3.2122719734660032,
"grad_norm": 0.07266055401505396,
"learning_rate": 0.0017161143303003382,
"loss": 2.5416,
"step": 10170
},
{
"epoch": 3.2138513780304825,
"grad_norm": 0.059584364082995236,
"learning_rate": 0.0017157293209174055,
"loss": 2.6751,
"step": 10175
},
{
"epoch": 3.2154307825949617,
"grad_norm": 0.077773574299169,
"learning_rate": 0.0017153440938857598,
"loss": 2.7405,
"step": 10180
},
{
"epoch": 3.217010187159441,
"grad_norm": 0.07093017068348874,
"learning_rate": 0.0017149586493225453,
"loss": 2.6437,
"step": 10185
},
{
"epoch": 3.2185895917239202,
"grad_norm": 0.06809763786519872,
"learning_rate": 0.0017145729873449737,
"loss": 2.5746,
"step": 10190
},
{
"epoch": 3.2201689962883995,
"grad_norm": 0.07279951057709282,
"learning_rate": 0.0017141871080703223,
"loss": 2.621,
"step": 10195
},
{
"epoch": 3.2217484008528783,
"grad_norm": 0.06777881865146256,
"learning_rate": 0.0017138010116159342,
"loss": 2.6139,
"step": 10200
},
{
"epoch": 3.2233278054173575,
"grad_norm": 0.08188752813516113,
"learning_rate": 0.001713414698099219,
"loss": 2.5855,
"step": 10205
},
{
"epoch": 3.2249072099818368,
"grad_norm": 0.07265247848277875,
"learning_rate": 0.0017130281676376521,
"loss": 2.6521,
"step": 10210
},
{
"epoch": 3.226486614546316,
"grad_norm": 0.08157050567560481,
"learning_rate": 0.0017126414203487755,
"loss": 2.6133,
"step": 10215
},
{
"epoch": 3.2280660191107953,
"grad_norm": 0.09223512463456594,
"learning_rate": 0.001712254456350196,
"loss": 2.7141,
"step": 10220
},
{
"epoch": 3.2296454236752745,
"grad_norm": 0.09052288762989194,
"learning_rate": 0.001711867275759587,
"loss": 2.6932,
"step": 10225
},
{
"epoch": 3.2312248282397538,
"grad_norm": 0.09452848600741288,
"learning_rate": 0.0017114798786946874,
"loss": 2.5789,
"step": 10230
},
{
"epoch": 3.2328042328042326,
"grad_norm": 0.07348911010047247,
"learning_rate": 0.0017110922652733027,
"loss": 2.6713,
"step": 10235
},
{
"epoch": 3.234383637368712,
"grad_norm": 0.07595309187595187,
"learning_rate": 0.0017107044356133036,
"loss": 2.6233,
"step": 10240
},
{
"epoch": 3.235963041933191,
"grad_norm": 0.07837850464064633,
"learning_rate": 0.0017103163898326264,
"loss": 2.5859,
"step": 10245
},
{
"epoch": 3.2375424464976703,
"grad_norm": 0.05919589780375687,
"learning_rate": 0.0017099281280492733,
"loss": 2.6239,
"step": 10250
},
{
"epoch": 3.2391218510621496,
"grad_norm": 0.06421315218708729,
"learning_rate": 0.0017095396503813123,
"loss": 2.563,
"step": 10255
},
{
"epoch": 3.240701255626629,
"grad_norm": 0.06286673187959461,
"learning_rate": 0.001709150956946877,
"loss": 2.5648,
"step": 10260
},
{
"epoch": 3.242280660191108,
"grad_norm": 0.06614146390357328,
"learning_rate": 0.0017087620478641668,
"loss": 2.6656,
"step": 10265
},
{
"epoch": 3.2438600647555873,
"grad_norm": 0.06331427324863376,
"learning_rate": 0.001708372923251446,
"loss": 2.6365,
"step": 10270
},
{
"epoch": 3.2454394693200666,
"grad_norm": 0.058382455520564966,
"learning_rate": 0.0017079835832270454,
"loss": 2.6919,
"step": 10275
},
{
"epoch": 3.2470188738845454,
"grad_norm": 0.059389669999683835,
"learning_rate": 0.00170759402790936,
"loss": 2.4577,
"step": 10280
},
{
"epoch": 3.2485982784490246,
"grad_norm": 0.062319863197590405,
"learning_rate": 0.0017072042574168523,
"loss": 2.6714,
"step": 10285
},
{
"epoch": 3.250177683013504,
"grad_norm": 0.07209810097155507,
"learning_rate": 0.0017068142718680481,
"loss": 2.5454,
"step": 10290
},
{
"epoch": 3.251757087577983,
"grad_norm": 0.06465098796584488,
"learning_rate": 0.00170642407138154,
"loss": 2.5554,
"step": 10295
},
{
"epoch": 3.2533364921424623,
"grad_norm": 0.07919298086200059,
"learning_rate": 0.0017060336560759848,
"loss": 2.6169,
"step": 10300
},
{
"epoch": 3.2549158967069416,
"grad_norm": 0.07458149730812416,
"learning_rate": 0.001705643026070106,
"loss": 2.5148,
"step": 10305
},
{
"epoch": 3.256495301271421,
"grad_norm": 0.06209413237348719,
"learning_rate": 0.0017052521814826913,
"loss": 2.4766,
"step": 10310
},
{
"epoch": 3.2580747058358996,
"grad_norm": 0.06263559867605274,
"learning_rate": 0.0017048611224325945,
"loss": 2.6141,
"step": 10315
},
{
"epoch": 3.259654110400379,
"grad_norm": 0.08859222068557153,
"learning_rate": 0.001704469849038734,
"loss": 2.5519,
"step": 10320
},
{
"epoch": 3.261233514964858,
"grad_norm": 0.13700798939206166,
"learning_rate": 0.001704078361420093,
"loss": 2.5719,
"step": 10325
},
{
"epoch": 3.2628129195293374,
"grad_norm": 0.08242422054130628,
"learning_rate": 0.0017036866596957208,
"loss": 2.543,
"step": 10330
},
{
"epoch": 3.2643923240938166,
"grad_norm": 0.07118911228005752,
"learning_rate": 0.0017032947439847314,
"loss": 2.6253,
"step": 10335
},
{
"epoch": 3.265971728658296,
"grad_norm": 0.06521260116723082,
"learning_rate": 0.001702902614406304,
"loss": 2.6389,
"step": 10340
},
{
"epoch": 3.267551133222775,
"grad_norm": 0.06794795762837166,
"learning_rate": 0.0017025102710796825,
"loss": 2.7081,
"step": 10345
},
{
"epoch": 3.2691305377872544,
"grad_norm": 0.06462409310767205,
"learning_rate": 0.0017021177141241758,
"loss": 2.5127,
"step": 10350
},
{
"epoch": 3.2707099423517336,
"grad_norm": 0.0642066617491224,
"learning_rate": 0.0017017249436591584,
"loss": 2.5866,
"step": 10355
},
{
"epoch": 3.2722893469162124,
"grad_norm": 0.07839870321314671,
"learning_rate": 0.0017013319598040688,
"loss": 2.5108,
"step": 10360
},
{
"epoch": 3.2738687514806917,
"grad_norm": 0.0798891891470443,
"learning_rate": 0.0017009387626784117,
"loss": 2.6293,
"step": 10365
},
{
"epoch": 3.275448156045171,
"grad_norm": 0.08280526365973821,
"learning_rate": 0.0017005453524017548,
"loss": 2.6433,
"step": 10370
},
{
"epoch": 3.27702756060965,
"grad_norm": 0.0722954043793966,
"learning_rate": 0.0017001517290937322,
"loss": 2.5975,
"step": 10375
},
{
"epoch": 3.2786069651741294,
"grad_norm": 0.06287903361816885,
"learning_rate": 0.0016997578928740422,
"loss": 2.5392,
"step": 10380
},
{
"epoch": 3.2801863697386087,
"grad_norm": 0.06314306170320469,
"learning_rate": 0.0016993638438624484,
"loss": 2.6261,
"step": 10385
},
{
"epoch": 3.281765774303088,
"grad_norm": 0.0874231602460201,
"learning_rate": 0.0016989695821787772,
"loss": 2.5358,
"step": 10390
},
{
"epoch": 3.2833451788675667,
"grad_norm": 0.07048809674755248,
"learning_rate": 0.0016985751079429223,
"loss": 2.7399,
"step": 10395
},
{
"epoch": 3.284924583432046,
"grad_norm": 0.08750781590069255,
"learning_rate": 0.0016981804212748404,
"loss": 2.5259,
"step": 10400
},
{
"epoch": 3.286503987996525,
"grad_norm": 0.07232745531335175,
"learning_rate": 0.0016977855222945531,
"loss": 2.6078,
"step": 10405
},
{
"epoch": 3.2880833925610045,
"grad_norm": 0.07351994308267305,
"learning_rate": 0.001697390411122147,
"loss": 2.527,
"step": 10410
},
{
"epoch": 3.2896627971254837,
"grad_norm": 0.06933416120054373,
"learning_rate": 0.0016969950878777723,
"loss": 2.5771,
"step": 10415
},
{
"epoch": 3.291242201689963,
"grad_norm": 0.06861156254584327,
"learning_rate": 0.0016965995526816446,
"loss": 2.5522,
"step": 10420
},
{
"epoch": 3.292821606254442,
"grad_norm": 0.08451993948302478,
"learning_rate": 0.0016962038056540438,
"loss": 2.5522,
"step": 10425
},
{
"epoch": 3.2944010108189214,
"grad_norm": 0.07272459006293112,
"learning_rate": 0.001695807846915314,
"loss": 2.5529,
"step": 10430
},
{
"epoch": 3.2959804153834007,
"grad_norm": 0.06768282259573886,
"learning_rate": 0.0016954116765858635,
"loss": 2.6157,
"step": 10435
},
{
"epoch": 3.2975598199478795,
"grad_norm": 0.053463358594411925,
"learning_rate": 0.001695015294786165,
"loss": 2.649,
"step": 10440
},
{
"epoch": 3.2991392245123587,
"grad_norm": 0.09274796837286527,
"learning_rate": 0.001694618701636756,
"loss": 2.6481,
"step": 10445
},
{
"epoch": 3.300718629076838,
"grad_norm": 0.0772108488909983,
"learning_rate": 0.001694221897258238,
"loss": 2.5709,
"step": 10450
},
{
"epoch": 3.3022980336413172,
"grad_norm": 0.08472081158875007,
"learning_rate": 0.0016938248817712767,
"loss": 2.5986,
"step": 10455
},
{
"epoch": 3.3038774382057965,
"grad_norm": 0.07749119429930267,
"learning_rate": 0.0016934276552966017,
"loss": 2.6093,
"step": 10460
},
{
"epoch": 3.3054568427702757,
"grad_norm": 0.07045324083306663,
"learning_rate": 0.001693030217955007,
"loss": 2.547,
"step": 10465
},
{
"epoch": 3.307036247334755,
"grad_norm": 0.06514536706521994,
"learning_rate": 0.0016926325698673511,
"loss": 2.5417,
"step": 10470
},
{
"epoch": 3.308615651899234,
"grad_norm": 0.07034685763804152,
"learning_rate": 0.0016922347111545557,
"loss": 2.6036,
"step": 10475
},
{
"epoch": 3.310195056463713,
"grad_norm": 0.09796385162305873,
"learning_rate": 0.0016918366419376078,
"loss": 2.5858,
"step": 10480
},
{
"epoch": 3.3117744610281923,
"grad_norm": 0.0742290614688897,
"learning_rate": 0.0016914383623375575,
"loss": 2.745,
"step": 10485
},
{
"epoch": 3.3133538655926715,
"grad_norm": 0.06899769885657452,
"learning_rate": 0.0016910398724755186,
"loss": 2.6352,
"step": 10490
},
{
"epoch": 3.314933270157151,
"grad_norm": 0.06790020533599436,
"learning_rate": 0.0016906411724726697,
"loss": 2.5656,
"step": 10495
},
{
"epoch": 3.31651267472163,
"grad_norm": 0.05885065461528016,
"learning_rate": 0.0016902422624502532,
"loss": 2.5416,
"step": 10500
},
{
"epoch": 3.3180920792861093,
"grad_norm": 0.07103136246491495,
"learning_rate": 0.0016898431425295744,
"loss": 2.6551,
"step": 10505
},
{
"epoch": 3.3196714838505885,
"grad_norm": 0.08179324963044199,
"learning_rate": 0.0016894438128320039,
"loss": 2.5598,
"step": 10510
},
{
"epoch": 3.3212508884150673,
"grad_norm": 0.06249717252547698,
"learning_rate": 0.0016890442734789743,
"loss": 2.5719,
"step": 10515
},
{
"epoch": 3.3228302929795466,
"grad_norm": 0.07490536043214126,
"learning_rate": 0.0016886445245919838,
"loss": 2.5096,
"step": 10520
},
{
"epoch": 3.324409697544026,
"grad_norm": 0.09144476189116073,
"learning_rate": 0.0016882445662925933,
"loss": 2.5459,
"step": 10525
},
{
"epoch": 3.325989102108505,
"grad_norm": 0.08519644228840297,
"learning_rate": 0.0016878443987024276,
"loss": 2.6242,
"step": 10530
},
{
"epoch": 3.3275685066729843,
"grad_norm": 0.07492565924347942,
"learning_rate": 0.001687444021943175,
"loss": 2.6675,
"step": 10535
},
{
"epoch": 3.3291479112374636,
"grad_norm": 0.07682330886863895,
"learning_rate": 0.0016870434361365874,
"loss": 2.6064,
"step": 10540
},
{
"epoch": 3.330727315801943,
"grad_norm": 0.06508504218064855,
"learning_rate": 0.0016866426414044807,
"loss": 2.4953,
"step": 10545
},
{
"epoch": 3.3323067203664216,
"grad_norm": 0.07424031145732456,
"learning_rate": 0.0016862416378687337,
"loss": 2.601,
"step": 10550
},
{
"epoch": 3.333886124930901,
"grad_norm": 0.07137816057188225,
"learning_rate": 0.001685840425651289,
"loss": 2.6436,
"step": 10555
},
{
"epoch": 3.33546552949538,
"grad_norm": 0.07731783109421575,
"learning_rate": 0.0016854390048741531,
"loss": 2.6154,
"step": 10560
},
{
"epoch": 3.3370449340598594,
"grad_norm": 0.08805264397542543,
"learning_rate": 0.001685037375659395,
"loss": 2.5765,
"step": 10565
},
{
"epoch": 3.3386243386243386,
"grad_norm": 0.07101122837198678,
"learning_rate": 0.001684635538129148,
"loss": 2.6236,
"step": 10570
},
{
"epoch": 3.340203743188818,
"grad_norm": 0.061219029182904686,
"learning_rate": 0.0016842334924056079,
"loss": 2.6292,
"step": 10575
},
{
"epoch": 3.341783147753297,
"grad_norm": 0.0739208555181634,
"learning_rate": 0.0016838312386110346,
"loss": 2.5738,
"step": 10580
},
{
"epoch": 3.3433625523177763,
"grad_norm": 0.06174029187954557,
"learning_rate": 0.0016834287768677505,
"loss": 2.6446,
"step": 10585
},
{
"epoch": 3.3449419568822556,
"grad_norm": 0.06501510056615152,
"learning_rate": 0.0016830261072981422,
"loss": 2.621,
"step": 10590
},
{
"epoch": 3.3465213614467344,
"grad_norm": 0.07129318306001188,
"learning_rate": 0.0016826232300246585,
"loss": 2.5013,
"step": 10595
},
{
"epoch": 3.3481007660112136,
"grad_norm": 0.060346700412604246,
"learning_rate": 0.001682220145169812,
"loss": 2.5445,
"step": 10600
},
{
"epoch": 3.349680170575693,
"grad_norm": 0.08314925435533438,
"learning_rate": 0.001681816852856178,
"loss": 2.7018,
"step": 10605
},
{
"epoch": 3.351259575140172,
"grad_norm": 0.07371136983426413,
"learning_rate": 0.0016814133532063956,
"loss": 2.5518,
"step": 10610
},
{
"epoch": 3.3528389797046514,
"grad_norm": 0.07034160049593934,
"learning_rate": 0.001681009646343166,
"loss": 2.4817,
"step": 10615
},
{
"epoch": 3.3544183842691306,
"grad_norm": 0.07883534743806464,
"learning_rate": 0.001680605732389254,
"loss": 2.6884,
"step": 10620
},
{
"epoch": 3.35599778883361,
"grad_norm": 0.06775784286490502,
"learning_rate": 0.0016802016114674874,
"loss": 2.5764,
"step": 10625
},
{
"epoch": 3.3575771933980887,
"grad_norm": 0.08366321506658525,
"learning_rate": 0.0016797972837007567,
"loss": 2.6764,
"step": 10630
},
{
"epoch": 3.359156597962568,
"grad_norm": 0.06664698592119082,
"learning_rate": 0.0016793927492120152,
"loss": 2.6409,
"step": 10635
},
{
"epoch": 3.360736002527047,
"grad_norm": 0.07823045449252466,
"learning_rate": 0.0016789880081242794,
"loss": 2.6773,
"step": 10640
},
{
"epoch": 3.3623154070915264,
"grad_norm": 0.06796072751134481,
"learning_rate": 0.0016785830605606288,
"loss": 2.5749,
"step": 10645
},
{
"epoch": 3.3638948116560057,
"grad_norm": 0.07521011042377342,
"learning_rate": 0.001678177906644205,
"loss": 2.603,
"step": 10650
},
{
"epoch": 3.365474216220485,
"grad_norm": 0.07736604538769924,
"learning_rate": 0.0016777725464982125,
"loss": 2.6039,
"step": 10655
},
{
"epoch": 3.367053620784964,
"grad_norm": 0.05241740355470881,
"learning_rate": 0.0016773669802459192,
"loss": 2.5906,
"step": 10660
},
{
"epoch": 3.3686330253494434,
"grad_norm": 0.06374080163158191,
"learning_rate": 0.0016769612080106554,
"loss": 2.6656,
"step": 10665
},
{
"epoch": 3.3702124299139227,
"grad_norm": 0.0721017802522469,
"learning_rate": 0.0016765552299158127,
"loss": 2.7058,
"step": 10670
},
{
"epoch": 3.3717918344784015,
"grad_norm": 0.06657094661342346,
"learning_rate": 0.0016761490460848476,
"loss": 2.4888,
"step": 10675
},
{
"epoch": 3.3733712390428807,
"grad_norm": 0.07017677258309017,
"learning_rate": 0.0016757426566412776,
"loss": 2.6002,
"step": 10680
},
{
"epoch": 3.37495064360736,
"grad_norm": 0.07685429307755572,
"learning_rate": 0.0016753360617086832,
"loss": 2.673,
"step": 10685
},
{
"epoch": 3.376530048171839,
"grad_norm": 0.07435876185097813,
"learning_rate": 0.0016749292614107074,
"loss": 2.5722,
"step": 10690
},
{
"epoch": 3.3781094527363185,
"grad_norm": 0.07399145117339155,
"learning_rate": 0.0016745222558710554,
"loss": 2.5617,
"step": 10695
},
{
"epoch": 3.3796888573007977,
"grad_norm": 0.08075039423060379,
"learning_rate": 0.0016741150452134947,
"loss": 2.5131,
"step": 10700
},
{
"epoch": 3.381268261865277,
"grad_norm": 0.06768543805520227,
"learning_rate": 0.0016737076295618564,
"loss": 2.6348,
"step": 10705
},
{
"epoch": 3.3828476664297558,
"grad_norm": 0.06413509406418867,
"learning_rate": 0.001673300009040032,
"loss": 2.5227,
"step": 10710
},
{
"epoch": 3.384427070994235,
"grad_norm": 0.09965748601411166,
"learning_rate": 0.0016728921837719766,
"loss": 2.5946,
"step": 10715
},
{
"epoch": 3.3860064755587143,
"grad_norm": 0.08948513305462186,
"learning_rate": 0.0016724841538817072,
"loss": 2.4781,
"step": 10720
},
{
"epoch": 3.3875858801231935,
"grad_norm": 0.06768464720263621,
"learning_rate": 0.0016720759194933036,
"loss": 2.5584,
"step": 10725
},
{
"epoch": 3.3891652846876728,
"grad_norm": 0.06731032519326456,
"learning_rate": 0.0016716674807309068,
"loss": 2.501,
"step": 10730
},
{
"epoch": 3.390744689252152,
"grad_norm": 0.07611234865803017,
"learning_rate": 0.0016712588377187205,
"loss": 2.7178,
"step": 10735
},
{
"epoch": 3.3923240938166312,
"grad_norm": 0.07191634935692151,
"learning_rate": 0.0016708499905810105,
"loss": 2.6405,
"step": 10740
},
{
"epoch": 3.3939034983811105,
"grad_norm": 0.059784831832480775,
"learning_rate": 0.0016704409394421042,
"loss": 2.5167,
"step": 10745
},
{
"epoch": 3.3954829029455897,
"grad_norm": 0.06378049802286767,
"learning_rate": 0.0016700316844263923,
"loss": 2.558,
"step": 10750
},
{
"epoch": 3.3970623075100685,
"grad_norm": 0.0729254696514964,
"learning_rate": 0.0016696222256583257,
"loss": 2.6993,
"step": 10755
},
{
"epoch": 3.398641712074548,
"grad_norm": 0.06783132950569057,
"learning_rate": 0.001669212563262419,
"loss": 2.6441,
"step": 10760
},
{
"epoch": 3.400221116639027,
"grad_norm": 0.06881595096036265,
"learning_rate": 0.0016688026973632473,
"loss": 2.5529,
"step": 10765
},
{
"epoch": 3.4018005212035063,
"grad_norm": 0.0660612489397916,
"learning_rate": 0.0016683926280854485,
"loss": 2.5967,
"step": 10770
},
{
"epoch": 3.4033799257679855,
"grad_norm": 0.07434614804003171,
"learning_rate": 0.0016679823555537218,
"loss": 2.4464,
"step": 10775
},
{
"epoch": 3.404959330332465,
"grad_norm": 0.07179597580664843,
"learning_rate": 0.0016675718798928288,
"loss": 2.6128,
"step": 10780
},
{
"epoch": 3.406538734896944,
"grad_norm": 0.07487229182490646,
"learning_rate": 0.0016671612012275922,
"loss": 2.6535,
"step": 10785
},
{
"epoch": 3.408118139461423,
"grad_norm": 0.06590376440721327,
"learning_rate": 0.001666750319682897,
"loss": 2.5712,
"step": 10790
},
{
"epoch": 3.409697544025902,
"grad_norm": 0.07018738374191079,
"learning_rate": 0.0016663392353836897,
"loss": 2.6166,
"step": 10795
},
{
"epoch": 3.4112769485903813,
"grad_norm": 0.058437229908067226,
"learning_rate": 0.0016659279484549784,
"loss": 2.5166,
"step": 10800
},
{
"epoch": 3.4128563531548606,
"grad_norm": 0.06823378284757801,
"learning_rate": 0.0016655164590218324,
"loss": 2.5792,
"step": 10805
},
{
"epoch": 3.41443575771934,
"grad_norm": 0.08623700445295854,
"learning_rate": 0.0016651047672093834,
"loss": 2.5792,
"step": 10810
},
{
"epoch": 3.416015162283819,
"grad_norm": 0.06236875375960552,
"learning_rate": 0.0016646928731428238,
"loss": 2.587,
"step": 10815
},
{
"epoch": 3.4175945668482983,
"grad_norm": 0.0732907310475096,
"learning_rate": 0.001664280776947409,
"loss": 2.6817,
"step": 10820
},
{
"epoch": 3.4191739714127776,
"grad_norm": 0.07650204052590727,
"learning_rate": 0.0016638684787484536,
"loss": 2.6649,
"step": 10825
},
{
"epoch": 3.420753375977257,
"grad_norm": 0.06819073406905447,
"learning_rate": 0.001663455978671336,
"loss": 2.5444,
"step": 10830
},
{
"epoch": 3.4223327805417356,
"grad_norm": 0.06486268882268804,
"learning_rate": 0.0016630432768414936,
"loss": 2.5283,
"step": 10835
},
{
"epoch": 3.423912185106215,
"grad_norm": 0.08334601379334415,
"learning_rate": 0.0016626303733844273,
"loss": 2.638,
"step": 10840
},
{
"epoch": 3.425491589670694,
"grad_norm": 0.07005603968188574,
"learning_rate": 0.0016622172684256982,
"loss": 2.5851,
"step": 10845
},
{
"epoch": 3.4270709942351734,
"grad_norm": 0.07198131768784445,
"learning_rate": 0.0016618039620909285,
"loss": 2.5739,
"step": 10850
},
{
"epoch": 3.4286503987996526,
"grad_norm": 0.07960782279570881,
"learning_rate": 0.0016613904545058024,
"loss": 2.4816,
"step": 10855
},
{
"epoch": 3.430229803364132,
"grad_norm": 0.07389425043887975,
"learning_rate": 0.0016609767457960647,
"loss": 2.5397,
"step": 10860
},
{
"epoch": 3.431809207928611,
"grad_norm": 0.07297687012791151,
"learning_rate": 0.001660562836087522,
"loss": 2.6118,
"step": 10865
},
{
"epoch": 3.43338861249309,
"grad_norm": 0.06729323343472277,
"learning_rate": 0.0016601487255060415,
"loss": 2.5716,
"step": 10870
},
{
"epoch": 3.434968017057569,
"grad_norm": 0.07466788627005895,
"learning_rate": 0.0016597344141775507,
"loss": 2.5055,
"step": 10875
},
{
"epoch": 3.4365474216220484,
"grad_norm": 0.0669564025857904,
"learning_rate": 0.0016593199022280404,
"loss": 2.5711,
"step": 10880
},
{
"epoch": 3.4381268261865277,
"grad_norm": 0.0715612878194584,
"learning_rate": 0.0016589051897835598,
"loss": 2.5906,
"step": 10885
},
{
"epoch": 3.439706230751007,
"grad_norm": 0.06104033332557915,
"learning_rate": 0.0016584902769702212,
"loss": 2.5908,
"step": 10890
},
{
"epoch": 3.441285635315486,
"grad_norm": 0.09763031493678595,
"learning_rate": 0.0016580751639141964,
"loss": 2.6466,
"step": 10895
},
{
"epoch": 3.4428650398799654,
"grad_norm": 0.08888846981736682,
"learning_rate": 0.001657659850741719,
"loss": 2.6081,
"step": 10900
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.07025842112272405,
"learning_rate": 0.0016572443375790825,
"loss": 2.5426,
"step": 10905
},
{
"epoch": 3.4460238490089234,
"grad_norm": 0.07725833505892749,
"learning_rate": 0.0016568286245526424,
"loss": 2.6115,
"step": 10910
},
{
"epoch": 3.4476032535734027,
"grad_norm": 0.07330908357772506,
"learning_rate": 0.0016564127117888146,
"loss": 2.6395,
"step": 10915
},
{
"epoch": 3.449182658137882,
"grad_norm": 0.06651234838698582,
"learning_rate": 0.0016559965994140747,
"loss": 2.5544,
"step": 10920
},
{
"epoch": 3.450762062702361,
"grad_norm": 0.06484930161153007,
"learning_rate": 0.00165558028755496,
"loss": 2.5822,
"step": 10925
},
{
"epoch": 3.4523414672668404,
"grad_norm": 0.06158066171770069,
"learning_rate": 0.0016551637763380688,
"loss": 2.5769,
"step": 10930
},
{
"epoch": 3.4539208718313197,
"grad_norm": 0.06584894085517716,
"learning_rate": 0.0016547470658900593,
"loss": 2.5717,
"step": 10935
},
{
"epoch": 3.455500276395799,
"grad_norm": 0.07184458522706526,
"learning_rate": 0.0016543301563376497,
"loss": 2.4779,
"step": 10940
},
{
"epoch": 3.4570796809602777,
"grad_norm": 0.06801420016492464,
"learning_rate": 0.0016539130478076208,
"loss": 2.6033,
"step": 10945
},
{
"epoch": 3.458659085524757,
"grad_norm": 0.08292466353061236,
"learning_rate": 0.001653495740426812,
"loss": 2.5496,
"step": 10950
},
{
"epoch": 3.4602384900892362,
"grad_norm": 0.09120479222013012,
"learning_rate": 0.0016530782343221234,
"loss": 2.5863,
"step": 10955
},
{
"epoch": 3.4618178946537155,
"grad_norm": 0.07435860546314733,
"learning_rate": 0.0016526605296205167,
"loss": 2.6012,
"step": 10960
},
{
"epoch": 3.4633972992181947,
"grad_norm": 0.09409409753292039,
"learning_rate": 0.0016522426264490128,
"loss": 2.5539,
"step": 10965
},
{
"epoch": 3.464976703782674,
"grad_norm": 0.09574905329209961,
"learning_rate": 0.0016518245249346935,
"loss": 2.5819,
"step": 10970
},
{
"epoch": 3.466556108347153,
"grad_norm": 0.05757058226828307,
"learning_rate": 0.0016514062252047008,
"loss": 2.6066,
"step": 10975
},
{
"epoch": 3.4681355129116325,
"grad_norm": 0.06550947535422515,
"learning_rate": 0.0016509877273862368,
"loss": 2.5726,
"step": 10980
},
{
"epoch": 3.4697149174761117,
"grad_norm": 0.05960405730646586,
"learning_rate": 0.0016505690316065645,
"loss": 2.6718,
"step": 10985
},
{
"epoch": 3.4712943220405905,
"grad_norm": 0.08540369280925722,
"learning_rate": 0.0016501501379930063,
"loss": 2.657,
"step": 10990
},
{
"epoch": 3.4728737266050698,
"grad_norm": 0.07302920777857468,
"learning_rate": 0.0016497310466729448,
"loss": 2.643,
"step": 10995
},
{
"epoch": 3.474453131169549,
"grad_norm": 0.06878816207071971,
"learning_rate": 0.0016493117577738232,
"loss": 2.6026,
"step": 11000
},
{
"epoch": 3.4760325357340283,
"grad_norm": 0.05880351233861769,
"learning_rate": 0.0016488922714231451,
"loss": 2.7324,
"step": 11005
},
{
"epoch": 3.4776119402985075,
"grad_norm": 0.0753234680927505,
"learning_rate": 0.001648472587748473,
"loss": 2.4972,
"step": 11010
},
{
"epoch": 3.4791913448629868,
"grad_norm": 0.05936749176702982,
"learning_rate": 0.0016480527068774297,
"loss": 2.607,
"step": 11015
},
{
"epoch": 3.480770749427466,
"grad_norm": 0.06152356191409516,
"learning_rate": 0.001647632628937699,
"loss": 2.5759,
"step": 11020
},
{
"epoch": 3.482350153991945,
"grad_norm": 0.07671879144110781,
"learning_rate": 0.0016472123540570238,
"loss": 2.5654,
"step": 11025
},
{
"epoch": 3.483929558556424,
"grad_norm": 0.06140593598243999,
"learning_rate": 0.0016467918823632071,
"loss": 2.6347,
"step": 11030
},
{
"epoch": 3.4855089631209033,
"grad_norm": 0.08460096728720515,
"learning_rate": 0.0016463712139841112,
"loss": 2.6307,
"step": 11035
},
{
"epoch": 3.4870883676853826,
"grad_norm": 0.06913864655333987,
"learning_rate": 0.0016459503490476588,
"loss": 2.5789,
"step": 11040
},
{
"epoch": 3.488667772249862,
"grad_norm": 0.06560244455602209,
"learning_rate": 0.0016455292876818323,
"loss": 2.5676,
"step": 11045
},
{
"epoch": 3.490247176814341,
"grad_norm": 0.07174425583587295,
"learning_rate": 0.0016451080300146743,
"loss": 2.6417,
"step": 11050
},
{
"epoch": 3.4918265813788203,
"grad_norm": 0.07854517982881039,
"learning_rate": 0.0016446865761742858,
"loss": 2.5639,
"step": 11055
},
{
"epoch": 3.4934059859432995,
"grad_norm": 0.10798333983779836,
"learning_rate": 0.001644264926288828,
"loss": 2.5174,
"step": 11060
},
{
"epoch": 3.494985390507779,
"grad_norm": 0.07640438316275776,
"learning_rate": 0.0016438430804865231,
"loss": 2.5181,
"step": 11065
},
{
"epoch": 3.4965647950722576,
"grad_norm": 0.0661772313075206,
"learning_rate": 0.0016434210388956508,
"loss": 2.7018,
"step": 11070
},
{
"epoch": 3.498144199636737,
"grad_norm": 0.075491851589846,
"learning_rate": 0.0016429988016445516,
"loss": 2.6088,
"step": 11075
},
{
"epoch": 3.499723604201216,
"grad_norm": 0.06394737278526207,
"learning_rate": 0.0016425763688616248,
"loss": 2.6242,
"step": 11080
},
{
"epoch": 3.5013030087656953,
"grad_norm": 0.07171320380234876,
"learning_rate": 0.00164215374067533,
"loss": 2.5235,
"step": 11085
},
{
"epoch": 3.5028824133301746,
"grad_norm": 0.05872080150096808,
"learning_rate": 0.0016417309172141853,
"loss": 2.5743,
"step": 11090
},
{
"epoch": 3.504461817894654,
"grad_norm": 0.06144897925333228,
"learning_rate": 0.0016413078986067691,
"loss": 2.5211,
"step": 11095
},
{
"epoch": 3.5060412224591326,
"grad_norm": 0.0787162807272008,
"learning_rate": 0.0016408846849817183,
"loss": 2.6094,
"step": 11100
},
{
"epoch": 3.507620627023612,
"grad_norm": 0.07024488444314682,
"learning_rate": 0.0016404612764677293,
"loss": 2.5469,
"step": 11105
},
{
"epoch": 3.509200031588091,
"grad_norm": 0.06570148242570677,
"learning_rate": 0.0016400376731935584,
"loss": 2.5928,
"step": 11110
},
{
"epoch": 3.5107794361525704,
"grad_norm": 0.07249968344951528,
"learning_rate": 0.0016396138752880203,
"loss": 2.5755,
"step": 11115
},
{
"epoch": 3.5123588407170496,
"grad_norm": 0.07077072471171038,
"learning_rate": 0.0016391898828799895,
"loss": 2.6563,
"step": 11120
},
{
"epoch": 3.513938245281529,
"grad_norm": 0.06685448261888306,
"learning_rate": 0.001638765696098399,
"loss": 2.6498,
"step": 11125
},
{
"epoch": 3.515517649846008,
"grad_norm": 0.07346445807897378,
"learning_rate": 0.0016383413150722415,
"loss": 2.5116,
"step": 11130
},
{
"epoch": 3.5170970544104874,
"grad_norm": 0.07151114604498575,
"learning_rate": 0.0016379167399305685,
"loss": 2.6605,
"step": 11135
},
{
"epoch": 3.5186764589749666,
"grad_norm": 0.07658864985072805,
"learning_rate": 0.0016374919708024907,
"loss": 2.623,
"step": 11140
},
{
"epoch": 3.520255863539446,
"grad_norm": 0.06544775756343006,
"learning_rate": 0.001637067007817178,
"loss": 2.5293,
"step": 11145
},
{
"epoch": 3.5218352681039247,
"grad_norm": 0.09329441798346816,
"learning_rate": 0.001636641851103858,
"loss": 2.5974,
"step": 11150
},
{
"epoch": 3.523414672668404,
"grad_norm": 0.07482084483418168,
"learning_rate": 0.0016362165007918188,
"loss": 2.6311,
"step": 11155
},
{
"epoch": 3.524994077232883,
"grad_norm": 0.08007411148052147,
"learning_rate": 0.0016357909570104067,
"loss": 2.5713,
"step": 11160
},
{
"epoch": 3.5265734817973624,
"grad_norm": 0.07112811037320547,
"learning_rate": 0.001635365219889027,
"loss": 2.5326,
"step": 11165
},
{
"epoch": 3.5281528863618417,
"grad_norm": 0.06863240009007542,
"learning_rate": 0.0016349392895571434,
"loss": 2.477,
"step": 11170
},
{
"epoch": 3.529732290926321,
"grad_norm": 0.06905806940530339,
"learning_rate": 0.001634513166144278,
"loss": 2.6037,
"step": 11175
},
{
"epoch": 3.5313116954907997,
"grad_norm": 0.05760631608345864,
"learning_rate": 0.0016340868497800134,
"loss": 2.6011,
"step": 11180
},
{
"epoch": 3.532891100055279,
"grad_norm": 0.0759003904957396,
"learning_rate": 0.0016336603405939887,
"loss": 2.5307,
"step": 11185
},
{
"epoch": 3.534470504619758,
"grad_norm": 0.09208394390989348,
"learning_rate": 0.0016332336387159033,
"loss": 2.6075,
"step": 11190
},
{
"epoch": 3.5360499091842374,
"grad_norm": 0.08444672769151976,
"learning_rate": 0.001632806744275514,
"loss": 2.5832,
"step": 11195
},
{
"epoch": 3.5376293137487167,
"grad_norm": 0.07808974855886736,
"learning_rate": 0.0016323796574026369,
"loss": 2.5501,
"step": 11200
},
{
"epoch": 3.539208718313196,
"grad_norm": 0.06424278269039783,
"learning_rate": 0.001631952378227146,
"loss": 2.6595,
"step": 11205
},
{
"epoch": 3.540788122877675,
"grad_norm": 0.06820610119172915,
"learning_rate": 0.0016315249068789752,
"loss": 2.5796,
"step": 11210
},
{
"epoch": 3.5423675274421544,
"grad_norm": 0.06912948934517808,
"learning_rate": 0.001631097243488115,
"loss": 2.613,
"step": 11215
},
{
"epoch": 3.5439469320066337,
"grad_norm": 0.06574909061024409,
"learning_rate": 0.001630669388184615,
"loss": 2.5504,
"step": 11220
},
{
"epoch": 3.545526336571113,
"grad_norm": 0.07357098575986265,
"learning_rate": 0.0016302413410985838,
"loss": 2.5472,
"step": 11225
},
{
"epoch": 3.5471057411355917,
"grad_norm": 0.060739519425697566,
"learning_rate": 0.001629813102360187,
"loss": 2.6525,
"step": 11230
},
{
"epoch": 3.548685145700071,
"grad_norm": 0.07460214721162602,
"learning_rate": 0.0016293846720996505,
"loss": 2.5912,
"step": 11235
},
{
"epoch": 3.5502645502645502,
"grad_norm": 0.07731981043931724,
"learning_rate": 0.0016289560504472557,
"loss": 2.5973,
"step": 11240
},
{
"epoch": 3.5518439548290295,
"grad_norm": 0.06102617164948744,
"learning_rate": 0.001628527237533345,
"loss": 2.5308,
"step": 11245
},
{
"epoch": 3.5534233593935087,
"grad_norm": 0.075526968146759,
"learning_rate": 0.0016280982334883167,
"loss": 2.6218,
"step": 11250
},
{
"epoch": 3.555002763957988,
"grad_norm": 0.0739731983245525,
"learning_rate": 0.001627669038442629,
"loss": 2.6824,
"step": 11255
},
{
"epoch": 3.556582168522467,
"grad_norm": 0.06741919304797549,
"learning_rate": 0.0016272396525267969,
"loss": 2.5657,
"step": 11260
},
{
"epoch": 3.558161573086946,
"grad_norm": 0.07405767062357331,
"learning_rate": 0.001626810075871394,
"loss": 2.6026,
"step": 11265
},
{
"epoch": 3.5597409776514253,
"grad_norm": 0.06368924285850194,
"learning_rate": 0.0016263803086070522,
"loss": 2.5798,
"step": 11270
},
{
"epoch": 3.5613203822159045,
"grad_norm": 0.0636071014304687,
"learning_rate": 0.0016259503508644598,
"loss": 2.5691,
"step": 11275
},
{
"epoch": 3.5628997867803838,
"grad_norm": 0.06086408409466744,
"learning_rate": 0.0016255202027743655,
"loss": 2.5419,
"step": 11280
},
{
"epoch": 3.564479191344863,
"grad_norm": 0.07759136034435625,
"learning_rate": 0.0016250898644675743,
"loss": 2.5078,
"step": 11285
},
{
"epoch": 3.5660585959093423,
"grad_norm": 0.09729816531848431,
"learning_rate": 0.0016246593360749486,
"loss": 2.5457,
"step": 11290
},
{
"epoch": 3.5676380004738215,
"grad_norm": 0.074411179609676,
"learning_rate": 0.0016242286177274102,
"loss": 2.5908,
"step": 11295
},
{
"epoch": 3.5692174050383008,
"grad_norm": 0.06271173103153893,
"learning_rate": 0.0016237977095559374,
"loss": 2.6156,
"step": 11300
},
{
"epoch": 3.57079680960278,
"grad_norm": 0.07243411302383965,
"learning_rate": 0.0016233666116915665,
"loss": 2.555,
"step": 11305
},
{
"epoch": 3.572376214167259,
"grad_norm": 0.06233727605201847,
"learning_rate": 0.0016229353242653921,
"loss": 2.5773,
"step": 11310
},
{
"epoch": 3.573955618731738,
"grad_norm": 0.08847979897029239,
"learning_rate": 0.0016225038474085656,
"loss": 2.4884,
"step": 11315
},
{
"epoch": 3.5755350232962173,
"grad_norm": 0.0701888861826865,
"learning_rate": 0.001622072181252296,
"loss": 2.4875,
"step": 11320
},
{
"epoch": 3.5771144278606966,
"grad_norm": 0.06670567324457778,
"learning_rate": 0.0016216403259278513,
"loss": 2.5978,
"step": 11325
},
{
"epoch": 3.578693832425176,
"grad_norm": 0.06900210003160186,
"learning_rate": 0.0016212082815665549,
"loss": 2.5214,
"step": 11330
},
{
"epoch": 3.580273236989655,
"grad_norm": 0.056985066817711455,
"learning_rate": 0.0016207760482997889,
"loss": 2.6671,
"step": 11335
},
{
"epoch": 3.581852641554134,
"grad_norm": 0.07224028193115203,
"learning_rate": 0.0016203436262589928,
"loss": 2.52,
"step": 11340
},
{
"epoch": 3.583432046118613,
"grad_norm": 0.07025491795002735,
"learning_rate": 0.0016199110155756635,
"loss": 2.5668,
"step": 11345
},
{
"epoch": 3.5850114506830923,
"grad_norm": 0.08806917660000195,
"learning_rate": 0.0016194782163813555,
"loss": 2.5575,
"step": 11350
},
{
"epoch": 3.5865908552475716,
"grad_norm": 0.06733194550829404,
"learning_rate": 0.0016190452288076793,
"loss": 2.5797,
"step": 11355
},
{
"epoch": 3.588170259812051,
"grad_norm": 0.07468759375418589,
"learning_rate": 0.0016186120529863043,
"loss": 2.6095,
"step": 11360
},
{
"epoch": 3.58974966437653,
"grad_norm": 0.0682787771741682,
"learning_rate": 0.0016181786890489566,
"loss": 2.6025,
"step": 11365
},
{
"epoch": 3.5913290689410093,
"grad_norm": 0.09009403905876984,
"learning_rate": 0.0016177451371274195,
"loss": 2.5497,
"step": 11370
},
{
"epoch": 3.5929084735054886,
"grad_norm": 0.07012466714141395,
"learning_rate": 0.0016173113973535326,
"loss": 2.6256,
"step": 11375
},
{
"epoch": 3.594487878069968,
"grad_norm": 0.08249534596518228,
"learning_rate": 0.0016168774698591942,
"loss": 2.589,
"step": 11380
},
{
"epoch": 3.596067282634447,
"grad_norm": 0.0771951952443618,
"learning_rate": 0.0016164433547763584,
"loss": 2.6929,
"step": 11385
},
{
"epoch": 3.597646687198926,
"grad_norm": 0.07089184148912944,
"learning_rate": 0.001616009052237037,
"loss": 2.5692,
"step": 11390
},
{
"epoch": 3.599226091763405,
"grad_norm": 0.06721528818867126,
"learning_rate": 0.0016155745623732988,
"loss": 2.6388,
"step": 11395
},
{
"epoch": 3.6008054963278844,
"grad_norm": 0.08228419386883479,
"learning_rate": 0.0016151398853172687,
"loss": 2.6275,
"step": 11400
},
{
"epoch": 3.6023849008923636,
"grad_norm": 0.0655671463462765,
"learning_rate": 0.00161470502120113,
"loss": 2.6346,
"step": 11405
},
{
"epoch": 3.603964305456843,
"grad_norm": 0.06417851556288011,
"learning_rate": 0.0016142699701571217,
"loss": 2.5587,
"step": 11410
},
{
"epoch": 3.605543710021322,
"grad_norm": 0.07935220802349033,
"learning_rate": 0.0016138347323175401,
"loss": 2.5931,
"step": 11415
},
{
"epoch": 3.607123114585801,
"grad_norm": 0.06373656925396162,
"learning_rate": 0.001613399307814739,
"loss": 2.5363,
"step": 11420
},
{
"epoch": 3.60870251915028,
"grad_norm": 0.07720732300365005,
"learning_rate": 0.0016129636967811267,
"loss": 2.5865,
"step": 11425
},
{
"epoch": 3.6102819237147594,
"grad_norm": 0.061986669130409316,
"learning_rate": 0.0016125278993491708,
"loss": 2.585,
"step": 11430
},
{
"epoch": 3.6118613282792387,
"grad_norm": 0.07635466273186765,
"learning_rate": 0.0016120919156513943,
"loss": 2.6706,
"step": 11435
},
{
"epoch": 3.613440732843718,
"grad_norm": 0.07586918538839484,
"learning_rate": 0.001611655745820377,
"loss": 2.6386,
"step": 11440
},
{
"epoch": 3.615020137408197,
"grad_norm": 0.06798853097959028,
"learning_rate": 0.0016112193899887554,
"loss": 2.6089,
"step": 11445
},
{
"epoch": 3.6165995419726764,
"grad_norm": 0.08951585142582336,
"learning_rate": 0.0016107828482892223,
"loss": 2.5687,
"step": 11450
},
{
"epoch": 3.6181789465371557,
"grad_norm": 0.07872066117084035,
"learning_rate": 0.0016103461208545277,
"loss": 2.6284,
"step": 11455
},
{
"epoch": 3.619758351101635,
"grad_norm": 0.07150456315801824,
"learning_rate": 0.001609909207817477,
"loss": 2.473,
"step": 11460
},
{
"epoch": 3.6213377556661137,
"grad_norm": 0.08774636947086427,
"learning_rate": 0.0016094721093109334,
"loss": 2.5884,
"step": 11465
},
{
"epoch": 3.622917160230593,
"grad_norm": 0.07516109178896002,
"learning_rate": 0.0016090348254678153,
"loss": 2.5736,
"step": 11470
},
{
"epoch": 3.624496564795072,
"grad_norm": 0.0639443580037544,
"learning_rate": 0.001608597356421098,
"loss": 2.5588,
"step": 11475
},
{
"epoch": 3.6260759693595515,
"grad_norm": 0.0620553637240486,
"learning_rate": 0.001608159702303813,
"loss": 2.5642,
"step": 11480
},
{
"epoch": 3.6276553739240307,
"grad_norm": 0.06705275148777172,
"learning_rate": 0.0016077218632490483,
"loss": 2.6038,
"step": 11485
},
{
"epoch": 3.62923477848851,
"grad_norm": 0.0573434384133038,
"learning_rate": 0.0016072838393899477,
"loss": 2.5603,
"step": 11490
},
{
"epoch": 3.6308141830529888,
"grad_norm": 0.09390910657840079,
"learning_rate": 0.0016068456308597115,
"loss": 2.5916,
"step": 11495
},
{
"epoch": 3.632393587617468,
"grad_norm": 0.07769507200891466,
"learning_rate": 0.0016064072377915963,
"loss": 2.665,
"step": 11500
},
{
"epoch": 3.6339729921819472,
"grad_norm": 0.08015931577181488,
"learning_rate": 0.0016059686603189145,
"loss": 2.618,
"step": 11505
},
{
"epoch": 3.6355523967464265,
"grad_norm": 0.061839171520963104,
"learning_rate": 0.001605529898575035,
"loss": 2.648,
"step": 11510
},
{
"epoch": 3.6371318013109057,
"grad_norm": 0.06849901337162981,
"learning_rate": 0.0016050909526933819,
"loss": 2.6146,
"step": 11515
},
{
"epoch": 3.638711205875385,
"grad_norm": 0.06096462347936403,
"learning_rate": 0.001604651822807436,
"loss": 2.5761,
"step": 11520
},
{
"epoch": 3.6402906104398642,
"grad_norm": 0.07735773794461735,
"learning_rate": 0.0016042125090507343,
"loss": 2.6231,
"step": 11525
},
{
"epoch": 3.6418700150043435,
"grad_norm": 0.06445663947834945,
"learning_rate": 0.0016037730115568687,
"loss": 2.6138,
"step": 11530
},
{
"epoch": 3.6434494195688227,
"grad_norm": 0.06481551326652253,
"learning_rate": 0.0016033333304594883,
"loss": 2.4989,
"step": 11535
},
{
"epoch": 3.645028824133302,
"grad_norm": 0.07463445354677864,
"learning_rate": 0.0016028934658922967,
"loss": 2.5576,
"step": 11540
},
{
"epoch": 3.646608228697781,
"grad_norm": 0.05973603973950257,
"learning_rate": 0.001602453417989054,
"loss": 2.6742,
"step": 11545
},
{
"epoch": 3.64818763326226,
"grad_norm": 0.06700117854462165,
"learning_rate": 0.0016020131868835761,
"loss": 2.6284,
"step": 11550
},
{
"epoch": 3.6497670378267393,
"grad_norm": 0.07029568334101637,
"learning_rate": 0.0016015727727097348,
"loss": 2.562,
"step": 11555
},
{
"epoch": 3.6513464423912185,
"grad_norm": 0.07551743367060475,
"learning_rate": 0.0016011321756014565,
"loss": 2.612,
"step": 11560
},
{
"epoch": 3.6529258469556978,
"grad_norm": 0.07940481504751765,
"learning_rate": 0.0016006913956927243,
"loss": 2.5675,
"step": 11565
},
{
"epoch": 3.654505251520177,
"grad_norm": 0.06675841086201434,
"learning_rate": 0.0016002504331175769,
"loss": 2.5539,
"step": 11570
},
{
"epoch": 3.656084656084656,
"grad_norm": 0.08992331141368655,
"learning_rate": 0.0015998092880101075,
"loss": 2.6729,
"step": 11575
},
{
"epoch": 3.657664060649135,
"grad_norm": 0.08040146383919725,
"learning_rate": 0.0015993679605044663,
"loss": 2.6544,
"step": 11580
},
{
"epoch": 3.6592434652136143,
"grad_norm": 0.05876739234585907,
"learning_rate": 0.0015989264507348575,
"loss": 2.5403,
"step": 11585
},
{
"epoch": 3.6608228697780936,
"grad_norm": 0.07351498356217907,
"learning_rate": 0.001598484758835542,
"loss": 2.5323,
"step": 11590
},
{
"epoch": 3.662402274342573,
"grad_norm": 0.05913681391507762,
"learning_rate": 0.0015980428849408348,
"loss": 2.6195,
"step": 11595
},
{
"epoch": 3.663981678907052,
"grad_norm": 0.0581271070416612,
"learning_rate": 0.0015976008291851075,
"loss": 2.4767,
"step": 11600
},
{
"epoch": 3.6655610834715313,
"grad_norm": 0.07273962694824715,
"learning_rate": 0.0015971585917027862,
"loss": 2.5916,
"step": 11605
},
{
"epoch": 3.6671404880360106,
"grad_norm": 0.06637742106479563,
"learning_rate": 0.0015967161726283526,
"loss": 2.6021,
"step": 11610
},
{
"epoch": 3.66871989260049,
"grad_norm": 0.06949695544434134,
"learning_rate": 0.0015962735720963432,
"loss": 2.4988,
"step": 11615
},
{
"epoch": 3.670299297164969,
"grad_norm": 0.07110169966769231,
"learning_rate": 0.0015958307902413503,
"loss": 2.5254,
"step": 11620
},
{
"epoch": 3.671878701729448,
"grad_norm": 0.08016657262784661,
"learning_rate": 0.0015953878271980212,
"loss": 2.6278,
"step": 11625
},
{
"epoch": 3.673458106293927,
"grad_norm": 0.06057413148965004,
"learning_rate": 0.0015949446831010575,
"loss": 2.6439,
"step": 11630
},
{
"epoch": 3.6750375108584064,
"grad_norm": 0.07240177167192563,
"learning_rate": 0.001594501358085217,
"loss": 2.5542,
"step": 11635
},
{
"epoch": 3.6766169154228856,
"grad_norm": 0.060843434450803834,
"learning_rate": 0.001594057852285312,
"loss": 2.5908,
"step": 11640
},
{
"epoch": 3.678196319987365,
"grad_norm": 0.06525810592005284,
"learning_rate": 0.0015936141658362097,
"loss": 2.6601,
"step": 11645
},
{
"epoch": 3.679775724551844,
"grad_norm": 0.06321377368586852,
"learning_rate": 0.001593170298872832,
"loss": 2.6012,
"step": 11650
},
{
"epoch": 3.681355129116323,
"grad_norm": 0.06628932292665589,
"learning_rate": 0.0015927262515301565,
"loss": 2.5134,
"step": 11655
},
{
"epoch": 3.682934533680802,
"grad_norm": 0.06559326144054903,
"learning_rate": 0.001592282023943215,
"loss": 2.6767,
"step": 11660
},
{
"epoch": 3.6845139382452814,
"grad_norm": 0.05987102260069441,
"learning_rate": 0.001591837616247094,
"loss": 2.6319,
"step": 11665
},
{
"epoch": 3.6860933428097606,
"grad_norm": 0.06934301401894631,
"learning_rate": 0.0015913930285769355,
"loss": 2.5198,
"step": 11670
},
{
"epoch": 3.68767274737424,
"grad_norm": 0.10831757254750395,
"learning_rate": 0.0015909482610679353,
"loss": 2.5457,
"step": 11675
},
{
"epoch": 3.689252151938719,
"grad_norm": 0.08218091726770442,
"learning_rate": 0.0015905033138553448,
"loss": 2.524,
"step": 11680
},
{
"epoch": 3.6908315565031984,
"grad_norm": 0.06376451212701367,
"learning_rate": 0.0015900581870744693,
"loss": 2.4758,
"step": 11685
},
{
"epoch": 3.6924109610676776,
"grad_norm": 0.06430972855977783,
"learning_rate": 0.001589612880860669,
"loss": 2.5077,
"step": 11690
},
{
"epoch": 3.693990365632157,
"grad_norm": 0.06695416071184843,
"learning_rate": 0.0015891673953493588,
"loss": 2.568,
"step": 11695
},
{
"epoch": 3.695569770196636,
"grad_norm": 0.061729629801812624,
"learning_rate": 0.001588721730676008,
"loss": 2.5588,
"step": 11700
},
{
"epoch": 3.697149174761115,
"grad_norm": 0.05946372003089128,
"learning_rate": 0.0015882758869761404,
"loss": 2.4831,
"step": 11705
},
{
"epoch": 3.698728579325594,
"grad_norm": 0.0630251412675826,
"learning_rate": 0.001587829864385334,
"loss": 2.5141,
"step": 11710
},
{
"epoch": 3.7003079838900734,
"grad_norm": 0.05948929391037606,
"learning_rate": 0.0015873836630392218,
"loss": 2.5658,
"step": 11715
},
{
"epoch": 3.7018873884545527,
"grad_norm": 0.06547513854729227,
"learning_rate": 0.0015869372830734905,
"loss": 2.5818,
"step": 11720
},
{
"epoch": 3.703466793019032,
"grad_norm": 0.06746569379680042,
"learning_rate": 0.0015864907246238814,
"loss": 2.5168,
"step": 11725
},
{
"epoch": 3.705046197583511,
"grad_norm": 0.0774919132426451,
"learning_rate": 0.0015860439878261903,
"loss": 2.5841,
"step": 11730
},
{
"epoch": 3.70662560214799,
"grad_norm": 0.06471795811247055,
"learning_rate": 0.0015855970728162665,
"loss": 2.586,
"step": 11735
},
{
"epoch": 3.708205006712469,
"grad_norm": 0.08173911913035221,
"learning_rate": 0.0015851499797300149,
"loss": 2.5915,
"step": 11740
},
{
"epoch": 3.7097844112769485,
"grad_norm": 0.0698748086397385,
"learning_rate": 0.0015847027087033925,
"loss": 2.6078,
"step": 11745
},
{
"epoch": 3.7113638158414277,
"grad_norm": 0.058986179766953396,
"learning_rate": 0.0015842552598724123,
"loss": 2.6095,
"step": 11750
},
{
"epoch": 3.712943220405907,
"grad_norm": 0.06443992991199661,
"learning_rate": 0.0015838076333731406,
"loss": 2.6793,
"step": 11755
},
{
"epoch": 3.714522624970386,
"grad_norm": 0.06707207521817622,
"learning_rate": 0.0015833598293416979,
"loss": 2.5904,
"step": 11760
},
{
"epoch": 3.7161020295348655,
"grad_norm": 0.06908462552250887,
"learning_rate": 0.001582911847914258,
"loss": 2.6395,
"step": 11765
},
{
"epoch": 3.7176814340993447,
"grad_norm": 0.07953988614999913,
"learning_rate": 0.0015824636892270494,
"loss": 2.6607,
"step": 11770
},
{
"epoch": 3.719260838663824,
"grad_norm": 0.07122295263792892,
"learning_rate": 0.0015820153534163543,
"loss": 2.4373,
"step": 11775
},
{
"epoch": 3.720840243228303,
"grad_norm": 0.0791227253596731,
"learning_rate": 0.001581566840618509,
"loss": 2.5887,
"step": 11780
},
{
"epoch": 3.722419647792782,
"grad_norm": 0.08491488490909872,
"learning_rate": 0.0015811181509699033,
"loss": 2.5077,
"step": 11785
},
{
"epoch": 3.7239990523572613,
"grad_norm": 0.06909500785675694,
"learning_rate": 0.0015806692846069806,
"loss": 2.5626,
"step": 11790
},
{
"epoch": 3.7255784569217405,
"grad_norm": 0.08871884583658181,
"learning_rate": 0.0015802202416662383,
"loss": 2.4927,
"step": 11795
},
{
"epoch": 3.7271578614862197,
"grad_norm": 0.06983612739579836,
"learning_rate": 0.0015797710222842278,
"loss": 2.574,
"step": 11800
},
{
"epoch": 3.728737266050699,
"grad_norm": 0.07178637063925711,
"learning_rate": 0.0015793216265975539,
"loss": 2.5315,
"step": 11805
},
{
"epoch": 3.7303166706151782,
"grad_norm": 0.07198552612664164,
"learning_rate": 0.0015788720547428748,
"loss": 2.6055,
"step": 11810
},
{
"epoch": 3.731896075179657,
"grad_norm": 0.0834213905388001,
"learning_rate": 0.001578422306856902,
"loss": 2.5838,
"step": 11815
},
{
"epoch": 3.7334754797441363,
"grad_norm": 0.06497027284645868,
"learning_rate": 0.0015779723830764013,
"loss": 2.5466,
"step": 11820
},
{
"epoch": 3.7350548843086155,
"grad_norm": 0.08205936413753118,
"learning_rate": 0.0015775222835381917,
"loss": 2.4887,
"step": 11825
},
{
"epoch": 3.736634288873095,
"grad_norm": 0.07645597101149956,
"learning_rate": 0.001577072008379146,
"loss": 2.611,
"step": 11830
},
{
"epoch": 3.738213693437574,
"grad_norm": 0.06915056001015991,
"learning_rate": 0.001576621557736189,
"loss": 2.6185,
"step": 11835
},
{
"epoch": 3.7397930980020533,
"grad_norm": 0.06742031156171105,
"learning_rate": 0.001576170931746301,
"loss": 2.6044,
"step": 11840
},
{
"epoch": 3.7413725025665325,
"grad_norm": 0.08832787082109252,
"learning_rate": 0.0015757201305465133,
"loss": 2.6052,
"step": 11845
},
{
"epoch": 3.742951907131012,
"grad_norm": 0.06210399759957026,
"learning_rate": 0.0015752691542739129,
"loss": 2.551,
"step": 11850
},
{
"epoch": 3.744531311695491,
"grad_norm": 0.07501130509152855,
"learning_rate": 0.0015748180030656376,
"loss": 2.6258,
"step": 11855
},
{
"epoch": 3.74611071625997,
"grad_norm": 0.06848979870150981,
"learning_rate": 0.0015743666770588805,
"loss": 2.5335,
"step": 11860
},
{
"epoch": 3.747690120824449,
"grad_norm": 0.07729824208655418,
"learning_rate": 0.0015739151763908867,
"loss": 2.5208,
"step": 11865
},
{
"epoch": 3.7492695253889283,
"grad_norm": 0.06169126876014135,
"learning_rate": 0.0015734635011989545,
"loss": 2.5912,
"step": 11870
},
{
"epoch": 3.7508489299534076,
"grad_norm": 0.06117099440365994,
"learning_rate": 0.0015730116516204354,
"loss": 2.5647,
"step": 11875
},
{
"epoch": 3.752428334517887,
"grad_norm": 0.07141550473164313,
"learning_rate": 0.0015725596277927343,
"loss": 2.5888,
"step": 11880
},
{
"epoch": 3.754007739082366,
"grad_norm": 0.06802374034691881,
"learning_rate": 0.0015721074298533084,
"loss": 2.6387,
"step": 11885
},
{
"epoch": 3.755587143646845,
"grad_norm": 0.07395997757949198,
"learning_rate": 0.0015716550579396684,
"loss": 2.6458,
"step": 11890
},
{
"epoch": 3.757166548211324,
"grad_norm": 0.07221727705366908,
"learning_rate": 0.001571202512189378,
"loss": 2.587,
"step": 11895
},
{
"epoch": 3.7587459527758034,
"grad_norm": 0.06499442066371568,
"learning_rate": 0.0015707497927400528,
"loss": 2.5284,
"step": 11900
},
{
"epoch": 3.7603253573402826,
"grad_norm": 0.0763606951031882,
"learning_rate": 0.0015702968997293625,
"loss": 2.6571,
"step": 11905
},
{
"epoch": 3.761904761904762,
"grad_norm": 0.07570530757186843,
"learning_rate": 0.0015698438332950287,
"loss": 2.5307,
"step": 11910
},
{
"epoch": 3.763484166469241,
"grad_norm": 0.11453605716363023,
"learning_rate": 0.0015693905935748262,
"loss": 2.4929,
"step": 11915
},
{
"epoch": 3.7650635710337204,
"grad_norm": 0.07181105280729862,
"learning_rate": 0.0015689371807065815,
"loss": 2.6071,
"step": 11920
},
{
"epoch": 3.7666429755981996,
"grad_norm": 0.08144796201140979,
"learning_rate": 0.0015684835948281757,
"loss": 2.5995,
"step": 11925
},
{
"epoch": 3.768222380162679,
"grad_norm": 0.06860324974122771,
"learning_rate": 0.0015680298360775406,
"loss": 2.6382,
"step": 11930
},
{
"epoch": 3.769801784727158,
"grad_norm": 0.07704279863985637,
"learning_rate": 0.001567575904592662,
"loss": 2.6116,
"step": 11935
},
{
"epoch": 3.771381189291637,
"grad_norm": 0.06734152458036485,
"learning_rate": 0.0015671218005115766,
"loss": 2.4517,
"step": 11940
},
{
"epoch": 3.772960593856116,
"grad_norm": 0.06861923074644795,
"learning_rate": 0.0015666675239723756,
"loss": 2.5049,
"step": 11945
},
{
"epoch": 3.7745399984205954,
"grad_norm": 0.06391851461061915,
"learning_rate": 0.0015662130751132007,
"loss": 2.509,
"step": 11950
},
{
"epoch": 3.7761194029850746,
"grad_norm": 0.08905207752274513,
"learning_rate": 0.0015657584540722477,
"loss": 2.591,
"step": 11955
},
{
"epoch": 3.777698807549554,
"grad_norm": 0.07245118648464396,
"learning_rate": 0.001565303660987763,
"loss": 2.5828,
"step": 11960
},
{
"epoch": 3.779278212114033,
"grad_norm": 0.054955350922944506,
"learning_rate": 0.0015648486959980471,
"loss": 2.4764,
"step": 11965
},
{
"epoch": 3.780857616678512,
"grad_norm": 0.0701877737144574,
"learning_rate": 0.0015643935592414518,
"loss": 2.5812,
"step": 11970
},
{
"epoch": 3.782437021242991,
"grad_norm": 0.06911041937893157,
"learning_rate": 0.001563938250856381,
"loss": 2.5525,
"step": 11975
},
{
"epoch": 3.7840164258074704,
"grad_norm": 0.08435506406574682,
"learning_rate": 0.0015634827709812913,
"loss": 2.6127,
"step": 11980
},
{
"epoch": 3.7855958303719497,
"grad_norm": 0.06339153276038305,
"learning_rate": 0.001563027119754691,
"loss": 2.4767,
"step": 11985
},
{
"epoch": 3.787175234936429,
"grad_norm": 0.07001504227452558,
"learning_rate": 0.0015625712973151408,
"loss": 2.5805,
"step": 11990
},
{
"epoch": 3.788754639500908,
"grad_norm": 0.0809836856598099,
"learning_rate": 0.0015621153038012539,
"loss": 2.5752,
"step": 11995
},
{
"epoch": 3.7903340440653874,
"grad_norm": 0.08150940153250924,
"learning_rate": 0.0015616591393516944,
"loss": 2.6091,
"step": 12000
},
{
"epoch": 3.7919134486298667,
"grad_norm": 0.07727453120069605,
"learning_rate": 0.001561202804105179,
"loss": 2.5872,
"step": 12005
},
{
"epoch": 3.793492853194346,
"grad_norm": 0.08644028462606336,
"learning_rate": 0.0015607462982004763,
"loss": 2.5823,
"step": 12010
},
{
"epoch": 3.795072257758825,
"grad_norm": 0.06867645950982325,
"learning_rate": 0.0015602896217764073,
"loss": 2.5743,
"step": 12015
},
{
"epoch": 3.796651662323304,
"grad_norm": 0.06724634687778647,
"learning_rate": 0.0015598327749718442,
"loss": 2.5859,
"step": 12020
},
{
"epoch": 3.7982310668877832,
"grad_norm": 0.08950770334640352,
"learning_rate": 0.001559375757925711,
"loss": 2.587,
"step": 12025
},
{
"epoch": 3.7998104714522625,
"grad_norm": 0.0637875610038583,
"learning_rate": 0.0015589185707769837,
"loss": 2.5481,
"step": 12030
},
{
"epoch": 3.8013898760167417,
"grad_norm": 0.06560210334509306,
"learning_rate": 0.0015584612136646898,
"loss": 2.5254,
"step": 12035
},
{
"epoch": 3.802969280581221,
"grad_norm": 0.06735440160501718,
"learning_rate": 0.0015580036867279094,
"loss": 2.5454,
"step": 12040
},
{
"epoch": 3.8045486851457,
"grad_norm": 0.07069421391081696,
"learning_rate": 0.001557545990105773,
"loss": 2.5071,
"step": 12045
},
{
"epoch": 3.806128089710179,
"grad_norm": 0.062099328451962206,
"learning_rate": 0.0015570881239374632,
"loss": 2.5911,
"step": 12050
},
{
"epoch": 3.8077074942746583,
"grad_norm": 0.06056417644853614,
"learning_rate": 0.001556630088362214,
"loss": 2.628,
"step": 12055
},
{
"epoch": 3.8092868988391375,
"grad_norm": 0.06895650617867606,
"learning_rate": 0.0015561718835193118,
"loss": 2.6498,
"step": 12060
},
{
"epoch": 3.8108663034036168,
"grad_norm": 0.06309650874937076,
"learning_rate": 0.001555713509548093,
"loss": 2.5595,
"step": 12065
},
{
"epoch": 3.812445707968096,
"grad_norm": 0.06385696409028277,
"learning_rate": 0.0015552549665879462,
"loss": 2.5541,
"step": 12070
},
{
"epoch": 3.8140251125325753,
"grad_norm": 0.08318600724534689,
"learning_rate": 0.0015547962547783124,
"loss": 2.605,
"step": 12075
},
{
"epoch": 3.8156045170970545,
"grad_norm": 0.0644446170959427,
"learning_rate": 0.0015543373742586816,
"loss": 2.5562,
"step": 12080
},
{
"epoch": 3.8171839216615338,
"grad_norm": 0.06254069992465562,
"learning_rate": 0.0015538783251685972,
"loss": 2.4745,
"step": 12085
},
{
"epoch": 3.818763326226013,
"grad_norm": 0.0627387511682279,
"learning_rate": 0.001553419107647653,
"loss": 2.6228,
"step": 12090
},
{
"epoch": 3.8203427307904922,
"grad_norm": 0.05981773066384552,
"learning_rate": 0.001552959721835494,
"loss": 2.5832,
"step": 12095
},
{
"epoch": 3.821922135354971,
"grad_norm": 0.0692513869855841,
"learning_rate": 0.0015525001678718168,
"loss": 2.5631,
"step": 12100
},
{
"epoch": 3.8235015399194503,
"grad_norm": 0.074388575687594,
"learning_rate": 0.0015520404458963684,
"loss": 2.6013,
"step": 12105
},
{
"epoch": 3.8250809444839295,
"grad_norm": 0.06664639223608652,
"learning_rate": 0.0015515805560489474,
"loss": 2.4846,
"step": 12110
},
{
"epoch": 3.826660349048409,
"grad_norm": 0.07384557080674342,
"learning_rate": 0.0015511204984694036,
"loss": 2.7205,
"step": 12115
},
{
"epoch": 3.828239753612888,
"grad_norm": 0.07225928430492333,
"learning_rate": 0.0015506602732976373,
"loss": 2.5327,
"step": 12120
},
{
"epoch": 3.8298191581773673,
"grad_norm": 0.06574503198115073,
"learning_rate": 0.0015501998806736002,
"loss": 2.5849,
"step": 12125
},
{
"epoch": 3.831398562741846,
"grad_norm": 0.08582669309979607,
"learning_rate": 0.0015497393207372946,
"loss": 2.5385,
"step": 12130
},
{
"epoch": 3.8329779673063253,
"grad_norm": 0.06837292773786788,
"learning_rate": 0.0015492785936287742,
"loss": 2.5878,
"step": 12135
},
{
"epoch": 3.8345573718708046,
"grad_norm": 0.07273565814600343,
"learning_rate": 0.0015488176994881428,
"loss": 2.5865,
"step": 12140
},
{
"epoch": 3.836136776435284,
"grad_norm": 0.07224334715060578,
"learning_rate": 0.0015483566384555556,
"loss": 2.5363,
"step": 12145
},
{
"epoch": 3.837716180999763,
"grad_norm": 0.0845990886678342,
"learning_rate": 0.001547895410671218,
"loss": 2.5515,
"step": 12150
},
{
"epoch": 3.8392955855642423,
"grad_norm": 0.06549368814306274,
"learning_rate": 0.0015474340162753867,
"loss": 2.5401,
"step": 12155
},
{
"epoch": 3.8408749901287216,
"grad_norm": 0.06806248391684777,
"learning_rate": 0.0015469724554083685,
"loss": 2.5636,
"step": 12160
},
{
"epoch": 3.842454394693201,
"grad_norm": 0.06279417772461118,
"learning_rate": 0.0015465107282105217,
"loss": 2.5114,
"step": 12165
},
{
"epoch": 3.84403379925768,
"grad_norm": 0.0655147853109739,
"learning_rate": 0.0015460488348222538,
"loss": 2.5706,
"step": 12170
},
{
"epoch": 3.8456132038221593,
"grad_norm": 0.06907197386454811,
"learning_rate": 0.0015455867753840242,
"loss": 2.5016,
"step": 12175
},
{
"epoch": 3.847192608386638,
"grad_norm": 0.0736423521161705,
"learning_rate": 0.0015451245500363421,
"loss": 2.5645,
"step": 12180
},
{
"epoch": 3.8487720129511174,
"grad_norm": 0.07886210741275858,
"learning_rate": 0.0015446621589197674,
"loss": 2.5423,
"step": 12185
},
{
"epoch": 3.8503514175155966,
"grad_norm": 0.08844057753505022,
"learning_rate": 0.0015441996021749098,
"loss": 2.5896,
"step": 12190
},
{
"epoch": 3.851930822080076,
"grad_norm": 0.0719920887267598,
"learning_rate": 0.0015437368799424305,
"loss": 2.5224,
"step": 12195
},
{
"epoch": 3.853510226644555,
"grad_norm": 0.07033026347955103,
"learning_rate": 0.0015432739923630398,
"loss": 2.6151,
"step": 12200
},
{
"epoch": 3.855089631209034,
"grad_norm": 0.06221780697286292,
"learning_rate": 0.0015428109395774993,
"loss": 2.5101,
"step": 12205
},
{
"epoch": 3.856669035773513,
"grad_norm": 0.07062749021279034,
"learning_rate": 0.0015423477217266198,
"loss": 2.6095,
"step": 12210
},
{
"epoch": 3.8582484403379924,
"grad_norm": 0.06299103067926018,
"learning_rate": 0.0015418843389512636,
"loss": 2.5005,
"step": 12215
},
{
"epoch": 3.8598278449024717,
"grad_norm": 0.06589410137986898,
"learning_rate": 0.001541420791392342,
"loss": 2.5212,
"step": 12220
},
{
"epoch": 3.861407249466951,
"grad_norm": 0.06971836408575503,
"learning_rate": 0.001540957079190817,
"loss": 2.5891,
"step": 12225
},
{
"epoch": 3.86298665403143,
"grad_norm": 0.06266695473473892,
"learning_rate": 0.0015404932024877006,
"loss": 2.5676,
"step": 12230
},
{
"epoch": 3.8645660585959094,
"grad_norm": 0.059653893509713715,
"learning_rate": 0.0015400291614240543,
"loss": 2.5693,
"step": 12235
},
{
"epoch": 3.8661454631603887,
"grad_norm": 0.06475486572029057,
"learning_rate": 0.0015395649561409904,
"loss": 2.5376,
"step": 12240
},
{
"epoch": 3.867724867724868,
"grad_norm": 0.0706062062003792,
"learning_rate": 0.001539100586779671,
"loss": 2.5812,
"step": 12245
},
{
"epoch": 3.869304272289347,
"grad_norm": 0.06582491956953788,
"learning_rate": 0.0015386360534813078,
"loss": 2.5246,
"step": 12250
},
{
"epoch": 3.870883676853826,
"grad_norm": 0.06774194262370492,
"learning_rate": 0.0015381713563871616,
"loss": 2.6078,
"step": 12255
},
{
"epoch": 3.872463081418305,
"grad_norm": 0.08581647240071012,
"learning_rate": 0.0015377064956385445,
"loss": 2.5637,
"step": 12260
},
{
"epoch": 3.8740424859827844,
"grad_norm": 0.07227433739993022,
"learning_rate": 0.0015372414713768175,
"loss": 2.507,
"step": 12265
},
{
"epoch": 3.8756218905472637,
"grad_norm": 0.07068266880914315,
"learning_rate": 0.001536776283743392,
"loss": 2.5054,
"step": 12270
},
{
"epoch": 3.877201295111743,
"grad_norm": 0.06252227417494192,
"learning_rate": 0.001536310932879728,
"loss": 2.5033,
"step": 12275
},
{
"epoch": 3.878780699676222,
"grad_norm": 0.06688104956127731,
"learning_rate": 0.0015358454189273358,
"loss": 2.7985,
"step": 12280
},
{
"epoch": 3.880360104240701,
"grad_norm": 0.05763098920186217,
"learning_rate": 0.0015353797420277753,
"loss": 2.5103,
"step": 12285
},
{
"epoch": 3.8819395088051802,
"grad_norm": 0.06888282622425737,
"learning_rate": 0.0015349139023226562,
"loss": 2.5494,
"step": 12290
},
{
"epoch": 3.8835189133696595,
"grad_norm": 0.06682425490074745,
"learning_rate": 0.0015344478999536366,
"loss": 2.5212,
"step": 12295
},
{
"epoch": 3.8850983179341387,
"grad_norm": 0.05722987794976806,
"learning_rate": 0.0015339817350624257,
"loss": 2.6008,
"step": 12300
},
{
"epoch": 3.886677722498618,
"grad_norm": 0.0573207391101277,
"learning_rate": 0.0015335154077907808,
"loss": 2.5186,
"step": 12305
},
{
"epoch": 3.8882571270630972,
"grad_norm": 0.06763657629014941,
"learning_rate": 0.0015330489182805087,
"loss": 2.5378,
"step": 12310
},
{
"epoch": 3.8898365316275765,
"grad_norm": 0.07187086440873063,
"learning_rate": 0.001532582266673467,
"loss": 2.5675,
"step": 12315
},
{
"epoch": 3.8914159361920557,
"grad_norm": 0.07420773877218477,
"learning_rate": 0.0015321154531115601,
"loss": 2.6274,
"step": 12320
},
{
"epoch": 3.892995340756535,
"grad_norm": 0.07927209871341649,
"learning_rate": 0.001531648477736744,
"loss": 2.5448,
"step": 12325
},
{
"epoch": 3.894574745321014,
"grad_norm": 0.08244381956489737,
"learning_rate": 0.0015311813406910224,
"loss": 2.4763,
"step": 12330
},
{
"epoch": 3.896154149885493,
"grad_norm": 0.0899563297138929,
"learning_rate": 0.001530714042116449,
"loss": 2.6179,
"step": 12335
},
{
"epoch": 3.8977335544499723,
"grad_norm": 0.07584948909618364,
"learning_rate": 0.0015302465821551267,
"loss": 2.546,
"step": 12340
},
{
"epoch": 3.8993129590144515,
"grad_norm": 0.06545533095786493,
"learning_rate": 0.0015297789609492061,
"loss": 2.5659,
"step": 12345
},
{
"epoch": 3.9008923635789308,
"grad_norm": 0.07894341187855289,
"learning_rate": 0.0015293111786408883,
"loss": 2.5599,
"step": 12350
},
{
"epoch": 3.90247176814341,
"grad_norm": 0.062169262244883804,
"learning_rate": 0.0015288432353724232,
"loss": 2.4791,
"step": 12355
},
{
"epoch": 3.9040511727078893,
"grad_norm": 0.06556902494812732,
"learning_rate": 0.0015283751312861092,
"loss": 2.6097,
"step": 12360
},
{
"epoch": 3.905630577272368,
"grad_norm": 0.06491974805319319,
"learning_rate": 0.0015279068665242934,
"loss": 2.6049,
"step": 12365
},
{
"epoch": 3.9072099818368473,
"grad_norm": 0.07259381531482005,
"learning_rate": 0.0015274384412293722,
"loss": 2.5192,
"step": 12370
},
{
"epoch": 3.9087893864013266,
"grad_norm": 0.08063144546357277,
"learning_rate": 0.0015269698555437912,
"loss": 2.5991,
"step": 12375
},
{
"epoch": 3.910368790965806,
"grad_norm": 0.07107690741998134,
"learning_rate": 0.001526501109610044,
"loss": 2.5397,
"step": 12380
},
{
"epoch": 3.911948195530285,
"grad_norm": 0.05530165601623058,
"learning_rate": 0.0015260322035706732,
"loss": 2.5563,
"step": 12385
},
{
"epoch": 3.9135276000947643,
"grad_norm": 0.06966379636017452,
"learning_rate": 0.00152556313756827,
"loss": 2.5813,
"step": 12390
},
{
"epoch": 3.9151070046592436,
"grad_norm": 0.06841110650186709,
"learning_rate": 0.001525093911745475,
"loss": 2.5373,
"step": 12395
},
{
"epoch": 3.916686409223723,
"grad_norm": 0.062219731428798705,
"learning_rate": 0.0015246245262449762,
"loss": 2.4471,
"step": 12400
},
{
"epoch": 3.918265813788202,
"grad_norm": 0.06941377270222482,
"learning_rate": 0.0015241549812095112,
"loss": 2.5232,
"step": 12405
},
{
"epoch": 3.9198452183526813,
"grad_norm": 0.060912115315366694,
"learning_rate": 0.0015236852767818649,
"loss": 2.6102,
"step": 12410
},
{
"epoch": 3.92142462291716,
"grad_norm": 0.053355889741235285,
"learning_rate": 0.0015232154131048716,
"loss": 2.5226,
"step": 12415
},
{
"epoch": 3.9230040274816393,
"grad_norm": 0.06741970546593662,
"learning_rate": 0.0015227453903214146,
"loss": 2.5365,
"step": 12420
},
{
"epoch": 3.9245834320461186,
"grad_norm": 0.06898743897733144,
"learning_rate": 0.0015222752085744242,
"loss": 2.5389,
"step": 12425
},
{
"epoch": 3.926162836610598,
"grad_norm": 0.07828821379776077,
"learning_rate": 0.00152180486800688,
"loss": 2.4833,
"step": 12430
},
{
"epoch": 3.927742241175077,
"grad_norm": 0.07412821727493593,
"learning_rate": 0.001521334368761809,
"loss": 2.573,
"step": 12435
},
{
"epoch": 3.9293216457395563,
"grad_norm": 0.0732937633568354,
"learning_rate": 0.001520863710982287,
"loss": 2.514,
"step": 12440
},
{
"epoch": 3.930901050304035,
"grad_norm": 0.06489639713929875,
"learning_rate": 0.0015203928948114389,
"loss": 2.6071,
"step": 12445
},
{
"epoch": 3.9324804548685144,
"grad_norm": 0.06300824225072883,
"learning_rate": 0.0015199219203924366,
"loss": 2.6405,
"step": 12450
},
{
"epoch": 3.9340598594329936,
"grad_norm": 0.06632129995715534,
"learning_rate": 0.0015194507878684997,
"loss": 2.5978,
"step": 12455
},
{
"epoch": 3.935639263997473,
"grad_norm": 0.06366258558967011,
"learning_rate": 0.0015189794973828974,
"loss": 2.4811,
"step": 12460
},
{
"epoch": 3.937218668561952,
"grad_norm": 0.057852439284190545,
"learning_rate": 0.0015185080490789456,
"loss": 2.4908,
"step": 12465
},
{
"epoch": 3.9387980731264314,
"grad_norm": 0.06935274072833637,
"learning_rate": 0.0015180364431000091,
"loss": 2.6038,
"step": 12470
},
{
"epoch": 3.9403774776909106,
"grad_norm": 0.06447527949901698,
"learning_rate": 0.0015175646795895,
"loss": 2.4986,
"step": 12475
},
{
"epoch": 3.94195688225539,
"grad_norm": 0.07718286416506806,
"learning_rate": 0.0015170927586908784,
"loss": 2.5036,
"step": 12480
},
{
"epoch": 3.943536286819869,
"grad_norm": 0.09905668690842306,
"learning_rate": 0.001516620680547653,
"loss": 2.5555,
"step": 12485
},
{
"epoch": 3.9451156913843484,
"grad_norm": 0.0729132030800347,
"learning_rate": 0.001516148445303379,
"loss": 2.5573,
"step": 12490
},
{
"epoch": 3.946695095948827,
"grad_norm": 0.0828373214953166,
"learning_rate": 0.001515676053101661,
"loss": 2.5675,
"step": 12495
},
{
"epoch": 3.9482745005133064,
"grad_norm": 0.07913317264070925,
"learning_rate": 0.00151520350408615,
"loss": 2.5286,
"step": 12500
},
{
"epoch": 3.9498539050777857,
"grad_norm": 0.08777782985578693,
"learning_rate": 0.001514730798400545,
"loss": 2.5353,
"step": 12505
},
{
"epoch": 3.951433309642265,
"grad_norm": 0.06957346647678062,
"learning_rate": 0.0015142579361885926,
"loss": 2.6194,
"step": 12510
},
{
"epoch": 3.953012714206744,
"grad_norm": 0.06560414609700341,
"learning_rate": 0.0015137849175940882,
"loss": 2.5488,
"step": 12515
},
{
"epoch": 3.9545921187712234,
"grad_norm": 0.060347021480123796,
"learning_rate": 0.0015133117427608724,
"loss": 2.4683,
"step": 12520
},
{
"epoch": 3.956171523335702,
"grad_norm": 0.07389638212333845,
"learning_rate": 0.0015128384118328353,
"loss": 2.5106,
"step": 12525
},
{
"epoch": 3.9577509279001815,
"grad_norm": 0.06385706669645118,
"learning_rate": 0.001512364924953914,
"loss": 2.4939,
"step": 12530
},
{
"epoch": 3.9593303324646607,
"grad_norm": 0.0655373849546735,
"learning_rate": 0.0015118912822680924,
"loss": 2.5059,
"step": 12535
},
{
"epoch": 3.96090973702914,
"grad_norm": 0.06035286538094596,
"learning_rate": 0.0015114174839194027,
"loss": 2.5216,
"step": 12540
},
{
"epoch": 3.962489141593619,
"grad_norm": 0.07314909754776168,
"learning_rate": 0.0015109435300519238,
"loss": 2.5976,
"step": 12545
},
{
"epoch": 3.9640685461580984,
"grad_norm": 0.06358442436359428,
"learning_rate": 0.0015104694208097815,
"loss": 2.5821,
"step": 12550
},
{
"epoch": 3.9656479507225777,
"grad_norm": 0.05627148872134842,
"learning_rate": 0.00150999515633715,
"loss": 2.5797,
"step": 12555
},
{
"epoch": 3.967227355287057,
"grad_norm": 0.055213419124257625,
"learning_rate": 0.00150952073677825,
"loss": 2.514,
"step": 12560
},
{
"epoch": 3.968806759851536,
"grad_norm": 0.06548271078868903,
"learning_rate": 0.0015090461622773495,
"loss": 2.6508,
"step": 12565
},
{
"epoch": 3.9703861644160154,
"grad_norm": 0.05439279521404275,
"learning_rate": 0.001508571432978763,
"loss": 2.5444,
"step": 12570
},
{
"epoch": 3.9719655689804942,
"grad_norm": 0.05597006250704407,
"learning_rate": 0.0015080965490268533,
"loss": 2.5818,
"step": 12575
},
{
"epoch": 3.9735449735449735,
"grad_norm": 0.06480900428913591,
"learning_rate": 0.0015076215105660291,
"loss": 2.503,
"step": 12580
},
{
"epoch": 3.9751243781094527,
"grad_norm": 0.0602210100028421,
"learning_rate": 0.0015071463177407471,
"loss": 2.488,
"step": 12585
},
{
"epoch": 3.976703782673932,
"grad_norm": 0.07181151731195233,
"learning_rate": 0.0015066709706955104,
"loss": 2.5144,
"step": 12590
},
{
"epoch": 3.9782831872384112,
"grad_norm": 0.061514167955377494,
"learning_rate": 0.0015061954695748682,
"loss": 2.4351,
"step": 12595
},
{
"epoch": 3.97986259180289,
"grad_norm": 0.08114429713093464,
"learning_rate": 0.001505719814523418,
"loss": 2.598,
"step": 12600
},
{
"epoch": 3.9814419963673693,
"grad_norm": 0.06643483669026658,
"learning_rate": 0.0015052440056858036,
"loss": 2.5068,
"step": 12605
},
{
"epoch": 3.9830214009318485,
"grad_norm": 0.0621066900073868,
"learning_rate": 0.001504768043206715,
"loss": 2.6134,
"step": 12610
},
{
"epoch": 3.984600805496328,
"grad_norm": 0.10306918624677071,
"learning_rate": 0.0015042919272308896,
"loss": 2.5018,
"step": 12615
},
{
"epoch": 3.986180210060807,
"grad_norm": 0.07294016120820981,
"learning_rate": 0.0015038156579031108,
"loss": 2.602,
"step": 12620
},
{
"epoch": 3.9877596146252863,
"grad_norm": 0.06777722117317442,
"learning_rate": 0.0015033392353682095,
"loss": 2.5236,
"step": 12625
},
{
"epoch": 3.9893390191897655,
"grad_norm": 0.06556144755742266,
"learning_rate": 0.001502862659771063,
"loss": 2.5473,
"step": 12630
},
{
"epoch": 3.9909184237542448,
"grad_norm": 0.0767011021148724,
"learning_rate": 0.0015023859312565944,
"loss": 2.6054,
"step": 12635
},
{
"epoch": 3.992497828318724,
"grad_norm": 0.05778768006153058,
"learning_rate": 0.0015019090499697738,
"loss": 2.5015,
"step": 12640
},
{
"epoch": 3.9940772328832033,
"grad_norm": 0.08519303735460436,
"learning_rate": 0.001501432016055618,
"loss": 2.6381,
"step": 12645
},
{
"epoch": 3.995656637447682,
"grad_norm": 0.06211552456567722,
"learning_rate": 0.00150095482965919,
"loss": 2.5302,
"step": 12650
},
{
"epoch": 3.9972360420121613,
"grad_norm": 0.0674676598184494,
"learning_rate": 0.0015004774909255984,
"loss": 2.546,
"step": 12655
},
{
"epoch": 3.9988154465766406,
"grad_norm": 0.06318447337573145,
"learning_rate": 0.0015,
"loss": 2.6011,
"step": 12660
},
{
"epoch": 4.0,
"eval_loss": 2.5386574268341064,
"eval_runtime": 118.4604,
"eval_samples_per_second": 22.362,
"eval_steps_per_second": 5.597,
"step": 12664
},
{
"epoch": 4.000315880912896,
"grad_norm": 0.06197262595124461,
"learning_rate": 0.0014995223570275962,
"loss": 2.5741,
"step": 12665
},
{
"epoch": 4.001895285477375,
"grad_norm": 0.10585904612167955,
"learning_rate": 0.0014990445621536348,
"loss": 2.5206,
"step": 12670
},
{
"epoch": 4.0034746900418545,
"grad_norm": 0.07447969829220162,
"learning_rate": 0.0014985666155234107,
"loss": 2.5209,
"step": 12675
},
{
"epoch": 4.005054094606334,
"grad_norm": 0.0720136198965427,
"learning_rate": 0.0014980885172822646,
"loss": 2.5258,
"step": 12680
},
{
"epoch": 4.006633499170813,
"grad_norm": 0.06614382437037156,
"learning_rate": 0.0014976102675755823,
"loss": 2.5181,
"step": 12685
},
{
"epoch": 4.008212903735291,
"grad_norm": 0.08568424376359589,
"learning_rate": 0.0014971318665487972,
"loss": 2.5327,
"step": 12690
},
{
"epoch": 4.009792308299771,
"grad_norm": 0.06507721023920351,
"learning_rate": 0.0014966533143473874,
"loss": 2.5715,
"step": 12695
},
{
"epoch": 4.01137171286425,
"grad_norm": 0.06424893564735171,
"learning_rate": 0.0014961746111168783,
"loss": 2.4817,
"step": 12700
},
{
"epoch": 4.012951117428729,
"grad_norm": 0.06417979275404782,
"learning_rate": 0.00149569575700284,
"loss": 2.5266,
"step": 12705
},
{
"epoch": 4.014530521993208,
"grad_norm": 0.06326836264220499,
"learning_rate": 0.001495216752150889,
"loss": 2.5385,
"step": 12710
},
{
"epoch": 4.016109926557688,
"grad_norm": 0.0796902692938263,
"learning_rate": 0.0014947375967066879,
"loss": 2.556,
"step": 12715
},
{
"epoch": 4.017689331122167,
"grad_norm": 0.06999205525685467,
"learning_rate": 0.0014942582908159445,
"loss": 2.4686,
"step": 12720
},
{
"epoch": 4.019268735686646,
"grad_norm": 0.058974463155499354,
"learning_rate": 0.0014937788346244126,
"loss": 2.485,
"step": 12725
},
{
"epoch": 4.020848140251125,
"grad_norm": 0.07187170821335045,
"learning_rate": 0.001493299228277892,
"loss": 2.6368,
"step": 12730
},
{
"epoch": 4.022427544815605,
"grad_norm": 0.0670071291341169,
"learning_rate": 0.001492819471922228,
"loss": 2.5562,
"step": 12735
},
{
"epoch": 4.024006949380084,
"grad_norm": 0.060418395902372614,
"learning_rate": 0.001492339565703311,
"loss": 2.5904,
"step": 12740
},
{
"epoch": 4.025586353944563,
"grad_norm": 0.07524606530892788,
"learning_rate": 0.0014918595097670783,
"loss": 2.5773,
"step": 12745
},
{
"epoch": 4.027165758509042,
"grad_norm": 0.0723928195941418,
"learning_rate": 0.0014913793042595107,
"loss": 2.535,
"step": 12750
},
{
"epoch": 4.028745163073522,
"grad_norm": 0.06762103752752954,
"learning_rate": 0.0014908989493266364,
"loss": 2.4715,
"step": 12755
},
{
"epoch": 4.030324567638001,
"grad_norm": 0.0595953998093573,
"learning_rate": 0.001490418445114528,
"loss": 2.4335,
"step": 12760
},
{
"epoch": 4.03190397220248,
"grad_norm": 0.07426039937453528,
"learning_rate": 0.001489937791769304,
"loss": 2.515,
"step": 12765
},
{
"epoch": 4.0334833767669585,
"grad_norm": 0.058581189485131954,
"learning_rate": 0.0014894569894371274,
"loss": 2.5054,
"step": 12770
},
{
"epoch": 4.035062781331438,
"grad_norm": 0.06680693812756389,
"learning_rate": 0.001488976038264208,
"loss": 2.5582,
"step": 12775
},
{
"epoch": 4.036642185895917,
"grad_norm": 0.06508799516512898,
"learning_rate": 0.0014884949383967992,
"loss": 2.4895,
"step": 12780
},
{
"epoch": 4.038221590460396,
"grad_norm": 0.056520400454428035,
"learning_rate": 0.0014880136899812011,
"loss": 2.515,
"step": 12785
},
{
"epoch": 4.039800995024875,
"grad_norm": 0.06301408957681141,
"learning_rate": 0.0014875322931637573,
"loss": 2.5243,
"step": 12790
},
{
"epoch": 4.041380399589355,
"grad_norm": 0.05950046682220469,
"learning_rate": 0.0014870507480908585,
"loss": 2.6026,
"step": 12795
},
{
"epoch": 4.042959804153834,
"grad_norm": 0.07726584626318818,
"learning_rate": 0.001486569054908939,
"loss": 2.5222,
"step": 12800
},
{
"epoch": 4.044539208718313,
"grad_norm": 0.08585312308095734,
"learning_rate": 0.0014860872137644784,
"loss": 2.5153,
"step": 12805
},
{
"epoch": 4.046118613282792,
"grad_norm": 0.0778424441713349,
"learning_rate": 0.001485605224804002,
"loss": 2.5472,
"step": 12810
},
{
"epoch": 4.047698017847272,
"grad_norm": 0.07639674589051161,
"learning_rate": 0.0014851230881740797,
"loss": 2.536,
"step": 12815
},
{
"epoch": 4.049277422411751,
"grad_norm": 0.07177264548867932,
"learning_rate": 0.0014846408040213256,
"loss": 2.4346,
"step": 12820
},
{
"epoch": 4.05085682697623,
"grad_norm": 0.07111026091809729,
"learning_rate": 0.0014841583724923993,
"loss": 2.4786,
"step": 12825
},
{
"epoch": 4.052436231540709,
"grad_norm": 0.0673086220881294,
"learning_rate": 0.0014836757937340052,
"loss": 2.4916,
"step": 12830
},
{
"epoch": 4.054015636105189,
"grad_norm": 0.0730411181213416,
"learning_rate": 0.0014831930678928928,
"loss": 2.4575,
"step": 12835
},
{
"epoch": 4.055595040669668,
"grad_norm": 0.08106042561681988,
"learning_rate": 0.0014827101951158555,
"loss": 2.5814,
"step": 12840
},
{
"epoch": 4.057174445234147,
"grad_norm": 0.06488877308697392,
"learning_rate": 0.0014822271755497321,
"loss": 2.5211,
"step": 12845
},
{
"epoch": 4.0587538497986255,
"grad_norm": 0.07530413806657588,
"learning_rate": 0.0014817440093414054,
"loss": 2.5002,
"step": 12850
},
{
"epoch": 4.060333254363105,
"grad_norm": 0.0789122215794093,
"learning_rate": 0.0014812606966378037,
"loss": 2.5722,
"step": 12855
},
{
"epoch": 4.061912658927584,
"grad_norm": 0.06851900387567693,
"learning_rate": 0.0014807772375858988,
"loss": 2.4766,
"step": 12860
},
{
"epoch": 4.063492063492063,
"grad_norm": 0.06498106802399249,
"learning_rate": 0.0014802936323327078,
"loss": 2.4923,
"step": 12865
},
{
"epoch": 4.0650714680565425,
"grad_norm": 0.06676788724094512,
"learning_rate": 0.001479809881025292,
"loss": 2.6196,
"step": 12870
},
{
"epoch": 4.066650872621022,
"grad_norm": 0.06230390401692597,
"learning_rate": 0.001479325983810757,
"loss": 2.592,
"step": 12875
},
{
"epoch": 4.068230277185501,
"grad_norm": 0.06662636637151956,
"learning_rate": 0.0014788419408362525,
"loss": 2.5958,
"step": 12880
},
{
"epoch": 4.06980968174998,
"grad_norm": 0.0531409949938066,
"learning_rate": 0.0014783577522489732,
"loss": 2.5631,
"step": 12885
},
{
"epoch": 4.0713890863144595,
"grad_norm": 0.07482519940011333,
"learning_rate": 0.0014778734181961582,
"loss": 2.5397,
"step": 12890
},
{
"epoch": 4.072968490878939,
"grad_norm": 0.06921277307702906,
"learning_rate": 0.0014773889388250896,
"loss": 2.542,
"step": 12895
},
{
"epoch": 4.074547895443418,
"grad_norm": 0.06772080251241064,
"learning_rate": 0.001476904314283095,
"loss": 2.5194,
"step": 12900
},
{
"epoch": 4.076127300007897,
"grad_norm": 0.06356953667371293,
"learning_rate": 0.0014764195447175452,
"loss": 2.4988,
"step": 12905
},
{
"epoch": 4.0777067045723765,
"grad_norm": 0.06883769891653696,
"learning_rate": 0.001475934630275856,
"loss": 2.5617,
"step": 12910
},
{
"epoch": 4.079286109136856,
"grad_norm": 0.06831757524638417,
"learning_rate": 0.0014754495711054865,
"loss": 2.5954,
"step": 12915
},
{
"epoch": 4.080865513701335,
"grad_norm": 0.06404279817913411,
"learning_rate": 0.0014749643673539403,
"loss": 2.5586,
"step": 12920
},
{
"epoch": 4.082444918265814,
"grad_norm": 0.0644981951967708,
"learning_rate": 0.0014744790191687646,
"loss": 2.5238,
"step": 12925
},
{
"epoch": 4.084024322830293,
"grad_norm": 0.07938321994404159,
"learning_rate": 0.0014739935266975502,
"loss": 2.5267,
"step": 12930
},
{
"epoch": 4.085603727394772,
"grad_norm": 0.07415739623629887,
"learning_rate": 0.0014735078900879332,
"loss": 2.466,
"step": 12935
},
{
"epoch": 4.087183131959251,
"grad_norm": 0.0614255488188257,
"learning_rate": 0.0014730221094875922,
"loss": 2.476,
"step": 12940
},
{
"epoch": 4.08876253652373,
"grad_norm": 0.08282163856306972,
"learning_rate": 0.00147253618504425,
"loss": 2.4782,
"step": 12945
},
{
"epoch": 4.09034194108821,
"grad_norm": 0.06277116984145156,
"learning_rate": 0.0014720501169056726,
"loss": 2.5513,
"step": 12950
},
{
"epoch": 4.091921345652689,
"grad_norm": 0.06684679135522835,
"learning_rate": 0.001471563905219671,
"loss": 2.4652,
"step": 12955
},
{
"epoch": 4.093500750217168,
"grad_norm": 0.08479893518892212,
"learning_rate": 0.0014710775501340988,
"loss": 2.6456,
"step": 12960
},
{
"epoch": 4.095080154781647,
"grad_norm": 0.06645816817928275,
"learning_rate": 0.0014705910517968533,
"loss": 2.5211,
"step": 12965
},
{
"epoch": 4.096659559346127,
"grad_norm": 0.07848580235716644,
"learning_rate": 0.0014701044103558757,
"loss": 2.4402,
"step": 12970
},
{
"epoch": 4.098238963910606,
"grad_norm": 0.07708904246857005,
"learning_rate": 0.00146961762595915,
"loss": 2.6356,
"step": 12975
},
{
"epoch": 4.099818368475085,
"grad_norm": 0.07140746362743702,
"learning_rate": 0.0014691306987547053,
"loss": 2.4781,
"step": 12980
},
{
"epoch": 4.101397773039564,
"grad_norm": 0.07707031926981767,
"learning_rate": 0.0014686436288906123,
"loss": 2.5186,
"step": 12985
},
{
"epoch": 4.102977177604044,
"grad_norm": 0.07229371248956278,
"learning_rate": 0.001468156416514986,
"loss": 2.4977,
"step": 12990
},
{
"epoch": 4.104556582168523,
"grad_norm": 0.06754261510286533,
"learning_rate": 0.0014676690617759845,
"loss": 2.5047,
"step": 12995
},
{
"epoch": 4.106135986733002,
"grad_norm": 0.06257340617672764,
"learning_rate": 0.0014671815648218092,
"loss": 2.4436,
"step": 13000
},
{
"epoch": 4.10771539129748,
"grad_norm": 0.06069847036077875,
"learning_rate": 0.0014666939258007052,
"loss": 2.4951,
"step": 13005
},
{
"epoch": 4.10929479586196,
"grad_norm": 0.07510772935293217,
"learning_rate": 0.0014662061448609603,
"loss": 2.5079,
"step": 13010
},
{
"epoch": 4.110874200426439,
"grad_norm": 0.07199220345004556,
"learning_rate": 0.001465718222150905,
"loss": 2.5082,
"step": 13015
},
{
"epoch": 4.112453604990918,
"grad_norm": 0.062254051740544276,
"learning_rate": 0.001465230157818914,
"loss": 2.5309,
"step": 13020
},
{
"epoch": 4.114033009555397,
"grad_norm": 0.06925424720492623,
"learning_rate": 0.0014647419520134046,
"loss": 2.6036,
"step": 13025
},
{
"epoch": 4.115612414119877,
"grad_norm": 0.06289672611745836,
"learning_rate": 0.001464253604882837,
"loss": 2.5908,
"step": 13030
},
{
"epoch": 4.117191818684356,
"grad_norm": 0.07510195324152907,
"learning_rate": 0.0014637651165757143,
"loss": 2.5165,
"step": 13035
},
{
"epoch": 4.118771223248835,
"grad_norm": 0.061998057020966786,
"learning_rate": 0.0014632764872405826,
"loss": 2.5964,
"step": 13040
},
{
"epoch": 4.120350627813314,
"grad_norm": 0.07622673341832373,
"learning_rate": 0.001462787717026031,
"loss": 2.549,
"step": 13045
},
{
"epoch": 4.121930032377794,
"grad_norm": 0.061580797687959134,
"learning_rate": 0.0014622988060806917,
"loss": 2.5137,
"step": 13050
},
{
"epoch": 4.123509436942273,
"grad_norm": 0.05307281433618267,
"learning_rate": 0.0014618097545532392,
"loss": 2.5621,
"step": 13055
},
{
"epoch": 4.125088841506752,
"grad_norm": 0.06578261530618441,
"learning_rate": 0.0014613205625923908,
"loss": 2.5674,
"step": 13060
},
{
"epoch": 4.126668246071231,
"grad_norm": 0.06614447582109592,
"learning_rate": 0.0014608312303469066,
"loss": 2.6642,
"step": 13065
},
{
"epoch": 4.128247650635711,
"grad_norm": 0.07080643368952194,
"learning_rate": 0.00146034175796559,
"loss": 2.612,
"step": 13070
},
{
"epoch": 4.12982705520019,
"grad_norm": 0.06637894811759577,
"learning_rate": 0.0014598521455972855,
"loss": 2.5063,
"step": 13075
},
{
"epoch": 4.131406459764669,
"grad_norm": 0.0657513334100105,
"learning_rate": 0.0014593623933908822,
"loss": 2.5591,
"step": 13080
},
{
"epoch": 4.1329858643291475,
"grad_norm": 0.07458588668215285,
"learning_rate": 0.0014588725014953094,
"loss": 2.611,
"step": 13085
},
{
"epoch": 4.134565268893627,
"grad_norm": 0.09211346308075136,
"learning_rate": 0.001458382470059541,
"loss": 2.5932,
"step": 13090
},
{
"epoch": 4.136144673458106,
"grad_norm": 0.06169403975962911,
"learning_rate": 0.0014578922992325922,
"loss": 2.4608,
"step": 13095
},
{
"epoch": 4.137724078022585,
"grad_norm": 0.0686759567747662,
"learning_rate": 0.001457401989163521,
"loss": 2.5128,
"step": 13100
},
{
"epoch": 4.1393034825870645,
"grad_norm": 0.1079703337793503,
"learning_rate": 0.0014569115400014268,
"loss": 2.5853,
"step": 13105
},
{
"epoch": 4.140882887151544,
"grad_norm": 0.0718356218996724,
"learning_rate": 0.0014564209518954528,
"loss": 2.4582,
"step": 13110
},
{
"epoch": 4.142462291716023,
"grad_norm": 0.07375396962599524,
"learning_rate": 0.0014559302249947832,
"loss": 2.5638,
"step": 13115
},
{
"epoch": 4.144041696280502,
"grad_norm": 0.0710460434497697,
"learning_rate": 0.0014554393594486458,
"loss": 2.5623,
"step": 13120
},
{
"epoch": 4.1456211008449815,
"grad_norm": 0.05547904233441889,
"learning_rate": 0.0014549483554063087,
"loss": 2.4681,
"step": 13125
},
{
"epoch": 4.147200505409461,
"grad_norm": 0.07601044775167944,
"learning_rate": 0.0014544572130170837,
"loss": 2.5175,
"step": 13130
},
{
"epoch": 4.14877990997394,
"grad_norm": 0.07610178491642125,
"learning_rate": 0.0014539659324303235,
"loss": 2.5246,
"step": 13135
},
{
"epoch": 4.150359314538419,
"grad_norm": 0.07703159150351786,
"learning_rate": 0.001453474513795424,
"loss": 2.5618,
"step": 13140
},
{
"epoch": 4.1519387191028985,
"grad_norm": 0.06607553225197815,
"learning_rate": 0.0014529829572618221,
"loss": 2.5774,
"step": 13145
},
{
"epoch": 4.153518123667378,
"grad_norm": 0.06340095814054024,
"learning_rate": 0.001452491262978997,
"loss": 2.6844,
"step": 13150
},
{
"epoch": 4.155097528231857,
"grad_norm": 0.06394311855675062,
"learning_rate": 0.0014519994310964698,
"loss": 2.5379,
"step": 13155
},
{
"epoch": 4.156676932796336,
"grad_norm": 0.06475478450956448,
"learning_rate": 0.0014515074617638035,
"loss": 2.5873,
"step": 13160
},
{
"epoch": 4.158256337360815,
"grad_norm": 0.06510203917360406,
"learning_rate": 0.001451015355130603,
"loss": 2.5246,
"step": 13165
},
{
"epoch": 4.159835741925294,
"grad_norm": 0.07346007569585081,
"learning_rate": 0.0014505231113465147,
"loss": 2.625,
"step": 13170
},
{
"epoch": 4.161415146489773,
"grad_norm": 0.07519606396868049,
"learning_rate": 0.0014500307305612267,
"loss": 2.5642,
"step": 13175
},
{
"epoch": 4.162994551054252,
"grad_norm": 0.06468656469001448,
"learning_rate": 0.0014495382129244684,
"loss": 2.5287,
"step": 13180
},
{
"epoch": 4.164573955618732,
"grad_norm": 0.06414418265386593,
"learning_rate": 0.0014490455585860122,
"loss": 2.6108,
"step": 13185
},
{
"epoch": 4.166153360183211,
"grad_norm": 0.05953847764779467,
"learning_rate": 0.001448552767695671,
"loss": 2.4915,
"step": 13190
},
{
"epoch": 4.16773276474769,
"grad_norm": 0.0639508017646886,
"learning_rate": 0.0014480598404032984,
"loss": 2.5515,
"step": 13195
},
{
"epoch": 4.169312169312169,
"grad_norm": 0.0694933632681381,
"learning_rate": 0.001447566776858791,
"loss": 2.4187,
"step": 13200
},
{
"epoch": 4.1708915738766486,
"grad_norm": 0.06501064281565713,
"learning_rate": 0.0014470735772120866,
"loss": 2.5036,
"step": 13205
},
{
"epoch": 4.172470978441128,
"grad_norm": 0.07636337677703987,
"learning_rate": 0.001446580241613164,
"loss": 2.5329,
"step": 13210
},
{
"epoch": 4.174050383005607,
"grad_norm": 0.06685853958905062,
"learning_rate": 0.001446086770212043,
"loss": 2.4654,
"step": 13215
},
{
"epoch": 4.175629787570086,
"grad_norm": 0.07065294814870929,
"learning_rate": 0.0014455931631587853,
"loss": 2.6656,
"step": 13220
},
{
"epoch": 4.1772091921345655,
"grad_norm": 0.059600183399475946,
"learning_rate": 0.0014450994206034935,
"loss": 2.4968,
"step": 13225
},
{
"epoch": 4.178788596699045,
"grad_norm": 0.07724022069731407,
"learning_rate": 0.001444605542696312,
"loss": 2.4847,
"step": 13230
},
{
"epoch": 4.180368001263524,
"grad_norm": 0.0606023615353993,
"learning_rate": 0.0014441115295874254,
"loss": 2.4797,
"step": 13235
},
{
"epoch": 4.181947405828003,
"grad_norm": 0.0756569105829631,
"learning_rate": 0.0014436173814270604,
"loss": 2.643,
"step": 13240
},
{
"epoch": 4.183526810392482,
"grad_norm": 0.06713550346731194,
"learning_rate": 0.0014431230983654837,
"loss": 2.4975,
"step": 13245
},
{
"epoch": 4.185106214956961,
"grad_norm": 0.0663594090047207,
"learning_rate": 0.0014426286805530042,
"loss": 2.5824,
"step": 13250
},
{
"epoch": 4.18668561952144,
"grad_norm": 0.07060822775944986,
"learning_rate": 0.0014421341281399712,
"loss": 2.5178,
"step": 13255
},
{
"epoch": 4.188265024085919,
"grad_norm": 0.05985464631605249,
"learning_rate": 0.0014416394412767747,
"loss": 2.5905,
"step": 13260
},
{
"epoch": 4.189844428650399,
"grad_norm": 0.07221561707389895,
"learning_rate": 0.0014411446201138451,
"loss": 2.567,
"step": 13265
},
{
"epoch": 4.191423833214878,
"grad_norm": 0.06578663378996884,
"learning_rate": 0.0014406496648016556,
"loss": 2.5081,
"step": 13270
},
{
"epoch": 4.193003237779357,
"grad_norm": 0.06648435586133757,
"learning_rate": 0.0014401545754907186,
"loss": 2.467,
"step": 13275
},
{
"epoch": 4.194582642343836,
"grad_norm": 0.08036769567181871,
"learning_rate": 0.0014396593523315873,
"loss": 2.5222,
"step": 13280
},
{
"epoch": 4.196162046908316,
"grad_norm": 0.07546337618188616,
"learning_rate": 0.0014391639954748558,
"loss": 2.5664,
"step": 13285
},
{
"epoch": 4.197741451472795,
"grad_norm": 0.0689660209289397,
"learning_rate": 0.0014386685050711593,
"loss": 2.5394,
"step": 13290
},
{
"epoch": 4.199320856037274,
"grad_norm": 0.061357672883435126,
"learning_rate": 0.0014381728812711732,
"loss": 2.6156,
"step": 13295
},
{
"epoch": 4.200900260601753,
"grad_norm": 0.07927976534483275,
"learning_rate": 0.0014376771242256134,
"loss": 2.5732,
"step": 13300
},
{
"epoch": 4.202479665166233,
"grad_norm": 0.06605757125068425,
"learning_rate": 0.0014371812340852367,
"loss": 2.5047,
"step": 13305
},
{
"epoch": 4.204059069730712,
"grad_norm": 0.065185810579207,
"learning_rate": 0.0014366852110008397,
"loss": 2.5204,
"step": 13310
},
{
"epoch": 4.205638474295191,
"grad_norm": 0.0752369655656418,
"learning_rate": 0.00143618905512326,
"loss": 2.5801,
"step": 13315
},
{
"epoch": 4.2072178788596695,
"grad_norm": 0.08200102491569435,
"learning_rate": 0.001435692766603376,
"loss": 2.5413,
"step": 13320
},
{
"epoch": 4.208797283424149,
"grad_norm": 0.06954876890972278,
"learning_rate": 0.0014351963455921052,
"loss": 2.5658,
"step": 13325
},
{
"epoch": 4.210376687988628,
"grad_norm": 0.07493534613014184,
"learning_rate": 0.0014346997922404059,
"loss": 2.4482,
"step": 13330
},
{
"epoch": 4.211956092553107,
"grad_norm": 0.0643238145688426,
"learning_rate": 0.0014342031066992772,
"loss": 2.4539,
"step": 13335
},
{
"epoch": 4.2135354971175865,
"grad_norm": 0.07058738633050943,
"learning_rate": 0.0014337062891197582,
"loss": 2.5635,
"step": 13340
},
{
"epoch": 4.215114901682066,
"grad_norm": 0.07637776100324138,
"learning_rate": 0.0014332093396529277,
"loss": 2.5817,
"step": 13345
},
{
"epoch": 4.216694306246545,
"grad_norm": 0.06836462407951979,
"learning_rate": 0.001432712258449905,
"loss": 2.5264,
"step": 13350
},
{
"epoch": 4.218273710811024,
"grad_norm": 0.0645167368572073,
"learning_rate": 0.0014322150456618488,
"loss": 2.6583,
"step": 13355
},
{
"epoch": 4.2198531153755034,
"grad_norm": 0.08201628889960703,
"learning_rate": 0.001431717701439959,
"loss": 2.4969,
"step": 13360
},
{
"epoch": 4.221432519939983,
"grad_norm": 0.08074439771233276,
"learning_rate": 0.0014312202259354745,
"loss": 2.3899,
"step": 13365
},
{
"epoch": 4.223011924504462,
"grad_norm": 0.07422919481985707,
"learning_rate": 0.0014307226192996744,
"loss": 2.5588,
"step": 13370
},
{
"epoch": 4.224591329068941,
"grad_norm": 0.08057286421521241,
"learning_rate": 0.0014302248816838777,
"loss": 2.5392,
"step": 13375
},
{
"epoch": 4.22617073363342,
"grad_norm": 0.06877191225129876,
"learning_rate": 0.0014297270132394432,
"loss": 2.6999,
"step": 13380
},
{
"epoch": 4.2277501381979,
"grad_norm": 0.07327500934807407,
"learning_rate": 0.0014292290141177694,
"loss": 2.4763,
"step": 13385
},
{
"epoch": 4.229329542762379,
"grad_norm": 0.07953535741393597,
"learning_rate": 0.0014287308844702954,
"loss": 2.5633,
"step": 13390
},
{
"epoch": 4.230908947326858,
"grad_norm": 0.09265386453796612,
"learning_rate": 0.0014282326244484983,
"loss": 2.5173,
"step": 13395
},
{
"epoch": 4.2324883518913365,
"grad_norm": 0.06626723087506727,
"learning_rate": 0.0014277342342038962,
"loss": 2.4999,
"step": 13400
},
{
"epoch": 4.234067756455816,
"grad_norm": 0.08258546988481656,
"learning_rate": 0.0014272357138880461,
"loss": 2.523,
"step": 13405
},
{
"epoch": 4.235647161020295,
"grad_norm": 0.06070972825243939,
"learning_rate": 0.0014267370636525457,
"loss": 2.4885,
"step": 13410
},
{
"epoch": 4.237226565584774,
"grad_norm": 0.06550670579870467,
"learning_rate": 0.0014262382836490303,
"loss": 2.5208,
"step": 13415
},
{
"epoch": 4.2388059701492535,
"grad_norm": 0.06939688767864273,
"learning_rate": 0.0014257393740291762,
"loss": 2.7158,
"step": 13420
},
{
"epoch": 4.240385374713733,
"grad_norm": 0.059725013485192754,
"learning_rate": 0.0014252403349446984,
"loss": 2.5409,
"step": 13425
},
{
"epoch": 4.241964779278212,
"grad_norm": 0.0674800569829641,
"learning_rate": 0.001424741166547352,
"loss": 2.4904,
"step": 13430
},
{
"epoch": 4.243544183842691,
"grad_norm": 0.07642877842093784,
"learning_rate": 0.0014242418689889304,
"loss": 2.4923,
"step": 13435
},
{
"epoch": 4.2451235884071705,
"grad_norm": 0.06707539009249994,
"learning_rate": 0.0014237424424212673,
"loss": 2.5957,
"step": 13440
},
{
"epoch": 4.24670299297165,
"grad_norm": 0.05478056652010952,
"learning_rate": 0.0014232428869962344,
"loss": 2.4415,
"step": 13445
},
{
"epoch": 4.248282397536129,
"grad_norm": 0.06647832348332867,
"learning_rate": 0.001422743202865744,
"loss": 2.5179,
"step": 13450
},
{
"epoch": 4.249861802100608,
"grad_norm": 0.07599209964906246,
"learning_rate": 0.0014222433901817466,
"loss": 2.5496,
"step": 13455
},
{
"epoch": 4.2514412066650875,
"grad_norm": 0.07508252444522776,
"learning_rate": 0.001421743449096232,
"loss": 2.566,
"step": 13460
},
{
"epoch": 4.253020611229567,
"grad_norm": 0.07034251601863514,
"learning_rate": 0.0014212433797612292,
"loss": 2.4595,
"step": 13465
},
{
"epoch": 4.254600015794046,
"grad_norm": 0.07715542862621001,
"learning_rate": 0.0014207431823288058,
"loss": 2.4418,
"step": 13470
},
{
"epoch": 4.256179420358524,
"grad_norm": 0.09268138219206884,
"learning_rate": 0.0014202428569510689,
"loss": 2.5466,
"step": 13475
},
{
"epoch": 4.257758824923004,
"grad_norm": 0.05898184787647431,
"learning_rate": 0.0014197424037801643,
"loss": 2.5578,
"step": 13480
},
{
"epoch": 4.259338229487483,
"grad_norm": 0.07989363701752655,
"learning_rate": 0.0014192418229682765,
"loss": 2.5104,
"step": 13485
},
{
"epoch": 4.260917634051962,
"grad_norm": 0.05928033650949228,
"learning_rate": 0.001418741114667629,
"loss": 2.5307,
"step": 13490
},
{
"epoch": 4.262497038616441,
"grad_norm": 0.07230013353506216,
"learning_rate": 0.0014182402790304837,
"loss": 2.6218,
"step": 13495
},
{
"epoch": 4.264076443180921,
"grad_norm": 0.0699251003611797,
"learning_rate": 0.001417739316209142,
"loss": 2.5339,
"step": 13500
},
{
"epoch": 4.2656558477454,
"grad_norm": 0.0661603496513996,
"learning_rate": 0.001417238226355943,
"loss": 2.5763,
"step": 13505
},
{
"epoch": 4.267235252309879,
"grad_norm": 0.0657835755499118,
"learning_rate": 0.0014167370096232657,
"loss": 2.4802,
"step": 13510
},
{
"epoch": 4.268814656874358,
"grad_norm": 0.06717937981458183,
"learning_rate": 0.001416235666163526,
"loss": 2.5466,
"step": 13515
},
{
"epoch": 4.270394061438838,
"grad_norm": 0.061219877946853664,
"learning_rate": 0.0014157341961291796,
"loss": 2.4778,
"step": 13520
},
{
"epoch": 4.271973466003317,
"grad_norm": 0.05999685422441346,
"learning_rate": 0.0014152325996727205,
"loss": 2.4325,
"step": 13525
},
{
"epoch": 4.273552870567796,
"grad_norm": 0.06287209203260331,
"learning_rate": 0.001414730876946681,
"loss": 2.4705,
"step": 13530
},
{
"epoch": 4.275132275132275,
"grad_norm": 0.08434710498720295,
"learning_rate": 0.001414229028103631,
"loss": 2.4909,
"step": 13535
},
{
"epoch": 4.276711679696755,
"grad_norm": 0.09622578602626065,
"learning_rate": 0.0014137270532961807,
"loss": 2.5129,
"step": 13540
},
{
"epoch": 4.278291084261234,
"grad_norm": 0.09314345440254598,
"learning_rate": 0.0014132249526769764,
"loss": 2.508,
"step": 13545
},
{
"epoch": 4.279870488825713,
"grad_norm": 0.06649245162780568,
"learning_rate": 0.0014127227263987046,
"loss": 2.6429,
"step": 13550
},
{
"epoch": 4.281449893390192,
"grad_norm": 0.06377941458760608,
"learning_rate": 0.0014122203746140885,
"loss": 2.5423,
"step": 13555
},
{
"epoch": 4.283029297954671,
"grad_norm": 0.07387226422492497,
"learning_rate": 0.00141171789747589,
"loss": 2.6465,
"step": 13560
},
{
"epoch": 4.28460870251915,
"grad_norm": 0.0657952911027925,
"learning_rate": 0.0014112152951369097,
"loss": 2.5298,
"step": 13565
},
{
"epoch": 4.286188107083629,
"grad_norm": 0.07001044405093522,
"learning_rate": 0.0014107125677499854,
"loss": 2.532,
"step": 13570
},
{
"epoch": 4.287767511648108,
"grad_norm": 0.0670565039718826,
"learning_rate": 0.0014102097154679936,
"loss": 2.4576,
"step": 13575
},
{
"epoch": 4.289346916212588,
"grad_norm": 0.05471371081590369,
"learning_rate": 0.001409706738443848,
"loss": 2.5445,
"step": 13580
},
{
"epoch": 4.290926320777067,
"grad_norm": 0.058653601582677364,
"learning_rate": 0.0014092036368305008,
"loss": 2.5299,
"step": 13585
},
{
"epoch": 4.292505725341546,
"grad_norm": 0.06909500250533289,
"learning_rate": 0.0014087004107809422,
"loss": 2.5063,
"step": 13590
},
{
"epoch": 4.294085129906025,
"grad_norm": 0.0653758506917844,
"learning_rate": 0.0014081970604482002,
"loss": 2.4977,
"step": 13595
},
{
"epoch": 4.295664534470505,
"grad_norm": 0.062247214562781795,
"learning_rate": 0.00140769358598534,
"loss": 2.4858,
"step": 13600
},
{
"epoch": 4.297243939034984,
"grad_norm": 0.06125582463343084,
"learning_rate": 0.001407189987545465,
"loss": 2.4963,
"step": 13605
},
{
"epoch": 4.298823343599463,
"grad_norm": 0.05558309017172807,
"learning_rate": 0.0014066862652817164,
"loss": 2.5758,
"step": 13610
},
{
"epoch": 4.300402748163942,
"grad_norm": 0.063779260157762,
"learning_rate": 0.001406182419347273,
"loss": 2.4866,
"step": 13615
},
{
"epoch": 4.301982152728422,
"grad_norm": 0.07980311631193766,
"learning_rate": 0.001405678449895351,
"loss": 2.6886,
"step": 13620
},
{
"epoch": 4.303561557292901,
"grad_norm": 0.06793474841846672,
"learning_rate": 0.0014051743570792047,
"loss": 2.5536,
"step": 13625
},
{
"epoch": 4.30514096185738,
"grad_norm": 0.07823166306740174,
"learning_rate": 0.0014046701410521246,
"loss": 2.5734,
"step": 13630
},
{
"epoch": 4.3067203664218585,
"grad_norm": 0.0696554159561388,
"learning_rate": 0.0014041658019674403,
"loss": 2.6522,
"step": 13635
},
{
"epoch": 4.308299770986338,
"grad_norm": 0.0767424290972511,
"learning_rate": 0.0014036613399785178,
"loss": 2.5352,
"step": 13640
},
{
"epoch": 4.309879175550817,
"grad_norm": 0.06860946097200132,
"learning_rate": 0.001403156755238761,
"loss": 2.5275,
"step": 13645
},
{
"epoch": 4.311458580115296,
"grad_norm": 0.058943783543938116,
"learning_rate": 0.001402652047901611,
"loss": 2.4936,
"step": 13650
},
{
"epoch": 4.3130379846797755,
"grad_norm": 0.07162936228619683,
"learning_rate": 0.0014021472181205456,
"loss": 2.5556,
"step": 13655
},
{
"epoch": 4.314617389244255,
"grad_norm": 0.05559098577268416,
"learning_rate": 0.0014016422660490806,
"loss": 2.5328,
"step": 13660
},
{
"epoch": 4.316196793808734,
"grad_norm": 0.06925217836926897,
"learning_rate": 0.0014011371918407685,
"loss": 2.501,
"step": 13665
},
{
"epoch": 4.317776198373213,
"grad_norm": 0.06574442097202614,
"learning_rate": 0.0014006319956491996,
"loss": 2.4949,
"step": 13670
},
{
"epoch": 4.3193556029376925,
"grad_norm": 0.06163159097364715,
"learning_rate": 0.0014001266776280004,
"loss": 2.4374,
"step": 13675
},
{
"epoch": 4.320935007502172,
"grad_norm": 0.07527784022922095,
"learning_rate": 0.0013996212379308352,
"loss": 2.5651,
"step": 13680
},
{
"epoch": 4.322514412066651,
"grad_norm": 0.06016455600422493,
"learning_rate": 0.0013991156767114044,
"loss": 2.489,
"step": 13685
},
{
"epoch": 4.32409381663113,
"grad_norm": 0.06711337906775994,
"learning_rate": 0.0013986099941234466,
"loss": 2.4754,
"step": 13690
},
{
"epoch": 4.3256732211956095,
"grad_norm": 0.07728217337140471,
"learning_rate": 0.0013981041903207362,
"loss": 2.6236,
"step": 13695
},
{
"epoch": 4.327252625760089,
"grad_norm": 0.09202475709476805,
"learning_rate": 0.001397598265457085,
"loss": 2.559,
"step": 13700
},
{
"epoch": 4.328832030324568,
"grad_norm": 0.06573246850646428,
"learning_rate": 0.001397092219686342,
"loss": 2.4991,
"step": 13705
},
{
"epoch": 4.330411434889047,
"grad_norm": 0.06543036372936917,
"learning_rate": 0.001396586053162392,
"loss": 2.5055,
"step": 13710
},
{
"epoch": 4.3319908394535265,
"grad_norm": 0.0807115846713941,
"learning_rate": 0.001396079766039157,
"loss": 2.5171,
"step": 13715
},
{
"epoch": 4.333570244018005,
"grad_norm": 0.0809838594844694,
"learning_rate": 0.0013955733584705957,
"loss": 2.5522,
"step": 13720
},
{
"epoch": 4.335149648582484,
"grad_norm": 0.07523546962854005,
"learning_rate": 0.0013950668306107034,
"loss": 2.5382,
"step": 13725
},
{
"epoch": 4.336729053146963,
"grad_norm": 0.08278129700893977,
"learning_rate": 0.0013945601826135122,
"loss": 2.519,
"step": 13730
},
{
"epoch": 4.338308457711443,
"grad_norm": 0.05914581025615629,
"learning_rate": 0.0013940534146330906,
"loss": 2.5308,
"step": 13735
},
{
"epoch": 4.339887862275922,
"grad_norm": 0.08798118929940507,
"learning_rate": 0.0013935465268235428,
"loss": 2.5847,
"step": 13740
},
{
"epoch": 4.341467266840401,
"grad_norm": 0.07153656693636579,
"learning_rate": 0.0013930395193390108,
"loss": 2.5399,
"step": 13745
},
{
"epoch": 4.34304667140488,
"grad_norm": 0.08199461753960192,
"learning_rate": 0.0013925323923336724,
"loss": 2.5723,
"step": 13750
},
{
"epoch": 4.34462607596936,
"grad_norm": 0.05920696830235281,
"learning_rate": 0.0013920251459617413,
"loss": 2.5572,
"step": 13755
},
{
"epoch": 4.346205480533839,
"grad_norm": 0.06898148636456203,
"learning_rate": 0.001391517780377468,
"loss": 2.5598,
"step": 13760
},
{
"epoch": 4.347784885098318,
"grad_norm": 0.056436110889154144,
"learning_rate": 0.001391010295735139,
"loss": 2.6277,
"step": 13765
},
{
"epoch": 4.349364289662797,
"grad_norm": 0.06356611277294899,
"learning_rate": 0.0013905026921890778,
"loss": 2.5147,
"step": 13770
},
{
"epoch": 4.350943694227277,
"grad_norm": 0.056308671341577105,
"learning_rate": 0.0013899949698936425,
"loss": 2.5154,
"step": 13775
},
{
"epoch": 4.352523098791756,
"grad_norm": 0.06676695503706351,
"learning_rate": 0.0013894871290032285,
"loss": 2.5106,
"step": 13780
},
{
"epoch": 4.354102503356235,
"grad_norm": 0.0859017385397016,
"learning_rate": 0.0013889791696722676,
"loss": 2.5461,
"step": 13785
},
{
"epoch": 4.355681907920714,
"grad_norm": 0.0641269088382948,
"learning_rate": 0.001388471092055226,
"loss": 2.5252,
"step": 13790
},
{
"epoch": 4.357261312485193,
"grad_norm": 0.07454163088702107,
"learning_rate": 0.0013879628963066075,
"loss": 2.448,
"step": 13795
},
{
"epoch": 4.358840717049672,
"grad_norm": 0.09356299937022537,
"learning_rate": 0.001387454582580951,
"loss": 2.5333,
"step": 13800
},
{
"epoch": 4.360420121614151,
"grad_norm": 0.06864964432028661,
"learning_rate": 0.0013869461510328314,
"loss": 2.5283,
"step": 13805
},
{
"epoch": 4.36199952617863,
"grad_norm": 0.07310291256869031,
"learning_rate": 0.0013864376018168595,
"loss": 2.5727,
"step": 13810
},
{
"epoch": 4.36357893074311,
"grad_norm": 0.07242835709121473,
"learning_rate": 0.001385928935087682,
"loss": 2.5139,
"step": 13815
},
{
"epoch": 4.365158335307589,
"grad_norm": 0.05428627981157836,
"learning_rate": 0.0013854201509999808,
"loss": 2.5324,
"step": 13820
},
{
"epoch": 4.366737739872068,
"grad_norm": 0.06937838622486542,
"learning_rate": 0.0013849112497084746,
"loss": 2.5905,
"step": 13825
},
{
"epoch": 4.368317144436547,
"grad_norm": 0.06755586765132361,
"learning_rate": 0.0013844022313679167,
"loss": 2.4154,
"step": 13830
},
{
"epoch": 4.369896549001027,
"grad_norm": 0.05777380943619152,
"learning_rate": 0.0013838930961330958,
"loss": 2.4106,
"step": 13835
},
{
"epoch": 4.371475953565506,
"grad_norm": 0.06083955833853959,
"learning_rate": 0.0013833838441588374,
"loss": 2.6462,
"step": 13840
},
{
"epoch": 4.373055358129985,
"grad_norm": 0.07551156041609827,
"learning_rate": 0.0013828744756000013,
"loss": 2.4989,
"step": 13845
},
{
"epoch": 4.374634762694464,
"grad_norm": 0.07542939198719763,
"learning_rate": 0.0013823649906114838,
"loss": 2.4391,
"step": 13850
},
{
"epoch": 4.376214167258944,
"grad_norm": 0.06506167630590956,
"learning_rate": 0.0013818553893482153,
"loss": 2.5238,
"step": 13855
},
{
"epoch": 4.377793571823423,
"grad_norm": 0.0597584835306194,
"learning_rate": 0.001381345671965163,
"loss": 2.5894,
"step": 13860
},
{
"epoch": 4.379372976387902,
"grad_norm": 0.06378945860954864,
"learning_rate": 0.0013808358386173279,
"loss": 2.484,
"step": 13865
},
{
"epoch": 4.380952380952381,
"grad_norm": 0.058409645223530246,
"learning_rate": 0.0013803258894597478,
"loss": 2.5072,
"step": 13870
},
{
"epoch": 4.38253178551686,
"grad_norm": 0.07616177601855426,
"learning_rate": 0.0013798158246474946,
"loss": 2.5044,
"step": 13875
},
{
"epoch": 4.384111190081339,
"grad_norm": 0.06595676858821979,
"learning_rate": 0.0013793056443356757,
"loss": 2.5473,
"step": 13880
},
{
"epoch": 4.385690594645818,
"grad_norm": 0.06969982767798963,
"learning_rate": 0.001378795348679434,
"loss": 2.5624,
"step": 13885
},
{
"epoch": 4.3872699992102975,
"grad_norm": 0.06868063538537598,
"learning_rate": 0.0013782849378339468,
"loss": 2.4366,
"step": 13890
},
{
"epoch": 4.388849403774777,
"grad_norm": 0.060562599314615474,
"learning_rate": 0.0013777744119544272,
"loss": 2.4932,
"step": 13895
},
{
"epoch": 4.390428808339256,
"grad_norm": 0.05862464836495264,
"learning_rate": 0.0013772637711961223,
"loss": 2.6184,
"step": 13900
},
{
"epoch": 4.392008212903735,
"grad_norm": 0.06286076947517723,
"learning_rate": 0.0013767530157143154,
"loss": 2.6207,
"step": 13905
},
{
"epoch": 4.3935876174682145,
"grad_norm": 0.07489822376604878,
"learning_rate": 0.001376242145664323,
"loss": 2.4922,
"step": 13910
},
{
"epoch": 4.395167022032694,
"grad_norm": 0.07289292764396378,
"learning_rate": 0.0013757311612014982,
"loss": 2.5087,
"step": 13915
},
{
"epoch": 4.396746426597173,
"grad_norm": 0.06649281228468891,
"learning_rate": 0.001375220062481228,
"loss": 2.4794,
"step": 13920
},
{
"epoch": 4.398325831161652,
"grad_norm": 0.084598796226276,
"learning_rate": 0.0013747088496589342,
"loss": 2.5119,
"step": 13925
},
{
"epoch": 4.3999052357261315,
"grad_norm": 0.07204598530837651,
"learning_rate": 0.0013741975228900732,
"loss": 2.452,
"step": 13930
},
{
"epoch": 4.401484640290611,
"grad_norm": 0.07094799318360079,
"learning_rate": 0.0013736860823301362,
"loss": 2.5321,
"step": 13935
},
{
"epoch": 4.40306404485509,
"grad_norm": 0.08547681655392879,
"learning_rate": 0.001373174528134649,
"loss": 2.4505,
"step": 13940
},
{
"epoch": 4.404643449419569,
"grad_norm": 0.07430050047282415,
"learning_rate": 0.0013726628604591724,
"loss": 2.5687,
"step": 13945
},
{
"epoch": 4.406222853984048,
"grad_norm": 0.0767070217749806,
"learning_rate": 0.001372151079459301,
"loss": 2.5587,
"step": 13950
},
{
"epoch": 4.407802258548527,
"grad_norm": 0.083602479556262,
"learning_rate": 0.0013716391852906637,
"loss": 2.4874,
"step": 13955
},
{
"epoch": 4.409381663113006,
"grad_norm": 0.08026159327952571,
"learning_rate": 0.001371127178108925,
"loss": 2.4348,
"step": 13960
},
{
"epoch": 4.410961067677485,
"grad_norm": 0.07921769390500488,
"learning_rate": 0.0013706150580697824,
"loss": 2.6105,
"step": 13965
},
{
"epoch": 4.4125404722419646,
"grad_norm": 0.06758457402712514,
"learning_rate": 0.0013701028253289686,
"loss": 2.4873,
"step": 13970
},
{
"epoch": 4.414119876806444,
"grad_norm": 0.052999579791167505,
"learning_rate": 0.0013695904800422505,
"loss": 2.5906,
"step": 13975
},
{
"epoch": 4.415699281370923,
"grad_norm": 0.057999719672230304,
"learning_rate": 0.0013690780223654284,
"loss": 2.5637,
"step": 13980
},
{
"epoch": 4.417278685935402,
"grad_norm": 0.0923377865259392,
"learning_rate": 0.0013685654524543379,
"loss": 2.5586,
"step": 13985
},
{
"epoch": 4.4188580904998815,
"grad_norm": 0.07699622829811727,
"learning_rate": 0.0013680527704648484,
"loss": 2.5492,
"step": 13990
},
{
"epoch": 4.420437495064361,
"grad_norm": 0.059955279849050386,
"learning_rate": 0.001367539976552863,
"loss": 2.4955,
"step": 13995
},
{
"epoch": 4.42201689962884,
"grad_norm": 0.06040037504820496,
"learning_rate": 0.0013670270708743186,
"loss": 2.5267,
"step": 14000
},
{
"epoch": 4.423596304193319,
"grad_norm": 0.08281551864708046,
"learning_rate": 0.001366514053585187,
"loss": 2.4655,
"step": 14005
},
{
"epoch": 4.4251757087577985,
"grad_norm": 0.05768791872116924,
"learning_rate": 0.0013660009248414736,
"loss": 2.5257,
"step": 14010
},
{
"epoch": 4.426755113322278,
"grad_norm": 0.06502174163101598,
"learning_rate": 0.0013654876847992174,
"loss": 2.5056,
"step": 14015
},
{
"epoch": 4.428334517886757,
"grad_norm": 0.05408102748420643,
"learning_rate": 0.0013649743336144914,
"loss": 2.5622,
"step": 14020
},
{
"epoch": 4.429913922451236,
"grad_norm": 0.06350269471766899,
"learning_rate": 0.0013644608714434025,
"loss": 2.5551,
"step": 14025
},
{
"epoch": 4.4314933270157155,
"grad_norm": 0.06237235890280388,
"learning_rate": 0.001363947298442091,
"loss": 2.4809,
"step": 14030
},
{
"epoch": 4.433072731580194,
"grad_norm": 0.060069035801989634,
"learning_rate": 0.0013634336147667317,
"loss": 2.4924,
"step": 14035
},
{
"epoch": 4.434652136144673,
"grad_norm": 0.06472959109812577,
"learning_rate": 0.001362919820573532,
"loss": 2.4054,
"step": 14040
},
{
"epoch": 4.436231540709152,
"grad_norm": 0.05414354543169224,
"learning_rate": 0.0013624059160187336,
"loss": 2.6368,
"step": 14045
},
{
"epoch": 4.437810945273632,
"grad_norm": 0.07275754959987431,
"learning_rate": 0.0013618919012586114,
"loss": 2.4423,
"step": 14050
},
{
"epoch": 4.439390349838111,
"grad_norm": 0.05630187983993725,
"learning_rate": 0.0013613777764494746,
"loss": 2.4456,
"step": 14055
},
{
"epoch": 4.44096975440259,
"grad_norm": 0.059532361130487516,
"learning_rate": 0.0013608635417476647,
"loss": 2.4708,
"step": 14060
},
{
"epoch": 4.442549158967069,
"grad_norm": 0.07262488825091169,
"learning_rate": 0.0013603491973095574,
"loss": 2.5457,
"step": 14065
},
{
"epoch": 4.444128563531549,
"grad_norm": 0.06222639433243145,
"learning_rate": 0.0013598347432915616,
"loss": 2.623,
"step": 14070
},
{
"epoch": 4.445707968096028,
"grad_norm": 0.06217185296538233,
"learning_rate": 0.0013593201798501192,
"loss": 2.4725,
"step": 14075
},
{
"epoch": 4.447287372660507,
"grad_norm": 0.06983882051728081,
"learning_rate": 0.0013588055071417063,
"loss": 2.6554,
"step": 14080
},
{
"epoch": 4.448866777224986,
"grad_norm": 0.07225143823696022,
"learning_rate": 0.001358290725322831,
"loss": 2.5879,
"step": 14085
},
{
"epoch": 4.450446181789466,
"grad_norm": 0.04946280398519253,
"learning_rate": 0.001357775834550035,
"loss": 2.504,
"step": 14090
},
{
"epoch": 4.452025586353945,
"grad_norm": 0.05885832040929403,
"learning_rate": 0.0013572608349798937,
"loss": 2.5389,
"step": 14095
},
{
"epoch": 4.453604990918424,
"grad_norm": 0.08591016654882142,
"learning_rate": 0.001356745726769015,
"loss": 2.5392,
"step": 14100
},
{
"epoch": 4.455184395482903,
"grad_norm": 0.07319105728497587,
"learning_rate": 0.0013562305100740404,
"loss": 2.5936,
"step": 14105
},
{
"epoch": 4.456763800047382,
"grad_norm": 0.0578954510511244,
"learning_rate": 0.0013557151850516439,
"loss": 2.452,
"step": 14110
},
{
"epoch": 4.458343204611861,
"grad_norm": 0.06653209710843927,
"learning_rate": 0.0013551997518585317,
"loss": 2.4913,
"step": 14115
},
{
"epoch": 4.45992260917634,
"grad_norm": 0.07802101976031472,
"learning_rate": 0.0013546842106514447,
"loss": 2.4905,
"step": 14120
},
{
"epoch": 4.4615020137408195,
"grad_norm": 0.06717026553531497,
"learning_rate": 0.0013541685615871555,
"loss": 2.5445,
"step": 14125
},
{
"epoch": 4.463081418305299,
"grad_norm": 0.0817827723762572,
"learning_rate": 0.0013536528048224696,
"loss": 2.5179,
"step": 14130
},
{
"epoch": 4.464660822869778,
"grad_norm": 0.06288364664081973,
"learning_rate": 0.001353136940514225,
"loss": 2.5217,
"step": 14135
},
{
"epoch": 4.466240227434257,
"grad_norm": 0.06522979915748774,
"learning_rate": 0.0013526209688192931,
"loss": 2.5435,
"step": 14140
},
{
"epoch": 4.467819631998736,
"grad_norm": 0.06396210312239645,
"learning_rate": 0.0013521048898945778,
"loss": 2.5306,
"step": 14145
},
{
"epoch": 4.469399036563216,
"grad_norm": 0.08620056053629571,
"learning_rate": 0.001351588703897015,
"loss": 2.6416,
"step": 14150
},
{
"epoch": 4.470978441127695,
"grad_norm": 0.07435867299037283,
"learning_rate": 0.0013510724109835738,
"loss": 2.4844,
"step": 14155
},
{
"epoch": 4.472557845692174,
"grad_norm": 0.07096473189861796,
"learning_rate": 0.0013505560113112555,
"loss": 2.5824,
"step": 14160
},
{
"epoch": 4.474137250256653,
"grad_norm": 0.0913140577177938,
"learning_rate": 0.0013500395050370937,
"loss": 2.4796,
"step": 14165
},
{
"epoch": 4.475716654821133,
"grad_norm": 0.062042548534127946,
"learning_rate": 0.001349522892318155,
"loss": 2.4454,
"step": 14170
},
{
"epoch": 4.477296059385612,
"grad_norm": 0.07373986458147358,
"learning_rate": 0.0013490061733115381,
"loss": 2.5725,
"step": 14175
},
{
"epoch": 4.478875463950091,
"grad_norm": 0.055063662338057706,
"learning_rate": 0.0013484893481743735,
"loss": 2.4965,
"step": 14180
},
{
"epoch": 4.48045486851457,
"grad_norm": 0.0665836744697159,
"learning_rate": 0.0013479724170638247,
"loss": 2.4605,
"step": 14185
},
{
"epoch": 4.48203427307905,
"grad_norm": 0.06170549589690821,
"learning_rate": 0.001347455380137087,
"loss": 2.5759,
"step": 14190
},
{
"epoch": 4.483613677643528,
"grad_norm": 0.06027017881844297,
"learning_rate": 0.0013469382375513885,
"loss": 2.4885,
"step": 14195
},
{
"epoch": 4.485193082208007,
"grad_norm": 0.06967962993886911,
"learning_rate": 0.0013464209894639885,
"loss": 2.5976,
"step": 14200
},
{
"epoch": 4.4867724867724865,
"grad_norm": 0.07882222499929464,
"learning_rate": 0.0013459036360321788,
"loss": 2.4848,
"step": 14205
},
{
"epoch": 4.488351891336966,
"grad_norm": 0.08115387174683329,
"learning_rate": 0.0013453861774132836,
"loss": 2.5393,
"step": 14210
},
{
"epoch": 4.489931295901445,
"grad_norm": 0.07249459700051057,
"learning_rate": 0.0013448686137646586,
"loss": 2.6291,
"step": 14215
},
{
"epoch": 4.491510700465924,
"grad_norm": 0.05952391465582586,
"learning_rate": 0.0013443509452436915,
"loss": 2.3992,
"step": 14220
},
{
"epoch": 4.4930901050304035,
"grad_norm": 0.06351042392140363,
"learning_rate": 0.0013438331720078019,
"loss": 2.6077,
"step": 14225
},
{
"epoch": 4.494669509594883,
"grad_norm": 0.09179406626814433,
"learning_rate": 0.0013433152942144417,
"loss": 2.5609,
"step": 14230
},
{
"epoch": 4.496248914159362,
"grad_norm": 0.06948199961598737,
"learning_rate": 0.0013427973120210938,
"loss": 2.4387,
"step": 14235
},
{
"epoch": 4.497828318723841,
"grad_norm": 0.06807742010409668,
"learning_rate": 0.0013422792255852738,
"loss": 2.5461,
"step": 14240
},
{
"epoch": 4.4994077232883205,
"grad_norm": 0.05565111213818792,
"learning_rate": 0.0013417610350645282,
"loss": 2.4781,
"step": 14245
},
{
"epoch": 4.5009871278528,
"grad_norm": 0.06018365938532872,
"learning_rate": 0.0013412427406164352,
"loss": 2.6829,
"step": 14250
},
{
"epoch": 4.502566532417279,
"grad_norm": 0.07509777643215441,
"learning_rate": 0.001340724342398605,
"loss": 2.629,
"step": 14255
},
{
"epoch": 4.504145936981758,
"grad_norm": 0.07356464439759725,
"learning_rate": 0.0013402058405686797,
"loss": 2.5301,
"step": 14260
},
{
"epoch": 4.505725341546237,
"grad_norm": 0.06645424332119493,
"learning_rate": 0.0013396872352843317,
"loss": 2.5281,
"step": 14265
},
{
"epoch": 4.507304746110716,
"grad_norm": 0.07558749040618944,
"learning_rate": 0.0013391685267032654,
"loss": 2.5062,
"step": 14270
},
{
"epoch": 4.508884150675195,
"grad_norm": 0.05613251020843608,
"learning_rate": 0.0013386497149832173,
"loss": 2.428,
"step": 14275
},
{
"epoch": 4.510463555239674,
"grad_norm": 0.06177974682298049,
"learning_rate": 0.0013381308002819545,
"loss": 2.4758,
"step": 14280
},
{
"epoch": 4.512042959804154,
"grad_norm": 0.0681251274627974,
"learning_rate": 0.001337611782757276,
"loss": 2.5083,
"step": 14285
},
{
"epoch": 4.513622364368633,
"grad_norm": 0.0692859935576207,
"learning_rate": 0.0013370926625670115,
"loss": 2.4362,
"step": 14290
},
{
"epoch": 4.515201768933112,
"grad_norm": 0.06808243851966223,
"learning_rate": 0.0013365734398690216,
"loss": 2.592,
"step": 14295
},
{
"epoch": 4.516781173497591,
"grad_norm": 0.05856059595516205,
"learning_rate": 0.0013360541148211994,
"loss": 2.5013,
"step": 14300
},
{
"epoch": 4.518360578062071,
"grad_norm": 0.0686920741590947,
"learning_rate": 0.0013355346875814679,
"loss": 2.529,
"step": 14305
},
{
"epoch": 4.51993998262655,
"grad_norm": 0.06560875792670096,
"learning_rate": 0.0013350151583077818,
"loss": 2.4386,
"step": 14310
},
{
"epoch": 4.521519387191029,
"grad_norm": 0.07705004270678456,
"learning_rate": 0.0013344955271581262,
"loss": 2.5246,
"step": 14315
},
{
"epoch": 4.523098791755508,
"grad_norm": 0.0694705060803293,
"learning_rate": 0.0013339757942905182,
"loss": 2.5614,
"step": 14320
},
{
"epoch": 4.524678196319988,
"grad_norm": 0.06658893824711391,
"learning_rate": 0.001333455959863005,
"loss": 2.5895,
"step": 14325
},
{
"epoch": 4.526257600884467,
"grad_norm": 0.06252886108747881,
"learning_rate": 0.001332936024033665,
"loss": 2.5424,
"step": 14330
},
{
"epoch": 4.527837005448946,
"grad_norm": 0.06417874942970586,
"learning_rate": 0.0013324159869606072,
"loss": 2.5635,
"step": 14335
},
{
"epoch": 4.529416410013425,
"grad_norm": 0.07692020820283244,
"learning_rate": 0.0013318958488019715,
"loss": 2.6134,
"step": 14340
},
{
"epoch": 4.530995814577905,
"grad_norm": 0.06719147959402327,
"learning_rate": 0.0013313756097159287,
"loss": 2.4861,
"step": 14345
},
{
"epoch": 4.532575219142384,
"grad_norm": 0.07078783998658239,
"learning_rate": 0.0013308552698606804,
"loss": 2.6335,
"step": 14350
},
{
"epoch": 4.534154623706862,
"grad_norm": 0.06913700433344214,
"learning_rate": 0.0013303348293944584,
"loss": 2.4,
"step": 14355
},
{
"epoch": 4.535734028271341,
"grad_norm": 0.0631198430913686,
"learning_rate": 0.001329814288475525,
"loss": 2.5384,
"step": 14360
},
{
"epoch": 4.537313432835821,
"grad_norm": 0.06690628991153875,
"learning_rate": 0.001329293647262174,
"loss": 2.4874,
"step": 14365
},
{
"epoch": 4.5388928374003,
"grad_norm": 0.061245030936560724,
"learning_rate": 0.0013287729059127287,
"loss": 2.579,
"step": 14370
},
{
"epoch": 4.540472241964779,
"grad_norm": 0.07265244960888212,
"learning_rate": 0.0013282520645855435,
"loss": 2.6145,
"step": 14375
},
{
"epoch": 4.542051646529258,
"grad_norm": 0.05562980129409219,
"learning_rate": 0.001327731123439003,
"loss": 2.5371,
"step": 14380
},
{
"epoch": 4.543631051093738,
"grad_norm": 0.07501342212715247,
"learning_rate": 0.001327210082631521,
"loss": 2.5584,
"step": 14385
},
{
"epoch": 4.545210455658217,
"grad_norm": 0.05621475207377457,
"learning_rate": 0.0013266889423215438,
"loss": 2.5589,
"step": 14390
},
{
"epoch": 4.546789860222696,
"grad_norm": 0.07727632887718167,
"learning_rate": 0.0013261677026675468,
"loss": 2.5207,
"step": 14395
},
{
"epoch": 4.548369264787175,
"grad_norm": 0.06575705110727952,
"learning_rate": 0.001325646363828035,
"loss": 2.5446,
"step": 14400
},
{
"epoch": 4.549948669351655,
"grad_norm": 0.0726735090024027,
"learning_rate": 0.0013251249259615449,
"loss": 2.494,
"step": 14405
},
{
"epoch": 4.551528073916134,
"grad_norm": 0.06988209331509379,
"learning_rate": 0.0013246033892266417,
"loss": 2.4647,
"step": 14410
},
{
"epoch": 4.553107478480613,
"grad_norm": 0.07207187368145113,
"learning_rate": 0.0013240817537819218,
"loss": 2.5596,
"step": 14415
},
{
"epoch": 4.5546868830450915,
"grad_norm": 0.059501193560655585,
"learning_rate": 0.0013235600197860117,
"loss": 2.4478,
"step": 14420
},
{
"epoch": 4.556266287609571,
"grad_norm": 0.06572261116435447,
"learning_rate": 0.0013230381873975666,
"loss": 2.5149,
"step": 14425
},
{
"epoch": 4.55784569217405,
"grad_norm": 0.06339768919779079,
"learning_rate": 0.0013225162567752724,
"loss": 2.4566,
"step": 14430
},
{
"epoch": 4.559425096738529,
"grad_norm": 0.0540220761112062,
"learning_rate": 0.0013219942280778454,
"loss": 2.5326,
"step": 14435
},
{
"epoch": 4.5610045013030085,
"grad_norm": 0.07110542106791799,
"learning_rate": 0.001321472101464031,
"loss": 2.4537,
"step": 14440
},
{
"epoch": 4.562583905867488,
"grad_norm": 0.06603071822844594,
"learning_rate": 0.0013209498770926044,
"loss": 2.5216,
"step": 14445
},
{
"epoch": 4.564163310431967,
"grad_norm": 0.06210510436605857,
"learning_rate": 0.0013204275551223707,
"loss": 2.4913,
"step": 14450
},
{
"epoch": 4.565742714996446,
"grad_norm": 0.06696826280590645,
"learning_rate": 0.0013199051357121645,
"loss": 2.5407,
"step": 14455
},
{
"epoch": 4.5673221195609255,
"grad_norm": 0.06457040402972691,
"learning_rate": 0.0013193826190208507,
"loss": 2.6159,
"step": 14460
},
{
"epoch": 4.568901524125405,
"grad_norm": 0.08403683367973355,
"learning_rate": 0.0013188600052073233,
"loss": 2.523,
"step": 14465
},
{
"epoch": 4.570480928689884,
"grad_norm": 0.09516774150664915,
"learning_rate": 0.0013183372944305055,
"loss": 2.5409,
"step": 14470
},
{
"epoch": 4.572060333254363,
"grad_norm": 0.06953612060835043,
"learning_rate": 0.00131781448684935,
"loss": 2.5325,
"step": 14475
},
{
"epoch": 4.5736397378188425,
"grad_norm": 0.06073184199451849,
"learning_rate": 0.0013172915826228397,
"loss": 2.537,
"step": 14480
},
{
"epoch": 4.575219142383322,
"grad_norm": 0.06047110009531498,
"learning_rate": 0.0013167685819099868,
"loss": 2.4982,
"step": 14485
},
{
"epoch": 4.576798546947801,
"grad_norm": 0.05516820416705633,
"learning_rate": 0.0013162454848698317,
"loss": 2.5509,
"step": 14490
},
{
"epoch": 4.57837795151228,
"grad_norm": 0.08413960458826529,
"learning_rate": 0.0013157222916614453,
"loss": 2.5301,
"step": 14495
},
{
"epoch": 4.5799573560767595,
"grad_norm": 0.061943460655179555,
"learning_rate": 0.0013151990024439272,
"loss": 2.5253,
"step": 14500
},
{
"epoch": 4.581536760641239,
"grad_norm": 0.07448364722954148,
"learning_rate": 0.001314675617376406,
"loss": 2.4996,
"step": 14505
},
{
"epoch": 4.583116165205717,
"grad_norm": 0.07394426552927498,
"learning_rate": 0.0013141521366180407,
"loss": 2.4658,
"step": 14510
},
{
"epoch": 4.584695569770196,
"grad_norm": 0.07090188960059321,
"learning_rate": 0.0013136285603280173,
"loss": 2.6264,
"step": 14515
},
{
"epoch": 4.586274974334676,
"grad_norm": 0.07835259049130651,
"learning_rate": 0.0013131048886655529,
"loss": 2.4878,
"step": 14520
},
{
"epoch": 4.587854378899155,
"grad_norm": 0.06973864384128355,
"learning_rate": 0.001312581121789892,
"loss": 2.5461,
"step": 14525
},
{
"epoch": 4.589433783463634,
"grad_norm": 0.05713278782611401,
"learning_rate": 0.0013120572598603094,
"loss": 2.6036,
"step": 14530
},
{
"epoch": 4.591013188028113,
"grad_norm": 0.06448552012510772,
"learning_rate": 0.0013115333030361076,
"loss": 2.6607,
"step": 14535
},
{
"epoch": 4.592592592592593,
"grad_norm": 0.06146515545218298,
"learning_rate": 0.001311009251476619,
"loss": 2.4407,
"step": 14540
},
{
"epoch": 4.594171997157072,
"grad_norm": 0.06085502924025703,
"learning_rate": 0.001310485105341204,
"loss": 2.5601,
"step": 14545
},
{
"epoch": 4.595751401721551,
"grad_norm": 0.06121460588993167,
"learning_rate": 0.0013099608647892521,
"loss": 2.3711,
"step": 14550
},
{
"epoch": 4.59733080628603,
"grad_norm": 0.06140176313807265,
"learning_rate": 0.001309436529980182,
"loss": 2.4567,
"step": 14555
},
{
"epoch": 4.5989102108505096,
"grad_norm": 0.06233078186399792,
"learning_rate": 0.0013089121010734397,
"loss": 2.4931,
"step": 14560
},
{
"epoch": 4.600489615414989,
"grad_norm": 0.07457805665337602,
"learning_rate": 0.0013083875782285016,
"loss": 2.4842,
"step": 14565
},
{
"epoch": 4.602069019979468,
"grad_norm": 0.07665835344387141,
"learning_rate": 0.001307862961604871,
"loss": 2.5222,
"step": 14570
},
{
"epoch": 4.603648424543947,
"grad_norm": 0.06806119545622995,
"learning_rate": 0.0013073382513620808,
"loss": 2.4976,
"step": 14575
},
{
"epoch": 4.605227829108426,
"grad_norm": 0.0555807009259313,
"learning_rate": 0.001306813447659692,
"loss": 2.4158,
"step": 14580
},
{
"epoch": 4.606807233672905,
"grad_norm": 0.05942410575287233,
"learning_rate": 0.0013062885506572944,
"loss": 2.4696,
"step": 14585
},
{
"epoch": 4.608386638237384,
"grad_norm": 0.05378692212406273,
"learning_rate": 0.0013057635605145048,
"loss": 2.5466,
"step": 14590
},
{
"epoch": 4.609966042801863,
"grad_norm": 0.05445137241232568,
"learning_rate": 0.0013052384773909705,
"loss": 2.4467,
"step": 14595
},
{
"epoch": 4.611545447366343,
"grad_norm": 0.06240141748299297,
"learning_rate": 0.0013047133014463654,
"loss": 2.5343,
"step": 14600
},
{
"epoch": 4.613124851930822,
"grad_norm": 0.062130909008503925,
"learning_rate": 0.001304188032840392,
"loss": 2.5451,
"step": 14605
},
{
"epoch": 4.614704256495301,
"grad_norm": 0.05765365607352223,
"learning_rate": 0.0013036626717327817,
"loss": 2.5551,
"step": 14610
},
{
"epoch": 4.61628366105978,
"grad_norm": 0.05839180617057512,
"learning_rate": 0.0013031372182832927,
"loss": 2.5071,
"step": 14615
},
{
"epoch": 4.61786306562426,
"grad_norm": 0.05267195123274345,
"learning_rate": 0.0013026116726517127,
"loss": 2.441,
"step": 14620
},
{
"epoch": 4.619442470188739,
"grad_norm": 0.05737658725604644,
"learning_rate": 0.0013020860349978562,
"loss": 2.5407,
"step": 14625
},
{
"epoch": 4.621021874753218,
"grad_norm": 0.05957690197529863,
"learning_rate": 0.0013015603054815667,
"loss": 2.4947,
"step": 14630
},
{
"epoch": 4.622601279317697,
"grad_norm": 0.05908030966268051,
"learning_rate": 0.0013010344842627154,
"loss": 2.6356,
"step": 14635
},
{
"epoch": 4.624180683882177,
"grad_norm": 0.06633542156038126,
"learning_rate": 0.0013005085715012002,
"loss": 2.5547,
"step": 14640
},
{
"epoch": 4.625760088446656,
"grad_norm": 0.06864905776747134,
"learning_rate": 0.0012999825673569488,
"loss": 2.6052,
"step": 14645
},
{
"epoch": 4.627339493011135,
"grad_norm": 0.07436619834506039,
"learning_rate": 0.0012994564719899149,
"loss": 2.5173,
"step": 14650
},
{
"epoch": 4.628918897575614,
"grad_norm": 0.05911882126334825,
"learning_rate": 0.0012989302855600814,
"loss": 2.4682,
"step": 14655
},
{
"epoch": 4.630498302140094,
"grad_norm": 0.06081199141600818,
"learning_rate": 0.001298404008227458,
"loss": 2.452,
"step": 14660
},
{
"epoch": 4.632077706704573,
"grad_norm": 0.060944164412464016,
"learning_rate": 0.0012978776401520824,
"loss": 2.5599,
"step": 14665
},
{
"epoch": 4.633657111269051,
"grad_norm": 0.07119386080046132,
"learning_rate": 0.0012973511814940192,
"loss": 2.5004,
"step": 14670
},
{
"epoch": 4.6352365158335305,
"grad_norm": 0.05575195182785549,
"learning_rate": 0.001296824632413362,
"loss": 2.5344,
"step": 14675
},
{
"epoch": 4.63681592039801,
"grad_norm": 0.062358567930570354,
"learning_rate": 0.0012962979930702303,
"loss": 2.518,
"step": 14680
},
{
"epoch": 4.638395324962489,
"grad_norm": 0.06279756886633003,
"learning_rate": 0.001295771263624772,
"loss": 2.5308,
"step": 14685
},
{
"epoch": 4.639974729526968,
"grad_norm": 0.06673592524557007,
"learning_rate": 0.0012952444442371623,
"loss": 2.4142,
"step": 14690
},
{
"epoch": 4.6415541340914475,
"grad_norm": 0.06218592477734958,
"learning_rate": 0.0012947175350676032,
"loss": 2.4998,
"step": 14695
},
{
"epoch": 4.643133538655927,
"grad_norm": 0.07279936719488737,
"learning_rate": 0.0012941905362763252,
"loss": 2.5579,
"step": 14700
},
{
"epoch": 4.644712943220406,
"grad_norm": 0.07435222246168129,
"learning_rate": 0.0012936634480235842,
"loss": 2.4673,
"step": 14705
},
{
"epoch": 4.646292347784885,
"grad_norm": 0.0632941669230261,
"learning_rate": 0.0012931362704696652,
"loss": 2.5094,
"step": 14710
},
{
"epoch": 4.6478717523493644,
"grad_norm": 0.07212217279803702,
"learning_rate": 0.0012926090037748792,
"loss": 2.5115,
"step": 14715
},
{
"epoch": 4.649451156913844,
"grad_norm": 0.06989878259512469,
"learning_rate": 0.0012920816480995645,
"loss": 2.5446,
"step": 14720
},
{
"epoch": 4.651030561478323,
"grad_norm": 0.06845073031606655,
"learning_rate": 0.001291554203604087,
"loss": 2.4885,
"step": 14725
},
{
"epoch": 4.652609966042802,
"grad_norm": 0.07007950023259411,
"learning_rate": 0.0012910266704488388,
"loss": 2.4828,
"step": 14730
},
{
"epoch": 4.654189370607281,
"grad_norm": 0.06702450252188678,
"learning_rate": 0.0012904990487942398,
"loss": 2.5228,
"step": 14735
},
{
"epoch": 4.65576877517176,
"grad_norm": 0.06533141251356811,
"learning_rate": 0.0012899713388007362,
"loss": 2.4774,
"step": 14740
},
{
"epoch": 4.657348179736239,
"grad_norm": 0.06062891469610465,
"learning_rate": 0.001289443540628801,
"loss": 2.4858,
"step": 14745
},
{
"epoch": 4.658927584300718,
"grad_norm": 0.05986021154478703,
"learning_rate": 0.0012889156544389343,
"loss": 2.5261,
"step": 14750
},
{
"epoch": 4.6605069888651975,
"grad_norm": 0.08522016594476302,
"learning_rate": 0.001288387680391663,
"loss": 2.519,
"step": 14755
},
{
"epoch": 4.662086393429677,
"grad_norm": 0.0731687262962632,
"learning_rate": 0.0012878596186475407,
"loss": 2.4651,
"step": 14760
},
{
"epoch": 4.663665797994156,
"grad_norm": 0.06355692861166644,
"learning_rate": 0.0012873314693671474,
"loss": 2.4626,
"step": 14765
},
{
"epoch": 4.665245202558635,
"grad_norm": 0.07824357408268898,
"learning_rate": 0.0012868032327110904,
"loss": 2.498,
"step": 14770
},
{
"epoch": 4.6668246071231145,
"grad_norm": 0.06509502457557328,
"learning_rate": 0.0012862749088400026,
"loss": 2.4943,
"step": 14775
},
{
"epoch": 4.668404011687594,
"grad_norm": 0.07135187141990396,
"learning_rate": 0.0012857464979145442,
"loss": 2.5456,
"step": 14780
},
{
"epoch": 4.669983416252073,
"grad_norm": 0.07224918585527369,
"learning_rate": 0.001285218000095401,
"loss": 2.4876,
"step": 14785
},
{
"epoch": 4.671562820816552,
"grad_norm": 0.07418437782732842,
"learning_rate": 0.0012846894155432867,
"loss": 2.5208,
"step": 14790
},
{
"epoch": 4.6731422253810315,
"grad_norm": 0.0709939637700474,
"learning_rate": 0.00128416074441894,
"loss": 2.4447,
"step": 14795
},
{
"epoch": 4.674721629945511,
"grad_norm": 0.06160707757163909,
"learning_rate": 0.0012836319868831268,
"loss": 2.5425,
"step": 14800
},
{
"epoch": 4.67630103450999,
"grad_norm": 0.0713384844027798,
"learning_rate": 0.001283103143096638,
"loss": 2.5299,
"step": 14805
},
{
"epoch": 4.677880439074469,
"grad_norm": 0.06761132129526373,
"learning_rate": 0.0012825742132202924,
"loss": 2.4755,
"step": 14810
},
{
"epoch": 4.6794598436389485,
"grad_norm": 0.058001357479072355,
"learning_rate": 0.0012820451974149341,
"loss": 2.3942,
"step": 14815
},
{
"epoch": 4.681039248203428,
"grad_norm": 0.07610367614504171,
"learning_rate": 0.0012815160958414332,
"loss": 2.49,
"step": 14820
},
{
"epoch": 4.682618652767906,
"grad_norm": 0.058818872817078344,
"learning_rate": 0.0012809869086606862,
"loss": 2.5079,
"step": 14825
},
{
"epoch": 4.684198057332385,
"grad_norm": 0.061137964717995257,
"learning_rate": 0.0012804576360336156,
"loss": 2.4274,
"step": 14830
},
{
"epoch": 4.685777461896865,
"grad_norm": 0.06570390088117294,
"learning_rate": 0.0012799282781211696,
"loss": 2.5274,
"step": 14835
},
{
"epoch": 4.687356866461344,
"grad_norm": 0.06073055469705799,
"learning_rate": 0.001279398835084323,
"loss": 2.4647,
"step": 14840
},
{
"epoch": 4.688936271025823,
"grad_norm": 0.060531944183211055,
"learning_rate": 0.0012788693070840758,
"loss": 2.5147,
"step": 14845
},
{
"epoch": 4.690515675590302,
"grad_norm": 0.06494266452306915,
"learning_rate": 0.0012783396942814538,
"loss": 2.5203,
"step": 14850
},
{
"epoch": 4.692095080154782,
"grad_norm": 0.07470619420782981,
"learning_rate": 0.0012778099968375092,
"loss": 2.4989,
"step": 14855
},
{
"epoch": 4.693674484719261,
"grad_norm": 0.07292076646736712,
"learning_rate": 0.0012772802149133196,
"loss": 2.4739,
"step": 14860
},
{
"epoch": 4.69525388928374,
"grad_norm": 0.07413512363981896,
"learning_rate": 0.0012767503486699884,
"loss": 2.546,
"step": 14865
},
{
"epoch": 4.696833293848219,
"grad_norm": 0.07548964649732978,
"learning_rate": 0.001276220398268644,
"loss": 2.4961,
"step": 14870
},
{
"epoch": 4.698412698412699,
"grad_norm": 0.07620750111593708,
"learning_rate": 0.0012756903638704413,
"loss": 2.4796,
"step": 14875
},
{
"epoch": 4.699992102977178,
"grad_norm": 0.0685213813223197,
"learning_rate": 0.0012751602456365608,
"loss": 2.5224,
"step": 14880
},
{
"epoch": 4.701571507541657,
"grad_norm": 0.07549811567322803,
"learning_rate": 0.0012746300437282074,
"loss": 2.4686,
"step": 14885
},
{
"epoch": 4.703150912106136,
"grad_norm": 0.07007093001892828,
"learning_rate": 0.0012740997583066125,
"loss": 2.5181,
"step": 14890
},
{
"epoch": 4.704730316670615,
"grad_norm": 0.06992912149096699,
"learning_rate": 0.0012735693895330324,
"loss": 2.4495,
"step": 14895
},
{
"epoch": 4.706309721235094,
"grad_norm": 0.07049608013635306,
"learning_rate": 0.0012730389375687485,
"loss": 2.5377,
"step": 14900
},
{
"epoch": 4.707889125799573,
"grad_norm": 0.07458682983718855,
"learning_rate": 0.0012725084025750682,
"loss": 2.5174,
"step": 14905
},
{
"epoch": 4.709468530364052,
"grad_norm": 0.07688364261040968,
"learning_rate": 0.0012719777847133241,
"loss": 2.5228,
"step": 14910
},
{
"epoch": 4.711047934928532,
"grad_norm": 0.05854700594985617,
"learning_rate": 0.0012714470841448733,
"loss": 2.4756,
"step": 14915
},
{
"epoch": 4.712627339493011,
"grad_norm": 0.06417359188466547,
"learning_rate": 0.0012709163010310985,
"loss": 2.4729,
"step": 14920
},
{
"epoch": 4.71420674405749,
"grad_norm": 0.06532722612555884,
"learning_rate": 0.0012703854355334073,
"loss": 2.5088,
"step": 14925
},
{
"epoch": 4.715786148621969,
"grad_norm": 0.06393029628497983,
"learning_rate": 0.001269854487813233,
"loss": 2.606,
"step": 14930
},
{
"epoch": 4.717365553186449,
"grad_norm": 0.075638289548756,
"learning_rate": 0.0012693234580320332,
"loss": 2.5032,
"step": 14935
},
{
"epoch": 4.718944957750928,
"grad_norm": 0.06346267592489585,
"learning_rate": 0.00126879234635129,
"loss": 2.426,
"step": 14940
},
{
"epoch": 4.720524362315407,
"grad_norm": 0.0753710407317556,
"learning_rate": 0.0012682611529325118,
"loss": 2.5639,
"step": 14945
},
{
"epoch": 4.722103766879886,
"grad_norm": 0.07113730530575336,
"learning_rate": 0.0012677298779372314,
"loss": 2.5784,
"step": 14950
},
{
"epoch": 4.723683171444366,
"grad_norm": 0.07059577267826218,
"learning_rate": 0.0012671985215270054,
"loss": 2.6693,
"step": 14955
},
{
"epoch": 4.725262576008845,
"grad_norm": 0.06972960409937519,
"learning_rate": 0.0012666670838634162,
"loss": 2.4031,
"step": 14960
},
{
"epoch": 4.726841980573324,
"grad_norm": 0.08810771753828298,
"learning_rate": 0.0012661355651080706,
"loss": 2.5473,
"step": 14965
},
{
"epoch": 4.728421385137803,
"grad_norm": 0.08301596809598058,
"learning_rate": 0.0012656039654225998,
"loss": 2.5494,
"step": 14970
},
{
"epoch": 4.730000789702283,
"grad_norm": 0.06278329136880133,
"learning_rate": 0.0012650722849686608,
"loss": 2.476,
"step": 14975
},
{
"epoch": 4.731580194266762,
"grad_norm": 0.06671055745535334,
"learning_rate": 0.0012645405239079329,
"loss": 2.5091,
"step": 14980
},
{
"epoch": 4.73315959883124,
"grad_norm": 0.061039369127283666,
"learning_rate": 0.001264008682402122,
"loss": 2.4996,
"step": 14985
},
{
"epoch": 4.7347390033957195,
"grad_norm": 0.08628748772802185,
"learning_rate": 0.0012634767606129575,
"loss": 2.5259,
"step": 14990
},
{
"epoch": 4.736318407960199,
"grad_norm": 0.07475897519624002,
"learning_rate": 0.0012629447587021935,
"loss": 2.4534,
"step": 14995
},
{
"epoch": 4.737897812524678,
"grad_norm": 0.0705938424551623,
"learning_rate": 0.0012624126768316086,
"loss": 2.5447,
"step": 15000
},
{
"epoch": 4.739477217089157,
"grad_norm": 0.0651629087395383,
"learning_rate": 0.0012618805151630053,
"loss": 2.5074,
"step": 15005
},
{
"epoch": 4.7410566216536365,
"grad_norm": 0.08525149958053733,
"learning_rate": 0.0012613482738582102,
"loss": 2.4931,
"step": 15010
},
{
"epoch": 4.742636026218116,
"grad_norm": 0.07045853188635649,
"learning_rate": 0.001260815953079075,
"loss": 2.4835,
"step": 15015
},
{
"epoch": 4.744215430782595,
"grad_norm": 0.06038041471515805,
"learning_rate": 0.0012602835529874749,
"loss": 2.5318,
"step": 15020
},
{
"epoch": 4.745794835347074,
"grad_norm": 0.059304704263382704,
"learning_rate": 0.0012597510737453097,
"loss": 2.4827,
"step": 15025
},
{
"epoch": 4.7473742399115535,
"grad_norm": 0.06122825638178242,
"learning_rate": 0.0012592185155145023,
"loss": 2.5176,
"step": 15030
},
{
"epoch": 4.748953644476033,
"grad_norm": 0.06576469785246933,
"learning_rate": 0.0012586858784570001,
"loss": 2.5614,
"step": 15035
},
{
"epoch": 4.750533049040512,
"grad_norm": 0.05216499044386397,
"learning_rate": 0.0012581531627347752,
"loss": 2.5391,
"step": 15040
},
{
"epoch": 4.752112453604991,
"grad_norm": 0.05816811441187202,
"learning_rate": 0.0012576203685098232,
"loss": 2.3904,
"step": 15045
},
{
"epoch": 4.7536918581694705,
"grad_norm": 0.06327988859865588,
"learning_rate": 0.0012570874959441634,
"loss": 2.4144,
"step": 15050
},
{
"epoch": 4.755271262733949,
"grad_norm": 0.05954686621291838,
"learning_rate": 0.0012565545451998382,
"loss": 2.5184,
"step": 15055
},
{
"epoch": 4.756850667298428,
"grad_norm": 0.08534564677467046,
"learning_rate": 0.0012560215164389148,
"loss": 2.485,
"step": 15060
},
{
"epoch": 4.758430071862907,
"grad_norm": 0.09411002400331668,
"learning_rate": 0.0012554884098234843,
"loss": 2.5018,
"step": 15065
},
{
"epoch": 4.760009476427387,
"grad_norm": 0.05829188110744158,
"learning_rate": 0.001254955225515661,
"loss": 2.4612,
"step": 15070
},
{
"epoch": 4.761588880991866,
"grad_norm": 0.06260111830916247,
"learning_rate": 0.0012544219636775819,
"loss": 2.4897,
"step": 15075
},
{
"epoch": 4.763168285556345,
"grad_norm": 0.09311945584816322,
"learning_rate": 0.0012538886244714096,
"loss": 2.3988,
"step": 15080
},
{
"epoch": 4.764747690120824,
"grad_norm": 0.09055351042402385,
"learning_rate": 0.0012533552080593285,
"loss": 2.466,
"step": 15085
},
{
"epoch": 4.766327094685304,
"grad_norm": 0.06199831535615724,
"learning_rate": 0.0012528217146035477,
"loss": 2.4885,
"step": 15090
},
{
"epoch": 4.767906499249783,
"grad_norm": 0.06763264055913053,
"learning_rate": 0.0012522881442662988,
"loss": 2.5595,
"step": 15095
},
{
"epoch": 4.769485903814262,
"grad_norm": 0.05847901990048384,
"learning_rate": 0.001251754497209837,
"loss": 2.4948,
"step": 15100
},
{
"epoch": 4.771065308378741,
"grad_norm": 0.052060540215396786,
"learning_rate": 0.001251220773596441,
"loss": 2.4376,
"step": 15105
},
{
"epoch": 4.772644712943221,
"grad_norm": 0.05157419424925148,
"learning_rate": 0.0012506869735884128,
"loss": 2.4315,
"step": 15110
},
{
"epoch": 4.7742241175077,
"grad_norm": 0.06833545413268356,
"learning_rate": 0.001250153097348078,
"loss": 2.5331,
"step": 15115
},
{
"epoch": 4.775803522072179,
"grad_norm": 0.06807438736130686,
"learning_rate": 0.0012496191450377843,
"loss": 2.5571,
"step": 15120
},
{
"epoch": 4.777382926636658,
"grad_norm": 0.08947326196212828,
"learning_rate": 0.0012490851168199036,
"loss": 2.5824,
"step": 15125
},
{
"epoch": 4.778962331201138,
"grad_norm": 0.07719919698525221,
"learning_rate": 0.00124855101285683,
"loss": 2.5316,
"step": 15130
},
{
"epoch": 4.780541735765617,
"grad_norm": 0.08025552743432796,
"learning_rate": 0.0012480168333109819,
"loss": 2.4447,
"step": 15135
},
{
"epoch": 4.782121140330095,
"grad_norm": 0.05261629415497841,
"learning_rate": 0.0012474825783447992,
"loss": 2.5032,
"step": 15140
},
{
"epoch": 4.783700544894574,
"grad_norm": 0.0757771632437342,
"learning_rate": 0.0012469482481207454,
"loss": 2.5653,
"step": 15145
},
{
"epoch": 4.785279949459054,
"grad_norm": 0.05623537967074141,
"learning_rate": 0.0012464138428013073,
"loss": 2.4591,
"step": 15150
},
{
"epoch": 4.786859354023533,
"grad_norm": 0.08262533487856788,
"learning_rate": 0.001245879362548994,
"loss": 2.4795,
"step": 15155
},
{
"epoch": 4.788438758588012,
"grad_norm": 0.06902115051220888,
"learning_rate": 0.001245344807526338,
"loss": 2.5178,
"step": 15160
},
{
"epoch": 4.790018163152491,
"grad_norm": 0.059367330068955296,
"learning_rate": 0.001244810177895893,
"loss": 2.4574,
"step": 15165
},
{
"epoch": 4.791597567716971,
"grad_norm": 0.05293086324890953,
"learning_rate": 0.001244275473820237,
"loss": 2.4589,
"step": 15170
},
{
"epoch": 4.79317697228145,
"grad_norm": 0.062351455497511706,
"learning_rate": 0.00124374069546197,
"loss": 2.518,
"step": 15175
},
{
"epoch": 4.794756376845929,
"grad_norm": 0.06960504551811326,
"learning_rate": 0.0012432058429837152,
"loss": 2.5104,
"step": 15180
},
{
"epoch": 4.796335781410408,
"grad_norm": 0.0785582773003688,
"learning_rate": 0.0012426709165481175,
"loss": 2.5301,
"step": 15185
},
{
"epoch": 4.797915185974888,
"grad_norm": 0.09538357866394954,
"learning_rate": 0.0012421359163178442,
"loss": 2.4908,
"step": 15190
},
{
"epoch": 4.799494590539367,
"grad_norm": 0.060926918043546184,
"learning_rate": 0.001241600842455586,
"loss": 2.5181,
"step": 15195
},
{
"epoch": 4.801073995103846,
"grad_norm": 0.06927795277655609,
"learning_rate": 0.001241065695124055,
"loss": 2.5287,
"step": 15200
},
{
"epoch": 4.802653399668325,
"grad_norm": 0.057548110512531184,
"learning_rate": 0.001240530474485987,
"loss": 2.4556,
"step": 15205
},
{
"epoch": 4.804232804232804,
"grad_norm": 0.06710418212774051,
"learning_rate": 0.0012399951807041379,
"loss": 2.5648,
"step": 15210
},
{
"epoch": 4.805812208797283,
"grad_norm": 0.06517248472080968,
"learning_rate": 0.001239459813941288,
"loss": 2.5899,
"step": 15215
},
{
"epoch": 4.807391613361762,
"grad_norm": 0.0671403527288489,
"learning_rate": 0.0012389243743602383,
"loss": 2.5127,
"step": 15220
},
{
"epoch": 4.8089710179262415,
"grad_norm": 0.07095804854043555,
"learning_rate": 0.001238388862123813,
"loss": 2.5935,
"step": 15225
},
{
"epoch": 4.810550422490721,
"grad_norm": 0.07413226220034597,
"learning_rate": 0.001237853277394858,
"loss": 2.5664,
"step": 15230
},
{
"epoch": 4.8121298270552,
"grad_norm": 0.06326029739527476,
"learning_rate": 0.001237317620336241,
"loss": 2.5517,
"step": 15235
},
{
"epoch": 4.813709231619679,
"grad_norm": 0.07277779313715153,
"learning_rate": 0.0012367818911108517,
"loss": 2.4122,
"step": 15240
},
{
"epoch": 4.8152886361841585,
"grad_norm": 0.06869560434543806,
"learning_rate": 0.0012362460898816025,
"loss": 2.5078,
"step": 15245
},
{
"epoch": 4.816868040748638,
"grad_norm": 0.0643629530844895,
"learning_rate": 0.0012357102168114268,
"loss": 2.5108,
"step": 15250
},
{
"epoch": 4.818447445313117,
"grad_norm": 0.06015315343653677,
"learning_rate": 0.0012351742720632798,
"loss": 2.4923,
"step": 15255
},
{
"epoch": 4.820026849877596,
"grad_norm": 0.05775598319572417,
"learning_rate": 0.0012346382558001392,
"loss": 2.468,
"step": 15260
},
{
"epoch": 4.8216062544420755,
"grad_norm": 0.06140546049655784,
"learning_rate": 0.0012341021681850045,
"loss": 2.5224,
"step": 15265
},
{
"epoch": 4.823185659006555,
"grad_norm": 0.0692881651243686,
"learning_rate": 0.001233566009380896,
"loss": 2.4801,
"step": 15270
},
{
"epoch": 4.824765063571034,
"grad_norm": 0.06165949337963385,
"learning_rate": 0.0012330297795508564,
"loss": 2.5157,
"step": 15275
},
{
"epoch": 4.826344468135513,
"grad_norm": 0.0682195228399358,
"learning_rate": 0.00123249347885795,
"loss": 2.5445,
"step": 15280
},
{
"epoch": 4.8279238726999925,
"grad_norm": 0.06226870586096772,
"learning_rate": 0.0012319571074652614,
"loss": 2.4146,
"step": 15285
},
{
"epoch": 4.829503277264472,
"grad_norm": 0.06679105092346499,
"learning_rate": 0.0012314206655358987,
"loss": 2.5325,
"step": 15290
},
{
"epoch": 4.831082681828951,
"grad_norm": 0.06162451599728066,
"learning_rate": 0.0012308841532329905,
"loss": 2.4682,
"step": 15295
},
{
"epoch": 4.832662086393429,
"grad_norm": 0.05503800445837147,
"learning_rate": 0.0012303475707196865,
"loss": 2.3985,
"step": 15300
},
{
"epoch": 4.834241490957909,
"grad_norm": 0.07987179045443299,
"learning_rate": 0.0012298109181591577,
"loss": 2.3869,
"step": 15305
},
{
"epoch": 4.835820895522388,
"grad_norm": 0.05973667980788249,
"learning_rate": 0.001229274195714597,
"loss": 2.466,
"step": 15310
},
{
"epoch": 4.837400300086867,
"grad_norm": 0.06080607906192062,
"learning_rate": 0.0012287374035492183,
"loss": 2.4171,
"step": 15315
},
{
"epoch": 4.838979704651346,
"grad_norm": 0.0929911394564496,
"learning_rate": 0.0012282005418262569,
"loss": 2.4351,
"step": 15320
},
{
"epoch": 4.8405591092158256,
"grad_norm": 0.07042123203050357,
"learning_rate": 0.0012276636107089684,
"loss": 2.4744,
"step": 15325
},
{
"epoch": 4.842138513780305,
"grad_norm": 0.061855793739237726,
"learning_rate": 0.0012271266103606304,
"loss": 2.6037,
"step": 15330
},
{
"epoch": 4.843717918344784,
"grad_norm": 0.06073958843808026,
"learning_rate": 0.0012265895409445413,
"loss": 2.4685,
"step": 15335
},
{
"epoch": 4.845297322909263,
"grad_norm": 0.06711465508022725,
"learning_rate": 0.001226052402624021,
"loss": 2.511,
"step": 15340
},
{
"epoch": 4.8468767274737425,
"grad_norm": 0.0808742963943968,
"learning_rate": 0.001225515195562409,
"loss": 2.518,
"step": 15345
},
{
"epoch": 4.848456132038222,
"grad_norm": 0.06842571928423273,
"learning_rate": 0.0012249779199230671,
"loss": 2.4423,
"step": 15350
},
{
"epoch": 4.850035536602701,
"grad_norm": 0.05912567812868452,
"learning_rate": 0.001224440575869377,
"loss": 2.4587,
"step": 15355
},
{
"epoch": 4.85161494116718,
"grad_norm": 0.0681131395124921,
"learning_rate": 0.0012239031635647418,
"loss": 2.5492,
"step": 15360
},
{
"epoch": 4.8531943457316595,
"grad_norm": 0.07921069271484028,
"learning_rate": 0.0012233656831725853,
"loss": 2.5395,
"step": 15365
},
{
"epoch": 4.854773750296138,
"grad_norm": 0.06666648532781783,
"learning_rate": 0.0012228281348563512,
"loss": 2.447,
"step": 15370
},
{
"epoch": 4.856353154860617,
"grad_norm": 0.06854146074690375,
"learning_rate": 0.0012222905187795053,
"loss": 2.4339,
"step": 15375
},
{
"epoch": 4.857932559425096,
"grad_norm": 0.0711578400136005,
"learning_rate": 0.0012217528351055327,
"loss": 2.5131,
"step": 15380
},
{
"epoch": 4.859511963989576,
"grad_norm": 0.06374172026154729,
"learning_rate": 0.0012212150839979402,
"loss": 2.4771,
"step": 15385
},
{
"epoch": 4.861091368554055,
"grad_norm": 0.06380930678887842,
"learning_rate": 0.0012206772656202537,
"loss": 2.5038,
"step": 15390
},
{
"epoch": 4.862670773118534,
"grad_norm": 0.053049498300829126,
"learning_rate": 0.0012201393801360208,
"loss": 2.5873,
"step": 15395
},
{
"epoch": 4.864250177683013,
"grad_norm": 0.060002911633182825,
"learning_rate": 0.0012196014277088088,
"loss": 2.5473,
"step": 15400
},
{
"epoch": 4.865829582247493,
"grad_norm": 0.06974729192567168,
"learning_rate": 0.0012190634085022056,
"loss": 2.551,
"step": 15405
},
{
"epoch": 4.867408986811972,
"grad_norm": 0.06401003903677191,
"learning_rate": 0.0012185253226798195,
"loss": 2.4325,
"step": 15410
},
{
"epoch": 4.868988391376451,
"grad_norm": 0.05765104930964299,
"learning_rate": 0.0012179871704052793,
"loss": 2.4814,
"step": 15415
},
{
"epoch": 4.87056779594093,
"grad_norm": 0.09917967750843776,
"learning_rate": 0.0012174489518422332,
"loss": 2.4686,
"step": 15420
},
{
"epoch": 4.87214720050541,
"grad_norm": 0.0790620977932766,
"learning_rate": 0.0012169106671543499,
"loss": 2.4991,
"step": 15425
},
{
"epoch": 4.873726605069889,
"grad_norm": 0.0746542162946544,
"learning_rate": 0.0012163723165053192,
"loss": 2.4402,
"step": 15430
},
{
"epoch": 4.875306009634368,
"grad_norm": 0.0688687534963303,
"learning_rate": 0.0012158339000588492,
"loss": 2.4895,
"step": 15435
},
{
"epoch": 4.876885414198847,
"grad_norm": 0.06709837817692976,
"learning_rate": 0.001215295417978669,
"loss": 2.5206,
"step": 15440
},
{
"epoch": 4.878464818763327,
"grad_norm": 0.0726724345962881,
"learning_rate": 0.0012147568704285276,
"loss": 2.5356,
"step": 15445
},
{
"epoch": 4.880044223327806,
"grad_norm": 0.07535157295869183,
"learning_rate": 0.0012142182575721945,
"loss": 2.4808,
"step": 15450
},
{
"epoch": 4.881623627892285,
"grad_norm": 0.08290435815113797,
"learning_rate": 0.0012136795795734576,
"loss": 2.4617,
"step": 15455
},
{
"epoch": 4.8832030324567635,
"grad_norm": 0.060912276319679534,
"learning_rate": 0.0012131408365961263,
"loss": 2.49,
"step": 15460
},
{
"epoch": 4.884782437021243,
"grad_norm": 0.0654902783875126,
"learning_rate": 0.0012126020288040279,
"loss": 2.3803,
"step": 15465
},
{
"epoch": 4.886361841585722,
"grad_norm": 0.05976371183664185,
"learning_rate": 0.0012120631563610107,
"loss": 2.4806,
"step": 15470
},
{
"epoch": 4.887941246150201,
"grad_norm": 0.0643486484882794,
"learning_rate": 0.001211524219430943,
"loss": 2.5788,
"step": 15475
},
{
"epoch": 4.8895206507146804,
"grad_norm": 0.0622585687911665,
"learning_rate": 0.0012109852181777117,
"loss": 2.5253,
"step": 15480
},
{
"epoch": 4.89110005527916,
"grad_norm": 0.07622978591804572,
"learning_rate": 0.0012104461527652232,
"loss": 2.4864,
"step": 15485
},
{
"epoch": 4.892679459843639,
"grad_norm": 0.058348614197018894,
"learning_rate": 0.0012099070233574044,
"loss": 2.4738,
"step": 15490
},
{
"epoch": 4.894258864408118,
"grad_norm": 0.0664664094650849,
"learning_rate": 0.0012093678301182012,
"loss": 2.6012,
"step": 15495
},
{
"epoch": 4.895838268972597,
"grad_norm": 0.07200468939704618,
"learning_rate": 0.001208828573211578,
"loss": 2.497,
"step": 15500
},
{
"epoch": 4.897417673537077,
"grad_norm": 0.08119284123858826,
"learning_rate": 0.0012082892528015204,
"loss": 2.544,
"step": 15505
},
{
"epoch": 4.898997078101556,
"grad_norm": 0.06423832044178142,
"learning_rate": 0.0012077498690520314,
"loss": 2.5031,
"step": 15510
},
{
"epoch": 4.900576482666035,
"grad_norm": 0.07950608792266346,
"learning_rate": 0.001207210422127135,
"loss": 2.4954,
"step": 15515
},
{
"epoch": 4.902155887230514,
"grad_norm": 0.08852436333106962,
"learning_rate": 0.001206670912190873,
"loss": 2.3703,
"step": 15520
},
{
"epoch": 4.903735291794993,
"grad_norm": 0.05748637559319089,
"learning_rate": 0.0012061313394073068,
"loss": 2.4484,
"step": 15525
},
{
"epoch": 4.905314696359472,
"grad_norm": 0.05847823169655169,
"learning_rate": 0.0012055917039405176,
"loss": 2.497,
"step": 15530
},
{
"epoch": 4.906894100923951,
"grad_norm": 0.058178719585947246,
"learning_rate": 0.0012050520059546047,
"loss": 2.4362,
"step": 15535
},
{
"epoch": 4.9084735054884305,
"grad_norm": 0.06385032415775303,
"learning_rate": 0.001204512245613687,
"loss": 2.4866,
"step": 15540
},
{
"epoch": 4.91005291005291,
"grad_norm": 0.07410022568643862,
"learning_rate": 0.0012039724230819017,
"loss": 2.4655,
"step": 15545
},
{
"epoch": 4.911632314617389,
"grad_norm": 0.06652306615010183,
"learning_rate": 0.0012034325385234061,
"loss": 2.4223,
"step": 15550
},
{
"epoch": 4.913211719181868,
"grad_norm": 0.05840175803667608,
"learning_rate": 0.0012028925921023753,
"loss": 2.4936,
"step": 15555
},
{
"epoch": 4.9147911237463475,
"grad_norm": 0.06306712385491223,
"learning_rate": 0.0012023525839830037,
"loss": 2.4684,
"step": 15560
},
{
"epoch": 4.916370528310827,
"grad_norm": 0.07921764574952989,
"learning_rate": 0.0012018125143295037,
"loss": 2.4417,
"step": 15565
},
{
"epoch": 4.917949932875306,
"grad_norm": 0.06100750172891688,
"learning_rate": 0.0012012723833061077,
"loss": 2.4098,
"step": 15570
},
{
"epoch": 4.919529337439785,
"grad_norm": 0.06070684128663812,
"learning_rate": 0.0012007321910770662,
"loss": 2.4733,
"step": 15575
},
{
"epoch": 4.9211087420042645,
"grad_norm": 0.05769476252807484,
"learning_rate": 0.0012001919378066474,
"loss": 2.4538,
"step": 15580
},
{
"epoch": 4.922688146568744,
"grad_norm": 0.06108438140943986,
"learning_rate": 0.0011996516236591397,
"loss": 2.4146,
"step": 15585
},
{
"epoch": 4.924267551133223,
"grad_norm": 0.06125485985860159,
"learning_rate": 0.0011991112487988488,
"loss": 2.4685,
"step": 15590
},
{
"epoch": 4.925846955697702,
"grad_norm": 0.0874930175823073,
"learning_rate": 0.0011985708133900993,
"loss": 2.5056,
"step": 15595
},
{
"epoch": 4.9274263602621815,
"grad_norm": 0.06731369921187405,
"learning_rate": 0.0011980303175972342,
"loss": 2.4626,
"step": 15600
},
{
"epoch": 4.929005764826661,
"grad_norm": 0.07144118656733887,
"learning_rate": 0.0011974897615846147,
"loss": 2.4938,
"step": 15605
},
{
"epoch": 4.93058516939114,
"grad_norm": 0.07594671817521159,
"learning_rate": 0.0011969491455166206,
"loss": 2.4213,
"step": 15610
},
{
"epoch": 4.932164573955618,
"grad_norm": 0.06292956355831127,
"learning_rate": 0.0011964084695576496,
"loss": 2.4995,
"step": 15615
},
{
"epoch": 4.933743978520098,
"grad_norm": 0.05708705811745932,
"learning_rate": 0.001195867733872118,
"loss": 2.5237,
"step": 15620
},
{
"epoch": 4.935323383084577,
"grad_norm": 0.07187282633092404,
"learning_rate": 0.0011953269386244597,
"loss": 2.4704,
"step": 15625
},
{
"epoch": 4.936902787649056,
"grad_norm": 0.06076469305755183,
"learning_rate": 0.0011947860839791277,
"loss": 2.4851,
"step": 15630
},
{
"epoch": 4.938482192213535,
"grad_norm": 0.05770019943142122,
"learning_rate": 0.0011942451701005918,
"loss": 2.528,
"step": 15635
},
{
"epoch": 4.940061596778015,
"grad_norm": 0.08398343738666424,
"learning_rate": 0.0011937041971533406,
"loss": 2.4163,
"step": 15640
},
{
"epoch": 4.941641001342494,
"grad_norm": 0.07544733825597935,
"learning_rate": 0.001193163165301881,
"loss": 2.4659,
"step": 15645
},
{
"epoch": 4.943220405906973,
"grad_norm": 0.07023987207599723,
"learning_rate": 0.0011926220747107371,
"loss": 2.4363,
"step": 15650
},
{
"epoch": 4.944799810471452,
"grad_norm": 0.07764060150434646,
"learning_rate": 0.0011920809255444506,
"loss": 2.5797,
"step": 15655
},
{
"epoch": 4.946379215035932,
"grad_norm": 0.0659107035007278,
"learning_rate": 0.001191539717967582,
"loss": 2.4964,
"step": 15660
},
{
"epoch": 4.947958619600411,
"grad_norm": 0.06660798481623668,
"learning_rate": 0.001190998452144709,
"loss": 2.3891,
"step": 15665
},
{
"epoch": 4.94953802416489,
"grad_norm": 0.062241320891403946,
"learning_rate": 0.001190457128240427,
"loss": 2.4861,
"step": 15670
},
{
"epoch": 4.951117428729369,
"grad_norm": 0.07985351613599452,
"learning_rate": 0.0011899157464193492,
"loss": 2.6429,
"step": 15675
},
{
"epoch": 4.952696833293849,
"grad_norm": 0.05954129453508578,
"learning_rate": 0.0011893743068461062,
"loss": 2.5111,
"step": 15680
},
{
"epoch": 4.954276237858327,
"grad_norm": 0.06007797509115445,
"learning_rate": 0.0011888328096853465,
"loss": 2.5346,
"step": 15685
},
{
"epoch": 4.955855642422806,
"grad_norm": 0.05823114187959376,
"learning_rate": 0.0011882912551017361,
"loss": 2.5305,
"step": 15690
},
{
"epoch": 4.957435046987285,
"grad_norm": 0.05736393358705801,
"learning_rate": 0.001187749643259958,
"loss": 2.4991,
"step": 15695
},
{
"epoch": 4.959014451551765,
"grad_norm": 0.08238671258192874,
"learning_rate": 0.0011872079743247125,
"loss": 2.4749,
"step": 15700
},
{
"epoch": 4.960593856116244,
"grad_norm": 0.06298808861703957,
"learning_rate": 0.0011866662484607184,
"loss": 2.4865,
"step": 15705
},
{
"epoch": 4.962173260680723,
"grad_norm": 0.05941530849476449,
"learning_rate": 0.0011861244658327112,
"loss": 2.4975,
"step": 15710
},
{
"epoch": 4.963752665245202,
"grad_norm": 0.0666754246041905,
"learning_rate": 0.0011855826266054424,
"loss": 2.4319,
"step": 15715
},
{
"epoch": 4.965332069809682,
"grad_norm": 0.05855819542081744,
"learning_rate": 0.001185040730943683,
"loss": 2.4817,
"step": 15720
},
{
"epoch": 4.966911474374161,
"grad_norm": 0.062468705971845444,
"learning_rate": 0.0011844987790122195,
"loss": 2.4615,
"step": 15725
},
{
"epoch": 4.96849087893864,
"grad_norm": 0.06404754778845122,
"learning_rate": 0.0011839567709758558,
"loss": 2.4788,
"step": 15730
},
{
"epoch": 4.970070283503119,
"grad_norm": 0.058087083590399054,
"learning_rate": 0.001183414706999414,
"loss": 2.5381,
"step": 15735
},
{
"epoch": 4.971649688067599,
"grad_norm": 0.05669964896838692,
"learning_rate": 0.0011828725872477313,
"loss": 2.5212,
"step": 15740
},
{
"epoch": 4.973229092632078,
"grad_norm": 0.061419590892372596,
"learning_rate": 0.001182330411885663,
"loss": 2.5256,
"step": 15745
},
{
"epoch": 4.974808497196557,
"grad_norm": 0.06465431498544376,
"learning_rate": 0.0011817881810780816,
"loss": 2.513,
"step": 15750
},
{
"epoch": 4.976387901761036,
"grad_norm": 0.07385204635840029,
"learning_rate": 0.0011812458949898759,
"loss": 2.4556,
"step": 15755
},
{
"epoch": 4.977967306325516,
"grad_norm": 0.05803605018835446,
"learning_rate": 0.0011807035537859513,
"loss": 2.4638,
"step": 15760
},
{
"epoch": 4.979546710889995,
"grad_norm": 0.06856048210261467,
"learning_rate": 0.001180161157631231,
"loss": 2.4593,
"step": 15765
},
{
"epoch": 4.981126115454474,
"grad_norm": 0.052510067991917486,
"learning_rate": 0.0011796187066906534,
"loss": 2.5206,
"step": 15770
},
{
"epoch": 4.9827055200189525,
"grad_norm": 0.06025897027411644,
"learning_rate": 0.0011790762011291748,
"loss": 2.4054,
"step": 15775
},
{
"epoch": 4.984284924583432,
"grad_norm": 0.06761575243212609,
"learning_rate": 0.0011785336411117675,
"loss": 2.5053,
"step": 15780
},
{
"epoch": 4.985864329147911,
"grad_norm": 0.057767353934116694,
"learning_rate": 0.0011779910268034208,
"loss": 2.5243,
"step": 15785
},
{
"epoch": 4.98744373371239,
"grad_norm": 0.06466723147966426,
"learning_rate": 0.0011774483583691397,
"loss": 2.4687,
"step": 15790
},
{
"epoch": 4.9890231382768695,
"grad_norm": 0.060452145161182434,
"learning_rate": 0.001176905635973947,
"loss": 2.4447,
"step": 15795
},
{
"epoch": 4.990602542841349,
"grad_norm": 0.06364205478369672,
"learning_rate": 0.0011763628597828803,
"loss": 2.6067,
"step": 15800
},
{
"epoch": 4.992181947405828,
"grad_norm": 0.060599709328997095,
"learning_rate": 0.0011758200299609952,
"loss": 2.3731,
"step": 15805
},
{
"epoch": 4.993761351970307,
"grad_norm": 0.06830949840866171,
"learning_rate": 0.001175277146673362,
"loss": 2.4716,
"step": 15810
},
{
"epoch": 4.9953407565347865,
"grad_norm": 0.09082852984432674,
"learning_rate": 0.0011747342100850685,
"loss": 2.4882,
"step": 15815
},
{
"epoch": 4.996920161099266,
"grad_norm": 0.06611274814656555,
"learning_rate": 0.001174191220361218,
"loss": 2.4938,
"step": 15820
},
{
"epoch": 4.998499565663745,
"grad_norm": 0.0651948980122603,
"learning_rate": 0.0011736481776669307,
"loss": 2.4455,
"step": 15825
},
{
"epoch": 5.0,
"grad_norm": 0.10218245823941818,
"learning_rate": 0.0011731050821673417,
"loss": 2.5349,
"step": 15830
},
{
"epoch": 5.0,
"eval_loss": 2.4879767894744873,
"eval_runtime": 118.4334,
"eval_samples_per_second": 22.367,
"eval_steps_per_second": 5.598,
"step": 15830
},
{
"epoch": 5.001579404564479,
"grad_norm": 0.05936455869948697,
"learning_rate": 0.001172561934027603,
"loss": 2.4368,
"step": 15835
},
{
"epoch": 5.0031588091289585,
"grad_norm": 0.05178263702404628,
"learning_rate": 0.0011720187334128829,
"loss": 2.5044,
"step": 15840
},
{
"epoch": 5.004738213693438,
"grad_norm": 0.05428143758672858,
"learning_rate": 0.001171475480488365,
"loss": 2.4597,
"step": 15845
},
{
"epoch": 5.006317618257917,
"grad_norm": 0.07558504457842352,
"learning_rate": 0.0011709321754192492,
"loss": 2.4644,
"step": 15850
},
{
"epoch": 5.007897022822396,
"grad_norm": 0.05747418106633769,
"learning_rate": 0.0011703888183707512,
"loss": 2.4854,
"step": 15855
},
{
"epoch": 5.0094764273868755,
"grad_norm": 0.08243436868767362,
"learning_rate": 0.0011698454095081018,
"loss": 2.4989,
"step": 15860
},
{
"epoch": 5.011055831951355,
"grad_norm": 0.05952542577324716,
"learning_rate": 0.0011693019489965484,
"loss": 2.4299,
"step": 15865
},
{
"epoch": 5.012635236515833,
"grad_norm": 0.07313668146611062,
"learning_rate": 0.0011687584370013544,
"loss": 2.475,
"step": 15870
},
{
"epoch": 5.014214641080312,
"grad_norm": 0.06373345839897392,
"learning_rate": 0.001168214873687798,
"loss": 2.4881,
"step": 15875
},
{
"epoch": 5.015794045644792,
"grad_norm": 0.07053297624719784,
"learning_rate": 0.0011676712592211729,
"loss": 2.3989,
"step": 15880
},
{
"epoch": 5.017373450209271,
"grad_norm": 0.06959343635310479,
"learning_rate": 0.0011671275937667894,
"loss": 2.4974,
"step": 15885
},
{
"epoch": 5.01895285477375,
"grad_norm": 0.05298602620521121,
"learning_rate": 0.0011665838774899719,
"loss": 2.4695,
"step": 15890
},
{
"epoch": 5.020532259338229,
"grad_norm": 0.06081738360944861,
"learning_rate": 0.0011660401105560623,
"loss": 2.4367,
"step": 15895
},
{
"epoch": 5.022111663902709,
"grad_norm": 0.05957421304787897,
"learning_rate": 0.0011654962931304158,
"loss": 2.607,
"step": 15900
},
{
"epoch": 5.023691068467188,
"grad_norm": 0.058059781037168315,
"learning_rate": 0.0011649524253784036,
"loss": 2.4989,
"step": 15905
},
{
"epoch": 5.025270473031667,
"grad_norm": 0.06481902623330911,
"learning_rate": 0.001164408507465413,
"loss": 2.4182,
"step": 15910
},
{
"epoch": 5.026849877596146,
"grad_norm": 0.06644572366396971,
"learning_rate": 0.0011638645395568457,
"loss": 2.4966,
"step": 15915
},
{
"epoch": 5.028429282160626,
"grad_norm": 0.09413712656237916,
"learning_rate": 0.0011633205218181191,
"loss": 2.4376,
"step": 15920
},
{
"epoch": 5.030008686725105,
"grad_norm": 0.09649674809369083,
"learning_rate": 0.001162776454414665,
"loss": 2.4057,
"step": 15925
},
{
"epoch": 5.031588091289584,
"grad_norm": 0.06840301674179984,
"learning_rate": 0.001162232337511931,
"loss": 2.4578,
"step": 15930
},
{
"epoch": 5.033167495854063,
"grad_norm": 0.06298972187905075,
"learning_rate": 0.0011616881712753799,
"loss": 2.5106,
"step": 15935
},
{
"epoch": 5.034746900418543,
"grad_norm": 0.08500448183013605,
"learning_rate": 0.001161143955870489,
"loss": 2.4424,
"step": 15940
},
{
"epoch": 5.036326304983022,
"grad_norm": 0.0666700102574268,
"learning_rate": 0.0011605996914627508,
"loss": 2.5119,
"step": 15945
},
{
"epoch": 5.0379057095475,
"grad_norm": 0.059598529645478314,
"learning_rate": 0.0011600553782176724,
"loss": 2.4163,
"step": 15950
},
{
"epoch": 5.039485114111979,
"grad_norm": 0.07605493015186099,
"learning_rate": 0.0011595110163007758,
"loss": 2.5509,
"step": 15955
},
{
"epoch": 5.041064518676459,
"grad_norm": 0.0648202884729895,
"learning_rate": 0.0011589666058775985,
"loss": 2.4685,
"step": 15960
},
{
"epoch": 5.042643923240938,
"grad_norm": 0.08600988911396924,
"learning_rate": 0.0011584221471136924,
"loss": 2.5668,
"step": 15965
},
{
"epoch": 5.044223327805417,
"grad_norm": 0.07635689265264523,
"learning_rate": 0.0011578776401746232,
"loss": 2.5022,
"step": 15970
},
{
"epoch": 5.045802732369896,
"grad_norm": 0.06792785847659778,
"learning_rate": 0.0011573330852259723,
"loss": 2.5037,
"step": 15975
},
{
"epoch": 5.047382136934376,
"grad_norm": 0.0792531105851889,
"learning_rate": 0.0011567884824333352,
"loss": 2.4579,
"step": 15980
},
{
"epoch": 5.048961541498855,
"grad_norm": 0.05561313486502166,
"learning_rate": 0.001156243831962323,
"loss": 2.4794,
"step": 15985
},
{
"epoch": 5.050540946063334,
"grad_norm": 0.060426396256911145,
"learning_rate": 0.0011556991339785594,
"loss": 2.4779,
"step": 15990
},
{
"epoch": 5.052120350627813,
"grad_norm": 0.0743286410045955,
"learning_rate": 0.001155154388647684,
"loss": 2.5053,
"step": 15995
},
{
"epoch": 5.053699755192293,
"grad_norm": 0.0642410054621103,
"learning_rate": 0.00115460959613535,
"loss": 2.4671,
"step": 16000
},
{
"epoch": 5.055279159756772,
"grad_norm": 0.08813869602163044,
"learning_rate": 0.0011540647566072257,
"loss": 2.5639,
"step": 16005
},
{
"epoch": 5.056858564321251,
"grad_norm": 0.06600521276012898,
"learning_rate": 0.0011535198702289939,
"loss": 2.437,
"step": 16010
},
{
"epoch": 5.05843796888573,
"grad_norm": 0.07586319419867858,
"learning_rate": 0.00115297493716635,
"loss": 2.4102,
"step": 16015
},
{
"epoch": 5.06001737345021,
"grad_norm": 0.05790710792159861,
"learning_rate": 0.0011524299575850047,
"loss": 2.5159,
"step": 16020
},
{
"epoch": 5.061596778014689,
"grad_norm": 0.06365084586078266,
"learning_rate": 0.0011518849316506836,
"loss": 2.432,
"step": 16025
},
{
"epoch": 5.063176182579167,
"grad_norm": 0.08387513396665641,
"learning_rate": 0.0011513398595291253,
"loss": 2.4663,
"step": 16030
},
{
"epoch": 5.0647555871436465,
"grad_norm": 0.07357946696135872,
"learning_rate": 0.0011507947413860826,
"loss": 2.6234,
"step": 16035
},
{
"epoch": 5.066334991708126,
"grad_norm": 0.08477276604753632,
"learning_rate": 0.0011502495773873225,
"loss": 2.525,
"step": 16040
},
{
"epoch": 5.067914396272605,
"grad_norm": 0.06536941662454575,
"learning_rate": 0.0011497043676986255,
"loss": 2.5088,
"step": 16045
},
{
"epoch": 5.069493800837084,
"grad_norm": 0.06999318385254447,
"learning_rate": 0.0011491591124857873,
"loss": 2.5295,
"step": 16050
},
{
"epoch": 5.0710732054015635,
"grad_norm": 0.0785145953519047,
"learning_rate": 0.0011486138119146162,
"loss": 2.6605,
"step": 16055
},
{
"epoch": 5.072652609966043,
"grad_norm": 0.05697907119882617,
"learning_rate": 0.0011480684661509337,
"loss": 2.4912,
"step": 16060
},
{
"epoch": 5.074232014530522,
"grad_norm": 0.058279369150634495,
"learning_rate": 0.001147523075360577,
"loss": 2.478,
"step": 16065
},
{
"epoch": 5.075811419095001,
"grad_norm": 0.09506533540972528,
"learning_rate": 0.0011469776397093955,
"loss": 2.6287,
"step": 16070
},
{
"epoch": 5.0773908236594805,
"grad_norm": 0.05870646987675661,
"learning_rate": 0.0011464321593632532,
"loss": 2.4934,
"step": 16075
},
{
"epoch": 5.07897022822396,
"grad_norm": 0.0751172381201278,
"learning_rate": 0.0011458866344880266,
"loss": 2.4517,
"step": 16080
},
{
"epoch": 5.080549632788439,
"grad_norm": 0.04860545935891463,
"learning_rate": 0.0011453410652496063,
"loss": 2.5204,
"step": 16085
},
{
"epoch": 5.082129037352918,
"grad_norm": 0.113731797770735,
"learning_rate": 0.001144795451813897,
"loss": 2.4957,
"step": 16090
},
{
"epoch": 5.0837084419173975,
"grad_norm": 0.06105826352877905,
"learning_rate": 0.0011442497943468157,
"loss": 2.5007,
"step": 16095
},
{
"epoch": 5.085287846481877,
"grad_norm": 0.06997079522607595,
"learning_rate": 0.001143704093014294,
"loss": 2.4797,
"step": 16100
},
{
"epoch": 5.086867251046356,
"grad_norm": 0.07272291158991666,
"learning_rate": 0.0011431583479822754,
"loss": 2.449,
"step": 16105
},
{
"epoch": 5.088446655610834,
"grad_norm": 0.06902835792256261,
"learning_rate": 0.001142612559416718,
"loss": 2.5247,
"step": 16110
},
{
"epoch": 5.090026060175314,
"grad_norm": 0.07561989736986836,
"learning_rate": 0.001142066727483592,
"loss": 2.5351,
"step": 16115
},
{
"epoch": 5.091605464739793,
"grad_norm": 0.09065227667689367,
"learning_rate": 0.0011415208523488825,
"loss": 2.4434,
"step": 16120
},
{
"epoch": 5.093184869304272,
"grad_norm": 0.054130632758493794,
"learning_rate": 0.0011409749341785857,
"loss": 2.4761,
"step": 16125
},
{
"epoch": 5.094764273868751,
"grad_norm": 0.05406569880209989,
"learning_rate": 0.0011404289731387122,
"loss": 2.5257,
"step": 16130
},
{
"epoch": 5.0963436784332306,
"grad_norm": 0.07165719721614144,
"learning_rate": 0.001139882969395285,
"loss": 2.5085,
"step": 16135
},
{
"epoch": 5.09792308299771,
"grad_norm": 0.061909141866988184,
"learning_rate": 0.0011393369231143405,
"loss": 2.4395,
"step": 16140
},
{
"epoch": 5.099502487562189,
"grad_norm": 0.08023843002538841,
"learning_rate": 0.0011387908344619281,
"loss": 2.5965,
"step": 16145
},
{
"epoch": 5.101081892126668,
"grad_norm": 0.07634235626975495,
"learning_rate": 0.00113824470360411,
"loss": 2.5738,
"step": 16150
},
{
"epoch": 5.1026612966911475,
"grad_norm": 0.0604585853738121,
"learning_rate": 0.0011376985307069605,
"loss": 2.4193,
"step": 16155
},
{
"epoch": 5.104240701255627,
"grad_norm": 0.057772266673428295,
"learning_rate": 0.0011371523159365675,
"loss": 2.4257,
"step": 16160
},
{
"epoch": 5.105820105820106,
"grad_norm": 0.0718835525811546,
"learning_rate": 0.0011366060594590317,
"loss": 2.5369,
"step": 16165
},
{
"epoch": 5.107399510384585,
"grad_norm": 0.06032148212252233,
"learning_rate": 0.0011360597614404663,
"loss": 2.4418,
"step": 16170
},
{
"epoch": 5.1089789149490645,
"grad_norm": 0.06628365602520163,
"learning_rate": 0.001135513422046996,
"loss": 2.5456,
"step": 16175
},
{
"epoch": 5.110558319513544,
"grad_norm": 0.061422640210239805,
"learning_rate": 0.0011349670414447603,
"loss": 2.4793,
"step": 16180
},
{
"epoch": 5.112137724078023,
"grad_norm": 0.05771049193758285,
"learning_rate": 0.0011344206197999094,
"loss": 2.4786,
"step": 16185
},
{
"epoch": 5.113717128642501,
"grad_norm": 0.06091115608367704,
"learning_rate": 0.0011338741572786072,
"loss": 2.5138,
"step": 16190
},
{
"epoch": 5.115296533206981,
"grad_norm": 0.06074604808714699,
"learning_rate": 0.0011333276540470292,
"loss": 2.3742,
"step": 16195
},
{
"epoch": 5.11687593777146,
"grad_norm": 0.05479575421173105,
"learning_rate": 0.0011327811102713632,
"loss": 2.534,
"step": 16200
},
{
"epoch": 5.118455342335939,
"grad_norm": 0.06195142036541382,
"learning_rate": 0.0011322345261178097,
"loss": 2.5255,
"step": 16205
},
{
"epoch": 5.120034746900418,
"grad_norm": 0.06262974702071916,
"learning_rate": 0.001131687901752582,
"loss": 2.4125,
"step": 16210
},
{
"epoch": 5.121614151464898,
"grad_norm": 0.0633672990276845,
"learning_rate": 0.001131141237341905,
"loss": 2.4914,
"step": 16215
},
{
"epoch": 5.123193556029377,
"grad_norm": 0.0643598624522212,
"learning_rate": 0.0011305945330520152,
"loss": 2.4156,
"step": 16220
},
{
"epoch": 5.124772960593856,
"grad_norm": 0.061094639303892535,
"learning_rate": 0.0011300477890491623,
"loss": 2.4471,
"step": 16225
},
{
"epoch": 5.126352365158335,
"grad_norm": 0.052171533316898965,
"learning_rate": 0.0011295010054996077,
"loss": 2.4441,
"step": 16230
},
{
"epoch": 5.127931769722815,
"grad_norm": 0.056466913910174894,
"learning_rate": 0.0011289541825696247,
"loss": 2.485,
"step": 16235
},
{
"epoch": 5.129511174287294,
"grad_norm": 0.07572727322827039,
"learning_rate": 0.001128407320425499,
"loss": 2.484,
"step": 16240
},
{
"epoch": 5.131090578851773,
"grad_norm": 0.06482262428381254,
"learning_rate": 0.0011278604192335273,
"loss": 2.4612,
"step": 16245
},
{
"epoch": 5.132669983416252,
"grad_norm": 0.06568181975286745,
"learning_rate": 0.001127313479160019,
"loss": 2.377,
"step": 16250
},
{
"epoch": 5.134249387980732,
"grad_norm": 0.06519208053816444,
"learning_rate": 0.0011267665003712951,
"loss": 2.4845,
"step": 16255
},
{
"epoch": 5.135828792545211,
"grad_norm": 0.056410431836072604,
"learning_rate": 0.0011262194830336887,
"loss": 2.5283,
"step": 16260
},
{
"epoch": 5.137408197109689,
"grad_norm": 0.06500325338501298,
"learning_rate": 0.0011256724273135438,
"loss": 2.4445,
"step": 16265
},
{
"epoch": 5.1389876016741685,
"grad_norm": 0.06452541716306441,
"learning_rate": 0.0011251253333772165,
"loss": 2.3963,
"step": 16270
},
{
"epoch": 5.140567006238648,
"grad_norm": 0.05911542687700847,
"learning_rate": 0.0011245782013910748,
"loss": 2.4702,
"step": 16275
},
{
"epoch": 5.142146410803127,
"grad_norm": 0.05073585115407527,
"learning_rate": 0.001124031031521498,
"loss": 2.3791,
"step": 16280
},
{
"epoch": 5.143725815367606,
"grad_norm": 0.05977816183764443,
"learning_rate": 0.0011234838239348773,
"loss": 2.6144,
"step": 16285
},
{
"epoch": 5.1453052199320854,
"grad_norm": 0.11924068568162381,
"learning_rate": 0.0011229365787976144,
"loss": 2.5134,
"step": 16290
},
{
"epoch": 5.146884624496565,
"grad_norm": 0.0734698954709752,
"learning_rate": 0.0011223892962761233,
"loss": 2.5135,
"step": 16295
},
{
"epoch": 5.148464029061044,
"grad_norm": 0.06039434476566112,
"learning_rate": 0.0011218419765368294,
"loss": 2.5427,
"step": 16300
},
{
"epoch": 5.150043433625523,
"grad_norm": 0.06217142680674239,
"learning_rate": 0.0011212946197461686,
"loss": 2.5361,
"step": 16305
},
{
"epoch": 5.151622838190002,
"grad_norm": 0.06420585743081299,
"learning_rate": 0.0011207472260705894,
"loss": 2.3974,
"step": 16310
},
{
"epoch": 5.153202242754482,
"grad_norm": 0.07112681958076077,
"learning_rate": 0.0011201997956765497,
"loss": 2.3866,
"step": 16315
},
{
"epoch": 5.154781647318961,
"grad_norm": 0.06579358055731502,
"learning_rate": 0.0011196523287305203,
"loss": 2.4988,
"step": 16320
},
{
"epoch": 5.15636105188344,
"grad_norm": 0.06422583481040922,
"learning_rate": 0.0011191048253989823,
"loss": 2.5402,
"step": 16325
},
{
"epoch": 5.157940456447919,
"grad_norm": 0.05614613307542216,
"learning_rate": 0.001118557285848428,
"loss": 2.5435,
"step": 16330
},
{
"epoch": 5.159519861012399,
"grad_norm": 0.054122449335019904,
"learning_rate": 0.0011180097102453605,
"loss": 2.4795,
"step": 16335
},
{
"epoch": 5.161099265576878,
"grad_norm": 0.07874484568863568,
"learning_rate": 0.0011174620987562936,
"loss": 2.5427,
"step": 16340
},
{
"epoch": 5.162678670141356,
"grad_norm": 0.07263009788499131,
"learning_rate": 0.0011169144515477537,
"loss": 2.4887,
"step": 16345
},
{
"epoch": 5.1642580747058355,
"grad_norm": 0.06027699366074942,
"learning_rate": 0.0011163667687862755,
"loss": 2.4958,
"step": 16350
},
{
"epoch": 5.165837479270315,
"grad_norm": 0.05915559937051019,
"learning_rate": 0.0011158190506384068,
"loss": 2.5644,
"step": 16355
},
{
"epoch": 5.167416883834794,
"grad_norm": 0.059476055705571074,
"learning_rate": 0.0011152712972707045,
"loss": 2.6209,
"step": 16360
},
{
"epoch": 5.168996288399273,
"grad_norm": 0.06808222938767634,
"learning_rate": 0.001114723508849737,
"loss": 2.5696,
"step": 16365
},
{
"epoch": 5.1705756929637525,
"grad_norm": 0.06521035300158985,
"learning_rate": 0.0011141756855420838,
"loss": 2.4274,
"step": 16370
},
{
"epoch": 5.172155097528232,
"grad_norm": 0.06339165426714281,
"learning_rate": 0.0011136278275143342,
"loss": 2.6055,
"step": 16375
},
{
"epoch": 5.173734502092711,
"grad_norm": 0.05411371687477742,
"learning_rate": 0.001113079934933088,
"loss": 2.4679,
"step": 16380
},
{
"epoch": 5.17531390665719,
"grad_norm": 0.06111465209279007,
"learning_rate": 0.0011125320079649562,
"loss": 2.4941,
"step": 16385
},
{
"epoch": 5.1768933112216695,
"grad_norm": 0.058314264656633696,
"learning_rate": 0.00111198404677656,
"loss": 2.4186,
"step": 16390
},
{
"epoch": 5.178472715786149,
"grad_norm": 0.06611542635281509,
"learning_rate": 0.00111143605153453,
"loss": 2.4831,
"step": 16395
},
{
"epoch": 5.180052120350628,
"grad_norm": 0.06365065892450729,
"learning_rate": 0.0011108880224055093,
"loss": 2.5002,
"step": 16400
},
{
"epoch": 5.181631524915107,
"grad_norm": 0.05149579258538051,
"learning_rate": 0.0011103399595561493,
"loss": 2.4647,
"step": 16405
},
{
"epoch": 5.1832109294795865,
"grad_norm": 0.06531925817820286,
"learning_rate": 0.0011097918631531123,
"loss": 2.4938,
"step": 16410
},
{
"epoch": 5.184790334044066,
"grad_norm": 0.05606313145169872,
"learning_rate": 0.0011092437333630716,
"loss": 2.5484,
"step": 16415
},
{
"epoch": 5.186369738608545,
"grad_norm": 0.061918571489501605,
"learning_rate": 0.0011086955703527093,
"loss": 2.5153,
"step": 16420
},
{
"epoch": 5.187949143173023,
"grad_norm": 0.06796321611167673,
"learning_rate": 0.001108147374288719,
"loss": 2.5846,
"step": 16425
},
{
"epoch": 5.189528547737503,
"grad_norm": 0.06654686032721133,
"learning_rate": 0.0011075991453378025,
"loss": 2.5631,
"step": 16430
},
{
"epoch": 5.191107952301982,
"grad_norm": 0.060261990753233845,
"learning_rate": 0.0011070508836666737,
"loss": 2.5073,
"step": 16435
},
{
"epoch": 5.192687356866461,
"grad_norm": 0.06296014676202179,
"learning_rate": 0.0011065025894420552,
"loss": 2.4725,
"step": 16440
},
{
"epoch": 5.19426676143094,
"grad_norm": 0.07877531687828125,
"learning_rate": 0.0011059542628306797,
"loss": 2.5068,
"step": 16445
},
{
"epoch": 5.19584616599542,
"grad_norm": 0.0687432738343893,
"learning_rate": 0.0011054059039992895,
"loss": 2.4373,
"step": 16450
},
{
"epoch": 5.197425570559899,
"grad_norm": 0.06456809106330956,
"learning_rate": 0.0011048575131146377,
"loss": 2.4645,
"step": 16455
},
{
"epoch": 5.199004975124378,
"grad_norm": 0.06532940947281059,
"learning_rate": 0.001104309090343486,
"loss": 2.4395,
"step": 16460
},
{
"epoch": 5.200584379688857,
"grad_norm": 0.06951168067612123,
"learning_rate": 0.0011037606358526065,
"loss": 2.5234,
"step": 16465
},
{
"epoch": 5.202163784253337,
"grad_norm": 0.06605425908188053,
"learning_rate": 0.0011032121498087805,
"loss": 2.3909,
"step": 16470
},
{
"epoch": 5.203743188817816,
"grad_norm": 0.06139388448666948,
"learning_rate": 0.001102663632378799,
"loss": 2.4513,
"step": 16475
},
{
"epoch": 5.205322593382295,
"grad_norm": 0.05321250714202924,
"learning_rate": 0.0011021150837294631,
"loss": 2.4193,
"step": 16480
},
{
"epoch": 5.206901997946774,
"grad_norm": 0.0670144536976953,
"learning_rate": 0.0011015665040275827,
"loss": 2.466,
"step": 16485
},
{
"epoch": 5.208481402511254,
"grad_norm": 0.0640672036717467,
"learning_rate": 0.0011010178934399773,
"loss": 2.4192,
"step": 16490
},
{
"epoch": 5.210060807075733,
"grad_norm": 0.06885519764125651,
"learning_rate": 0.0011004692521334755,
"loss": 2.4949,
"step": 16495
},
{
"epoch": 5.211640211640212,
"grad_norm": 0.06479142893373602,
"learning_rate": 0.0010999205802749163,
"loss": 2.519,
"step": 16500
},
{
"epoch": 5.21321961620469,
"grad_norm": 0.06367553196577055,
"learning_rate": 0.0010993718780311474,
"loss": 2.4375,
"step": 16505
},
{
"epoch": 5.21479902076917,
"grad_norm": 0.07410082618851017,
"learning_rate": 0.001098823145569025,
"loss": 2.398,
"step": 16510
},
{
"epoch": 5.216378425333649,
"grad_norm": 0.06301957247302468,
"learning_rate": 0.0010982743830554155,
"loss": 2.5229,
"step": 16515
},
{
"epoch": 5.217957829898128,
"grad_norm": 0.06421095569993603,
"learning_rate": 0.0010977255906571939,
"loss": 2.4504,
"step": 16520
},
{
"epoch": 5.219537234462607,
"grad_norm": 0.06917348371322869,
"learning_rate": 0.0010971767685412448,
"loss": 2.4585,
"step": 16525
},
{
"epoch": 5.221116639027087,
"grad_norm": 0.0762081917098472,
"learning_rate": 0.001096627916874461,
"loss": 2.4095,
"step": 16530
},
{
"epoch": 5.222696043591566,
"grad_norm": 0.06776761418065072,
"learning_rate": 0.0010960790358237448,
"loss": 2.6017,
"step": 16535
},
{
"epoch": 5.224275448156045,
"grad_norm": 0.054990387693461554,
"learning_rate": 0.0010955301255560085,
"loss": 2.4561,
"step": 16540
},
{
"epoch": 5.225854852720524,
"grad_norm": 0.052096822769913995,
"learning_rate": 0.0010949811862381706,
"loss": 2.5755,
"step": 16545
},
{
"epoch": 5.227434257285004,
"grad_norm": 0.05360652247727314,
"learning_rate": 0.0010944322180371612,
"loss": 2.5952,
"step": 16550
},
{
"epoch": 5.229013661849483,
"grad_norm": 0.06870679333822231,
"learning_rate": 0.0010938832211199177,
"loss": 2.4554,
"step": 16555
},
{
"epoch": 5.230593066413962,
"grad_norm": 0.0708017886233242,
"learning_rate": 0.0010933341956533863,
"loss": 2.5121,
"step": 16560
},
{
"epoch": 5.232172470978441,
"grad_norm": 0.0888433051481054,
"learning_rate": 0.0010927851418045223,
"loss": 2.3919,
"step": 16565
},
{
"epoch": 5.233751875542921,
"grad_norm": 0.05876750386387108,
"learning_rate": 0.0010922360597402899,
"loss": 2.5121,
"step": 16570
},
{
"epoch": 5.2353312801074,
"grad_norm": 0.07072961142914727,
"learning_rate": 0.0010916869496276605,
"loss": 2.4312,
"step": 16575
},
{
"epoch": 5.236910684671878,
"grad_norm": 0.06580191859037744,
"learning_rate": 0.0010911378116336156,
"loss": 2.4287,
"step": 16580
},
{
"epoch": 5.2384900892363575,
"grad_norm": 0.06248045185532465,
"learning_rate": 0.001090588645925145,
"loss": 2.3803,
"step": 16585
},
{
"epoch": 5.240069493800837,
"grad_norm": 0.059890358250749606,
"learning_rate": 0.0010900394526692453,
"loss": 2.5089,
"step": 16590
},
{
"epoch": 5.241648898365316,
"grad_norm": 0.0585610711878174,
"learning_rate": 0.0010894902320329237,
"loss": 2.4623,
"step": 16595
},
{
"epoch": 5.243228302929795,
"grad_norm": 0.05983185438216206,
"learning_rate": 0.0010889409841831942,
"loss": 2.3765,
"step": 16600
},
{
"epoch": 5.2448077074942745,
"grad_norm": 0.060187315671030875,
"learning_rate": 0.0010883917092870796,
"loss": 2.4824,
"step": 16605
},
{
"epoch": 5.246387112058754,
"grad_norm": 0.0585721389921002,
"learning_rate": 0.0010878424075116112,
"loss": 2.5855,
"step": 16610
},
{
"epoch": 5.247966516623233,
"grad_norm": 0.05396930882771886,
"learning_rate": 0.0010872930790238279,
"loss": 2.4659,
"step": 16615
},
{
"epoch": 5.249545921187712,
"grad_norm": 0.058200285764090955,
"learning_rate": 0.0010867437239907764,
"loss": 2.5575,
"step": 16620
},
{
"epoch": 5.2511253257521915,
"grad_norm": 0.07066251591128414,
"learning_rate": 0.0010861943425795131,
"loss": 2.4811,
"step": 16625
},
{
"epoch": 5.252704730316671,
"grad_norm": 0.06454918154698588,
"learning_rate": 0.001085644934957101,
"loss": 2.5084,
"step": 16630
},
{
"epoch": 5.25428413488115,
"grad_norm": 0.06916896566986358,
"learning_rate": 0.0010850955012906113,
"loss": 2.467,
"step": 16635
},
{
"epoch": 5.255863539445629,
"grad_norm": 0.07621274124652579,
"learning_rate": 0.0010845460417471236,
"loss": 2.4866,
"step": 16640
},
{
"epoch": 5.2574429440101085,
"grad_norm": 0.058369653895255,
"learning_rate": 0.0010839965564937244,
"loss": 2.4595,
"step": 16645
},
{
"epoch": 5.259022348574588,
"grad_norm": 0.060666486896924936,
"learning_rate": 0.0010834470456975091,
"loss": 2.4964,
"step": 16650
},
{
"epoch": 5.260601753139067,
"grad_norm": 0.05894951924037973,
"learning_rate": 0.0010828975095255806,
"loss": 2.4953,
"step": 16655
},
{
"epoch": 5.262181157703546,
"grad_norm": 0.06684114577856183,
"learning_rate": 0.001082347948145049,
"loss": 2.5224,
"step": 16660
},
{
"epoch": 5.263760562268025,
"grad_norm": 0.07270527632123917,
"learning_rate": 0.0010817983617230325,
"loss": 2.5415,
"step": 16665
},
{
"epoch": 5.265339966832504,
"grad_norm": 0.05330178952705446,
"learning_rate": 0.0010812487504266565,
"loss": 2.5755,
"step": 16670
},
{
"epoch": 5.266919371396983,
"grad_norm": 0.059499155784585894,
"learning_rate": 0.001080699114423055,
"loss": 2.4563,
"step": 16675
},
{
"epoch": 5.268498775961462,
"grad_norm": 0.06704501210732458,
"learning_rate": 0.0010801494538793684,
"loss": 2.6167,
"step": 16680
},
{
"epoch": 5.270078180525942,
"grad_norm": 0.05814805570784973,
"learning_rate": 0.0010795997689627451,
"loss": 2.4289,
"step": 16685
},
{
"epoch": 5.271657585090421,
"grad_norm": 0.0669517010977098,
"learning_rate": 0.0010790500598403402,
"loss": 2.4127,
"step": 16690
},
{
"epoch": 5.2732369896549,
"grad_norm": 0.06309378267967428,
"learning_rate": 0.001078500326679317,
"loss": 2.4365,
"step": 16695
},
{
"epoch": 5.274816394219379,
"grad_norm": 0.06584312219840473,
"learning_rate": 0.0010779505696468469,
"loss": 2.4437,
"step": 16700
},
{
"epoch": 5.276395798783859,
"grad_norm": 0.0748847383545806,
"learning_rate": 0.0010774007889101061,
"loss": 2.5373,
"step": 16705
},
{
"epoch": 5.277975203348338,
"grad_norm": 0.05230783420322393,
"learning_rate": 0.0010768509846362797,
"loss": 2.5201,
"step": 16710
},
{
"epoch": 5.279554607912817,
"grad_norm": 0.061470445691836156,
"learning_rate": 0.00107630115699256,
"loss": 2.4913,
"step": 16715
},
{
"epoch": 5.281134012477296,
"grad_norm": 0.056911010107714786,
"learning_rate": 0.0010757513061461462,
"loss": 2.4145,
"step": 16720
},
{
"epoch": 5.2827134170417755,
"grad_norm": 0.06249572583749286,
"learning_rate": 0.001075201432264244,
"loss": 2.5379,
"step": 16725
},
{
"epoch": 5.284292821606255,
"grad_norm": 0.05916696895884777,
"learning_rate": 0.001074651535514067,
"loss": 2.5474,
"step": 16730
},
{
"epoch": 5.285872226170734,
"grad_norm": 0.07074457981481334,
"learning_rate": 0.0010741016160628345,
"loss": 2.5206,
"step": 16735
},
{
"epoch": 5.287451630735212,
"grad_norm": 0.07527858926282108,
"learning_rate": 0.0010735516740777741,
"loss": 2.5098,
"step": 16740
},
{
"epoch": 5.289031035299692,
"grad_norm": 0.06153733507179991,
"learning_rate": 0.00107300170972612,
"loss": 2.4103,
"step": 16745
},
{
"epoch": 5.290610439864171,
"grad_norm": 0.06310848763976958,
"learning_rate": 0.0010724517231751123,
"loss": 2.4628,
"step": 16750
},
{
"epoch": 5.29218984442865,
"grad_norm": 0.05508495712248767,
"learning_rate": 0.0010719017145919983,
"loss": 2.4478,
"step": 16755
},
{
"epoch": 5.293769248993129,
"grad_norm": 0.06435719349082726,
"learning_rate": 0.0010713516841440321,
"loss": 2.5433,
"step": 16760
},
{
"epoch": 5.295348653557609,
"grad_norm": 0.0798223299576822,
"learning_rate": 0.001070801631998475,
"loss": 2.447,
"step": 16765
},
{
"epoch": 5.296928058122088,
"grad_norm": 0.060059753102617686,
"learning_rate": 0.0010702515583225936,
"loss": 2.4276,
"step": 16770
},
{
"epoch": 5.298507462686567,
"grad_norm": 0.05718298632033585,
"learning_rate": 0.0010697014632836627,
"loss": 2.4071,
"step": 16775
},
{
"epoch": 5.300086867251046,
"grad_norm": 0.06008953594279999,
"learning_rate": 0.0010691513470489616,
"loss": 2.488,
"step": 16780
},
{
"epoch": 5.301666271815526,
"grad_norm": 0.08472308898028438,
"learning_rate": 0.0010686012097857777,
"loss": 2.4654,
"step": 16785
},
{
"epoch": 5.303245676380005,
"grad_norm": 0.07276775293262801,
"learning_rate": 0.0010680510516614045,
"loss": 2.4008,
"step": 16790
},
{
"epoch": 5.304825080944484,
"grad_norm": 0.08578372585501848,
"learning_rate": 0.0010675008728431414,
"loss": 2.4863,
"step": 16795
},
{
"epoch": 5.306404485508963,
"grad_norm": 0.076242387761487,
"learning_rate": 0.001066950673498294,
"loss": 2.5704,
"step": 16800
},
{
"epoch": 5.307983890073443,
"grad_norm": 0.06923296405260501,
"learning_rate": 0.0010664004537941742,
"loss": 2.5675,
"step": 16805
},
{
"epoch": 5.309563294637922,
"grad_norm": 0.07172642450659165,
"learning_rate": 0.0010658502138981008,
"loss": 2.5564,
"step": 16810
},
{
"epoch": 5.311142699202401,
"grad_norm": 0.07214633031703806,
"learning_rate": 0.0010652999539773984,
"loss": 2.52,
"step": 16815
},
{
"epoch": 5.3127221037668795,
"grad_norm": 0.0673649488725695,
"learning_rate": 0.001064749674199397,
"loss": 2.4927,
"step": 16820
},
{
"epoch": 5.314301508331359,
"grad_norm": 0.08050938669838298,
"learning_rate": 0.0010641993747314334,
"loss": 2.5102,
"step": 16825
},
{
"epoch": 5.315880912895838,
"grad_norm": 0.07195246233541937,
"learning_rate": 0.00106364905574085,
"loss": 2.4801,
"step": 16830
},
{
"epoch": 5.317460317460317,
"grad_norm": 0.05612902846451449,
"learning_rate": 0.0010630987173949958,
"loss": 2.4484,
"step": 16835
},
{
"epoch": 5.3190397220247965,
"grad_norm": 0.0496078995641674,
"learning_rate": 0.0010625483598612246,
"loss": 2.5132,
"step": 16840
},
{
"epoch": 5.320619126589276,
"grad_norm": 0.06237915696779509,
"learning_rate": 0.0010619979833068965,
"loss": 2.4461,
"step": 16845
},
{
"epoch": 5.322198531153755,
"grad_norm": 0.05606620687503389,
"learning_rate": 0.001061447587899378,
"loss": 2.4537,
"step": 16850
},
{
"epoch": 5.323777935718234,
"grad_norm": 0.05437527681426904,
"learning_rate": 0.0010608971738060404,
"loss": 2.5118,
"step": 16855
},
{
"epoch": 5.3253573402827135,
"grad_norm": 0.06628402939707274,
"learning_rate": 0.0010603467411942618,
"loss": 2.4164,
"step": 16860
},
{
"epoch": 5.326936744847193,
"grad_norm": 0.06435070608088721,
"learning_rate": 0.0010597962902314246,
"loss": 2.52,
"step": 16865
},
{
"epoch": 5.328516149411672,
"grad_norm": 0.07830690204343072,
"learning_rate": 0.0010592458210849174,
"loss": 2.4917,
"step": 16870
},
{
"epoch": 5.330095553976151,
"grad_norm": 0.08642147176922639,
"learning_rate": 0.0010586953339221346,
"loss": 2.5402,
"step": 16875
},
{
"epoch": 5.3316749585406304,
"grad_norm": 0.09187424195378634,
"learning_rate": 0.0010581448289104759,
"loss": 2.3867,
"step": 16880
},
{
"epoch": 5.33325436310511,
"grad_norm": 0.14183700498762744,
"learning_rate": 0.0010575943062173462,
"loss": 2.4946,
"step": 16885
},
{
"epoch": 5.334833767669589,
"grad_norm": 0.0643752778203694,
"learning_rate": 0.001057043766010156,
"loss": 2.4688,
"step": 16890
},
{
"epoch": 5.336413172234067,
"grad_norm": 0.05696581393811108,
"learning_rate": 0.0010564932084563207,
"loss": 2.5694,
"step": 16895
},
{
"epoch": 5.3379925767985466,
"grad_norm": 0.06514340707364054,
"learning_rate": 0.0010559426337232618,
"loss": 2.459,
"step": 16900
},
{
"epoch": 5.339571981363026,
"grad_norm": 0.06423140329868181,
"learning_rate": 0.0010553920419784056,
"loss": 2.3664,
"step": 16905
},
{
"epoch": 5.341151385927505,
"grad_norm": 0.05756396111491676,
"learning_rate": 0.0010548414333891834,
"loss": 2.4964,
"step": 16910
},
{
"epoch": 5.342730790491984,
"grad_norm": 0.06258406731521991,
"learning_rate": 0.0010542908081230314,
"loss": 2.3496,
"step": 16915
},
{
"epoch": 5.3443101950564635,
"grad_norm": 0.05578908244242506,
"learning_rate": 0.0010537401663473916,
"loss": 2.4955,
"step": 16920
},
{
"epoch": 5.345889599620943,
"grad_norm": 0.05874669174522487,
"learning_rate": 0.0010531895082297107,
"loss": 2.5043,
"step": 16925
},
{
"epoch": 5.347469004185422,
"grad_norm": 0.05615463876654649,
"learning_rate": 0.0010526388339374402,
"loss": 2.4556,
"step": 16930
},
{
"epoch": 5.349048408749901,
"grad_norm": 0.0717665194726691,
"learning_rate": 0.0010520881436380364,
"loss": 2.434,
"step": 16935
},
{
"epoch": 5.3506278133143805,
"grad_norm": 0.06025021098987067,
"learning_rate": 0.001051537437498961,
"loss": 2.4606,
"step": 16940
},
{
"epoch": 5.35220721787886,
"grad_norm": 0.0723709397590929,
"learning_rate": 0.0010509867156876802,
"loss": 2.529,
"step": 16945
},
{
"epoch": 5.353786622443339,
"grad_norm": 0.07275350241449625,
"learning_rate": 0.001050435978371665,
"loss": 2.4838,
"step": 16950
},
{
"epoch": 5.355366027007818,
"grad_norm": 0.06785690614428286,
"learning_rate": 0.001049885225718391,
"loss": 2.463,
"step": 16955
},
{
"epoch": 5.3569454315722975,
"grad_norm": 0.0666591076212561,
"learning_rate": 0.0010493344578953385,
"loss": 2.3827,
"step": 16960
},
{
"epoch": 5.358524836136777,
"grad_norm": 0.06319052158087238,
"learning_rate": 0.0010487836750699925,
"loss": 2.5529,
"step": 16965
},
{
"epoch": 5.360104240701256,
"grad_norm": 0.054779998660577985,
"learning_rate": 0.0010482328774098428,
"loss": 2.4198,
"step": 16970
},
{
"epoch": 5.361683645265735,
"grad_norm": 0.06595481664767047,
"learning_rate": 0.0010476820650823834,
"loss": 2.5699,
"step": 16975
},
{
"epoch": 5.363263049830214,
"grad_norm": 0.05765694032285817,
"learning_rate": 0.0010471312382551122,
"loss": 2.5256,
"step": 16980
},
{
"epoch": 5.364842454394693,
"grad_norm": 0.06406264763633265,
"learning_rate": 0.0010465803970955325,
"loss": 2.4457,
"step": 16985
},
{
"epoch": 5.366421858959172,
"grad_norm": 0.058716196334157277,
"learning_rate": 0.0010460295417711518,
"loss": 2.4894,
"step": 16990
},
{
"epoch": 5.368001263523651,
"grad_norm": 0.055264350923520805,
"learning_rate": 0.0010454786724494818,
"loss": 2.5555,
"step": 16995
},
{
"epoch": 5.369580668088131,
"grad_norm": 0.0779942238783775,
"learning_rate": 0.0010449277892980381,
"loss": 2.4478,
"step": 17000
},
{
"epoch": 5.37116007265261,
"grad_norm": 0.05737792547478927,
"learning_rate": 0.0010443768924843404,
"loss": 2.4918,
"step": 17005
},
{
"epoch": 5.372739477217089,
"grad_norm": 0.07708306947393631,
"learning_rate": 0.0010438259821759133,
"loss": 2.3815,
"step": 17010
},
{
"epoch": 5.374318881781568,
"grad_norm": 0.07054007734865732,
"learning_rate": 0.0010432750585402852,
"loss": 2.424,
"step": 17015
},
{
"epoch": 5.375898286346048,
"grad_norm": 0.0706022313990559,
"learning_rate": 0.0010427241217449885,
"loss": 2.4761,
"step": 17020
},
{
"epoch": 5.377477690910527,
"grad_norm": 0.06813243778394985,
"learning_rate": 0.0010421731719575588,
"loss": 2.4499,
"step": 17025
},
{
"epoch": 5.379057095475006,
"grad_norm": 0.05391903605438501,
"learning_rate": 0.0010416222093455373,
"loss": 2.4157,
"step": 17030
},
{
"epoch": 5.380636500039485,
"grad_norm": 0.059391745910537766,
"learning_rate": 0.0010410712340764676,
"loss": 2.5278,
"step": 17035
},
{
"epoch": 5.382215904603965,
"grad_norm": 0.06332603147895359,
"learning_rate": 0.0010405202463178984,
"loss": 2.4639,
"step": 17040
},
{
"epoch": 5.383795309168444,
"grad_norm": 0.0711124646613565,
"learning_rate": 0.0010399692462373811,
"loss": 2.4097,
"step": 17045
},
{
"epoch": 5.385374713732923,
"grad_norm": 0.0637810374649417,
"learning_rate": 0.0010394182340024711,
"loss": 2.434,
"step": 17050
},
{
"epoch": 5.3869541182974015,
"grad_norm": 0.055547887293987096,
"learning_rate": 0.0010388672097807281,
"loss": 2.4207,
"step": 17055
},
{
"epoch": 5.388533522861881,
"grad_norm": 0.06417690698464526,
"learning_rate": 0.0010383161737397154,
"loss": 2.4728,
"step": 17060
},
{
"epoch": 5.39011292742636,
"grad_norm": 0.06593119769528422,
"learning_rate": 0.0010377651260469987,
"loss": 2.4172,
"step": 17065
},
{
"epoch": 5.391692331990839,
"grad_norm": 0.06845694761919878,
"learning_rate": 0.0010372140668701482,
"loss": 2.4343,
"step": 17070
},
{
"epoch": 5.393271736555318,
"grad_norm": 0.05436828003384813,
"learning_rate": 0.001036662996376738,
"loss": 2.4925,
"step": 17075
},
{
"epoch": 5.394851141119798,
"grad_norm": 0.061883791335234604,
"learning_rate": 0.0010361119147343448,
"loss": 2.5331,
"step": 17080
},
{
"epoch": 5.396430545684277,
"grad_norm": 0.06348242440565674,
"learning_rate": 0.001035560822110549,
"loss": 2.5279,
"step": 17085
},
{
"epoch": 5.398009950248756,
"grad_norm": 0.0672056420447635,
"learning_rate": 0.001035009718672935,
"loss": 2.4785,
"step": 17090
},
{
"epoch": 5.399589354813235,
"grad_norm": 0.06675203584026605,
"learning_rate": 0.0010344586045890882,
"loss": 2.52,
"step": 17095
},
{
"epoch": 5.401168759377715,
"grad_norm": 0.06551572667439134,
"learning_rate": 0.0010339074800266004,
"loss": 2.5402,
"step": 17100
},
{
"epoch": 5.402748163942194,
"grad_norm": 0.06334548154625982,
"learning_rate": 0.0010333563451530648,
"loss": 2.5497,
"step": 17105
},
{
"epoch": 5.404327568506673,
"grad_norm": 0.10586308533135143,
"learning_rate": 0.0010328052001360778,
"loss": 2.5112,
"step": 17110
},
{
"epoch": 5.405906973071152,
"grad_norm": 0.07921321629017972,
"learning_rate": 0.0010322540451432386,
"loss": 2.5719,
"step": 17115
},
{
"epoch": 5.407486377635632,
"grad_norm": 0.07478108021776518,
"learning_rate": 0.0010317028803421505,
"loss": 2.4436,
"step": 17120
},
{
"epoch": 5.409065782200111,
"grad_norm": 0.06353594190898273,
"learning_rate": 0.001031151705900419,
"loss": 2.4052,
"step": 17125
},
{
"epoch": 5.41064518676459,
"grad_norm": 0.06858545588224939,
"learning_rate": 0.0010306005219856528,
"loss": 2.5134,
"step": 17130
},
{
"epoch": 5.412224591329069,
"grad_norm": 0.11161139739640971,
"learning_rate": 0.0010300493287654635,
"loss": 2.4786,
"step": 17135
},
{
"epoch": 5.413803995893548,
"grad_norm": 0.07029261166704179,
"learning_rate": 0.0010294981264074652,
"loss": 2.5099,
"step": 17140
},
{
"epoch": 5.415383400458027,
"grad_norm": 0.07834550948404823,
"learning_rate": 0.0010289469150792751,
"loss": 2.3872,
"step": 17145
},
{
"epoch": 5.416962805022506,
"grad_norm": 0.12133522402306104,
"learning_rate": 0.001028395694948513,
"loss": 2.4681,
"step": 17150
},
{
"epoch": 5.4185422095869855,
"grad_norm": 0.07107773751852352,
"learning_rate": 0.0010278444661828018,
"loss": 2.5221,
"step": 17155
},
{
"epoch": 5.420121614151465,
"grad_norm": 0.06698901319417577,
"learning_rate": 0.0010272932289497663,
"loss": 2.5352,
"step": 17160
},
{
"epoch": 5.421701018715944,
"grad_norm": 0.07078386952715568,
"learning_rate": 0.0010267419834170339,
"loss": 2.4919,
"step": 17165
},
{
"epoch": 5.423280423280423,
"grad_norm": 0.06088678320734555,
"learning_rate": 0.0010261907297522354,
"loss": 2.4718,
"step": 17170
},
{
"epoch": 5.4248598278449025,
"grad_norm": 0.06240042207023365,
"learning_rate": 0.0010256394681230035,
"loss": 2.5504,
"step": 17175
},
{
"epoch": 5.426439232409382,
"grad_norm": 0.07121927254555442,
"learning_rate": 0.0010250881986969731,
"loss": 2.4199,
"step": 17180
},
{
"epoch": 5.428018636973861,
"grad_norm": 0.05428715785274347,
"learning_rate": 0.0010245369216417817,
"loss": 2.4463,
"step": 17185
},
{
"epoch": 5.42959804153834,
"grad_norm": 0.08288380913410646,
"learning_rate": 0.001023985637125069,
"loss": 2.4475,
"step": 17190
},
{
"epoch": 5.4311774461028195,
"grad_norm": 0.06595287674137955,
"learning_rate": 0.0010234343453144777,
"loss": 2.4012,
"step": 17195
},
{
"epoch": 5.432756850667299,
"grad_norm": 0.06925202479359453,
"learning_rate": 0.0010228830463776513,
"loss": 2.4934,
"step": 17200
},
{
"epoch": 5.434336255231778,
"grad_norm": 0.06264816824230261,
"learning_rate": 0.001022331740482237,
"loss": 2.4051,
"step": 17205
},
{
"epoch": 5.435915659796256,
"grad_norm": 0.06786139302297989,
"learning_rate": 0.0010217804277958828,
"loss": 2.479,
"step": 17210
},
{
"epoch": 5.437495064360736,
"grad_norm": 0.07021697171462576,
"learning_rate": 0.0010212291084862398,
"loss": 2.3518,
"step": 17215
},
{
"epoch": 5.439074468925215,
"grad_norm": 0.06656502176587634,
"learning_rate": 0.0010206777827209607,
"loss": 2.5177,
"step": 17220
},
{
"epoch": 5.440653873489694,
"grad_norm": 0.062355951345659966,
"learning_rate": 0.0010201264506676999,
"loss": 2.3527,
"step": 17225
},
{
"epoch": 5.442233278054173,
"grad_norm": 0.05410415444862466,
"learning_rate": 0.001019575112494114,
"loss": 2.4348,
"step": 17230
},
{
"epoch": 5.443812682618653,
"grad_norm": 0.0749066429064683,
"learning_rate": 0.0010190237683678613,
"loss": 2.5108,
"step": 17235
},
{
"epoch": 5.445392087183132,
"grad_norm": 0.08340716169128037,
"learning_rate": 0.0010184724184566028,
"loss": 2.4268,
"step": 17240
},
{
"epoch": 5.446971491747611,
"grad_norm": 0.0740558184922576,
"learning_rate": 0.0010179210629279992,
"loss": 2.4082,
"step": 17245
},
{
"epoch": 5.44855089631209,
"grad_norm": 0.06154226785203741,
"learning_rate": 0.0010173697019497153,
"loss": 2.5026,
"step": 17250
},
{
"epoch": 5.45013030087657,
"grad_norm": 0.061934212314750915,
"learning_rate": 0.0010168183356894156,
"loss": 2.4739,
"step": 17255
},
{
"epoch": 5.451709705441049,
"grad_norm": 0.11389227639836863,
"learning_rate": 0.0010162669643147676,
"loss": 2.5456,
"step": 17260
},
{
"epoch": 5.453289110005528,
"grad_norm": 0.06718105948625795,
"learning_rate": 0.00101571558799344,
"loss": 2.4807,
"step": 17265
},
{
"epoch": 5.454868514570007,
"grad_norm": 0.07172866049825512,
"learning_rate": 0.0010151642068931023,
"loss": 2.5509,
"step": 17270
},
{
"epoch": 5.456447919134487,
"grad_norm": 0.05455984207106459,
"learning_rate": 0.001014612821181426,
"loss": 2.3514,
"step": 17275
},
{
"epoch": 5.458027323698966,
"grad_norm": 0.058908508581679266,
"learning_rate": 0.0010140614310260843,
"loss": 2.4898,
"step": 17280
},
{
"epoch": 5.459606728263445,
"grad_norm": 0.0609450518634079,
"learning_rate": 0.0010135100365947513,
"loss": 2.412,
"step": 17285
},
{
"epoch": 5.461186132827924,
"grad_norm": 0.060831067041835656,
"learning_rate": 0.0010129586380551027,
"loss": 2.4479,
"step": 17290
},
{
"epoch": 5.462765537392403,
"grad_norm": 0.06320656444773806,
"learning_rate": 0.0010124072355748148,
"loss": 2.3886,
"step": 17295
},
{
"epoch": 5.464344941956882,
"grad_norm": 0.06503437000744444,
"learning_rate": 0.0010118558293215657,
"loss": 2.5205,
"step": 17300
},
{
"epoch": 5.465924346521361,
"grad_norm": 0.059692468617380576,
"learning_rate": 0.0010113044194630348,
"loss": 2.5019,
"step": 17305
},
{
"epoch": 5.46750375108584,
"grad_norm": 0.05487462808337112,
"learning_rate": 0.0010107530061669021,
"loss": 2.4946,
"step": 17310
},
{
"epoch": 5.46908315565032,
"grad_norm": 0.06649084437197074,
"learning_rate": 0.001010201589600849,
"loss": 2.4836,
"step": 17315
},
{
"epoch": 5.470662560214799,
"grad_norm": 0.07901367919834779,
"learning_rate": 0.0010096501699325578,
"loss": 2.5324,
"step": 17320
},
{
"epoch": 5.472241964779278,
"grad_norm": 0.0720978856191932,
"learning_rate": 0.0010090987473297113,
"loss": 2.4307,
"step": 17325
},
{
"epoch": 5.473821369343757,
"grad_norm": 0.07054417946975411,
"learning_rate": 0.001008547321959994,
"loss": 2.465,
"step": 17330
},
{
"epoch": 5.475400773908237,
"grad_norm": 0.05523354163994097,
"learning_rate": 0.001007995893991091,
"loss": 2.4587,
"step": 17335
},
{
"epoch": 5.476980178472716,
"grad_norm": 0.07469524334146507,
"learning_rate": 0.0010074444635906875,
"loss": 2.4247,
"step": 17340
},
{
"epoch": 5.478559583037195,
"grad_norm": 0.05889256875418852,
"learning_rate": 0.00100689303092647,
"loss": 2.4254,
"step": 17345
},
{
"epoch": 5.480138987601674,
"grad_norm": 0.0579479853221772,
"learning_rate": 0.0010063415961661258,
"loss": 2.5711,
"step": 17350
},
{
"epoch": 5.481718392166154,
"grad_norm": 0.07167989050379651,
"learning_rate": 0.0010057901594773431,
"loss": 2.5187,
"step": 17355
},
{
"epoch": 5.483297796730633,
"grad_norm": 0.06150262610378773,
"learning_rate": 0.0010052387210278096,
"loss": 2.4176,
"step": 17360
},
{
"epoch": 5.484877201295112,
"grad_norm": 0.060729870908633486,
"learning_rate": 0.0010046872809852147,
"loss": 2.4863,
"step": 17365
},
{
"epoch": 5.4864566058595905,
"grad_norm": 0.060531187020126144,
"learning_rate": 0.0010041358395172474,
"loss": 2.4736,
"step": 17370
},
{
"epoch": 5.48803601042407,
"grad_norm": 0.058692155895619304,
"learning_rate": 0.001003584396791598,
"loss": 2.4325,
"step": 17375
},
{
"epoch": 5.489615414988549,
"grad_norm": 0.06928908884049095,
"learning_rate": 0.001003032952975956,
"loss": 2.5205,
"step": 17380
},
{
"epoch": 5.491194819553028,
"grad_norm": 0.06234210873800841,
"learning_rate": 0.001002481508238013,
"loss": 2.4895,
"step": 17385
},
{
"epoch": 5.4927742241175075,
"grad_norm": 0.06461681068296442,
"learning_rate": 0.0010019300627454586,
"loss": 2.4935,
"step": 17390
},
{
"epoch": 5.494353628681987,
"grad_norm": 0.06541091432144396,
"learning_rate": 0.0010013786166659846,
"loss": 2.4708,
"step": 17395
},
{
"epoch": 5.495933033246466,
"grad_norm": 0.06942867472925346,
"learning_rate": 0.0010008271701672823,
"loss": 2.5403,
"step": 17400
},
{
"epoch": 5.497512437810945,
"grad_norm": 0.05600043018162233,
"learning_rate": 0.0010002757234170428,
"loss": 2.4785,
"step": 17405
},
{
"epoch": 5.4990918423754245,
"grad_norm": 0.0682744087396039,
"learning_rate": 0.0009997242765829575,
"loss": 2.453,
"step": 17410
},
{
"epoch": 5.500671246939904,
"grad_norm": 0.053749717340257915,
"learning_rate": 0.000999172829832718,
"loss": 2.3701,
"step": 17415
},
{
"epoch": 5.502250651504383,
"grad_norm": 0.06969763757322571,
"learning_rate": 0.0009986213833340155,
"loss": 2.4837,
"step": 17420
},
{
"epoch": 5.503830056068862,
"grad_norm": 0.06641338974341425,
"learning_rate": 0.0009980699372545419,
"loss": 2.4402,
"step": 17425
},
{
"epoch": 5.5054094606333415,
"grad_norm": 0.07644320259996816,
"learning_rate": 0.0009975184917619872,
"loss": 2.564,
"step": 17430
},
{
"epoch": 5.506988865197821,
"grad_norm": 0.07194674371193889,
"learning_rate": 0.000996967047024044,
"loss": 2.4611,
"step": 17435
},
{
"epoch": 5.5085682697623,
"grad_norm": 0.061514768644023464,
"learning_rate": 0.0009964156032084021,
"loss": 2.4047,
"step": 17440
},
{
"epoch": 5.510147674326779,
"grad_norm": 0.06052995303564927,
"learning_rate": 0.0009958641604827527,
"loss": 2.4338,
"step": 17445
},
{
"epoch": 5.5117270788912585,
"grad_norm": 0.07290696842185718,
"learning_rate": 0.0009953127190147858,
"loss": 2.4264,
"step": 17450
},
{
"epoch": 5.513306483455737,
"grad_norm": 0.06714044171667788,
"learning_rate": 0.0009947612789721904,
"loss": 2.535,
"step": 17455
},
{
"epoch": 5.514885888020216,
"grad_norm": 0.07210098271810879,
"learning_rate": 0.0009942098405226571,
"loss": 2.5275,
"step": 17460
},
{
"epoch": 5.516465292584695,
"grad_norm": 0.08397579566338907,
"learning_rate": 0.0009936584038338742,
"loss": 2.4639,
"step": 17465
},
{
"epoch": 5.518044697149175,
"grad_norm": 0.08134901166923597,
"learning_rate": 0.00099310696907353,
"loss": 2.4538,
"step": 17470
},
{
"epoch": 5.519624101713654,
"grad_norm": 0.06500796543368963,
"learning_rate": 0.000992555536409313,
"loss": 2.4564,
"step": 17475
},
{
"epoch": 5.521203506278133,
"grad_norm": 0.07085037792573211,
"learning_rate": 0.000992004106008909,
"loss": 2.4866,
"step": 17480
},
{
"epoch": 5.522782910842612,
"grad_norm": 0.0870294260785915,
"learning_rate": 0.000991452678040006,
"loss": 2.5225,
"step": 17485
},
{
"epoch": 5.5243623154070916,
"grad_norm": 0.07228562028116617,
"learning_rate": 0.0009909012526702887,
"loss": 2.5377,
"step": 17490
},
{
"epoch": 5.525941719971571,
"grad_norm": 0.08333109488041902,
"learning_rate": 0.0009903498300674425,
"loss": 2.4739,
"step": 17495
},
{
"epoch": 5.52752112453605,
"grad_norm": 0.06330541636085896,
"learning_rate": 0.0009897984103991511,
"loss": 2.4587,
"step": 17500
},
{
"epoch": 5.529100529100529,
"grad_norm": 0.056951745501475054,
"learning_rate": 0.0009892469938330981,
"loss": 2.5458,
"step": 17505
},
{
"epoch": 5.5306799336650085,
"grad_norm": 0.05321501357293479,
"learning_rate": 0.0009886955805369654,
"loss": 2.4727,
"step": 17510
},
{
"epoch": 5.532259338229488,
"grad_norm": 0.05821182962415963,
"learning_rate": 0.0009881441706784348,
"loss": 2.488,
"step": 17515
},
{
"epoch": 5.533838742793967,
"grad_norm": 0.06260039710575123,
"learning_rate": 0.0009875927644251855,
"loss": 2.548,
"step": 17520
},
{
"epoch": 5.535418147358445,
"grad_norm": 0.0781565614405411,
"learning_rate": 0.0009870413619448976,
"loss": 2.4291,
"step": 17525
},
{
"epoch": 5.536997551922925,
"grad_norm": 0.08756626675054255,
"learning_rate": 0.0009864899634052487,
"loss": 2.4711,
"step": 17530
},
{
"epoch": 5.538576956487404,
"grad_norm": 0.06891074352458357,
"learning_rate": 0.0009859385689739157,
"loss": 2.5283,
"step": 17535
},
{
"epoch": 5.540156361051883,
"grad_norm": 0.09753062283035774,
"learning_rate": 0.0009853871788185742,
"loss": 2.5116,
"step": 17540
},
{
"epoch": 5.541735765616362,
"grad_norm": 0.07846124506397864,
"learning_rate": 0.0009848357931068977,
"loss": 2.4321,
"step": 17545
},
{
"epoch": 5.543315170180842,
"grad_norm": 0.06235575304439498,
"learning_rate": 0.0009842844120065601,
"loss": 2.4801,
"step": 17550
},
{
"epoch": 5.544894574745321,
"grad_norm": 0.06137945296351546,
"learning_rate": 0.0009837330356852324,
"loss": 2.5149,
"step": 17555
},
{
"epoch": 5.5464739793098,
"grad_norm": 0.06121198130376255,
"learning_rate": 0.0009831816643105845,
"loss": 2.4464,
"step": 17560
},
{
"epoch": 5.548053383874279,
"grad_norm": 0.06269174896054455,
"learning_rate": 0.0009826302980502852,
"loss": 2.4166,
"step": 17565
},
{
"epoch": 5.549632788438759,
"grad_norm": 0.05876432486725242,
"learning_rate": 0.0009820789370720007,
"loss": 2.4987,
"step": 17570
},
{
"epoch": 5.551212193003238,
"grad_norm": 0.06401956237757549,
"learning_rate": 0.0009815275815433975,
"loss": 2.469,
"step": 17575
},
{
"epoch": 5.552791597567717,
"grad_norm": 0.060370186784724854,
"learning_rate": 0.0009809762316321388,
"loss": 2.6298,
"step": 17580
},
{
"epoch": 5.554371002132196,
"grad_norm": 0.0738976588830285,
"learning_rate": 0.0009804248875058862,
"loss": 2.4293,
"step": 17585
},
{
"epoch": 5.555950406696676,
"grad_norm": 0.05452476583067413,
"learning_rate": 0.0009798735493323004,
"loss": 2.4461,
"step": 17590
},
{
"epoch": 5.557529811261155,
"grad_norm": 0.0642538767603594,
"learning_rate": 0.0009793222172790395,
"loss": 2.4149,
"step": 17595
},
{
"epoch": 5.559109215825634,
"grad_norm": 0.06065766885817183,
"learning_rate": 0.0009787708915137603,
"loss": 2.4054,
"step": 17600
},
{
"epoch": 5.560688620390113,
"grad_norm": 0.06602400522764282,
"learning_rate": 0.0009782195722041174,
"loss": 2.5549,
"step": 17605
},
{
"epoch": 5.562268024954593,
"grad_norm": 0.06339566592751954,
"learning_rate": 0.000977668259517763,
"loss": 2.5543,
"step": 17610
},
{
"epoch": 5.563847429519071,
"grad_norm": 0.062141568435096384,
"learning_rate": 0.000977116953622349,
"loss": 2.5201,
"step": 17615
},
{
"epoch": 5.56542683408355,
"grad_norm": 0.05748031048120429,
"learning_rate": 0.0009765656546855226,
"loss": 2.4949,
"step": 17620
},
{
"epoch": 5.5670062386480295,
"grad_norm": 0.05976838766209523,
"learning_rate": 0.0009760143628749312,
"loss": 2.4857,
"step": 17625
},
{
"epoch": 5.568585643212509,
"grad_norm": 0.052765602810347645,
"learning_rate": 0.0009754630783582188,
"loss": 2.5324,
"step": 17630
},
{
"epoch": 5.570165047776988,
"grad_norm": 0.05883039123763326,
"learning_rate": 0.000974911801303027,
"loss": 2.4809,
"step": 17635
},
{
"epoch": 5.571744452341467,
"grad_norm": 0.08782768304510996,
"learning_rate": 0.0009743605318769967,
"loss": 2.4721,
"step": 17640
},
{
"epoch": 5.5733238569059464,
"grad_norm": 0.05907416732092804,
"learning_rate": 0.0009738092702477646,
"loss": 2.4432,
"step": 17645
},
{
"epoch": 5.574903261470426,
"grad_norm": 0.0685264295057429,
"learning_rate": 0.0009732580165829662,
"loss": 2.4322,
"step": 17650
},
{
"epoch": 5.576482666034905,
"grad_norm": 0.05612377288970562,
"learning_rate": 0.0009727067710502341,
"loss": 2.5235,
"step": 17655
},
{
"epoch": 5.578062070599384,
"grad_norm": 0.05649579334506815,
"learning_rate": 0.0009721555338171982,
"loss": 2.5793,
"step": 17660
},
{
"epoch": 5.579641475163863,
"grad_norm": 0.0651808114232073,
"learning_rate": 0.0009716043050514869,
"loss": 2.4136,
"step": 17665
},
{
"epoch": 5.581220879728343,
"grad_norm": 0.06795332757210341,
"learning_rate": 0.0009710530849207249,
"loss": 2.4253,
"step": 17670
},
{
"epoch": 5.582800284292822,
"grad_norm": 0.07165329380615267,
"learning_rate": 0.0009705018735925349,
"loss": 2.4546,
"step": 17675
},
{
"epoch": 5.584379688857301,
"grad_norm": 0.08186143079488234,
"learning_rate": 0.0009699506712345368,
"loss": 2.5271,
"step": 17680
},
{
"epoch": 5.5859590934217795,
"grad_norm": 0.06961513474365384,
"learning_rate": 0.0009693994780143473,
"loss": 2.4857,
"step": 17685
},
{
"epoch": 5.587538497986259,
"grad_norm": 0.07662382334266588,
"learning_rate": 0.0009688482940995813,
"loss": 2.4929,
"step": 17690
},
{
"epoch": 5.589117902550738,
"grad_norm": 0.07356845960705671,
"learning_rate": 0.00096829711965785,
"loss": 2.4273,
"step": 17695
},
{
"epoch": 5.590697307115217,
"grad_norm": 0.060151403946030704,
"learning_rate": 0.0009677459548567617,
"loss": 2.4488,
"step": 17700
},
{
"epoch": 5.5922767116796965,
"grad_norm": 0.07285569534142783,
"learning_rate": 0.0009671947998639228,
"loss": 2.4341,
"step": 17705
},
{
"epoch": 5.593856116244176,
"grad_norm": 0.08477060368378804,
"learning_rate": 0.0009666436548469354,
"loss": 2.4338,
"step": 17710
},
{
"epoch": 5.595435520808655,
"grad_norm": 0.08023375374829808,
"learning_rate": 0.0009660925199733996,
"loss": 2.4462,
"step": 17715
},
{
"epoch": 5.597014925373134,
"grad_norm": 0.055123349431664234,
"learning_rate": 0.000965541395410912,
"loss": 2.4051,
"step": 17720
},
{
"epoch": 5.5985943299376135,
"grad_norm": 0.06205980066098349,
"learning_rate": 0.0009649902813270655,
"loss": 2.3988,
"step": 17725
},
{
"epoch": 5.600173734502093,
"grad_norm": 0.05587189952928736,
"learning_rate": 0.000964439177889451,
"loss": 2.4597,
"step": 17730
},
{
"epoch": 5.601753139066572,
"grad_norm": 0.057582250498120324,
"learning_rate": 0.0009638880852656552,
"loss": 2.5108,
"step": 17735
},
{
"epoch": 5.603332543631051,
"grad_norm": 0.06339550672783038,
"learning_rate": 0.0009633370036232622,
"loss": 2.5168,
"step": 17740
},
{
"epoch": 5.6049119481955305,
"grad_norm": 0.057733986727466144,
"learning_rate": 0.0009627859331298521,
"loss": 2.4336,
"step": 17745
},
{
"epoch": 5.60649135276001,
"grad_norm": 0.06390094290114952,
"learning_rate": 0.0009622348739530016,
"loss": 2.5104,
"step": 17750
},
{
"epoch": 5.608070757324489,
"grad_norm": 0.08318532157702971,
"learning_rate": 0.000961683826260285,
"loss": 2.3577,
"step": 17755
},
{
"epoch": 5.609650161888968,
"grad_norm": 0.07191966723564622,
"learning_rate": 0.0009611327902192718,
"loss": 2.5667,
"step": 17760
},
{
"epoch": 5.6112295664534475,
"grad_norm": 0.06433803397010941,
"learning_rate": 0.000960581765997529,
"loss": 2.4523,
"step": 17765
},
{
"epoch": 5.612808971017926,
"grad_norm": 0.0702119826574869,
"learning_rate": 0.0009600307537626193,
"loss": 2.4192,
"step": 17770
},
{
"epoch": 5.614388375582405,
"grad_norm": 0.07572920151883997,
"learning_rate": 0.0009594797536821018,
"loss": 2.406,
"step": 17775
},
{
"epoch": 5.615967780146884,
"grad_norm": 0.06286998282516897,
"learning_rate": 0.0009589287659235326,
"loss": 2.4889,
"step": 17780
},
{
"epoch": 5.617547184711364,
"grad_norm": 0.06618826119811236,
"learning_rate": 0.0009583777906544627,
"loss": 2.5588,
"step": 17785
},
{
"epoch": 5.619126589275843,
"grad_norm": 0.0859103412752953,
"learning_rate": 0.0009578268280424413,
"loss": 2.3832,
"step": 17790
},
{
"epoch": 5.620705993840322,
"grad_norm": 0.07137853429429822,
"learning_rate": 0.000957275878255012,
"loss": 2.5166,
"step": 17795
},
{
"epoch": 5.622285398404801,
"grad_norm": 0.06758649234475232,
"learning_rate": 0.0009567249414597148,
"loss": 2.4675,
"step": 17800
},
{
"epoch": 5.623864802969281,
"grad_norm": 0.059088756328290484,
"learning_rate": 0.0009561740178240868,
"loss": 2.4409,
"step": 17805
},
{
"epoch": 5.62544420753376,
"grad_norm": 0.08041593622491673,
"learning_rate": 0.0009556231075156598,
"loss": 2.4645,
"step": 17810
},
{
"epoch": 5.627023612098239,
"grad_norm": 0.05685364383901346,
"learning_rate": 0.000955072210701962,
"loss": 2.5194,
"step": 17815
},
{
"epoch": 5.628603016662718,
"grad_norm": 0.06871867375902899,
"learning_rate": 0.0009545213275505182,
"loss": 2.4839,
"step": 17820
},
{
"epoch": 5.630182421227198,
"grad_norm": 0.08478486893781242,
"learning_rate": 0.0009539704582288479,
"loss": 2.4521,
"step": 17825
},
{
"epoch": 5.631761825791677,
"grad_norm": 0.06389375572876728,
"learning_rate": 0.0009534196029044676,
"loss": 2.4364,
"step": 17830
},
{
"epoch": 5.633341230356156,
"grad_norm": 0.06457230801716601,
"learning_rate": 0.0009528687617448882,
"loss": 2.4594,
"step": 17835
},
{
"epoch": 5.634920634920634,
"grad_norm": 0.058486007512611106,
"learning_rate": 0.0009523179349176169,
"loss": 2.4242,
"step": 17840
},
{
"epoch": 5.636500039485114,
"grad_norm": 0.0594000856430798,
"learning_rate": 0.0009517671225901574,
"loss": 2.307,
"step": 17845
},
{
"epoch": 5.638079444049593,
"grad_norm": 0.05988223693010041,
"learning_rate": 0.0009512163249300074,
"loss": 2.4774,
"step": 17850
},
{
"epoch": 5.639658848614072,
"grad_norm": 0.06761927429607896,
"learning_rate": 0.0009506655421046616,
"loss": 2.5086,
"step": 17855
},
{
"epoch": 5.641238253178551,
"grad_norm": 0.06698682955923135,
"learning_rate": 0.0009501147742816093,
"loss": 2.4664,
"step": 17860
},
{
"epoch": 5.642817657743031,
"grad_norm": 0.06694964877617479,
"learning_rate": 0.0009495640216283352,
"loss": 2.4891,
"step": 17865
},
{
"epoch": 5.64439706230751,
"grad_norm": 0.06878856138544699,
"learning_rate": 0.0009490132843123201,
"loss": 2.5097,
"step": 17870
},
{
"epoch": 5.645976466871989,
"grad_norm": 0.060872884646230065,
"learning_rate": 0.0009484625625010388,
"loss": 2.4395,
"step": 17875
},
{
"epoch": 5.647555871436468,
"grad_norm": 0.06669403635612096,
"learning_rate": 0.0009479118563619636,
"loss": 2.4011,
"step": 17880
},
{
"epoch": 5.649135276000948,
"grad_norm": 0.06080545042600969,
"learning_rate": 0.00094736116606256,
"loss": 2.4199,
"step": 17885
},
{
"epoch": 5.650714680565427,
"grad_norm": 0.05368013530662863,
"learning_rate": 0.0009468104917702894,
"loss": 2.4389,
"step": 17890
},
{
"epoch": 5.652294085129906,
"grad_norm": 0.06946420227942268,
"learning_rate": 0.0009462598336526086,
"loss": 2.5133,
"step": 17895
},
{
"epoch": 5.653873489694385,
"grad_norm": 0.06366908445615306,
"learning_rate": 0.0009457091918769685,
"loss": 2.4904,
"step": 17900
},
{
"epoch": 5.655452894258865,
"grad_norm": 0.07362315822428503,
"learning_rate": 0.0009451585666108167,
"loss": 2.5063,
"step": 17905
},
{
"epoch": 5.657032298823344,
"grad_norm": 0.09533561009131834,
"learning_rate": 0.0009446079580215945,
"loss": 2.4746,
"step": 17910
},
{
"epoch": 5.658611703387823,
"grad_norm": 0.06077729252304897,
"learning_rate": 0.0009440573662767381,
"loss": 2.5094,
"step": 17915
},
{
"epoch": 5.660191107952302,
"grad_norm": 0.0662968301309586,
"learning_rate": 0.0009435067915436794,
"loss": 2.3658,
"step": 17920
},
{
"epoch": 5.661770512516782,
"grad_norm": 0.0769241530947481,
"learning_rate": 0.0009429562339898445,
"loss": 2.5339,
"step": 17925
},
{
"epoch": 5.66334991708126,
"grad_norm": 0.06592696376918891,
"learning_rate": 0.0009424056937826538,
"loss": 2.5045,
"step": 17930
},
{
"epoch": 5.664929321645739,
"grad_norm": 0.08464207015232636,
"learning_rate": 0.0009418551710895242,
"loss": 2.5892,
"step": 17935
},
{
"epoch": 5.6665087262102185,
"grad_norm": 0.07033232531022446,
"learning_rate": 0.0009413046660778654,
"loss": 2.4835,
"step": 17940
},
{
"epoch": 5.668088130774698,
"grad_norm": 0.06718718776882328,
"learning_rate": 0.0009407541789150828,
"loss": 2.455,
"step": 17945
},
{
"epoch": 5.669667535339177,
"grad_norm": 0.07220635578147377,
"learning_rate": 0.0009402037097685759,
"loss": 2.4157,
"step": 17950
},
{
"epoch": 5.671246939903656,
"grad_norm": 0.07669643712845939,
"learning_rate": 0.0009396532588057384,
"loss": 2.5414,
"step": 17955
},
{
"epoch": 5.6728263444681355,
"grad_norm": 0.06481064584496879,
"learning_rate": 0.0009391028261939597,
"loss": 2.4669,
"step": 17960
},
{
"epoch": 5.674405749032615,
"grad_norm": 0.05562313944754461,
"learning_rate": 0.000938552412100622,
"loss": 2.428,
"step": 17965
},
{
"epoch": 5.675985153597094,
"grad_norm": 0.060502393132632275,
"learning_rate": 0.0009380020166931036,
"loss": 2.4722,
"step": 17970
},
{
"epoch": 5.677564558161573,
"grad_norm": 0.06509747628332405,
"learning_rate": 0.0009374516401387759,
"loss": 2.5836,
"step": 17975
},
{
"epoch": 5.6791439627260525,
"grad_norm": 0.05353349416140083,
"learning_rate": 0.0009369012826050045,
"loss": 2.4497,
"step": 17980
},
{
"epoch": 5.680723367290532,
"grad_norm": 0.08962014160900758,
"learning_rate": 0.0009363509442591501,
"loss": 2.4346,
"step": 17985
},
{
"epoch": 5.682302771855011,
"grad_norm": 0.0562749201074952,
"learning_rate": 0.0009358006252685666,
"loss": 2.4374,
"step": 17990
},
{
"epoch": 5.68388217641949,
"grad_norm": 0.05772870332111024,
"learning_rate": 0.0009352503258006031,
"loss": 2.4886,
"step": 17995
},
{
"epoch": 5.685461580983969,
"grad_norm": 0.056923818553177526,
"learning_rate": 0.0009347000460226019,
"loss": 2.3856,
"step": 18000
},
{
"epoch": 5.687040985548448,
"grad_norm": 0.05729033792146132,
"learning_rate": 0.0009341497861018992,
"loss": 2.48,
"step": 18005
},
{
"epoch": 5.688620390112927,
"grad_norm": 0.06981635363934481,
"learning_rate": 0.000933599546205826,
"loss": 2.4828,
"step": 18010
},
{
"epoch": 5.690199794677406,
"grad_norm": 0.05164015352664017,
"learning_rate": 0.0009330493265017061,
"loss": 2.4892,
"step": 18015
},
{
"epoch": 5.691779199241886,
"grad_norm": 0.057934213648398486,
"learning_rate": 0.0009324991271568588,
"loss": 2.4577,
"step": 18020
},
{
"epoch": 5.693358603806365,
"grad_norm": 0.05694147473041426,
"learning_rate": 0.0009319489483385955,
"loss": 2.4117,
"step": 18025
},
{
"epoch": 5.694938008370844,
"grad_norm": 0.05096467033027819,
"learning_rate": 0.0009313987902142222,
"loss": 2.4195,
"step": 18030
},
{
"epoch": 5.696517412935323,
"grad_norm": 0.057477673552068995,
"learning_rate": 0.0009308486529510386,
"loss": 2.3689,
"step": 18035
},
{
"epoch": 5.698096817499803,
"grad_norm": 0.06798699127392135,
"learning_rate": 0.0009302985367163379,
"loss": 2.4072,
"step": 18040
},
{
"epoch": 5.699676222064282,
"grad_norm": 0.05563695899204988,
"learning_rate": 0.0009297484416774066,
"loss": 2.4454,
"step": 18045
},
{
"epoch": 5.701255626628761,
"grad_norm": 0.09863742171806429,
"learning_rate": 0.0009291983680015254,
"loss": 2.4635,
"step": 18050
},
{
"epoch": 5.70283503119324,
"grad_norm": 0.0628812994958859,
"learning_rate": 0.0009286483158559679,
"loss": 2.3879,
"step": 18055
},
{
"epoch": 5.70441443575772,
"grad_norm": 0.0638576863793566,
"learning_rate": 0.0009280982854080021,
"loss": 2.4721,
"step": 18060
},
{
"epoch": 5.705993840322199,
"grad_norm": 0.05476592238086014,
"learning_rate": 0.0009275482768248881,
"loss": 2.4724,
"step": 18065
},
{
"epoch": 5.707573244886678,
"grad_norm": 0.0725197139200165,
"learning_rate": 0.0009269982902738802,
"loss": 2.6608,
"step": 18070
},
{
"epoch": 5.709152649451157,
"grad_norm": 0.06572976807566827,
"learning_rate": 0.0009264483259222259,
"loss": 2.4306,
"step": 18075
},
{
"epoch": 5.7107320540156365,
"grad_norm": 0.0681474422557826,
"learning_rate": 0.0009258983839371655,
"loss": 2.4726,
"step": 18080
},
{
"epoch": 5.712311458580115,
"grad_norm": 0.06744277166816749,
"learning_rate": 0.0009253484644859332,
"loss": 2.4281,
"step": 18085
},
{
"epoch": 5.713890863144594,
"grad_norm": 0.07761470290397195,
"learning_rate": 0.0009247985677357562,
"loss": 2.4706,
"step": 18090
},
{
"epoch": 5.715470267709073,
"grad_norm": 0.057970051991407166,
"learning_rate": 0.000924248693853854,
"loss": 2.4427,
"step": 18095
},
{
"epoch": 5.717049672273553,
"grad_norm": 0.08015945925405381,
"learning_rate": 0.0009236988430074401,
"loss": 2.4668,
"step": 18100
},
{
"epoch": 5.718629076838032,
"grad_norm": 0.06550079456203349,
"learning_rate": 0.0009231490153637202,
"loss": 2.5061,
"step": 18105
},
{
"epoch": 5.720208481402511,
"grad_norm": 0.0668134312212946,
"learning_rate": 0.0009225992110898941,
"loss": 2.411,
"step": 18110
},
{
"epoch": 5.72178788596699,
"grad_norm": 0.10618086040739694,
"learning_rate": 0.0009220494303531534,
"loss": 2.4976,
"step": 18115
},
{
"epoch": 5.72336729053147,
"grad_norm": 0.10057209914764205,
"learning_rate": 0.0009214996733206826,
"loss": 2.4485,
"step": 18120
},
{
"epoch": 5.724946695095949,
"grad_norm": 0.09196840925031985,
"learning_rate": 0.00092094994015966,
"loss": 2.4374,
"step": 18125
},
{
"epoch": 5.726526099660428,
"grad_norm": 0.08473209885560502,
"learning_rate": 0.000920400231037255,
"loss": 2.5291,
"step": 18130
},
{
"epoch": 5.728105504224907,
"grad_norm": 0.07124496206061115,
"learning_rate": 0.0009198505461206318,
"loss": 2.4709,
"step": 18135
},
{
"epoch": 5.729684908789387,
"grad_norm": 0.06020663786883897,
"learning_rate": 0.0009193008855769452,
"loss": 2.4731,
"step": 18140
},
{
"epoch": 5.731264313353866,
"grad_norm": 0.04977407449166733,
"learning_rate": 0.0009187512495733432,
"loss": 2.4211,
"step": 18145
},
{
"epoch": 5.732843717918345,
"grad_norm": 0.0545026125366248,
"learning_rate": 0.0009182016382769676,
"loss": 2.5419,
"step": 18150
},
{
"epoch": 5.7344231224828235,
"grad_norm": 0.05529547977282229,
"learning_rate": 0.0009176520518549512,
"loss": 2.4584,
"step": 18155
},
{
"epoch": 5.736002527047303,
"grad_norm": 0.053943009477869834,
"learning_rate": 0.0009171024904744195,
"loss": 2.3883,
"step": 18160
},
{
"epoch": 5.737581931611782,
"grad_norm": 0.0785572884148887,
"learning_rate": 0.0009165529543024909,
"loss": 2.4702,
"step": 18165
},
{
"epoch": 5.739161336176261,
"grad_norm": 0.06983162962876663,
"learning_rate": 0.0009160034435062755,
"loss": 2.5775,
"step": 18170
},
{
"epoch": 5.7407407407407405,
"grad_norm": 0.07317556418866439,
"learning_rate": 0.0009154539582528766,
"loss": 2.4661,
"step": 18175
},
{
"epoch": 5.74232014530522,
"grad_norm": 0.05152038341836128,
"learning_rate": 0.0009149044987093887,
"loss": 2.4231,
"step": 18180
},
{
"epoch": 5.743899549869699,
"grad_norm": 0.06470217308342688,
"learning_rate": 0.000914355065042899,
"loss": 2.3834,
"step": 18185
},
{
"epoch": 5.745478954434178,
"grad_norm": 0.05801791502540421,
"learning_rate": 0.0009138056574204869,
"loss": 2.4993,
"step": 18190
},
{
"epoch": 5.7470583589986575,
"grad_norm": 0.049807297826352216,
"learning_rate": 0.0009132562760092234,
"loss": 2.3345,
"step": 18195
},
{
"epoch": 5.748637763563137,
"grad_norm": 0.05176149164337185,
"learning_rate": 0.0009127069209761725,
"loss": 2.3969,
"step": 18200
},
{
"epoch": 5.750217168127616,
"grad_norm": 0.05540983496219784,
"learning_rate": 0.0009121575924883891,
"loss": 2.5172,
"step": 18205
},
{
"epoch": 5.751796572692095,
"grad_norm": 0.07062737019904822,
"learning_rate": 0.0009116082907129204,
"loss": 2.4267,
"step": 18210
},
{
"epoch": 5.7533759772565745,
"grad_norm": 0.05713448525959423,
"learning_rate": 0.0009110590158168061,
"loss": 2.4438,
"step": 18215
},
{
"epoch": 5.754955381821054,
"grad_norm": 0.07565938761972303,
"learning_rate": 0.0009105097679670763,
"loss": 2.5628,
"step": 18220
},
{
"epoch": 5.756534786385533,
"grad_norm": 0.07144974280431203,
"learning_rate": 0.000909960547330755,
"loss": 2.4112,
"step": 18225
},
{
"epoch": 5.758114190950012,
"grad_norm": 0.08004658227388475,
"learning_rate": 0.0009094113540748556,
"loss": 2.4569,
"step": 18230
},
{
"epoch": 5.7596935955144914,
"grad_norm": 0.06325848747050856,
"learning_rate": 0.0009088621883663843,
"loss": 2.473,
"step": 18235
},
{
"epoch": 5.761273000078971,
"grad_norm": 0.0658128971018369,
"learning_rate": 0.0009083130503723397,
"loss": 2.4052,
"step": 18240
},
{
"epoch": 5.762852404643449,
"grad_norm": 0.07131593128764628,
"learning_rate": 0.0009077639402597104,
"loss": 2.4612,
"step": 18245
},
{
"epoch": 5.764431809207928,
"grad_norm": 0.05815984418376559,
"learning_rate": 0.0009072148581954777,
"loss": 2.445,
"step": 18250
},
{
"epoch": 5.7660112137724076,
"grad_norm": 0.06696390988086089,
"learning_rate": 0.000906665804346614,
"loss": 2.3816,
"step": 18255
},
{
"epoch": 5.767590618336887,
"grad_norm": 0.06072912908978546,
"learning_rate": 0.0009061167788800824,
"loss": 2.4931,
"step": 18260
},
{
"epoch": 5.769170022901366,
"grad_norm": 0.054937370178212874,
"learning_rate": 0.0009055677819628388,
"loss": 2.4928,
"step": 18265
},
{
"epoch": 5.770749427465845,
"grad_norm": 0.06507034650605066,
"learning_rate": 0.0009050188137618295,
"loss": 2.3613,
"step": 18270
},
{
"epoch": 5.7723288320303245,
"grad_norm": 0.05901245015000263,
"learning_rate": 0.0009044698744439918,
"loss": 2.4248,
"step": 18275
},
{
"epoch": 5.773908236594804,
"grad_norm": 0.05660425661902344,
"learning_rate": 0.0009039209641762551,
"loss": 2.4165,
"step": 18280
},
{
"epoch": 5.775487641159283,
"grad_norm": 0.06049866366270378,
"learning_rate": 0.000903372083125539,
"loss": 2.4223,
"step": 18285
},
{
"epoch": 5.777067045723762,
"grad_norm": 0.060957914090694364,
"learning_rate": 0.0009028232314587555,
"loss": 2.4436,
"step": 18290
},
{
"epoch": 5.7786464502882415,
"grad_norm": 0.053613679381113895,
"learning_rate": 0.0009022744093428063,
"loss": 2.5136,
"step": 18295
},
{
"epoch": 5.780225854852721,
"grad_norm": 0.05835608721294925,
"learning_rate": 0.0009017256169445846,
"loss": 2.4515,
"step": 18300
},
{
"epoch": 5.7818052594172,
"grad_norm": 0.07079845036186541,
"learning_rate": 0.0009011768544309751,
"loss": 2.3503,
"step": 18305
},
{
"epoch": 5.783384663981679,
"grad_norm": 0.0652389566633875,
"learning_rate": 0.0009006281219688525,
"loss": 2.4271,
"step": 18310
},
{
"epoch": 5.784964068546158,
"grad_norm": 0.06330923838643326,
"learning_rate": 0.0009000794197250837,
"loss": 2.376,
"step": 18315
},
{
"epoch": 5.786543473110637,
"grad_norm": 0.06052683005905966,
"learning_rate": 0.0008995307478665246,
"loss": 2.4239,
"step": 18320
},
{
"epoch": 5.788122877675116,
"grad_norm": 0.06390808341534668,
"learning_rate": 0.000898982106560023,
"loss": 2.3723,
"step": 18325
},
{
"epoch": 5.789702282239595,
"grad_norm": 0.06987840184368298,
"learning_rate": 0.0008984334959724177,
"loss": 2.5108,
"step": 18330
},
{
"epoch": 5.791281686804075,
"grad_norm": 0.05559648943533013,
"learning_rate": 0.0008978849162705369,
"loss": 2.4836,
"step": 18335
},
{
"epoch": 5.792861091368554,
"grad_norm": 0.059576872584581095,
"learning_rate": 0.000897336367621201,
"loss": 2.5455,
"step": 18340
},
{
"epoch": 5.794440495933033,
"grad_norm": 0.06256260950604808,
"learning_rate": 0.0008967878501912199,
"loss": 2.4057,
"step": 18345
},
{
"epoch": 5.796019900497512,
"grad_norm": 0.0634177189700817,
"learning_rate": 0.0008962393641473935,
"loss": 2.4772,
"step": 18350
},
{
"epoch": 5.797599305061992,
"grad_norm": 0.061569379281350534,
"learning_rate": 0.000895690909656514,
"loss": 2.4593,
"step": 18355
},
{
"epoch": 5.799178709626471,
"grad_norm": 0.07243432736085856,
"learning_rate": 0.0008951424868853622,
"loss": 2.4282,
"step": 18360
},
{
"epoch": 5.80075811419095,
"grad_norm": 0.08064467427424583,
"learning_rate": 0.0008945940960007105,
"loss": 2.4573,
"step": 18365
},
{
"epoch": 5.802337518755429,
"grad_norm": 0.05480200407215235,
"learning_rate": 0.0008940457371693207,
"loss": 2.4848,
"step": 18370
},
{
"epoch": 5.803916923319909,
"grad_norm": 0.05697400727478003,
"learning_rate": 0.0008934974105579448,
"loss": 2.3826,
"step": 18375
},
{
"epoch": 5.805496327884388,
"grad_norm": 0.060818124238185944,
"learning_rate": 0.0008929491163333263,
"loss": 2.4257,
"step": 18380
},
{
"epoch": 5.807075732448867,
"grad_norm": 0.05427907215517275,
"learning_rate": 0.0008924008546621977,
"loss": 2.5639,
"step": 18385
},
{
"epoch": 5.808655137013346,
"grad_norm": 0.06286923950584318,
"learning_rate": 0.0008918526257112813,
"loss": 2.4455,
"step": 18390
},
{
"epoch": 5.810234541577826,
"grad_norm": 0.05475356012061112,
"learning_rate": 0.0008913044296472907,
"loss": 2.3673,
"step": 18395
},
{
"epoch": 5.811813946142305,
"grad_norm": 0.09123960529197869,
"learning_rate": 0.0008907562666369283,
"loss": 2.4212,
"step": 18400
},
{
"epoch": 5.813393350706783,
"grad_norm": 0.06160750325735122,
"learning_rate": 0.0008902081368468877,
"loss": 2.4737,
"step": 18405
},
{
"epoch": 5.8149727552712625,
"grad_norm": 0.06042864285383974,
"learning_rate": 0.0008896600404438512,
"loss": 2.4076,
"step": 18410
},
{
"epoch": 5.816552159835742,
"grad_norm": 0.062130405319900335,
"learning_rate": 0.0008891119775944908,
"loss": 2.4252,
"step": 18415
},
{
"epoch": 5.818131564400221,
"grad_norm": 0.052148374032821625,
"learning_rate": 0.0008885639484654701,
"loss": 2.3184,
"step": 18420
},
{
"epoch": 5.8197109689647,
"grad_norm": 0.07799748157479257,
"learning_rate": 0.0008880159532234403,
"loss": 2.4437,
"step": 18425
},
{
"epoch": 5.821290373529179,
"grad_norm": 0.06354087937293497,
"learning_rate": 0.000887467992035044,
"loss": 2.3482,
"step": 18430
},
{
"epoch": 5.822869778093659,
"grad_norm": 0.06776266791706592,
"learning_rate": 0.0008869200650669123,
"loss": 2.4484,
"step": 18435
},
{
"epoch": 5.824449182658138,
"grad_norm": 0.06986149822627638,
"learning_rate": 0.0008863721724856658,
"loss": 2.4601,
"step": 18440
},
{
"epoch": 5.826028587222617,
"grad_norm": 0.0636207961069725,
"learning_rate": 0.0008858243144579162,
"loss": 2.5333,
"step": 18445
},
{
"epoch": 5.827607991787096,
"grad_norm": 0.05633739964135409,
"learning_rate": 0.0008852764911502629,
"loss": 2.4579,
"step": 18450
},
{
"epoch": 5.829187396351576,
"grad_norm": 0.07580112598710743,
"learning_rate": 0.0008847287027292959,
"loss": 2.4581,
"step": 18455
},
{
"epoch": 5.830766800916055,
"grad_norm": 0.0584897564914947,
"learning_rate": 0.0008841809493615937,
"loss": 2.4064,
"step": 18460
},
{
"epoch": 5.832346205480534,
"grad_norm": 0.05670195848256627,
"learning_rate": 0.0008836332312137245,
"loss": 2.4902,
"step": 18465
},
{
"epoch": 5.8339256100450125,
"grad_norm": 0.06261669554059426,
"learning_rate": 0.0008830855484522467,
"loss": 2.3873,
"step": 18470
},
{
"epoch": 5.835505014609492,
"grad_norm": 0.05497956371246285,
"learning_rate": 0.0008825379012437065,
"loss": 2.3732,
"step": 18475
},
{
"epoch": 5.837084419173971,
"grad_norm": 0.058994243345202174,
"learning_rate": 0.0008819902897546399,
"loss": 2.44,
"step": 18480
},
{
"epoch": 5.83866382373845,
"grad_norm": 0.05702003258862989,
"learning_rate": 0.0008814427141515724,
"loss": 2.4532,
"step": 18485
},
{
"epoch": 5.8402432283029295,
"grad_norm": 0.054639243278408696,
"learning_rate": 0.0008808951746010176,
"loss": 2.4148,
"step": 18490
},
{
"epoch": 5.841822632867409,
"grad_norm": 0.06417166303687594,
"learning_rate": 0.0008803476712694799,
"loss": 2.5821,
"step": 18495
},
{
"epoch": 5.843402037431888,
"grad_norm": 0.06281475530121881,
"learning_rate": 0.0008798002043234507,
"loss": 2.4995,
"step": 18500
},
{
"epoch": 5.844981441996367,
"grad_norm": 0.05850353517807507,
"learning_rate": 0.0008792527739294109,
"loss": 2.5369,
"step": 18505
},
{
"epoch": 5.8465608465608465,
"grad_norm": 0.06933387569229595,
"learning_rate": 0.0008787053802538315,
"loss": 2.5295,
"step": 18510
},
{
"epoch": 5.848140251125326,
"grad_norm": 0.05310284438947429,
"learning_rate": 0.0008781580234631707,
"loss": 2.436,
"step": 18515
},
{
"epoch": 5.849719655689805,
"grad_norm": 0.05225047738231209,
"learning_rate": 0.0008776107037238768,
"loss": 2.4643,
"step": 18520
},
{
"epoch": 5.851299060254284,
"grad_norm": 0.06069573506210318,
"learning_rate": 0.000877063421202386,
"loss": 2.5021,
"step": 18525
},
{
"epoch": 5.8528784648187635,
"grad_norm": 0.06725734461704204,
"learning_rate": 0.0008765161760651228,
"loss": 2.448,
"step": 18530
},
{
"epoch": 5.854457869383243,
"grad_norm": 0.06705519985775174,
"learning_rate": 0.000875968968478502,
"loss": 2.4865,
"step": 18535
},
{
"epoch": 5.856037273947722,
"grad_norm": 0.06879518724372008,
"learning_rate": 0.0008754217986089252,
"loss": 2.544,
"step": 18540
},
{
"epoch": 5.857616678512201,
"grad_norm": 0.06783341102034925,
"learning_rate": 0.0008748746666227837,
"loss": 2.4572,
"step": 18545
},
{
"epoch": 5.8591960830766805,
"grad_norm": 0.06202947189676426,
"learning_rate": 0.0008743275726864567,
"loss": 2.5136,
"step": 18550
},
{
"epoch": 5.86077548764116,
"grad_norm": 0.06858077927005182,
"learning_rate": 0.0008737805169663113,
"loss": 2.5023,
"step": 18555
},
{
"epoch": 5.862354892205638,
"grad_norm": 0.07833625418904053,
"learning_rate": 0.0008732334996287048,
"loss": 2.4177,
"step": 18560
},
{
"epoch": 5.863934296770117,
"grad_norm": 0.06708193322427108,
"learning_rate": 0.000872686520839981,
"loss": 2.4835,
"step": 18565
},
{
"epoch": 5.865513701334597,
"grad_norm": 0.06777483951535948,
"learning_rate": 0.000872139580766473,
"loss": 2.3961,
"step": 18570
},
{
"epoch": 5.867093105899076,
"grad_norm": 0.05897782997028614,
"learning_rate": 0.0008715926795745013,
"loss": 2.4551,
"step": 18575
},
{
"epoch": 5.868672510463555,
"grad_norm": 0.05648603893901046,
"learning_rate": 0.000871045817430375,
"loss": 2.4382,
"step": 18580
},
{
"epoch": 5.870251915028034,
"grad_norm": 0.058115341169490256,
"learning_rate": 0.0008704989945003925,
"loss": 2.521,
"step": 18585
},
{
"epoch": 5.871831319592514,
"grad_norm": 0.058398666016500775,
"learning_rate": 0.0008699522109508381,
"loss": 2.5097,
"step": 18590
},
{
"epoch": 5.873410724156993,
"grad_norm": 0.05148611445163005,
"learning_rate": 0.0008694054669479849,
"loss": 2.4309,
"step": 18595
},
{
"epoch": 5.874990128721472,
"grad_norm": 0.05600237584439396,
"learning_rate": 0.0008688587626580953,
"loss": 2.4497,
"step": 18600
},
{
"epoch": 5.876569533285951,
"grad_norm": 0.055734415645769995,
"learning_rate": 0.000868312098247418,
"loss": 2.4417,
"step": 18605
},
{
"epoch": 5.878148937850431,
"grad_norm": 0.049748014401075,
"learning_rate": 0.0008677654738821904,
"loss": 2.4085,
"step": 18610
},
{
"epoch": 5.87972834241491,
"grad_norm": 0.05825774563921929,
"learning_rate": 0.0008672188897286372,
"loss": 2.4668,
"step": 18615
},
{
"epoch": 5.881307746979389,
"grad_norm": 0.057123010762611504,
"learning_rate": 0.000866672345952971,
"loss": 2.4945,
"step": 18620
},
{
"epoch": 5.882887151543868,
"grad_norm": 0.07302755947184687,
"learning_rate": 0.0008661258427213929,
"loss": 2.5048,
"step": 18625
},
{
"epoch": 5.884466556108347,
"grad_norm": 0.06749009836319973,
"learning_rate": 0.0008655793802000904,
"loss": 2.4409,
"step": 18630
},
{
"epoch": 5.886045960672826,
"grad_norm": 0.060230827309487846,
"learning_rate": 0.0008650329585552399,
"loss": 2.4291,
"step": 18635
},
{
"epoch": 5.887625365237305,
"grad_norm": 0.07520235279964461,
"learning_rate": 0.0008644865779530043,
"loss": 2.4758,
"step": 18640
},
{
"epoch": 5.889204769801784,
"grad_norm": 0.06257962555596458,
"learning_rate": 0.0008639402385595341,
"loss": 2.3305,
"step": 18645
},
{
"epoch": 5.890784174366264,
"grad_norm": 0.05010892685311434,
"learning_rate": 0.0008633939405409684,
"loss": 2.4512,
"step": 18650
},
{
"epoch": 5.892363578930743,
"grad_norm": 0.05780345477019086,
"learning_rate": 0.0008628476840634326,
"loss": 2.3788,
"step": 18655
},
{
"epoch": 5.893942983495222,
"grad_norm": 0.07516486107590714,
"learning_rate": 0.0008623014692930398,
"loss": 2.4626,
"step": 18660
},
{
"epoch": 5.895522388059701,
"grad_norm": 0.07086214642004705,
"learning_rate": 0.0008617552963958903,
"loss": 2.4828,
"step": 18665
},
{
"epoch": 5.897101792624181,
"grad_norm": 0.06525854181400412,
"learning_rate": 0.0008612091655380717,
"loss": 2.4876,
"step": 18670
},
{
"epoch": 5.89868119718866,
"grad_norm": 0.06473701514578614,
"learning_rate": 0.0008606630768856596,
"loss": 2.4663,
"step": 18675
},
{
"epoch": 5.900260601753139,
"grad_norm": 0.11143040114895895,
"learning_rate": 0.000860117030604715,
"loss": 2.4441,
"step": 18680
},
{
"epoch": 5.901840006317618,
"grad_norm": 0.08641539861758633,
"learning_rate": 0.0008595710268612881,
"loss": 2.4274,
"step": 18685
},
{
"epoch": 5.903419410882098,
"grad_norm": 0.07521524967995398,
"learning_rate": 0.0008590250658214147,
"loss": 2.5591,
"step": 18690
},
{
"epoch": 5.904998815446577,
"grad_norm": 0.0536980553622514,
"learning_rate": 0.0008584791476511178,
"loss": 2.4091,
"step": 18695
},
{
"epoch": 5.906578220011056,
"grad_norm": 0.05573623517051879,
"learning_rate": 0.0008579332725164082,
"loss": 2.4902,
"step": 18700
},
{
"epoch": 5.908157624575535,
"grad_norm": 0.05927694066712838,
"learning_rate": 0.0008573874405832827,
"loss": 2.6475,
"step": 18705
},
{
"epoch": 5.909737029140015,
"grad_norm": 0.07787533712544467,
"learning_rate": 0.0008568416520177248,
"loss": 2.444,
"step": 18710
},
{
"epoch": 5.911316433704494,
"grad_norm": 0.07580753073276318,
"learning_rate": 0.0008562959069857063,
"loss": 2.4167,
"step": 18715
},
{
"epoch": 5.912895838268972,
"grad_norm": 0.08533835749634594,
"learning_rate": 0.0008557502056531843,
"loss": 2.3843,
"step": 18720
},
{
"epoch": 5.9144752428334515,
"grad_norm": 0.06919443483491658,
"learning_rate": 0.0008552045481861033,
"loss": 2.438,
"step": 18725
},
{
"epoch": 5.916054647397931,
"grad_norm": 0.08056489336824202,
"learning_rate": 0.000854658934750394,
"loss": 2.4123,
"step": 18730
},
{
"epoch": 5.91763405196241,
"grad_norm": 0.05431615132944134,
"learning_rate": 0.0008541133655119736,
"loss": 2.4532,
"step": 18735
},
{
"epoch": 5.919213456526889,
"grad_norm": 0.07022673657018344,
"learning_rate": 0.0008535678406367471,
"loss": 2.6008,
"step": 18740
},
{
"epoch": 5.9207928610913685,
"grad_norm": 0.08312006482231832,
"learning_rate": 0.0008530223602906045,
"loss": 2.3474,
"step": 18745
},
{
"epoch": 5.922372265655848,
"grad_norm": 0.07106942527735756,
"learning_rate": 0.0008524769246394232,
"loss": 2.5367,
"step": 18750
},
{
"epoch": 5.923951670220327,
"grad_norm": 0.054139765634404945,
"learning_rate": 0.0008519315338490666,
"loss": 2.4478,
"step": 18755
},
{
"epoch": 5.925531074784806,
"grad_norm": 0.05964575416082492,
"learning_rate": 0.0008513861880853842,
"loss": 2.4661,
"step": 18760
},
{
"epoch": 5.9271104793492855,
"grad_norm": 0.06845435248458885,
"learning_rate": 0.000850840887514213,
"loss": 2.4567,
"step": 18765
},
{
"epoch": 5.928689883913765,
"grad_norm": 0.06691190090607119,
"learning_rate": 0.0008502956323013742,
"loss": 2.4561,
"step": 18770
},
{
"epoch": 5.930269288478244,
"grad_norm": 0.05742529746381336,
"learning_rate": 0.0008497504226126776,
"loss": 2.334,
"step": 18775
},
{
"epoch": 5.931848693042723,
"grad_norm": 0.053918374898314594,
"learning_rate": 0.0008492052586139176,
"loss": 2.414,
"step": 18780
},
{
"epoch": 5.9334280976072025,
"grad_norm": 0.06588285090390467,
"learning_rate": 0.0008486601404708748,
"loss": 2.4861,
"step": 18785
},
{
"epoch": 5.935007502171681,
"grad_norm": 0.06592636654504706,
"learning_rate": 0.0008481150683493165,
"loss": 2.5156,
"step": 18790
},
{
"epoch": 5.93658690673616,
"grad_norm": 0.06268926063718763,
"learning_rate": 0.000847570042414995,
"loss": 2.4163,
"step": 18795
},
{
"epoch": 5.938166311300639,
"grad_norm": 0.05884941027164353,
"learning_rate": 0.0008470250628336502,
"loss": 2.5446,
"step": 18800
},
{
"epoch": 5.939745715865119,
"grad_norm": 0.05866739879678812,
"learning_rate": 0.0008464801297710065,
"loss": 2.4007,
"step": 18805
},
{
"epoch": 5.941325120429598,
"grad_norm": 0.05844027310580414,
"learning_rate": 0.0008459352433927742,
"loss": 2.4184,
"step": 18810
},
{
"epoch": 5.942904524994077,
"grad_norm": 0.061943910450837116,
"learning_rate": 0.0008453904038646502,
"loss": 2.4625,
"step": 18815
},
{
"epoch": 5.944483929558556,
"grad_norm": 0.10225139082580605,
"learning_rate": 0.0008448456113523165,
"loss": 2.4255,
"step": 18820
},
{
"epoch": 5.946063334123036,
"grad_norm": 0.06259875674488143,
"learning_rate": 0.0008443008660214407,
"loss": 2.4698,
"step": 18825
},
{
"epoch": 5.947642738687515,
"grad_norm": 0.07473013511729105,
"learning_rate": 0.0008437561680376773,
"loss": 2.4987,
"step": 18830
},
{
"epoch": 5.949222143251994,
"grad_norm": 0.10129259076352053,
"learning_rate": 0.0008432115175666646,
"loss": 2.4721,
"step": 18835
},
{
"epoch": 5.950801547816473,
"grad_norm": 0.0699598712898237,
"learning_rate": 0.000842666914774028,
"loss": 2.4591,
"step": 18840
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.0997115556126127,
"learning_rate": 0.0008421223598253772,
"loss": 2.4802,
"step": 18845
},
{
"epoch": 5.953960356945432,
"grad_norm": 0.05240629597794155,
"learning_rate": 0.0008415778528863077,
"loss": 2.4607,
"step": 18850
},
{
"epoch": 5.955539761509911,
"grad_norm": 0.0600795497830552,
"learning_rate": 0.0008410333941224016,
"loss": 2.4358,
"step": 18855
},
{
"epoch": 5.95711916607439,
"grad_norm": 0.07904368495608866,
"learning_rate": 0.0008404889836992241,
"loss": 2.4552,
"step": 18860
},
{
"epoch": 5.9586985706388695,
"grad_norm": 0.07204247109410876,
"learning_rate": 0.0008399446217823279,
"loss": 2.3968,
"step": 18865
},
{
"epoch": 5.960277975203349,
"grad_norm": 0.05683872136202642,
"learning_rate": 0.0008394003085372496,
"loss": 2.4381,
"step": 18870
},
{
"epoch": 5.961857379767827,
"grad_norm": 0.08522386567809322,
"learning_rate": 0.0008388560441295112,
"loss": 2.4795,
"step": 18875
},
{
"epoch": 5.963436784332306,
"grad_norm": 0.05579606099491831,
"learning_rate": 0.0008383118287246203,
"loss": 2.4209,
"step": 18880
},
{
"epoch": 5.965016188896786,
"grad_norm": 0.07102608009357568,
"learning_rate": 0.0008377676624880687,
"loss": 2.3678,
"step": 18885
},
{
"epoch": 5.966595593461265,
"grad_norm": 0.053367495741105525,
"learning_rate": 0.0008372235455853352,
"loss": 2.4579,
"step": 18890
},
{
"epoch": 5.968174998025744,
"grad_norm": 0.0604002108347439,
"learning_rate": 0.0008366794781818812,
"loss": 2.4009,
"step": 18895
},
{
"epoch": 5.969754402590223,
"grad_norm": 0.07157847356370406,
"learning_rate": 0.0008361354604431543,
"loss": 2.5228,
"step": 18900
},
{
"epoch": 5.971333807154703,
"grad_norm": 0.059942760889739315,
"learning_rate": 0.0008355914925345871,
"loss": 2.3524,
"step": 18905
},
{
"epoch": 5.972913211719182,
"grad_norm": 0.066784733945027,
"learning_rate": 0.0008350475746215962,
"loss": 2.4295,
"step": 18910
},
{
"epoch": 5.974492616283661,
"grad_norm": 0.07602771233408008,
"learning_rate": 0.0008345037068695844,
"loss": 2.3876,
"step": 18915
},
{
"epoch": 5.97607202084814,
"grad_norm": 0.06951439257665362,
"learning_rate": 0.0008339598894439379,
"loss": 2.4417,
"step": 18920
},
{
"epoch": 5.97765142541262,
"grad_norm": 0.07068684262286003,
"learning_rate": 0.0008334161225100279,
"loss": 2.4353,
"step": 18925
},
{
"epoch": 5.979230829977099,
"grad_norm": 0.05926446479911698,
"learning_rate": 0.0008328724062332109,
"loss": 2.4387,
"step": 18930
},
{
"epoch": 5.980810234541578,
"grad_norm": 0.05714291646293212,
"learning_rate": 0.0008323287407788275,
"loss": 2.4758,
"step": 18935
},
{
"epoch": 5.982389639106057,
"grad_norm": 0.06065052122055392,
"learning_rate": 0.0008317851263122023,
"loss": 2.4815,
"step": 18940
},
{
"epoch": 5.983969043670536,
"grad_norm": 0.050265879107395724,
"learning_rate": 0.000831241562998646,
"loss": 2.4395,
"step": 18945
},
{
"epoch": 5.985548448235015,
"grad_norm": 0.08877890773804512,
"learning_rate": 0.0008306980510034514,
"loss": 2.4496,
"step": 18950
},
{
"epoch": 5.987127852799494,
"grad_norm": 0.09088066285243378,
"learning_rate": 0.0008301545904918985,
"loss": 2.4562,
"step": 18955
},
{
"epoch": 5.9887072573639735,
"grad_norm": 0.07212537493855416,
"learning_rate": 0.0008296111816292494,
"loss": 2.3848,
"step": 18960
},
{
"epoch": 5.990286661928453,
"grad_norm": 0.09530438007539307,
"learning_rate": 0.0008290678245807509,
"loss": 2.4474,
"step": 18965
},
{
"epoch": 5.991866066492932,
"grad_norm": 0.06269179023551791,
"learning_rate": 0.0008285245195116351,
"loss": 2.4097,
"step": 18970
},
{
"epoch": 5.993445471057411,
"grad_norm": 0.05876585534017766,
"learning_rate": 0.0008279812665871171,
"loss": 2.4617,
"step": 18975
},
{
"epoch": 5.9950248756218905,
"grad_norm": 0.05600082094942641,
"learning_rate": 0.000827438065972397,
"loss": 2.5398,
"step": 18980
},
{
"epoch": 5.99660428018637,
"grad_norm": 0.06387783912326771,
"learning_rate": 0.0008268949178326588,
"loss": 2.4397,
"step": 18985
},
{
"epoch": 5.998183684750849,
"grad_norm": 0.0661590272942373,
"learning_rate": 0.0008263518223330697,
"loss": 2.4274,
"step": 18990
},
{
"epoch": 5.999763089315328,
"grad_norm": 0.07063174926051552,
"learning_rate": 0.0008258087796387822,
"loss": 2.5648,
"step": 18995
},
{
"epoch": 6.0,
"eval_loss": 2.452611207962036,
"eval_runtime": 118.4455,
"eval_samples_per_second": 22.365,
"eval_steps_per_second": 5.598,
"step": 18996
},
{
"epoch": 6.001263523651583,
"grad_norm": 0.06755485745393365,
"learning_rate": 0.0008252657899149315,
"loss": 2.4559,
"step": 19000
},
{
"epoch": 6.0028429282160625,
"grad_norm": 0.06995856223474835,
"learning_rate": 0.0008247228533266381,
"loss": 2.4591,
"step": 19005
},
{
"epoch": 6.004422332780542,
"grad_norm": 0.06036922504256234,
"learning_rate": 0.0008241799700390051,
"loss": 2.457,
"step": 19010
},
{
"epoch": 6.006001737345021,
"grad_norm": 0.07337472446882752,
"learning_rate": 0.0008236371402171197,
"loss": 2.4634,
"step": 19015
},
{
"epoch": 6.0075811419095,
"grad_norm": 0.057428936378157186,
"learning_rate": 0.0008230943640260534,
"loss": 2.5075,
"step": 19020
},
{
"epoch": 6.0091605464739795,
"grad_norm": 0.05892214975458761,
"learning_rate": 0.00082255164163086,
"loss": 2.4597,
"step": 19025
},
{
"epoch": 6.010739951038459,
"grad_norm": 0.05256753445267508,
"learning_rate": 0.0008220089731965794,
"loss": 2.4061,
"step": 19030
},
{
"epoch": 6.012319355602938,
"grad_norm": 0.06046687041545794,
"learning_rate": 0.0008214663588882328,
"loss": 2.4363,
"step": 19035
},
{
"epoch": 6.013898760167417,
"grad_norm": 0.05863844764285577,
"learning_rate": 0.0008209237988708254,
"loss": 2.392,
"step": 19040
},
{
"epoch": 6.0154781647318964,
"grad_norm": 0.05603381098865268,
"learning_rate": 0.0008203812933093469,
"loss": 2.511,
"step": 19045
},
{
"epoch": 6.017057569296376,
"grad_norm": 0.0536891131990583,
"learning_rate": 0.0008198388423687694,
"loss": 2.4732,
"step": 19050
},
{
"epoch": 6.018636973860854,
"grad_norm": 0.05725119713649916,
"learning_rate": 0.0008192964462140487,
"loss": 2.4728,
"step": 19055
},
{
"epoch": 6.020216378425333,
"grad_norm": 0.06486729711490877,
"learning_rate": 0.0008187541050101244,
"loss": 2.4254,
"step": 19060
},
{
"epoch": 6.0217957829898126,
"grad_norm": 0.05603893699604805,
"learning_rate": 0.0008182118189219183,
"loss": 2.3838,
"step": 19065
},
{
"epoch": 6.023375187554292,
"grad_norm": 0.059224961336993114,
"learning_rate": 0.0008176695881143371,
"loss": 2.4123,
"step": 19070
},
{
"epoch": 6.024954592118771,
"grad_norm": 0.05398166783902062,
"learning_rate": 0.0008171274127522692,
"loss": 2.4329,
"step": 19075
},
{
"epoch": 6.02653399668325,
"grad_norm": 0.0507008907302069,
"learning_rate": 0.0008165852930005863,
"loss": 2.459,
"step": 19080
},
{
"epoch": 6.0281134012477295,
"grad_norm": 0.04947190101267776,
"learning_rate": 0.0008160432290241443,
"loss": 2.496,
"step": 19085
},
{
"epoch": 6.029692805812209,
"grad_norm": 0.06868271629697459,
"learning_rate": 0.0008155012209877805,
"loss": 2.4549,
"step": 19090
},
{
"epoch": 6.031272210376688,
"grad_norm": 0.05516426059948859,
"learning_rate": 0.0008149592690563171,
"loss": 2.4232,
"step": 19095
},
{
"epoch": 6.032851614941167,
"grad_norm": 0.06117699149357592,
"learning_rate": 0.0008144173733945578,
"loss": 2.4359,
"step": 19100
},
{
"epoch": 6.0344310195056465,
"grad_norm": 0.05511416994441555,
"learning_rate": 0.0008138755341672892,
"loss": 2.4632,
"step": 19105
},
{
"epoch": 6.036010424070126,
"grad_norm": 0.07941509896758685,
"learning_rate": 0.0008133337515392817,
"loss": 2.3763,
"step": 19110
},
{
"epoch": 6.037589828634605,
"grad_norm": 0.08006670063910017,
"learning_rate": 0.0008127920256752873,
"loss": 2.4321,
"step": 19115
},
{
"epoch": 6.039169233199084,
"grad_norm": 0.057388186750708,
"learning_rate": 0.0008122503567400422,
"loss": 2.4903,
"step": 19120
},
{
"epoch": 6.0407486377635635,
"grad_norm": 0.058150501456731375,
"learning_rate": 0.0008117087448982643,
"loss": 2.4013,
"step": 19125
},
{
"epoch": 6.042328042328043,
"grad_norm": 0.057680383693818144,
"learning_rate": 0.0008111671903146534,
"loss": 2.4884,
"step": 19130
},
{
"epoch": 6.043907446892521,
"grad_norm": 0.06935938361387563,
"learning_rate": 0.0008106256931538938,
"loss": 2.4594,
"step": 19135
},
{
"epoch": 6.045486851457,
"grad_norm": 0.0700477907794558,
"learning_rate": 0.0008100842535806508,
"loss": 2.4168,
"step": 19140
},
{
"epoch": 6.04706625602148,
"grad_norm": 0.07296062388623109,
"learning_rate": 0.0008095428717595731,
"loss": 2.5121,
"step": 19145
},
{
"epoch": 6.048645660585959,
"grad_norm": 0.05502508806338148,
"learning_rate": 0.0008090015478552912,
"loss": 2.3913,
"step": 19150
},
{
"epoch": 6.050225065150438,
"grad_norm": 0.05750008280959948,
"learning_rate": 0.0008084602820324179,
"loss": 2.3773,
"step": 19155
},
{
"epoch": 6.051804469714917,
"grad_norm": 0.067930747346504,
"learning_rate": 0.0008079190744555495,
"loss": 2.4459,
"step": 19160
},
{
"epoch": 6.053383874279397,
"grad_norm": 0.05830080957083264,
"learning_rate": 0.0008073779252892633,
"loss": 2.3981,
"step": 19165
},
{
"epoch": 6.054963278843876,
"grad_norm": 0.07333606080282067,
"learning_rate": 0.0008068368346981191,
"loss": 2.4511,
"step": 19170
},
{
"epoch": 6.056542683408355,
"grad_norm": 0.056744436786467525,
"learning_rate": 0.0008062958028466594,
"loss": 2.4713,
"step": 19175
},
{
"epoch": 6.058122087972834,
"grad_norm": 0.057703650483882755,
"learning_rate": 0.0008057548298994082,
"loss": 2.3564,
"step": 19180
},
{
"epoch": 6.059701492537314,
"grad_norm": 0.0502145706242276,
"learning_rate": 0.0008052139160208725,
"loss": 2.4217,
"step": 19185
},
{
"epoch": 6.061280897101793,
"grad_norm": 0.053239458975068536,
"learning_rate": 0.0008046730613755404,
"loss": 2.4373,
"step": 19190
},
{
"epoch": 6.062860301666272,
"grad_norm": 0.06313006413371712,
"learning_rate": 0.0008041322661278823,
"loss": 2.5361,
"step": 19195
},
{
"epoch": 6.064439706230751,
"grad_norm": 0.0817887604251127,
"learning_rate": 0.0008035915304423506,
"loss": 2.3477,
"step": 19200
},
{
"epoch": 6.066019110795231,
"grad_norm": 0.07800824773512383,
"learning_rate": 0.0008030508544833794,
"loss": 2.4476,
"step": 19205
},
{
"epoch": 6.067598515359709,
"grad_norm": 0.05161799834997871,
"learning_rate": 0.0008025102384153853,
"loss": 2.4375,
"step": 19210
},
{
"epoch": 6.069177919924188,
"grad_norm": 0.06052198038533024,
"learning_rate": 0.0008019696824027663,
"loss": 2.4775,
"step": 19215
},
{
"epoch": 6.0707573244886675,
"grad_norm": 0.05953805064958084,
"learning_rate": 0.0008014291866099007,
"loss": 2.4548,
"step": 19220
},
{
"epoch": 6.072336729053147,
"grad_norm": 0.05988878487502156,
"learning_rate": 0.0008008887512011513,
"loss": 2.4502,
"step": 19225
},
{
"epoch": 6.073916133617626,
"grad_norm": 0.06897539510277752,
"learning_rate": 0.0008003483763408604,
"loss": 2.4402,
"step": 19230
},
{
"epoch": 6.075495538182105,
"grad_norm": 0.060776502782774085,
"learning_rate": 0.0007998080621933527,
"loss": 2.4286,
"step": 19235
},
{
"epoch": 6.077074942746584,
"grad_norm": 0.05985056424299122,
"learning_rate": 0.0007992678089229344,
"loss": 2.4727,
"step": 19240
},
{
"epoch": 6.078654347311064,
"grad_norm": 0.054963308123563426,
"learning_rate": 0.0007987276166938923,
"loss": 2.4795,
"step": 19245
},
{
"epoch": 6.080233751875543,
"grad_norm": 0.0541319804727999,
"learning_rate": 0.0007981874856704964,
"loss": 2.4764,
"step": 19250
},
{
"epoch": 6.081813156440022,
"grad_norm": 0.06476768923529816,
"learning_rate": 0.0007976474160169966,
"loss": 2.4733,
"step": 19255
},
{
"epoch": 6.083392561004501,
"grad_norm": 0.06039699006882269,
"learning_rate": 0.0007971074078976249,
"loss": 2.4292,
"step": 19260
},
{
"epoch": 6.084971965568981,
"grad_norm": 0.054339609303525015,
"learning_rate": 0.0007965674614765942,
"loss": 2.4195,
"step": 19265
},
{
"epoch": 6.08655137013346,
"grad_norm": 0.07341912919361576,
"learning_rate": 0.0007960275769180982,
"loss": 2.3928,
"step": 19270
},
{
"epoch": 6.088130774697939,
"grad_norm": 0.07088938821426911,
"learning_rate": 0.0007954877543863133,
"loss": 2.4505,
"step": 19275
},
{
"epoch": 6.089710179262418,
"grad_norm": 0.0755487703825606,
"learning_rate": 0.0007949479940453956,
"loss": 2.4253,
"step": 19280
},
{
"epoch": 6.091289583826898,
"grad_norm": 0.07742352758165552,
"learning_rate": 0.0007944082960594825,
"loss": 2.4225,
"step": 19285
},
{
"epoch": 6.092868988391376,
"grad_norm": 0.06083384088162834,
"learning_rate": 0.0007938686605926934,
"loss": 2.4465,
"step": 19290
},
{
"epoch": 6.094448392955855,
"grad_norm": 0.08586216693868441,
"learning_rate": 0.000793329087809127,
"loss": 2.461,
"step": 19295
},
{
"epoch": 6.0960277975203345,
"grad_norm": 0.10215266319373814,
"learning_rate": 0.0007927895778728651,
"loss": 2.4836,
"step": 19300
},
{
"epoch": 6.097607202084814,
"grad_norm": 0.07178187717373394,
"learning_rate": 0.0007922501309479688,
"loss": 2.5602,
"step": 19305
},
{
"epoch": 6.099186606649293,
"grad_norm": 0.054302517877230484,
"learning_rate": 0.0007917107471984798,
"loss": 2.5433,
"step": 19310
},
{
"epoch": 6.100766011213772,
"grad_norm": 0.061096824637253154,
"learning_rate": 0.0007911714267884221,
"loss": 2.4354,
"step": 19315
},
{
"epoch": 6.1023454157782515,
"grad_norm": 0.06577886495946768,
"learning_rate": 0.000790632169881799,
"loss": 2.3632,
"step": 19320
},
{
"epoch": 6.103924820342731,
"grad_norm": 0.06098322790980453,
"learning_rate": 0.0007900929766425957,
"loss": 2.5382,
"step": 19325
},
{
"epoch": 6.10550422490721,
"grad_norm": 0.05641490027858137,
"learning_rate": 0.000789553847234777,
"loss": 2.5491,
"step": 19330
},
{
"epoch": 6.107083629471689,
"grad_norm": 0.08712990033275116,
"learning_rate": 0.0007890147818222884,
"loss": 2.432,
"step": 19335
},
{
"epoch": 6.1086630340361685,
"grad_norm": 0.05542758442883438,
"learning_rate": 0.0007884757805690572,
"loss": 2.5538,
"step": 19340
},
{
"epoch": 6.110242438600648,
"grad_norm": 0.06324436303455877,
"learning_rate": 0.0007879368436389891,
"loss": 2.4013,
"step": 19345
},
{
"epoch": 6.111821843165127,
"grad_norm": 0.07341142734295356,
"learning_rate": 0.0007873979711959723,
"loss": 2.4703,
"step": 19350
},
{
"epoch": 6.113401247729606,
"grad_norm": 0.0544160751976481,
"learning_rate": 0.0007868591634038742,
"loss": 2.4309,
"step": 19355
},
{
"epoch": 6.1149806522940855,
"grad_norm": 0.08064278246626877,
"learning_rate": 0.0007863204204265422,
"loss": 2.4925,
"step": 19360
},
{
"epoch": 6.116560056858565,
"grad_norm": 0.06617168976754821,
"learning_rate": 0.0007857817424278056,
"loss": 2.409,
"step": 19365
},
{
"epoch": 6.118139461423043,
"grad_norm": 0.052907287990285504,
"learning_rate": 0.0007852431295714722,
"loss": 2.4569,
"step": 19370
},
{
"epoch": 6.119718865987522,
"grad_norm": 0.05681403097049076,
"learning_rate": 0.0007847045820213312,
"loss": 2.4856,
"step": 19375
},
{
"epoch": 6.121298270552002,
"grad_norm": 0.05434375282784585,
"learning_rate": 0.0007841660999411513,
"loss": 2.4537,
"step": 19380
},
{
"epoch": 6.122877675116481,
"grad_norm": 0.05946687736962094,
"learning_rate": 0.0007836276834946808,
"loss": 2.3871,
"step": 19385
},
{
"epoch": 6.12445707968096,
"grad_norm": 0.06053216424873867,
"learning_rate": 0.0007830893328456501,
"loss": 2.432,
"step": 19390
},
{
"epoch": 6.126036484245439,
"grad_norm": 0.07109957967941444,
"learning_rate": 0.0007825510481577671,
"loss": 2.5123,
"step": 19395
},
{
"epoch": 6.127615888809919,
"grad_norm": 0.058225125492934855,
"learning_rate": 0.0007820128295947206,
"loss": 2.4557,
"step": 19400
},
{
"epoch": 6.129195293374398,
"grad_norm": 0.052451647177555134,
"learning_rate": 0.0007814746773201804,
"loss": 2.4639,
"step": 19405
},
{
"epoch": 6.130774697938877,
"grad_norm": 0.06406265660562299,
"learning_rate": 0.0007809365914977944,
"loss": 2.4812,
"step": 19410
},
{
"epoch": 6.132354102503356,
"grad_norm": 0.06349179734314664,
"learning_rate": 0.0007803985722911915,
"loss": 2.3316,
"step": 19415
},
{
"epoch": 6.133933507067836,
"grad_norm": 0.06851849340426248,
"learning_rate": 0.0007798606198639798,
"loss": 2.4043,
"step": 19420
},
{
"epoch": 6.135512911632315,
"grad_norm": 0.05910445645432784,
"learning_rate": 0.0007793227343797464,
"loss": 2.4166,
"step": 19425
},
{
"epoch": 6.137092316196794,
"grad_norm": 0.05322964382562131,
"learning_rate": 0.00077878491600206,
"loss": 2.4909,
"step": 19430
},
{
"epoch": 6.138671720761273,
"grad_norm": 0.0568388791717399,
"learning_rate": 0.0007782471648944673,
"loss": 2.5081,
"step": 19435
},
{
"epoch": 6.140251125325753,
"grad_norm": 0.05336020271954184,
"learning_rate": 0.0007777094812204949,
"loss": 2.4632,
"step": 19440
},
{
"epoch": 6.141830529890232,
"grad_norm": 0.06311225934248536,
"learning_rate": 0.000777171865143649,
"loss": 2.4742,
"step": 19445
},
{
"epoch": 6.14340993445471,
"grad_norm": 0.08329850738087803,
"learning_rate": 0.0007766343168274149,
"loss": 2.4213,
"step": 19450
},
{
"epoch": 6.144989339019189,
"grad_norm": 0.06946732510040121,
"learning_rate": 0.0007760968364352584,
"loss": 2.4884,
"step": 19455
},
{
"epoch": 6.146568743583669,
"grad_norm": 0.08711521025253743,
"learning_rate": 0.0007755594241306231,
"loss": 2.4959,
"step": 19460
},
{
"epoch": 6.148148148148148,
"grad_norm": 0.07855381937421685,
"learning_rate": 0.0007750220800769333,
"loss": 2.3949,
"step": 19465
},
{
"epoch": 6.149727552712627,
"grad_norm": 0.06113392449692607,
"learning_rate": 0.0007744848044375912,
"loss": 2.4374,
"step": 19470
},
{
"epoch": 6.151306957277106,
"grad_norm": 0.06288587860316672,
"learning_rate": 0.000773947597375979,
"loss": 2.4297,
"step": 19475
},
{
"epoch": 6.152886361841586,
"grad_norm": 0.0653770187253421,
"learning_rate": 0.0007734104590554587,
"loss": 2.4974,
"step": 19480
},
{
"epoch": 6.154465766406065,
"grad_norm": 0.05761192000773616,
"learning_rate": 0.0007728733896393699,
"loss": 2.4836,
"step": 19485
},
{
"epoch": 6.156045170970544,
"grad_norm": 0.07031759093297552,
"learning_rate": 0.0007723363892910318,
"loss": 2.3831,
"step": 19490
},
{
"epoch": 6.157624575535023,
"grad_norm": 0.06353387610842912,
"learning_rate": 0.0007717994581737435,
"loss": 2.4409,
"step": 19495
},
{
"epoch": 6.159203980099503,
"grad_norm": 0.05250149475425236,
"learning_rate": 0.0007712625964507818,
"loss": 2.3626,
"step": 19500
},
{
"epoch": 6.160783384663982,
"grad_norm": 0.0507851420947074,
"learning_rate": 0.0007707258042854032,
"loss": 2.502,
"step": 19505
},
{
"epoch": 6.162362789228461,
"grad_norm": 0.053488995679231016,
"learning_rate": 0.0007701890818408427,
"loss": 2.4677,
"step": 19510
},
{
"epoch": 6.16394219379294,
"grad_norm": 0.08382190036423479,
"learning_rate": 0.0007696524292803137,
"loss": 2.5104,
"step": 19515
},
{
"epoch": 6.16552159835742,
"grad_norm": 0.06078629464656898,
"learning_rate": 0.0007691158467670096,
"loss": 2.3855,
"step": 19520
},
{
"epoch": 6.167101002921898,
"grad_norm": 0.06442201467040917,
"learning_rate": 0.0007685793344641012,
"loss": 2.2965,
"step": 19525
},
{
"epoch": 6.168680407486377,
"grad_norm": 0.04867253028066429,
"learning_rate": 0.0007680428925347386,
"loss": 2.464,
"step": 19530
},
{
"epoch": 6.1702598120508565,
"grad_norm": 0.07103714043573228,
"learning_rate": 0.0007675065211420507,
"loss": 2.4888,
"step": 19535
},
{
"epoch": 6.171839216615336,
"grad_norm": 0.06499738123502725,
"learning_rate": 0.0007669702204491436,
"loss": 2.4052,
"step": 19540
},
{
"epoch": 6.173418621179815,
"grad_norm": 0.058453433265348194,
"learning_rate": 0.0007664339906191042,
"loss": 2.3744,
"step": 19545
},
{
"epoch": 6.174998025744294,
"grad_norm": 0.06112806537988322,
"learning_rate": 0.0007658978318149957,
"loss": 2.4018,
"step": 19550
},
{
"epoch": 6.1765774303087735,
"grad_norm": 0.06365063449863004,
"learning_rate": 0.0007653617441998608,
"loss": 2.3682,
"step": 19555
},
{
"epoch": 6.178156834873253,
"grad_norm": 0.06203737080784095,
"learning_rate": 0.0007648257279367206,
"loss": 2.4238,
"step": 19560
},
{
"epoch": 6.179736239437732,
"grad_norm": 0.07088864858319288,
"learning_rate": 0.0007642897831885735,
"loss": 2.4125,
"step": 19565
},
{
"epoch": 6.181315644002211,
"grad_norm": 0.07057223116169063,
"learning_rate": 0.0007637539101183979,
"loss": 2.3309,
"step": 19570
},
{
"epoch": 6.1828950485666905,
"grad_norm": 0.06290781688397977,
"learning_rate": 0.0007632181088891482,
"loss": 2.46,
"step": 19575
},
{
"epoch": 6.18447445313117,
"grad_norm": 0.06570377591134992,
"learning_rate": 0.0007626823796637592,
"loss": 2.4906,
"step": 19580
},
{
"epoch": 6.186053857695649,
"grad_norm": 0.060461208411296756,
"learning_rate": 0.0007621467226051422,
"loss": 2.4814,
"step": 19585
},
{
"epoch": 6.187633262260128,
"grad_norm": 0.10724116753160037,
"learning_rate": 0.0007616111378761871,
"loss": 2.3867,
"step": 19590
},
{
"epoch": 6.1892126668246075,
"grad_norm": 0.06311108522890622,
"learning_rate": 0.000761075625639762,
"loss": 2.512,
"step": 19595
},
{
"epoch": 6.190792071389087,
"grad_norm": 0.05713061134094679,
"learning_rate": 0.0007605401860587126,
"loss": 2.4465,
"step": 19600
},
{
"epoch": 6.192371475953566,
"grad_norm": 0.06088272053471607,
"learning_rate": 0.0007600048192958622,
"loss": 2.4067,
"step": 19605
},
{
"epoch": 6.193950880518044,
"grad_norm": 0.05431585064737753,
"learning_rate": 0.0007594695255140134,
"loss": 2.3904,
"step": 19610
},
{
"epoch": 6.195530285082524,
"grad_norm": 0.06016840675239864,
"learning_rate": 0.0007589343048759449,
"loss": 2.5428,
"step": 19615
},
{
"epoch": 6.197109689647003,
"grad_norm": 0.05380648367219518,
"learning_rate": 0.0007583991575444142,
"loss": 2.4529,
"step": 19620
},
{
"epoch": 6.198689094211482,
"grad_norm": 0.0775100601412376,
"learning_rate": 0.0007578640836821561,
"loss": 2.3903,
"step": 19625
},
{
"epoch": 6.200268498775961,
"grad_norm": 0.08526232121545854,
"learning_rate": 0.0007573290834518827,
"loss": 2.445,
"step": 19630
},
{
"epoch": 6.201847903340441,
"grad_norm": 0.05991259904388278,
"learning_rate": 0.0007567941570162848,
"loss": 2.525,
"step": 19635
},
{
"epoch": 6.20342730790492,
"grad_norm": 0.05185741511355774,
"learning_rate": 0.0007562593045380299,
"loss": 2.429,
"step": 19640
},
{
"epoch": 6.205006712469399,
"grad_norm": 0.08706659397624288,
"learning_rate": 0.0007557245261797633,
"loss": 2.4428,
"step": 19645
},
{
"epoch": 6.206586117033878,
"grad_norm": 0.06709441806701733,
"learning_rate": 0.0007551898221041076,
"loss": 2.4473,
"step": 19650
},
{
"epoch": 6.2081655215983576,
"grad_norm": 0.05821670970822972,
"learning_rate": 0.0007546551924736625,
"loss": 2.4678,
"step": 19655
},
{
"epoch": 6.209744926162837,
"grad_norm": 0.062191125073990654,
"learning_rate": 0.0007541206374510062,
"loss": 2.4385,
"step": 19660
},
{
"epoch": 6.211324330727316,
"grad_norm": 0.0625137100432733,
"learning_rate": 0.0007535861571986926,
"loss": 2.4372,
"step": 19665
},
{
"epoch": 6.212903735291795,
"grad_norm": 0.061725845909325915,
"learning_rate": 0.0007530517518792547,
"loss": 2.4223,
"step": 19670
},
{
"epoch": 6.2144831398562745,
"grad_norm": 0.06026508757658309,
"learning_rate": 0.0007525174216552013,
"loss": 2.4125,
"step": 19675
},
{
"epoch": 6.216062544420754,
"grad_norm": 0.0801349638909046,
"learning_rate": 0.0007519831666890184,
"loss": 2.4602,
"step": 19680
},
{
"epoch": 6.217641948985232,
"grad_norm": 0.0748061969276911,
"learning_rate": 0.0007514489871431702,
"loss": 2.3577,
"step": 19685
},
{
"epoch": 6.219221353549711,
"grad_norm": 0.06703059419819289,
"learning_rate": 0.0007509148831800965,
"loss": 2.4924,
"step": 19690
},
{
"epoch": 6.220800758114191,
"grad_norm": 0.06602078693858335,
"learning_rate": 0.0007503808549622158,
"loss": 2.4308,
"step": 19695
},
{
"epoch": 6.22238016267867,
"grad_norm": 0.06339145900477593,
"learning_rate": 0.0007498469026519223,
"loss": 2.3763,
"step": 19700
},
{
"epoch": 6.223959567243149,
"grad_norm": 0.05530531986340535,
"learning_rate": 0.000749313026411587,
"loss": 2.451,
"step": 19705
},
{
"epoch": 6.225538971807628,
"grad_norm": 0.07442021183242264,
"learning_rate": 0.0007487792264035592,
"loss": 2.4233,
"step": 19710
},
{
"epoch": 6.227118376372108,
"grad_norm": 0.052265710521129105,
"learning_rate": 0.0007482455027901635,
"loss": 2.4508,
"step": 19715
},
{
"epoch": 6.228697780936587,
"grad_norm": 0.05920679774013846,
"learning_rate": 0.0007477118557337012,
"loss": 2.4679,
"step": 19720
},
{
"epoch": 6.230277185501066,
"grad_norm": 0.05938237683838213,
"learning_rate": 0.0007471782853964524,
"loss": 2.5072,
"step": 19725
},
{
"epoch": 6.231856590065545,
"grad_norm": 0.05694161128480649,
"learning_rate": 0.0007466447919406713,
"loss": 2.4131,
"step": 19730
},
{
"epoch": 6.233435994630025,
"grad_norm": 0.06333470851146386,
"learning_rate": 0.0007461113755285907,
"loss": 2.455,
"step": 19735
},
{
"epoch": 6.235015399194504,
"grad_norm": 0.05424511788662894,
"learning_rate": 0.0007455780363224184,
"loss": 2.4136,
"step": 19740
},
{
"epoch": 6.236594803758983,
"grad_norm": 0.07083251242734387,
"learning_rate": 0.0007450447744843393,
"loss": 2.4706,
"step": 19745
},
{
"epoch": 6.238174208323462,
"grad_norm": 0.05102356920787792,
"learning_rate": 0.0007445115901765161,
"loss": 2.4865,
"step": 19750
},
{
"epoch": 6.239753612887942,
"grad_norm": 0.05770373850241559,
"learning_rate": 0.0007439784835610852,
"loss": 2.4086,
"step": 19755
},
{
"epoch": 6.241333017452421,
"grad_norm": 0.054102109406934856,
"learning_rate": 0.0007434454548001621,
"loss": 2.4218,
"step": 19760
},
{
"epoch": 6.242912422016899,
"grad_norm": 0.06613804076706183,
"learning_rate": 0.0007429125040558371,
"loss": 2.4291,
"step": 19765
},
{
"epoch": 6.2444918265813785,
"grad_norm": 0.052500542600787924,
"learning_rate": 0.0007423796314901768,
"loss": 2.4256,
"step": 19770
},
{
"epoch": 6.246071231145858,
"grad_norm": 0.06909005559887729,
"learning_rate": 0.0007418468372652248,
"loss": 2.4589,
"step": 19775
},
{
"epoch": 6.247650635710337,
"grad_norm": 0.06303785575072766,
"learning_rate": 0.0007413141215429998,
"loss": 2.4335,
"step": 19780
},
{
"epoch": 6.249230040274816,
"grad_norm": 0.06056915510263313,
"learning_rate": 0.0007407814844854981,
"loss": 2.454,
"step": 19785
},
{
"epoch": 6.2508094448392955,
"grad_norm": 0.06941117388350045,
"learning_rate": 0.0007402489262546908,
"loss": 2.4128,
"step": 19790
},
{
"epoch": 6.252388849403775,
"grad_norm": 0.05504060593786002,
"learning_rate": 0.000739716447012525,
"loss": 2.3883,
"step": 19795
},
{
"epoch": 6.253968253968254,
"grad_norm": 0.061787067926771924,
"learning_rate": 0.000739184046920925,
"loss": 2.4231,
"step": 19800
},
{
"epoch": 6.255547658532733,
"grad_norm": 0.07911758040593335,
"learning_rate": 0.0007386517261417896,
"loss": 2.5322,
"step": 19805
},
{
"epoch": 6.2571270630972124,
"grad_norm": 0.07059955428855016,
"learning_rate": 0.0007381194848369947,
"loss": 2.5197,
"step": 19810
},
{
"epoch": 6.258706467661692,
"grad_norm": 0.05793352587725986,
"learning_rate": 0.0007375873231683915,
"loss": 2.4764,
"step": 19815
},
{
"epoch": 6.260285872226171,
"grad_norm": 0.06790131801420428,
"learning_rate": 0.0007370552412978064,
"loss": 2.5465,
"step": 19820
},
{
"epoch": 6.26186527679065,
"grad_norm": 0.07514729263157002,
"learning_rate": 0.0007365232393870427,
"loss": 2.5099,
"step": 19825
},
{
"epoch": 6.263444681355129,
"grad_norm": 0.06619360340136478,
"learning_rate": 0.0007359913175978783,
"loss": 2.4732,
"step": 19830
},
{
"epoch": 6.265024085919609,
"grad_norm": 0.05833907471011041,
"learning_rate": 0.0007354594760920672,
"loss": 2.4916,
"step": 19835
},
{
"epoch": 6.266603490484087,
"grad_norm": 0.05677736917333003,
"learning_rate": 0.0007349277150313398,
"loss": 2.4314,
"step": 19840
},
{
"epoch": 6.268182895048566,
"grad_norm": 0.0558191327993684,
"learning_rate": 0.0007343960345774,
"loss": 2.3966,
"step": 19845
},
{
"epoch": 6.2697622996130455,
"grad_norm": 0.05193655545478352,
"learning_rate": 0.0007338644348919295,
"loss": 2.4396,
"step": 19850
},
{
"epoch": 6.271341704177525,
"grad_norm": 0.057162696128829744,
"learning_rate": 0.0007333329161365841,
"loss": 2.4253,
"step": 19855
},
{
"epoch": 6.272921108742004,
"grad_norm": 0.055770002545731084,
"learning_rate": 0.0007328014784729948,
"loss": 2.3843,
"step": 19860
},
{
"epoch": 6.274500513306483,
"grad_norm": 0.05727410758040963,
"learning_rate": 0.000732270122062769,
"loss": 2.4839,
"step": 19865
},
{
"epoch": 6.2760799178709625,
"grad_norm": 0.057229089242360455,
"learning_rate": 0.000731738847067488,
"loss": 2.4468,
"step": 19870
},
{
"epoch": 6.277659322435442,
"grad_norm": 0.05769316358896857,
"learning_rate": 0.00073120765364871,
"loss": 2.5816,
"step": 19875
},
{
"epoch": 6.279238726999921,
"grad_norm": 0.0571768306769889,
"learning_rate": 0.0007306765419679673,
"loss": 2.4462,
"step": 19880
},
{
"epoch": 6.2808181315644,
"grad_norm": 0.053190686271972334,
"learning_rate": 0.0007301455121867671,
"loss": 2.5108,
"step": 19885
},
{
"epoch": 6.2823975361288795,
"grad_norm": 0.0713570291442539,
"learning_rate": 0.0007296145644665928,
"loss": 2.4145,
"step": 19890
},
{
"epoch": 6.283976940693359,
"grad_norm": 0.05917801670607476,
"learning_rate": 0.0007290836989689015,
"loss": 2.4244,
"step": 19895
},
{
"epoch": 6.285556345257838,
"grad_norm": 0.054512218469362134,
"learning_rate": 0.0007285529158551267,
"loss": 2.5235,
"step": 19900
},
{
"epoch": 6.287135749822317,
"grad_norm": 0.05601623728749516,
"learning_rate": 0.000728022215286676,
"loss": 2.3833,
"step": 19905
},
{
"epoch": 6.2887151543867965,
"grad_norm": 0.06088975945160917,
"learning_rate": 0.0007274915974249316,
"loss": 2.4772,
"step": 19910
},
{
"epoch": 6.290294558951276,
"grad_norm": 0.0536131533215133,
"learning_rate": 0.0007269610624312517,
"loss": 2.4868,
"step": 19915
},
{
"epoch": 6.291873963515755,
"grad_norm": 0.05750908257419262,
"learning_rate": 0.0007264306104669678,
"loss": 2.4313,
"step": 19920
},
{
"epoch": 6.293453368080233,
"grad_norm": 0.059299770139471,
"learning_rate": 0.0007259002416933876,
"loss": 2.3861,
"step": 19925
},
{
"epoch": 6.295032772644713,
"grad_norm": 0.06457815583700187,
"learning_rate": 0.0007253699562717929,
"loss": 2.3659,
"step": 19930
},
{
"epoch": 6.296612177209192,
"grad_norm": 0.07102289405919454,
"learning_rate": 0.0007248397543634392,
"loss": 2.4526,
"step": 19935
},
{
"epoch": 6.298191581773671,
"grad_norm": 0.058997874582867565,
"learning_rate": 0.0007243096361295587,
"loss": 2.4705,
"step": 19940
},
{
"epoch": 6.29977098633815,
"grad_norm": 0.06266048043881087,
"learning_rate": 0.0007237796017313563,
"loss": 2.5197,
"step": 19945
},
{
"epoch": 6.30135039090263,
"grad_norm": 0.06221028004346952,
"learning_rate": 0.000723249651330012,
"loss": 2.491,
"step": 19950
},
{
"epoch": 6.302929795467109,
"grad_norm": 0.06018431461748382,
"learning_rate": 0.0007227197850866807,
"loss": 2.4182,
"step": 19955
},
{
"epoch": 6.304509200031588,
"grad_norm": 0.059088934210273426,
"learning_rate": 0.0007221900031624908,
"loss": 2.4473,
"step": 19960
},
{
"epoch": 6.306088604596067,
"grad_norm": 0.06053952326016882,
"learning_rate": 0.0007216603057185465,
"loss": 2.3726,
"step": 19965
},
{
"epoch": 6.307668009160547,
"grad_norm": 0.05734903425797904,
"learning_rate": 0.0007211306929159247,
"loss": 2.4523,
"step": 19970
},
{
"epoch": 6.309247413725026,
"grad_norm": 0.059347340635324056,
"learning_rate": 0.0007206011649156772,
"loss": 2.4244,
"step": 19975
},
{
"epoch": 6.310826818289505,
"grad_norm": 0.057641619829113484,
"learning_rate": 0.0007200717218788307,
"loss": 2.4737,
"step": 19980
},
{
"epoch": 6.312406222853984,
"grad_norm": 0.07366682908731811,
"learning_rate": 0.0007195423639663844,
"loss": 2.4987,
"step": 19985
},
{
"epoch": 6.313985627418464,
"grad_norm": 0.08410507754995093,
"learning_rate": 0.0007190130913393139,
"loss": 2.4592,
"step": 19990
},
{
"epoch": 6.315565031982943,
"grad_norm": 0.08943630780959341,
"learning_rate": 0.000718483904158567,
"loss": 2.4048,
"step": 19995
},
{
"epoch": 6.317144436547421,
"grad_norm": 0.06330160880031208,
"learning_rate": 0.0007179548025850659,
"loss": 2.4437,
"step": 20000
},
{
"epoch": 6.3187238411119,
"grad_norm": 0.06236266066667164,
"learning_rate": 0.0007174257867797078,
"loss": 2.4187,
"step": 20005
},
{
"epoch": 6.32030324567638,
"grad_norm": 0.06880313643879914,
"learning_rate": 0.0007168968569033618,
"loss": 2.397,
"step": 20010
},
{
"epoch": 6.321882650240859,
"grad_norm": 0.0796116848627419,
"learning_rate": 0.0007163680131168735,
"loss": 2.4144,
"step": 20015
},
{
"epoch": 6.323462054805338,
"grad_norm": 0.06082918554738977,
"learning_rate": 0.0007158392555810602,
"loss": 2.4489,
"step": 20020
},
{
"epoch": 6.325041459369817,
"grad_norm": 0.05528001497547358,
"learning_rate": 0.0007153105844567133,
"loss": 2.3929,
"step": 20025
},
{
"epoch": 6.326620863934297,
"grad_norm": 0.06956583597351566,
"learning_rate": 0.0007147819999045991,
"loss": 2.4127,
"step": 20030
},
{
"epoch": 6.328200268498776,
"grad_norm": 0.05928270622597066,
"learning_rate": 0.0007142535020854561,
"loss": 2.4676,
"step": 20035
},
{
"epoch": 6.329779673063255,
"grad_norm": 0.060333763608973995,
"learning_rate": 0.0007137250911599978,
"loss": 2.4146,
"step": 20040
},
{
"epoch": 6.331359077627734,
"grad_norm": 0.05991178940603649,
"learning_rate": 0.0007131967672889101,
"loss": 2.3577,
"step": 20045
},
{
"epoch": 6.332938482192214,
"grad_norm": 0.05558170961877607,
"learning_rate": 0.0007126685306328525,
"loss": 2.4753,
"step": 20050
},
{
"epoch": 6.334517886756693,
"grad_norm": 0.05785550291586159,
"learning_rate": 0.0007121403813524595,
"loss": 2.392,
"step": 20055
},
{
"epoch": 6.336097291321172,
"grad_norm": 0.05491722701683667,
"learning_rate": 0.0007116123196083373,
"loss": 2.3809,
"step": 20060
},
{
"epoch": 6.337676695885651,
"grad_norm": 0.06068570278142025,
"learning_rate": 0.000711084345561066,
"loss": 2.3726,
"step": 20065
},
{
"epoch": 6.339256100450131,
"grad_norm": 0.055891629425172874,
"learning_rate": 0.0007105564593711995,
"loss": 2.4416,
"step": 20070
},
{
"epoch": 6.34083550501461,
"grad_norm": 0.05462412335867579,
"learning_rate": 0.0007100286611992639,
"loss": 2.2954,
"step": 20075
},
{
"epoch": 6.342414909579089,
"grad_norm": 0.05393312209152095,
"learning_rate": 0.0007095009512057602,
"loss": 2.5259,
"step": 20080
},
{
"epoch": 6.3439943141435675,
"grad_norm": 0.06552752866128936,
"learning_rate": 0.0007089733295511611,
"loss": 2.4799,
"step": 20085
},
{
"epoch": 6.345573718708047,
"grad_norm": 0.1118648236173462,
"learning_rate": 0.000708445796395913,
"loss": 2.4729,
"step": 20090
},
{
"epoch": 6.347153123272526,
"grad_norm": 0.06886127615642178,
"learning_rate": 0.0007079183519004355,
"loss": 2.5257,
"step": 20095
},
{
"epoch": 6.348732527837005,
"grad_norm": 0.0703385290289815,
"learning_rate": 0.0007073909962251209,
"loss": 2.4716,
"step": 20100
},
{
"epoch": 6.3503119324014845,
"grad_norm": 0.07464735487394716,
"learning_rate": 0.0007068637295303349,
"loss": 2.4992,
"step": 20105
},
{
"epoch": 6.351891336965964,
"grad_norm": 0.06789224630431206,
"learning_rate": 0.0007063365519764162,
"loss": 2.422,
"step": 20110
},
{
"epoch": 6.353470741530443,
"grad_norm": 0.059206546098195346,
"learning_rate": 0.0007058094637236752,
"loss": 2.4291,
"step": 20115
},
{
"epoch": 6.355050146094922,
"grad_norm": 0.05399741185812805,
"learning_rate": 0.0007052824649323969,
"loss": 2.3992,
"step": 20120
},
{
"epoch": 6.3566295506594015,
"grad_norm": 0.06147701896292227,
"learning_rate": 0.0007047555557628379,
"loss": 2.4161,
"step": 20125
},
{
"epoch": 6.358208955223881,
"grad_norm": 0.05166087438320311,
"learning_rate": 0.0007042287363752283,
"loss": 2.5049,
"step": 20130
},
{
"epoch": 6.35978835978836,
"grad_norm": 0.06354875474376977,
"learning_rate": 0.0007037020069297702,
"loss": 2.4589,
"step": 20135
},
{
"epoch": 6.361367764352839,
"grad_norm": 0.05775554305567858,
"learning_rate": 0.0007031753675866381,
"loss": 2.3615,
"step": 20140
},
{
"epoch": 6.3629471689173185,
"grad_norm": 0.06717018758488477,
"learning_rate": 0.0007026488185059808,
"loss": 2.5662,
"step": 20145
},
{
"epoch": 6.364526573481798,
"grad_norm": 0.07125756890253258,
"learning_rate": 0.0007021223598479179,
"loss": 2.4711,
"step": 20150
},
{
"epoch": 6.366105978046276,
"grad_norm": 0.0552105505115895,
"learning_rate": 0.0007015959917725421,
"loss": 2.3321,
"step": 20155
},
{
"epoch": 6.367685382610755,
"grad_norm": 0.0711688256270678,
"learning_rate": 0.0007010697144399187,
"loss": 2.3948,
"step": 20160
},
{
"epoch": 6.369264787175235,
"grad_norm": 0.05391905398869705,
"learning_rate": 0.000700543528010085,
"loss": 2.3398,
"step": 20165
},
{
"epoch": 6.370844191739714,
"grad_norm": 0.06534707008655423,
"learning_rate": 0.0007000174326430515,
"loss": 2.4308,
"step": 20170
},
{
"epoch": 6.372423596304193,
"grad_norm": 0.07938743461028204,
"learning_rate": 0.0006994914284988001,
"loss": 2.4693,
"step": 20175
},
{
"epoch": 6.374003000868672,
"grad_norm": 0.06811162638520542,
"learning_rate": 0.000698965515737285,
"loss": 2.4052,
"step": 20180
},
{
"epoch": 6.375582405433152,
"grad_norm": 0.06482877963590887,
"learning_rate": 0.0006984396945184335,
"loss": 2.5106,
"step": 20185
},
{
"epoch": 6.377161809997631,
"grad_norm": 0.0621878075448932,
"learning_rate": 0.0006979139650021435,
"loss": 2.4139,
"step": 20190
},
{
"epoch": 6.37874121456211,
"grad_norm": 0.05717168618430722,
"learning_rate": 0.0006973883273482874,
"loss": 2.5794,
"step": 20195
},
{
"epoch": 6.380320619126589,
"grad_norm": 0.058384350907489155,
"learning_rate": 0.0006968627817167076,
"loss": 2.4317,
"step": 20200
},
{
"epoch": 6.381900023691069,
"grad_norm": 0.06208971981672258,
"learning_rate": 0.0006963373282672185,
"loss": 2.4561,
"step": 20205
},
{
"epoch": 6.383479428255548,
"grad_norm": 0.06027034578128326,
"learning_rate": 0.000695811967159608,
"loss": 2.4484,
"step": 20210
},
{
"epoch": 6.385058832820027,
"grad_norm": 0.05957450555656316,
"learning_rate": 0.0006952866985536347,
"loss": 2.4317,
"step": 20215
},
{
"epoch": 6.386638237384506,
"grad_norm": 0.05822686780947575,
"learning_rate": 0.0006947615226090297,
"loss": 2.5867,
"step": 20220
},
{
"epoch": 6.388217641948986,
"grad_norm": 0.06484165103161406,
"learning_rate": 0.0006942364394854954,
"loss": 2.3842,
"step": 20225
},
{
"epoch": 6.389797046513465,
"grad_norm": 0.05559231605792102,
"learning_rate": 0.0006937114493427059,
"loss": 2.4256,
"step": 20230
},
{
"epoch": 6.391376451077944,
"grad_norm": 0.06433577974811837,
"learning_rate": 0.0006931865523403082,
"loss": 2.4834,
"step": 20235
},
{
"epoch": 6.392955855642422,
"grad_norm": 0.05649681627652346,
"learning_rate": 0.0006926617486379194,
"loss": 2.4661,
"step": 20240
},
{
"epoch": 6.394535260206902,
"grad_norm": 0.06293263126241654,
"learning_rate": 0.0006921370383951293,
"loss": 2.4375,
"step": 20245
},
{
"epoch": 6.396114664771381,
"grad_norm": 0.051549970176822474,
"learning_rate": 0.0006916124217714989,
"loss": 2.4145,
"step": 20250
},
{
"epoch": 6.39769406933586,
"grad_norm": 0.06901527332210579,
"learning_rate": 0.0006910878989265603,
"loss": 2.4099,
"step": 20255
},
{
"epoch": 6.399273473900339,
"grad_norm": 0.0797135766335866,
"learning_rate": 0.0006905634700198183,
"loss": 2.4296,
"step": 20260
},
{
"epoch": 6.400852878464819,
"grad_norm": 0.05621303030996471,
"learning_rate": 0.0006900391352107478,
"loss": 2.3952,
"step": 20265
},
{
"epoch": 6.402432283029298,
"grad_norm": 0.05646658463741475,
"learning_rate": 0.0006895148946587962,
"loss": 2.3726,
"step": 20270
},
{
"epoch": 6.404011687593777,
"grad_norm": 0.06780048965410568,
"learning_rate": 0.0006889907485233813,
"loss": 2.4438,
"step": 20275
},
{
"epoch": 6.405591092158256,
"grad_norm": 0.056144378783592644,
"learning_rate": 0.0006884666969638924,
"loss": 2.381,
"step": 20280
},
{
"epoch": 6.407170496722736,
"grad_norm": 0.07509907298138832,
"learning_rate": 0.0006879427401396908,
"loss": 2.4764,
"step": 20285
},
{
"epoch": 6.408749901287215,
"grad_norm": 0.059945601070473024,
"learning_rate": 0.0006874188782101084,
"loss": 2.5221,
"step": 20290
},
{
"epoch": 6.410329305851694,
"grad_norm": 0.054707228885569524,
"learning_rate": 0.0006868951113344473,
"loss": 2.5151,
"step": 20295
},
{
"epoch": 6.411908710416173,
"grad_norm": 0.06278686032168498,
"learning_rate": 0.0006863714396719829,
"loss": 2.4056,
"step": 20300
},
{
"epoch": 6.413488114980653,
"grad_norm": 0.07822803264136353,
"learning_rate": 0.0006858478633819596,
"loss": 2.3835,
"step": 20305
},
{
"epoch": 6.415067519545132,
"grad_norm": 0.05451919846700328,
"learning_rate": 0.000685324382623594,
"loss": 2.4672,
"step": 20310
},
{
"epoch": 6.41664692410961,
"grad_norm": 0.06343652940984232,
"learning_rate": 0.0006848009975560732,
"loss": 2.3666,
"step": 20315
},
{
"epoch": 6.4182263286740895,
"grad_norm": 0.06640771837812848,
"learning_rate": 0.0006842777083385548,
"loss": 2.4597,
"step": 20320
},
{
"epoch": 6.419805733238569,
"grad_norm": 0.059272823868647025,
"learning_rate": 0.0006837545151301685,
"loss": 2.4158,
"step": 20325
},
{
"epoch": 6.421385137803048,
"grad_norm": 0.06747797623613917,
"learning_rate": 0.0006832314180900133,
"loss": 2.4111,
"step": 20330
},
{
"epoch": 6.422964542367527,
"grad_norm": 0.05532566188652506,
"learning_rate": 0.0006827084173771603,
"loss": 2.4859,
"step": 20335
},
{
"epoch": 6.4245439469320065,
"grad_norm": 0.05718193786836203,
"learning_rate": 0.0006821855131506502,
"loss": 2.4076,
"step": 20340
},
{
"epoch": 6.426123351496486,
"grad_norm": 0.08859672081989957,
"learning_rate": 0.0006816627055694946,
"loss": 2.4557,
"step": 20345
},
{
"epoch": 6.427702756060965,
"grad_norm": 0.05611112160445568,
"learning_rate": 0.0006811399947926768,
"loss": 2.5086,
"step": 20350
},
{
"epoch": 6.429282160625444,
"grad_norm": 0.0765003452427302,
"learning_rate": 0.0006806173809791492,
"loss": 2.5024,
"step": 20355
},
{
"epoch": 6.4308615651899235,
"grad_norm": 0.06173073387982078,
"learning_rate": 0.0006800948642878355,
"loss": 2.409,
"step": 20360
},
{
"epoch": 6.432440969754403,
"grad_norm": 0.07566938586979603,
"learning_rate": 0.0006795724448776297,
"loss": 2.4258,
"step": 20365
},
{
"epoch": 6.434020374318882,
"grad_norm": 0.06657405835538575,
"learning_rate": 0.0006790501229073958,
"loss": 2.4711,
"step": 20370
},
{
"epoch": 6.435599778883361,
"grad_norm": 0.05398736598825769,
"learning_rate": 0.0006785278985359692,
"loss": 2.3043,
"step": 20375
},
{
"epoch": 6.4371791834478405,
"grad_norm": 0.0590280207460199,
"learning_rate": 0.0006780057719221551,
"loss": 2.3536,
"step": 20380
},
{
"epoch": 6.43875858801232,
"grad_norm": 0.05536512813414772,
"learning_rate": 0.0006774837432247276,
"loss": 2.3979,
"step": 20385
},
{
"epoch": 6.440337992576799,
"grad_norm": 0.06999585369846131,
"learning_rate": 0.0006769618126024337,
"loss": 2.4737,
"step": 20390
},
{
"epoch": 6.441917397141278,
"grad_norm": 0.0935108775167,
"learning_rate": 0.0006764399802139885,
"loss": 2.4879,
"step": 20395
},
{
"epoch": 6.443496801705757,
"grad_norm": 0.07381233374689976,
"learning_rate": 0.0006759182462180782,
"loss": 2.3666,
"step": 20400
},
{
"epoch": 6.445076206270236,
"grad_norm": 0.05131499052796765,
"learning_rate": 0.0006753966107733586,
"loss": 2.3764,
"step": 20405
},
{
"epoch": 6.446655610834715,
"grad_norm": 0.05533009728866976,
"learning_rate": 0.0006748750740384553,
"loss": 2.3707,
"step": 20410
},
{
"epoch": 6.448235015399194,
"grad_norm": 0.05390546171598726,
"learning_rate": 0.0006743536361719651,
"loss": 2.4821,
"step": 20415
},
{
"epoch": 6.4498144199636736,
"grad_norm": 0.07512206993558714,
"learning_rate": 0.0006738322973324534,
"loss": 2.436,
"step": 20420
},
{
"epoch": 6.451393824528153,
"grad_norm": 0.054634898457488874,
"learning_rate": 0.0006733110576784563,
"loss": 2.3275,
"step": 20425
},
{
"epoch": 6.452973229092632,
"grad_norm": 0.05450585017125078,
"learning_rate": 0.0006727899173684793,
"loss": 2.437,
"step": 20430
},
{
"epoch": 6.454552633657111,
"grad_norm": 0.06061471159834075,
"learning_rate": 0.0006722688765609975,
"loss": 2.3199,
"step": 20435
},
{
"epoch": 6.4561320382215905,
"grad_norm": 0.052256092507753815,
"learning_rate": 0.0006717479354144567,
"loss": 2.2832,
"step": 20440
},
{
"epoch": 6.45771144278607,
"grad_norm": 0.06333135818279083,
"learning_rate": 0.0006712270940872712,
"loss": 2.4067,
"step": 20445
},
{
"epoch": 6.459290847350549,
"grad_norm": 0.06693645700514088,
"learning_rate": 0.0006707063527378261,
"loss": 2.4187,
"step": 20450
},
{
"epoch": 6.460870251915028,
"grad_norm": 0.09742855878142472,
"learning_rate": 0.0006701857115244752,
"loss": 2.3825,
"step": 20455
},
{
"epoch": 6.4624496564795075,
"grad_norm": 0.06080532997515874,
"learning_rate": 0.0006696651706055418,
"loss": 2.5544,
"step": 20460
},
{
"epoch": 6.464029061043987,
"grad_norm": 0.06693902176632435,
"learning_rate": 0.0006691447301393199,
"loss": 2.4168,
"step": 20465
},
{
"epoch": 6.465608465608465,
"grad_norm": 0.0809218694358045,
"learning_rate": 0.0006686243902840714,
"loss": 2.5445,
"step": 20470
},
{
"epoch": 6.467187870172944,
"grad_norm": 0.06069332355264493,
"learning_rate": 0.0006681041511980288,
"loss": 2.3824,
"step": 20475
},
{
"epoch": 6.468767274737424,
"grad_norm": 0.06571379404356267,
"learning_rate": 0.0006675840130393933,
"loss": 2.4237,
"step": 20480
},
{
"epoch": 6.470346679301903,
"grad_norm": 0.07572609411052877,
"learning_rate": 0.0006670639759663353,
"loss": 2.4618,
"step": 20485
},
{
"epoch": 6.471926083866382,
"grad_norm": 0.05471074733766293,
"learning_rate": 0.0006665440401369953,
"loss": 2.5023,
"step": 20490
},
{
"epoch": 6.473505488430861,
"grad_norm": 0.0712791647410548,
"learning_rate": 0.0006660242057094821,
"loss": 2.4129,
"step": 20495
},
{
"epoch": 6.475084892995341,
"grad_norm": 0.07005922454887745,
"learning_rate": 0.0006655044728418738,
"loss": 2.4285,
"step": 20500
},
{
"epoch": 6.47666429755982,
"grad_norm": 0.06958646111624173,
"learning_rate": 0.0006649848416922186,
"loss": 2.3992,
"step": 20505
},
{
"epoch": 6.478243702124299,
"grad_norm": 0.07334876732527172,
"learning_rate": 0.0006644653124185323,
"loss": 2.4153,
"step": 20510
},
{
"epoch": 6.479823106688778,
"grad_norm": 0.07258404844749798,
"learning_rate": 0.0006639458851788009,
"loss": 2.3949,
"step": 20515
},
{
"epoch": 6.481402511253258,
"grad_norm": 0.10411797321323364,
"learning_rate": 0.0006634265601309787,
"loss": 2.3937,
"step": 20520
},
{
"epoch": 6.482981915817737,
"grad_norm": 0.09113918435988624,
"learning_rate": 0.0006629073374329888,
"loss": 2.3915,
"step": 20525
},
{
"epoch": 6.484561320382216,
"grad_norm": 0.05540007410136057,
"learning_rate": 0.0006623882172427241,
"loss": 2.4174,
"step": 20530
},
{
"epoch": 6.486140724946695,
"grad_norm": 0.08202788053405924,
"learning_rate": 0.0006618691997180455,
"loss": 2.4115,
"step": 20535
},
{
"epoch": 6.487720129511175,
"grad_norm": 0.09924470624914969,
"learning_rate": 0.0006613502850167829,
"loss": 2.502,
"step": 20540
},
{
"epoch": 6.489299534075654,
"grad_norm": 0.07684790883534742,
"learning_rate": 0.000660831473296735,
"loss": 2.3873,
"step": 20545
},
{
"epoch": 6.490878938640133,
"grad_norm": 0.05693514419709399,
"learning_rate": 0.0006603127647156686,
"loss": 2.4283,
"step": 20550
},
{
"epoch": 6.4924583432046115,
"grad_norm": 0.06553061754463374,
"learning_rate": 0.0006597941594313206,
"loss": 2.4539,
"step": 20555
},
{
"epoch": 6.494037747769091,
"grad_norm": 0.06355261773818827,
"learning_rate": 0.0006592756576013949,
"loss": 2.3769,
"step": 20560
},
{
"epoch": 6.49561715233357,
"grad_norm": 0.05852829833587037,
"learning_rate": 0.0006587572593835649,
"loss": 2.2922,
"step": 20565
},
{
"epoch": 6.497196556898049,
"grad_norm": 0.06057719520523879,
"learning_rate": 0.0006582389649354721,
"loss": 2.4873,
"step": 20570
},
{
"epoch": 6.4987759614625284,
"grad_norm": 0.06404783121293027,
"learning_rate": 0.0006577207744147262,
"loss": 2.5823,
"step": 20575
},
{
"epoch": 6.500355366027008,
"grad_norm": 0.06238517706108964,
"learning_rate": 0.0006572026879789063,
"loss": 2.4048,
"step": 20580
},
{
"epoch": 6.501934770591487,
"grad_norm": 0.084566544252628,
"learning_rate": 0.0006566847057855583,
"loss": 2.495,
"step": 20585
},
{
"epoch": 6.503514175155966,
"grad_norm": 0.08866824050321336,
"learning_rate": 0.0006561668279921982,
"loss": 2.4044,
"step": 20590
},
{
"epoch": 6.505093579720445,
"grad_norm": 0.06242580125310291,
"learning_rate": 0.0006556490547563089,
"loss": 2.5171,
"step": 20595
},
{
"epoch": 6.506672984284925,
"grad_norm": 0.06005794753994994,
"learning_rate": 0.0006551313862353417,
"loss": 2.3504,
"step": 20600
},
{
"epoch": 6.508252388849404,
"grad_norm": 0.06397329256961332,
"learning_rate": 0.0006546138225867167,
"loss": 2.3979,
"step": 20605
},
{
"epoch": 6.509831793413883,
"grad_norm": 0.07405986725381314,
"learning_rate": 0.0006540963639678214,
"loss": 2.3973,
"step": 20610
},
{
"epoch": 6.511411197978362,
"grad_norm": 0.06530724972921835,
"learning_rate": 0.0006535790105360116,
"loss": 2.4029,
"step": 20615
},
{
"epoch": 6.512990602542842,
"grad_norm": 0.05498352465599929,
"learning_rate": 0.0006530617624486118,
"loss": 2.4322,
"step": 20620
},
{
"epoch": 6.514570007107321,
"grad_norm": 0.05581188949448493,
"learning_rate": 0.0006525446198629129,
"loss": 2.4059,
"step": 20625
},
{
"epoch": 6.516149411671799,
"grad_norm": 0.08415959336897978,
"learning_rate": 0.0006520275829361755,
"loss": 2.4613,
"step": 20630
},
{
"epoch": 6.5177288162362785,
"grad_norm": 0.06862904566833473,
"learning_rate": 0.0006515106518256269,
"loss": 2.4917,
"step": 20635
},
{
"epoch": 6.519308220800758,
"grad_norm": 0.09169453611375598,
"learning_rate": 0.000650993826688462,
"loss": 2.3704,
"step": 20640
},
{
"epoch": 6.520887625365237,
"grad_norm": 0.06956139951956909,
"learning_rate": 0.0006504771076818451,
"loss": 2.3782,
"step": 20645
},
{
"epoch": 6.522467029929716,
"grad_norm": 0.06277690395283149,
"learning_rate": 0.0006499604949629064,
"loss": 2.4124,
"step": 20650
},
{
"epoch": 6.5240464344941955,
"grad_norm": 0.08322856565190066,
"learning_rate": 0.0006494439886887448,
"loss": 2.444,
"step": 20655
},
{
"epoch": 6.525625839058675,
"grad_norm": 0.05389248672929697,
"learning_rate": 0.0006489275890164264,
"loss": 2.4816,
"step": 20660
},
{
"epoch": 6.527205243623154,
"grad_norm": 0.05018136361957522,
"learning_rate": 0.0006484112961029851,
"loss": 2.4618,
"step": 20665
},
{
"epoch": 6.528784648187633,
"grad_norm": 0.07115480752235714,
"learning_rate": 0.0006478951101054225,
"loss": 2.4771,
"step": 20670
},
{
"epoch": 6.5303640527521125,
"grad_norm": 0.06140523510481152,
"learning_rate": 0.0006473790311807066,
"loss": 2.4204,
"step": 20675
},
{
"epoch": 6.531943457316592,
"grad_norm": 0.056264873222821726,
"learning_rate": 0.0006468630594857749,
"loss": 2.4072,
"step": 20680
},
{
"epoch": 6.533522861881071,
"grad_norm": 0.04836118315341504,
"learning_rate": 0.0006463471951775307,
"loss": 2.444,
"step": 20685
},
{
"epoch": 6.53510226644555,
"grad_norm": 0.057506584093042185,
"learning_rate": 0.0006458314384128447,
"loss": 2.5414,
"step": 20690
},
{
"epoch": 6.5366816710100295,
"grad_norm": 0.05276356453937048,
"learning_rate": 0.0006453157893485555,
"loss": 2.4246,
"step": 20695
},
{
"epoch": 6.538261075574509,
"grad_norm": 0.054430250916809086,
"learning_rate": 0.000644800248141468,
"loss": 2.4457,
"step": 20700
},
{
"epoch": 6.539840480138988,
"grad_norm": 0.07702321509104175,
"learning_rate": 0.0006442848149483565,
"loss": 2.3848,
"step": 20705
},
{
"epoch": 6.541419884703467,
"grad_norm": 0.0669765829276087,
"learning_rate": 0.0006437694899259597,
"loss": 2.4353,
"step": 20710
},
{
"epoch": 6.542999289267946,
"grad_norm": 0.06911662331662145,
"learning_rate": 0.0006432542732309849,
"loss": 2.434,
"step": 20715
},
{
"epoch": 6.544578693832425,
"grad_norm": 0.04670729422642143,
"learning_rate": 0.0006427391650201064,
"loss": 2.3938,
"step": 20720
},
{
"epoch": 6.546158098396904,
"grad_norm": 0.05420350055764316,
"learning_rate": 0.0006422241654499654,
"loss": 2.4174,
"step": 20725
},
{
"epoch": 6.547737502961383,
"grad_norm": 0.06064627205328386,
"learning_rate": 0.0006417092746771693,
"loss": 2.3583,
"step": 20730
},
{
"epoch": 6.549316907525863,
"grad_norm": 0.0724627474102502,
"learning_rate": 0.000641194492858294,
"loss": 2.4769,
"step": 20735
},
{
"epoch": 6.550896312090342,
"grad_norm": 0.06145175058664715,
"learning_rate": 0.0006406798201498806,
"loss": 2.5493,
"step": 20740
},
{
"epoch": 6.552475716654821,
"grad_norm": 0.07168946019155933,
"learning_rate": 0.0006401652567084386,
"loss": 2.4802,
"step": 20745
},
{
"epoch": 6.5540551212193,
"grad_norm": 0.05391704872919303,
"learning_rate": 0.0006396508026904428,
"loss": 2.4745,
"step": 20750
},
{
"epoch": 6.55563452578378,
"grad_norm": 0.05155171999881381,
"learning_rate": 0.0006391364582523355,
"loss": 2.383,
"step": 20755
},
{
"epoch": 6.557213930348259,
"grad_norm": 0.07862364403967002,
"learning_rate": 0.0006386222235505257,
"loss": 2.4155,
"step": 20760
},
{
"epoch": 6.558793334912738,
"grad_norm": 0.06171940669513623,
"learning_rate": 0.0006381080987413884,
"loss": 2.3414,
"step": 20765
},
{
"epoch": 6.560372739477217,
"grad_norm": 0.05391005538952901,
"learning_rate": 0.0006375940839812666,
"loss": 2.4023,
"step": 20770
},
{
"epoch": 6.561952144041697,
"grad_norm": 0.059746498294211754,
"learning_rate": 0.0006370801794264682,
"loss": 2.4768,
"step": 20775
},
{
"epoch": 6.563531548606176,
"grad_norm": 0.059891714288636426,
"learning_rate": 0.0006365663852332684,
"loss": 2.3974,
"step": 20780
},
{
"epoch": 6.565110953170654,
"grad_norm": 0.06020696840420814,
"learning_rate": 0.0006360527015579092,
"loss": 2.5119,
"step": 20785
},
{
"epoch": 6.566690357735133,
"grad_norm": 0.06637291578503648,
"learning_rate": 0.0006355391285565974,
"loss": 2.5168,
"step": 20790
},
{
"epoch": 6.568269762299613,
"grad_norm": 0.057836322139409785,
"learning_rate": 0.0006350256663855085,
"loss": 2.388,
"step": 20795
},
{
"epoch": 6.569849166864092,
"grad_norm": 0.059381685790223855,
"learning_rate": 0.0006345123152007826,
"loss": 2.4761,
"step": 20800
},
{
"epoch": 6.571428571428571,
"grad_norm": 0.07339162811532468,
"learning_rate": 0.0006339990751585264,
"loss": 2.485,
"step": 20805
},
{
"epoch": 6.57300797599305,
"grad_norm": 0.053332487819192885,
"learning_rate": 0.0006334859464148131,
"loss": 2.3739,
"step": 20810
},
{
"epoch": 6.57458738055753,
"grad_norm": 0.058735428240274216,
"learning_rate": 0.0006329729291256814,
"loss": 2.425,
"step": 20815
},
{
"epoch": 6.576166785122009,
"grad_norm": 0.05697964987032508,
"learning_rate": 0.0006324600234471372,
"loss": 2.4277,
"step": 20820
},
{
"epoch": 6.577746189686488,
"grad_norm": 0.05514067686993358,
"learning_rate": 0.0006319472295351517,
"loss": 2.4361,
"step": 20825
},
{
"epoch": 6.579325594250967,
"grad_norm": 0.06102788490633586,
"learning_rate": 0.000631434547545662,
"loss": 2.5325,
"step": 20830
},
{
"epoch": 6.580904998815447,
"grad_norm": 0.05897696248779253,
"learning_rate": 0.0006309219776345717,
"loss": 2.4727,
"step": 20835
},
{
"epoch": 6.582484403379926,
"grad_norm": 0.06727456644683324,
"learning_rate": 0.00063040951995775,
"loss": 2.3821,
"step": 20840
},
{
"epoch": 6.584063807944405,
"grad_norm": 0.04967440222945126,
"learning_rate": 0.0006298971746710316,
"loss": 2.2977,
"step": 20845
},
{
"epoch": 6.585643212508884,
"grad_norm": 0.05384296029749339,
"learning_rate": 0.0006293849419302178,
"loss": 2.4543,
"step": 20850
},
{
"epoch": 6.587222617073364,
"grad_norm": 0.06125422766949562,
"learning_rate": 0.0006288728218910751,
"loss": 2.4878,
"step": 20855
},
{
"epoch": 6.588802021637843,
"grad_norm": 0.07072270452560224,
"learning_rate": 0.0006283608147093362,
"loss": 2.4066,
"step": 20860
},
{
"epoch": 6.590381426202322,
"grad_norm": 0.08089157778628371,
"learning_rate": 0.0006278489205406992,
"loss": 2.4273,
"step": 20865
},
{
"epoch": 6.591960830766801,
"grad_norm": 0.06807327668370637,
"learning_rate": 0.0006273371395408276,
"loss": 2.4643,
"step": 20870
},
{
"epoch": 6.59354023533128,
"grad_norm": 0.051338173826827205,
"learning_rate": 0.000626825471865351,
"loss": 2.4302,
"step": 20875
},
{
"epoch": 6.595119639895759,
"grad_norm": 0.06388132555176677,
"learning_rate": 0.0006263139176698638,
"loss": 2.4533,
"step": 20880
},
{
"epoch": 6.596699044460238,
"grad_norm": 0.06100198666281842,
"learning_rate": 0.0006258024771099269,
"loss": 2.3827,
"step": 20885
},
{
"epoch": 6.5982784490247175,
"grad_norm": 0.0615362542419615,
"learning_rate": 0.0006252911503410661,
"loss": 2.3859,
"step": 20890
},
{
"epoch": 6.599857853589197,
"grad_norm": 0.05506183334065402,
"learning_rate": 0.000624779937518772,
"loss": 2.3833,
"step": 20895
},
{
"epoch": 6.601437258153676,
"grad_norm": 0.06586044247997229,
"learning_rate": 0.000624268838798502,
"loss": 2.4417,
"step": 20900
},
{
"epoch": 6.603016662718155,
"grad_norm": 0.05622418928177649,
"learning_rate": 0.0006237578543356769,
"loss": 2.4432,
"step": 20905
},
{
"epoch": 6.6045960672826345,
"grad_norm": 0.054937089771462236,
"learning_rate": 0.0006232469842856849,
"loss": 2.4022,
"step": 20910
},
{
"epoch": 6.606175471847114,
"grad_norm": 0.0585384939232035,
"learning_rate": 0.0006227362288038778,
"loss": 2.3998,
"step": 20915
},
{
"epoch": 6.607754876411593,
"grad_norm": 0.0694134000321541,
"learning_rate": 0.000622225588045573,
"loss": 2.4481,
"step": 20920
},
{
"epoch": 6.609334280976072,
"grad_norm": 0.05339212873887325,
"learning_rate": 0.0006217150621660532,
"loss": 2.372,
"step": 20925
},
{
"epoch": 6.6109136855405515,
"grad_norm": 0.059635604148924506,
"learning_rate": 0.0006212046513205661,
"loss": 2.3724,
"step": 20930
},
{
"epoch": 6.612493090105031,
"grad_norm": 0.05840989832969823,
"learning_rate": 0.0006206943556643246,
"loss": 2.4719,
"step": 20935
},
{
"epoch": 6.61407249466951,
"grad_norm": 0.05419546777536966,
"learning_rate": 0.0006201841753525058,
"loss": 2.3878,
"step": 20940
},
{
"epoch": 6.615651899233988,
"grad_norm": 0.0694169198910676,
"learning_rate": 0.0006196741105402524,
"loss": 2.5015,
"step": 20945
},
{
"epoch": 6.617231303798468,
"grad_norm": 0.05556344170108016,
"learning_rate": 0.0006191641613826723,
"loss": 2.3889,
"step": 20950
},
{
"epoch": 6.618810708362947,
"grad_norm": 0.052620835978537304,
"learning_rate": 0.0006186543280348375,
"loss": 2.3962,
"step": 20955
},
{
"epoch": 6.620390112927426,
"grad_norm": 0.06339890729547158,
"learning_rate": 0.0006181446106517849,
"loss": 2.455,
"step": 20960
},
{
"epoch": 6.621969517491905,
"grad_norm": 0.05505793978721506,
"learning_rate": 0.0006176350093885166,
"loss": 2.3367,
"step": 20965
},
{
"epoch": 6.623548922056385,
"grad_norm": 0.057843051055069136,
"learning_rate": 0.0006171255243999987,
"loss": 2.5125,
"step": 20970
},
{
"epoch": 6.625128326620864,
"grad_norm": 0.06214587643757465,
"learning_rate": 0.0006166161558411627,
"loss": 2.4258,
"step": 20975
},
{
"epoch": 6.626707731185343,
"grad_norm": 0.05623495978859743,
"learning_rate": 0.0006161069038669044,
"loss": 2.4656,
"step": 20980
},
{
"epoch": 6.628287135749822,
"grad_norm": 0.09347695348850525,
"learning_rate": 0.0006155977686320837,
"loss": 2.4758,
"step": 20985
},
{
"epoch": 6.629866540314302,
"grad_norm": 0.08204971148501496,
"learning_rate": 0.0006150887502915257,
"loss": 2.5468,
"step": 20990
},
{
"epoch": 6.631445944878781,
"grad_norm": 0.061384071592050546,
"learning_rate": 0.000614579849000019,
"loss": 2.424,
"step": 20995
},
{
"epoch": 6.63302534944326,
"grad_norm": 0.0758429337434268,
"learning_rate": 0.0006140710649123182,
"loss": 2.4577,
"step": 21000
},
{
"epoch": 6.634604754007739,
"grad_norm": 0.06750234585741496,
"learning_rate": 0.0006135623981831408,
"loss": 2.4611,
"step": 21005
},
{
"epoch": 6.6361841585722185,
"grad_norm": 0.05212622446955658,
"learning_rate": 0.0006130538489671688,
"loss": 2.4119,
"step": 21010
},
{
"epoch": 6.637763563136698,
"grad_norm": 0.05171825318302362,
"learning_rate": 0.0006125454174190492,
"loss": 2.4197,
"step": 21015
},
{
"epoch": 6.639342967701177,
"grad_norm": 0.04999694587867551,
"learning_rate": 0.0006120371036933927,
"loss": 2.4476,
"step": 21020
},
{
"epoch": 6.640922372265656,
"grad_norm": 0.05063313641490577,
"learning_rate": 0.0006115289079447742,
"loss": 2.5165,
"step": 21025
},
{
"epoch": 6.642501776830135,
"grad_norm": 0.0644335498574247,
"learning_rate": 0.0006110208303277329,
"loss": 2.3849,
"step": 21030
},
{
"epoch": 6.644081181394614,
"grad_norm": 0.06294764797698174,
"learning_rate": 0.0006105128709967714,
"loss": 2.4369,
"step": 21035
},
{
"epoch": 6.645660585959093,
"grad_norm": 0.055502495101548276,
"learning_rate": 0.0006100050301063577,
"loss": 2.4349,
"step": 21040
},
{
"epoch": 6.647239990523572,
"grad_norm": 0.06655289353912996,
"learning_rate": 0.0006094973078109222,
"loss": 2.4072,
"step": 21045
},
{
"epoch": 6.648819395088052,
"grad_norm": 0.05201586001744827,
"learning_rate": 0.0006089897042648609,
"loss": 2.39,
"step": 21050
},
{
"epoch": 6.650398799652531,
"grad_norm": 0.053748384847827874,
"learning_rate": 0.0006084822196225322,
"loss": 2.4438,
"step": 21055
},
{
"epoch": 6.65197820421701,
"grad_norm": 0.051300648982207465,
"learning_rate": 0.0006079748540382587,
"loss": 2.3912,
"step": 21060
},
{
"epoch": 6.653557608781489,
"grad_norm": 0.04687662341409184,
"learning_rate": 0.0006074676076663277,
"loss": 2.4639,
"step": 21065
},
{
"epoch": 6.655137013345969,
"grad_norm": 0.07092567169247159,
"learning_rate": 0.0006069604806609893,
"loss": 2.4718,
"step": 21070
},
{
"epoch": 6.656716417910448,
"grad_norm": 0.05984170026138167,
"learning_rate": 0.0006064534731764573,
"loss": 2.4754,
"step": 21075
},
{
"epoch": 6.658295822474927,
"grad_norm": 0.0635421823482248,
"learning_rate": 0.0006059465853669098,
"loss": 2.429,
"step": 21080
},
{
"epoch": 6.659875227039406,
"grad_norm": 0.06890486031509065,
"learning_rate": 0.0006054398173864876,
"loss": 2.4638,
"step": 21085
},
{
"epoch": 6.661454631603886,
"grad_norm": 0.0690713561702384,
"learning_rate": 0.0006049331693892965,
"loss": 2.3999,
"step": 21090
},
{
"epoch": 6.663034036168365,
"grad_norm": 0.06404351030765236,
"learning_rate": 0.0006044266415294046,
"loss": 2.3991,
"step": 21095
},
{
"epoch": 6.664613440732843,
"grad_norm": 0.061888142256296186,
"learning_rate": 0.0006039202339608432,
"loss": 2.5241,
"step": 21100
},
{
"epoch": 6.6661928452973225,
"grad_norm": 0.0729176166350399,
"learning_rate": 0.0006034139468376083,
"loss": 2.3942,
"step": 21105
},
{
"epoch": 6.667772249861802,
"grad_norm": 0.04947800128694307,
"learning_rate": 0.0006029077803136581,
"loss": 2.419,
"step": 21110
},
{
"epoch": 6.669351654426281,
"grad_norm": 0.06841577856807288,
"learning_rate": 0.0006024017345429149,
"loss": 2.4606,
"step": 21115
},
{
"epoch": 6.67093105899076,
"grad_norm": 0.05770427309484664,
"learning_rate": 0.0006018958096792641,
"loss": 2.439,
"step": 21120
},
{
"epoch": 6.6725104635552395,
"grad_norm": 0.05522581275167986,
"learning_rate": 0.0006013900058765535,
"loss": 2.4211,
"step": 21125
},
{
"epoch": 6.674089868119719,
"grad_norm": 0.0600817695752141,
"learning_rate": 0.0006008843232885958,
"loss": 2.4781,
"step": 21130
},
{
"epoch": 6.675669272684198,
"grad_norm": 0.07080242556859254,
"learning_rate": 0.0006003787620691651,
"loss": 2.4881,
"step": 21135
},
{
"epoch": 6.677248677248677,
"grad_norm": 0.06125436041243013,
"learning_rate": 0.0005998733223719998,
"loss": 2.4683,
"step": 21140
},
{
"epoch": 6.6788280818131565,
"grad_norm": 0.08281825486541163,
"learning_rate": 0.0005993680043508007,
"loss": 2.4371,
"step": 21145
},
{
"epoch": 6.680407486377636,
"grad_norm": 0.07741584389134457,
"learning_rate": 0.0005988628081592313,
"loss": 2.4019,
"step": 21150
},
{
"epoch": 6.681986890942115,
"grad_norm": 0.08786921274591279,
"learning_rate": 0.0005983577339509196,
"loss": 2.4488,
"step": 21155
},
{
"epoch": 6.683566295506594,
"grad_norm": 0.06565792219426568,
"learning_rate": 0.0005978527818794545,
"loss": 2.4433,
"step": 21160
},
{
"epoch": 6.6851457000710734,
"grad_norm": 0.05681726799483692,
"learning_rate": 0.0005973479520983892,
"loss": 2.4619,
"step": 21165
},
{
"epoch": 6.686725104635553,
"grad_norm": 0.0679320979214565,
"learning_rate": 0.0005968432447612391,
"loss": 2.4669,
"step": 21170
},
{
"epoch": 6.688304509200032,
"grad_norm": 0.05167237326549837,
"learning_rate": 0.000596338660021482,
"loss": 2.53,
"step": 21175
},
{
"epoch": 6.689883913764511,
"grad_norm": 0.052642134871684294,
"learning_rate": 0.0005958341980325598,
"loss": 2.3834,
"step": 21180
},
{
"epoch": 6.69146331832899,
"grad_norm": 0.05183197534273639,
"learning_rate": 0.0005953298589478757,
"loss": 2.5494,
"step": 21185
},
{
"epoch": 6.693042722893469,
"grad_norm": 0.058016467201555635,
"learning_rate": 0.0005948256429207957,
"loss": 2.4402,
"step": 21190
},
{
"epoch": 6.694622127457948,
"grad_norm": 0.05504376462993496,
"learning_rate": 0.0005943215501046492,
"loss": 2.4724,
"step": 21195
},
{
"epoch": 6.696201532022427,
"grad_norm": 0.04805213505710295,
"learning_rate": 0.000593817580652727,
"loss": 2.432,
"step": 21200
},
{
"epoch": 6.6977809365869065,
"grad_norm": 0.05438841893764982,
"learning_rate": 0.0005933137347182838,
"loss": 2.4917,
"step": 21205
},
{
"epoch": 6.699360341151386,
"grad_norm": 0.06253126037078709,
"learning_rate": 0.0005928100124545355,
"loss": 2.3982,
"step": 21210
},
{
"epoch": 6.700939745715865,
"grad_norm": 0.06946817209182686,
"learning_rate": 0.0005923064140146602,
"loss": 2.3622,
"step": 21215
},
{
"epoch": 6.702519150280344,
"grad_norm": 0.055943700526094424,
"learning_rate": 0.0005918029395518001,
"loss": 2.432,
"step": 21220
},
{
"epoch": 6.7040985548448235,
"grad_norm": 0.0625940926110543,
"learning_rate": 0.0005912995892190578,
"loss": 2.4415,
"step": 21225
},
{
"epoch": 6.705677959409303,
"grad_norm": 0.05202327389180163,
"learning_rate": 0.0005907963631694993,
"loss": 2.4042,
"step": 21230
},
{
"epoch": 6.707257363973782,
"grad_norm": 0.07192885151510676,
"learning_rate": 0.0005902932615561524,
"loss": 2.4617,
"step": 21235
},
{
"epoch": 6.708836768538261,
"grad_norm": 0.05317858526574907,
"learning_rate": 0.0005897902845320064,
"loss": 2.4618,
"step": 21240
},
{
"epoch": 6.7104161731027405,
"grad_norm": 0.05493955034968431,
"learning_rate": 0.0005892874322500146,
"loss": 2.4495,
"step": 21245
},
{
"epoch": 6.71199557766722,
"grad_norm": 0.057507254717356474,
"learning_rate": 0.0005887847048630902,
"loss": 2.4012,
"step": 21250
},
{
"epoch": 6.713574982231699,
"grad_norm": 0.06348054251390015,
"learning_rate": 0.00058828210252411,
"loss": 2.4883,
"step": 21255
},
{
"epoch": 6.715154386796177,
"grad_norm": 0.05405229919959808,
"learning_rate": 0.0005877796253859118,
"loss": 2.4362,
"step": 21260
},
{
"epoch": 6.716733791360657,
"grad_norm": 0.06359025309672207,
"learning_rate": 0.0005872772736012955,
"loss": 2.4992,
"step": 21265
},
{
"epoch": 6.718313195925136,
"grad_norm": 0.05766892929360303,
"learning_rate": 0.0005867750473230235,
"loss": 2.5039,
"step": 21270
},
{
"epoch": 6.719892600489615,
"grad_norm": 0.05185065731909846,
"learning_rate": 0.0005862729467038195,
"loss": 2.4521,
"step": 21275
},
{
"epoch": 6.721472005054094,
"grad_norm": 0.0638969724670669,
"learning_rate": 0.000585770971896369,
"loss": 2.4547,
"step": 21280
},
{
"epoch": 6.723051409618574,
"grad_norm": 0.07557132807061367,
"learning_rate": 0.0005852691230533196,
"loss": 2.5275,
"step": 21285
},
{
"epoch": 6.724630814183053,
"grad_norm": 0.06444976114625953,
"learning_rate": 0.0005847674003272797,
"loss": 2.4865,
"step": 21290
},
{
"epoch": 6.726210218747532,
"grad_norm": 0.048931507948122496,
"learning_rate": 0.0005842658038708206,
"loss": 2.418,
"step": 21295
},
{
"epoch": 6.727789623312011,
"grad_norm": 0.059369553415906454,
"learning_rate": 0.0005837643338364744,
"loss": 2.3805,
"step": 21300
},
{
"epoch": 6.729369027876491,
"grad_norm": 0.05844950933586259,
"learning_rate": 0.0005832629903767345,
"loss": 2.402,
"step": 21305
},
{
"epoch": 6.73094843244097,
"grad_norm": 0.05893507399240853,
"learning_rate": 0.0005827617736440569,
"loss": 2.4748,
"step": 21310
},
{
"epoch": 6.732527837005449,
"grad_norm": 0.05084658590116852,
"learning_rate": 0.0005822606837908578,
"loss": 2.3668,
"step": 21315
},
{
"epoch": 6.734107241569928,
"grad_norm": 0.049818689273461586,
"learning_rate": 0.0005817597209695162,
"loss": 2.4421,
"step": 21320
},
{
"epoch": 6.735686646134408,
"grad_norm": 0.06560679436585973,
"learning_rate": 0.0005812588853323713,
"loss": 2.4364,
"step": 21325
},
{
"epoch": 6.737266050698887,
"grad_norm": 0.05177158420238849,
"learning_rate": 0.0005807581770317237,
"loss": 2.3318,
"step": 21330
},
{
"epoch": 6.738845455263366,
"grad_norm": 0.05844167575968277,
"learning_rate": 0.000580257596219836,
"loss": 2.5628,
"step": 21335
},
{
"epoch": 6.740424859827845,
"grad_norm": 0.05560042809757776,
"learning_rate": 0.0005797571430489311,
"loss": 2.4057,
"step": 21340
},
{
"epoch": 6.742004264392325,
"grad_norm": 0.055891772567053834,
"learning_rate": 0.0005792568176711944,
"loss": 2.4307,
"step": 21345
},
{
"epoch": 6.743583668956803,
"grad_norm": 0.05737511434738402,
"learning_rate": 0.0005787566202387713,
"loss": 2.4589,
"step": 21350
},
{
"epoch": 6.745163073521282,
"grad_norm": 0.0631938128075815,
"learning_rate": 0.000578256550903768,
"loss": 2.4153,
"step": 21355
},
{
"epoch": 6.746742478085761,
"grad_norm": 0.060357085037496414,
"learning_rate": 0.0005777566098182536,
"loss": 2.4025,
"step": 21360
},
{
"epoch": 6.748321882650241,
"grad_norm": 0.04874718966699242,
"learning_rate": 0.0005772567971342557,
"loss": 2.4402,
"step": 21365
},
{
"epoch": 6.74990128721472,
"grad_norm": 0.06512903170789627,
"learning_rate": 0.0005767571130037654,
"loss": 2.4672,
"step": 21370
},
{
"epoch": 6.751480691779199,
"grad_norm": 0.053667384264513535,
"learning_rate": 0.0005762575575787332,
"loss": 2.4265,
"step": 21375
},
{
"epoch": 6.753060096343678,
"grad_norm": 0.07297903970161429,
"learning_rate": 0.0005757581310110696,
"loss": 2.4725,
"step": 21380
},
{
"epoch": 6.754639500908158,
"grad_norm": 0.05872227870883123,
"learning_rate": 0.0005752588334526483,
"loss": 2.373,
"step": 21385
},
{
"epoch": 6.756218905472637,
"grad_norm": 0.06336969603677607,
"learning_rate": 0.0005747596650553019,
"loss": 2.3794,
"step": 21390
},
{
"epoch": 6.757798310037116,
"grad_norm": 0.06503276039389301,
"learning_rate": 0.000574260625970824,
"loss": 2.5187,
"step": 21395
},
{
"epoch": 6.759377714601595,
"grad_norm": 0.060933732085715314,
"learning_rate": 0.0005737617163509701,
"loss": 2.3795,
"step": 21400
},
{
"epoch": 6.760957119166075,
"grad_norm": 0.0605127703286891,
"learning_rate": 0.0005732629363474544,
"loss": 2.4924,
"step": 21405
},
{
"epoch": 6.762536523730554,
"grad_norm": 0.07433786962800859,
"learning_rate": 0.0005727642861119537,
"loss": 2.4789,
"step": 21410
},
{
"epoch": 6.764115928295032,
"grad_norm": 0.05739265169637819,
"learning_rate": 0.0005722657657961041,
"loss": 2.3773,
"step": 21415
},
{
"epoch": 6.7656953328595115,
"grad_norm": 0.05821728913037662,
"learning_rate": 0.000571767375551502,
"loss": 2.4953,
"step": 21420
},
{
"epoch": 6.767274737423991,
"grad_norm": 0.05726282173331931,
"learning_rate": 0.0005712691155297052,
"loss": 2.4107,
"step": 21425
},
{
"epoch": 6.76885414198847,
"grad_norm": 0.060644972908402456,
"learning_rate": 0.0005707709858822305,
"loss": 2.4309,
"step": 21430
},
{
"epoch": 6.770433546552949,
"grad_norm": 0.05875530299857965,
"learning_rate": 0.0005702729867605571,
"loss": 2.4803,
"step": 21435
},
{
"epoch": 6.7720129511174285,
"grad_norm": 0.07191803996274608,
"learning_rate": 0.0005697751183161228,
"loss": 2.4641,
"step": 21440
},
{
"epoch": 6.773592355681908,
"grad_norm": 0.05775390158910223,
"learning_rate": 0.0005692773807003257,
"loss": 2.4321,
"step": 21445
},
{
"epoch": 6.775171760246387,
"grad_norm": 0.05510099252873888,
"learning_rate": 0.0005687797740645257,
"loss": 2.3841,
"step": 21450
},
{
"epoch": 6.776751164810866,
"grad_norm": 0.05358811307984855,
"learning_rate": 0.0005682822985600409,
"loss": 2.4162,
"step": 21455
},
{
"epoch": 6.7783305693753455,
"grad_norm": 0.05802284171031527,
"learning_rate": 0.000567784954338151,
"loss": 2.4454,
"step": 21460
},
{
"epoch": 6.779909973939825,
"grad_norm": 0.0505949926437704,
"learning_rate": 0.0005672877415500956,
"loss": 2.3944,
"step": 21465
},
{
"epoch": 6.781489378504304,
"grad_norm": 0.05866143634554086,
"learning_rate": 0.0005667906603470723,
"loss": 2.3932,
"step": 21470
},
{
"epoch": 6.783068783068783,
"grad_norm": 0.05793737809646166,
"learning_rate": 0.000566293710880242,
"loss": 2.4064,
"step": 21475
},
{
"epoch": 6.7846481876332625,
"grad_norm": 0.0519726652750793,
"learning_rate": 0.0005657968933007227,
"loss": 2.4746,
"step": 21480
},
{
"epoch": 6.786227592197742,
"grad_norm": 0.058300041427298825,
"learning_rate": 0.0005653002077595944,
"loss": 2.4135,
"step": 21485
},
{
"epoch": 6.787806996762221,
"grad_norm": 0.05234486328538095,
"learning_rate": 0.0005648036544078954,
"loss": 2.3928,
"step": 21490
},
{
"epoch": 6.7893864013267,
"grad_norm": 0.055602872160708736,
"learning_rate": 0.0005643072333966242,
"loss": 2.4309,
"step": 21495
},
{
"epoch": 6.7909658058911795,
"grad_norm": 0.05173743221141718,
"learning_rate": 0.0005638109448767399,
"loss": 2.4736,
"step": 21500
},
{
"epoch": 6.792545210455658,
"grad_norm": 0.05907175588242061,
"learning_rate": 0.0005633147889991606,
"loss": 2.6019,
"step": 21505
},
{
"epoch": 6.794124615020137,
"grad_norm": 0.0615331667397914,
"learning_rate": 0.0005628187659147637,
"loss": 2.5359,
"step": 21510
},
{
"epoch": 6.795704019584616,
"grad_norm": 0.06127667387678013,
"learning_rate": 0.000562322875774387,
"loss": 2.4743,
"step": 21515
},
{
"epoch": 6.797283424149096,
"grad_norm": 0.10062733082611985,
"learning_rate": 0.0005618271187288269,
"loss": 2.3991,
"step": 21520
},
{
"epoch": 6.798862828713575,
"grad_norm": 0.0649846458859756,
"learning_rate": 0.0005613314949288408,
"loss": 2.4433,
"step": 21525
},
{
"epoch": 6.800442233278054,
"grad_norm": 0.05435369272893038,
"learning_rate": 0.0005608360045251445,
"loss": 2.4667,
"step": 21530
},
{
"epoch": 6.802021637842533,
"grad_norm": 0.05969015357945196,
"learning_rate": 0.0005603406476684128,
"loss": 2.4626,
"step": 21535
},
{
"epoch": 6.803601042407013,
"grad_norm": 0.05751701647723755,
"learning_rate": 0.0005598454245092816,
"loss": 2.4328,
"step": 21540
},
{
"epoch": 6.805180446971492,
"grad_norm": 0.04962348059479472,
"learning_rate": 0.0005593503351983441,
"loss": 2.36,
"step": 21545
},
{
"epoch": 6.806759851535971,
"grad_norm": 0.057983305152976485,
"learning_rate": 0.0005588553798861547,
"loss": 2.379,
"step": 21550
},
{
"epoch": 6.80833925610045,
"grad_norm": 0.049851774150790285,
"learning_rate": 0.0005583605587232261,
"loss": 2.3591,
"step": 21555
},
{
"epoch": 6.80991866066493,
"grad_norm": 0.06148293087382803,
"learning_rate": 0.0005578658718600291,
"loss": 2.346,
"step": 21560
},
{
"epoch": 6.811498065229409,
"grad_norm": 0.0607279741352406,
"learning_rate": 0.0005573713194469961,
"loss": 2.4491,
"step": 21565
},
{
"epoch": 6.813077469793888,
"grad_norm": 0.0611488176613782,
"learning_rate": 0.0005568769016345162,
"loss": 2.4978,
"step": 21570
},
{
"epoch": 6.814656874358366,
"grad_norm": 0.06097889717255455,
"learning_rate": 0.0005563826185729398,
"loss": 2.3682,
"step": 21575
},
{
"epoch": 6.816236278922846,
"grad_norm": 0.05115123269234018,
"learning_rate": 0.0005558884704125748,
"loss": 2.3969,
"step": 21580
},
{
"epoch": 6.817815683487325,
"grad_norm": 0.07194913074065468,
"learning_rate": 0.0005553944573036879,
"loss": 2.3681,
"step": 21585
},
{
"epoch": 6.819395088051804,
"grad_norm": 0.06114575712306162,
"learning_rate": 0.0005549005793965065,
"loss": 2.4123,
"step": 21590
},
{
"epoch": 6.820974492616283,
"grad_norm": 0.05394160080735589,
"learning_rate": 0.0005544068368412149,
"loss": 2.3875,
"step": 21595
},
{
"epoch": 6.822553897180763,
"grad_norm": 0.06470531739214352,
"learning_rate": 0.0005539132297879574,
"loss": 2.497,
"step": 21600
},
{
"epoch": 6.824133301745242,
"grad_norm": 0.051509753550531084,
"learning_rate": 0.0005534197583868366,
"loss": 2.417,
"step": 21605
},
{
"epoch": 6.825712706309721,
"grad_norm": 0.051051927926556565,
"learning_rate": 0.0005529264227879134,
"loss": 2.4835,
"step": 21610
},
{
"epoch": 6.8272921108742,
"grad_norm": 0.047564532608101465,
"learning_rate": 0.000552433223141209,
"loss": 2.4317,
"step": 21615
},
{
"epoch": 6.82887151543868,
"grad_norm": 0.05734664728392672,
"learning_rate": 0.0005519401595967021,
"loss": 2.3542,
"step": 21620
},
{
"epoch": 6.830450920003159,
"grad_norm": 0.0672852984035047,
"learning_rate": 0.0005514472323043294,
"loss": 2.466,
"step": 21625
},
{
"epoch": 6.832030324567638,
"grad_norm": 0.05435036293623683,
"learning_rate": 0.0005509544414139878,
"loss": 2.3773,
"step": 21630
},
{
"epoch": 6.833609729132117,
"grad_norm": 0.06367650525566754,
"learning_rate": 0.0005504617870755313,
"loss": 2.4725,
"step": 21635
},
{
"epoch": 6.835189133696597,
"grad_norm": 0.06673691388756689,
"learning_rate": 0.0005499692694387735,
"loss": 2.4828,
"step": 21640
},
{
"epoch": 6.836768538261076,
"grad_norm": 0.05827313896785369,
"learning_rate": 0.0005494768886534858,
"loss": 2.3298,
"step": 21645
},
{
"epoch": 6.838347942825555,
"grad_norm": 0.0536158853716576,
"learning_rate": 0.0005489846448693971,
"loss": 2.4936,
"step": 21650
},
{
"epoch": 6.839927347390034,
"grad_norm": 0.05404584286993024,
"learning_rate": 0.0005484925382361967,
"loss": 2.3928,
"step": 21655
},
{
"epoch": 6.841506751954514,
"grad_norm": 0.05579403311119814,
"learning_rate": 0.0005480005689035303,
"loss": 2.3985,
"step": 21660
},
{
"epoch": 6.843086156518992,
"grad_norm": 0.04999571171378245,
"learning_rate": 0.0005475087370210032,
"loss": 2.4438,
"step": 21665
},
{
"epoch": 6.844665561083471,
"grad_norm": 0.05422807783375237,
"learning_rate": 0.0005470170427381782,
"loss": 2.4761,
"step": 21670
},
{
"epoch": 6.8462449656479505,
"grad_norm": 0.05942475947479741,
"learning_rate": 0.0005465254862045761,
"loss": 2.4057,
"step": 21675
},
{
"epoch": 6.84782437021243,
"grad_norm": 0.06514406946071337,
"learning_rate": 0.0005460340675696766,
"loss": 2.4166,
"step": 21680
},
{
"epoch": 6.849403774776909,
"grad_norm": 0.05360057779921819,
"learning_rate": 0.0005455427869829166,
"loss": 2.5401,
"step": 21685
},
{
"epoch": 6.850983179341388,
"grad_norm": 0.06089804877013015,
"learning_rate": 0.0005450516445936915,
"loss": 2.3797,
"step": 21690
},
{
"epoch": 6.8525625839058675,
"grad_norm": 0.05693908928925304,
"learning_rate": 0.0005445606405513546,
"loss": 2.4941,
"step": 21695
},
{
"epoch": 6.854141988470347,
"grad_norm": 0.0707655381654669,
"learning_rate": 0.0005440697750052166,
"loss": 2.357,
"step": 21700
},
{
"epoch": 6.855721393034826,
"grad_norm": 0.05577309405341037,
"learning_rate": 0.0005435790481045473,
"loss": 2.5022,
"step": 21705
},
{
"epoch": 6.857300797599305,
"grad_norm": 0.05945130649107957,
"learning_rate": 0.0005430884599985731,
"loss": 2.4243,
"step": 21710
},
{
"epoch": 6.8588802021637845,
"grad_norm": 0.059499421774360954,
"learning_rate": 0.0005425980108364793,
"loss": 2.4342,
"step": 21715
},
{
"epoch": 6.860459606728264,
"grad_norm": 0.06518694486381847,
"learning_rate": 0.0005421077007674079,
"loss": 2.4241,
"step": 21720
},
{
"epoch": 6.862039011292743,
"grad_norm": 0.051394730635343784,
"learning_rate": 0.0005416175299404588,
"loss": 2.4587,
"step": 21725
},
{
"epoch": 6.863618415857222,
"grad_norm": 0.045408359711962304,
"learning_rate": 0.0005411274985046905,
"loss": 2.3355,
"step": 21730
},
{
"epoch": 6.865197820421701,
"grad_norm": 0.0513470116736704,
"learning_rate": 0.0005406376066091186,
"loss": 2.4024,
"step": 21735
},
{
"epoch": 6.86677722498618,
"grad_norm": 0.05063925633979523,
"learning_rate": 0.0005401478544027145,
"loss": 2.4745,
"step": 21740
},
{
"epoch": 6.868356629550659,
"grad_norm": 0.07104281738141084,
"learning_rate": 0.0005396582420344105,
"loss": 2.343,
"step": 21745
},
{
"epoch": 6.869936034115138,
"grad_norm": 0.059455373567313806,
"learning_rate": 0.0005391687696530933,
"loss": 2.4954,
"step": 21750
},
{
"epoch": 6.871515438679618,
"grad_norm": 0.06096793387466837,
"learning_rate": 0.0005386794374076095,
"loss": 2.3438,
"step": 21755
},
{
"epoch": 6.873094843244097,
"grad_norm": 0.054379783576155465,
"learning_rate": 0.0005381902454467612,
"loss": 2.4693,
"step": 21760
},
{
"epoch": 6.874674247808576,
"grad_norm": 0.05652207840644949,
"learning_rate": 0.0005377011939193084,
"loss": 2.5034,
"step": 21765
},
{
"epoch": 6.876253652373055,
"grad_norm": 0.06233177406755634,
"learning_rate": 0.0005372122829739689,
"loss": 2.3965,
"step": 21770
},
{
"epoch": 6.8778330569375346,
"grad_norm": 0.06609802159000204,
"learning_rate": 0.0005367235127594176,
"loss": 2.4501,
"step": 21775
},
{
"epoch": 6.879412461502014,
"grad_norm": 0.06125662370101596,
"learning_rate": 0.0005362348834242861,
"loss": 2.3855,
"step": 21780
},
{
"epoch": 6.880991866066493,
"grad_norm": 0.06099320925783692,
"learning_rate": 0.0005357463951171635,
"loss": 2.3108,
"step": 21785
},
{
"epoch": 6.882571270630972,
"grad_norm": 0.058472540308738864,
"learning_rate": 0.0005352580479865954,
"loss": 2.3968,
"step": 21790
},
{
"epoch": 6.8841506751954515,
"grad_norm": 0.06232962054138482,
"learning_rate": 0.0005347698421810861,
"loss": 2.3888,
"step": 21795
},
{
"epoch": 6.885730079759931,
"grad_norm": 0.06477288485131953,
"learning_rate": 0.000534281777849095,
"loss": 2.4474,
"step": 21800
},
{
"epoch": 6.88730948432441,
"grad_norm": 0.05873390973067887,
"learning_rate": 0.0005337938551390398,
"loss": 2.4788,
"step": 21805
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.060654604539577196,
"learning_rate": 0.0005333060741992949,
"loss": 2.4519,
"step": 21810
},
{
"epoch": 6.8904682934533685,
"grad_norm": 0.05639692992085628,
"learning_rate": 0.0005328184351781905,
"loss": 2.3953,
"step": 21815
},
{
"epoch": 6.892047698017847,
"grad_norm": 0.053266466338713084,
"learning_rate": 0.0005323309382240155,
"loss": 2.3888,
"step": 21820
},
{
"epoch": 6.893627102582326,
"grad_norm": 0.05800361122482948,
"learning_rate": 0.0005318435834850142,
"loss": 2.3813,
"step": 21825
},
{
"epoch": 6.895206507146805,
"grad_norm": 0.06586588164894695,
"learning_rate": 0.000531356371109388,
"loss": 2.3707,
"step": 21830
},
{
"epoch": 6.896785911711285,
"grad_norm": 0.05851260838880374,
"learning_rate": 0.000530869301245295,
"loss": 2.424,
"step": 21835
},
{
"epoch": 6.898365316275764,
"grad_norm": 0.06209200802669711,
"learning_rate": 0.0005303823740408499,
"loss": 2.3685,
"step": 21840
},
{
"epoch": 6.899944720840243,
"grad_norm": 0.05286685692262011,
"learning_rate": 0.0005298955896441246,
"loss": 2.2661,
"step": 21845
},
{
"epoch": 6.901524125404722,
"grad_norm": 0.05713877322038269,
"learning_rate": 0.0005294089482031471,
"loss": 2.3312,
"step": 21850
},
{
"epoch": 6.903103529969202,
"grad_norm": 0.05528321789315224,
"learning_rate": 0.0005289224498659013,
"loss": 2.4055,
"step": 21855
},
{
"epoch": 6.904682934533681,
"grad_norm": 0.06110530953581327,
"learning_rate": 0.0005284360947803291,
"loss": 2.3902,
"step": 21860
},
{
"epoch": 6.90626233909816,
"grad_norm": 0.05159472647040717,
"learning_rate": 0.0005279498830943275,
"loss": 2.3989,
"step": 21865
},
{
"epoch": 6.907841743662639,
"grad_norm": 0.05335318785652728,
"learning_rate": 0.0005274638149557505,
"loss": 2.5193,
"step": 21870
},
{
"epoch": 6.909421148227119,
"grad_norm": 0.056920508642124584,
"learning_rate": 0.0005269778905124082,
"loss": 2.485,
"step": 21875
},
{
"epoch": 6.911000552791598,
"grad_norm": 0.06303317804323633,
"learning_rate": 0.0005264921099120668,
"loss": 2.3776,
"step": 21880
},
{
"epoch": 6.912579957356077,
"grad_norm": 0.059305948269867445,
"learning_rate": 0.0005260064733024498,
"loss": 2.4129,
"step": 21885
},
{
"epoch": 6.9141593619205555,
"grad_norm": 0.058581759702154644,
"learning_rate": 0.0005255209808312356,
"loss": 2.4401,
"step": 21890
},
{
"epoch": 6.915738766485035,
"grad_norm": 0.05312168898445783,
"learning_rate": 0.0005250356326460599,
"loss": 2.4257,
"step": 21895
},
{
"epoch": 6.917318171049514,
"grad_norm": 0.05630831885265081,
"learning_rate": 0.0005245504288945137,
"loss": 2.4652,
"step": 21900
},
{
"epoch": 6.918897575613993,
"grad_norm": 0.06559146434257056,
"learning_rate": 0.0005240653697241439,
"loss": 2.4492,
"step": 21905
},
{
"epoch": 6.9204769801784725,
"grad_norm": 0.06018418584465903,
"learning_rate": 0.0005235804552824548,
"loss": 2.3373,
"step": 21910
},
{
"epoch": 6.922056384742952,
"grad_norm": 0.05290805452225749,
"learning_rate": 0.0005230956857169051,
"loss": 2.3979,
"step": 21915
},
{
"epoch": 6.923635789307431,
"grad_norm": 0.05688352863619254,
"learning_rate": 0.0005226110611749106,
"loss": 2.4009,
"step": 21920
},
{
"epoch": 6.92521519387191,
"grad_norm": 0.062045953503586715,
"learning_rate": 0.0005221265818038422,
"loss": 2.5034,
"step": 21925
},
{
"epoch": 6.9267945984363894,
"grad_norm": 0.06701488553948802,
"learning_rate": 0.0005216422477510266,
"loss": 2.4982,
"step": 21930
},
{
"epoch": 6.928374003000869,
"grad_norm": 0.06656038450198463,
"learning_rate": 0.0005211580591637477,
"loss": 2.4535,
"step": 21935
},
{
"epoch": 6.929953407565348,
"grad_norm": 0.05927357694315514,
"learning_rate": 0.0005206740161892431,
"loss": 2.4145,
"step": 21940
},
{
"epoch": 6.931532812129827,
"grad_norm": 0.06324530362062546,
"learning_rate": 0.000520190118974708,
"loss": 2.4619,
"step": 21945
},
{
"epoch": 6.933112216694306,
"grad_norm": 0.06475746568052063,
"learning_rate": 0.0005197063676672922,
"loss": 2.4084,
"step": 21950
},
{
"epoch": 6.934691621258786,
"grad_norm": 0.05339050369592582,
"learning_rate": 0.0005192227624141014,
"loss": 2.4382,
"step": 21955
},
{
"epoch": 6.936271025823265,
"grad_norm": 0.05058452813668629,
"learning_rate": 0.0005187393033621966,
"loss": 2.4678,
"step": 21960
},
{
"epoch": 6.937850430387744,
"grad_norm": 0.06376407480731965,
"learning_rate": 0.000518255990658595,
"loss": 2.5248,
"step": 21965
},
{
"epoch": 6.939429834952223,
"grad_norm": 0.06208593674054495,
"learning_rate": 0.0005177728244502681,
"loss": 2.3976,
"step": 21970
},
{
"epoch": 6.941009239516703,
"grad_norm": 0.05444347232961063,
"learning_rate": 0.0005172898048841448,
"loss": 2.4416,
"step": 21975
},
{
"epoch": 6.942588644081181,
"grad_norm": 0.05412349409442446,
"learning_rate": 0.0005168069321071072,
"loss": 2.4072,
"step": 21980
},
{
"epoch": 6.94416804864566,
"grad_norm": 0.061800130782726656,
"learning_rate": 0.0005163242062659947,
"loss": 2.4059,
"step": 21985
},
{
"epoch": 6.9457474532101395,
"grad_norm": 0.06225518916268915,
"learning_rate": 0.000515841627507601,
"loss": 2.4214,
"step": 21990
},
{
"epoch": 6.947326857774619,
"grad_norm": 0.05028043840864827,
"learning_rate": 0.0005153591959786744,
"loss": 2.3807,
"step": 21995
},
{
"epoch": 6.948906262339098,
"grad_norm": 0.06280098731901271,
"learning_rate": 0.0005148769118259204,
"loss": 2.4558,
"step": 22000
},
{
"epoch": 6.950485666903577,
"grad_norm": 0.0545790632896203,
"learning_rate": 0.0005143947751959978,
"loss": 2.3941,
"step": 22005
},
{
"epoch": 6.9520650714680565,
"grad_norm": 0.05423863822796326,
"learning_rate": 0.0005139127862355215,
"loss": 2.4085,
"step": 22010
},
{
"epoch": 6.953644476032536,
"grad_norm": 0.05854777490309802,
"learning_rate": 0.0005134309450910612,
"loss": 2.3779,
"step": 22015
},
{
"epoch": 6.955223880597015,
"grad_norm": 0.06203241290016519,
"learning_rate": 0.0005129492519091414,
"loss": 2.3778,
"step": 22020
},
{
"epoch": 6.956803285161494,
"grad_norm": 0.06108661630820922,
"learning_rate": 0.0005124677068362427,
"loss": 2.3823,
"step": 22025
},
{
"epoch": 6.9583826897259735,
"grad_norm": 0.051174743117047304,
"learning_rate": 0.0005119863100187989,
"loss": 2.3984,
"step": 22030
},
{
"epoch": 6.959962094290453,
"grad_norm": 0.0583678329545391,
"learning_rate": 0.0005115050616032006,
"loss": 2.4062,
"step": 22035
},
{
"epoch": 6.961541498854932,
"grad_norm": 0.050732585539663234,
"learning_rate": 0.0005110239617357921,
"loss": 2.4515,
"step": 22040
},
{
"epoch": 6.963120903419411,
"grad_norm": 0.06316805828329164,
"learning_rate": 0.0005105430105628725,
"loss": 2.415,
"step": 22045
},
{
"epoch": 6.96470030798389,
"grad_norm": 0.05826522378664329,
"learning_rate": 0.0005100622082306964,
"loss": 2.4071,
"step": 22050
},
{
"epoch": 6.966279712548369,
"grad_norm": 0.06024407186568456,
"learning_rate": 0.0005095815548854718,
"loss": 2.4042,
"step": 22055
},
{
"epoch": 6.967859117112848,
"grad_norm": 0.05240525392795767,
"learning_rate": 0.0005091010506733637,
"loss": 2.3804,
"step": 22060
},
{
"epoch": 6.969438521677327,
"grad_norm": 0.050460358002704216,
"learning_rate": 0.0005086206957404895,
"loss": 2.3951,
"step": 22065
},
{
"epoch": 6.971017926241807,
"grad_norm": 0.059198426627816145,
"learning_rate": 0.0005081404902329219,
"loss": 2.4172,
"step": 22070
},
{
"epoch": 6.972597330806286,
"grad_norm": 0.07203729921772287,
"learning_rate": 0.0005076604342966888,
"loss": 2.4817,
"step": 22075
},
{
"epoch": 6.974176735370765,
"grad_norm": 0.054764474869055016,
"learning_rate": 0.0005071805280777721,
"loss": 2.4372,
"step": 22080
},
{
"epoch": 6.975756139935244,
"grad_norm": 0.06859694147601073,
"learning_rate": 0.0005067007717221078,
"loss": 2.4975,
"step": 22085
},
{
"epoch": 6.977335544499724,
"grad_norm": 0.05521051080090544,
"learning_rate": 0.0005062211653755874,
"loss": 2.4881,
"step": 22090
},
{
"epoch": 6.978914949064203,
"grad_norm": 0.0640812527980952,
"learning_rate": 0.0005057417091840558,
"loss": 2.405,
"step": 22095
},
{
"epoch": 6.980494353628682,
"grad_norm": 0.0802579141078725,
"learning_rate": 0.0005052624032933124,
"loss": 2.3667,
"step": 22100
},
{
"epoch": 6.982073758193161,
"grad_norm": 0.0676106499145331,
"learning_rate": 0.0005047832478491112,
"loss": 2.4645,
"step": 22105
},
{
"epoch": 6.983653162757641,
"grad_norm": 0.057582598854102246,
"learning_rate": 0.0005043042429971601,
"loss": 2.4041,
"step": 22110
},
{
"epoch": 6.98523256732212,
"grad_norm": 0.0642687758267521,
"learning_rate": 0.000503825388883122,
"loss": 2.5206,
"step": 22115
},
{
"epoch": 6.986811971886599,
"grad_norm": 0.05943948002624905,
"learning_rate": 0.0005033466856526123,
"loss": 2.5057,
"step": 22120
},
{
"epoch": 6.988391376451078,
"grad_norm": 0.061296324638803454,
"learning_rate": 0.0005028681334512028,
"loss": 2.5147,
"step": 22125
},
{
"epoch": 6.989970781015558,
"grad_norm": 0.06645640402884086,
"learning_rate": 0.0005023897324244178,
"loss": 2.4797,
"step": 22130
},
{
"epoch": 6.991550185580037,
"grad_norm": 0.05908400342848221,
"learning_rate": 0.0005019114827177358,
"loss": 2.3979,
"step": 22135
},
{
"epoch": 6.993129590144515,
"grad_norm": 0.051582964000261286,
"learning_rate": 0.0005014333844765895,
"loss": 2.4004,
"step": 22140
},
{
"epoch": 6.994708994708994,
"grad_norm": 0.058922640607268637,
"learning_rate": 0.0005009554378463653,
"loss": 2.4946,
"step": 22145
},
{
"epoch": 6.996288399273474,
"grad_norm": 0.05257449063779885,
"learning_rate": 0.0005004776429724041,
"loss": 2.5856,
"step": 22150
},
{
"epoch": 6.997867803837953,
"grad_norm": 0.05202237087799441,
"learning_rate": 0.0005000000000000002,
"loss": 2.4001,
"step": 22155
},
{
"epoch": 6.999447208402432,
"grad_norm": 0.06504574622504392,
"learning_rate": 0.0004995225090744013,
"loss": 2.3885,
"step": 22160
},
{
"epoch": 7.0,
"eval_loss": 2.4251391887664795,
"eval_runtime": 118.8395,
"eval_samples_per_second": 22.291,
"eval_steps_per_second": 5.579,
"step": 22162
},
{
"epoch": 7.000947642738687,
"grad_norm": 0.062346395048555915,
"learning_rate": 0.0004990451703408103,
"loss": 2.4199,
"step": 22165
},
{
"epoch": 7.002527047303166,
"grad_norm": 0.07037473965441808,
"learning_rate": 0.0004985679839443818,
"loss": 2.4788,
"step": 22170
},
{
"epoch": 7.004106451867646,
"grad_norm": 0.06285878195523269,
"learning_rate": 0.0004980909500302261,
"loss": 2.4508,
"step": 22175
},
{
"epoch": 7.005685856432125,
"grad_norm": 0.05917472863936181,
"learning_rate": 0.0004976140687434057,
"loss": 2.3731,
"step": 22180
},
{
"epoch": 7.007265260996604,
"grad_norm": 0.048792163797312715,
"learning_rate": 0.0004971373402289371,
"loss": 2.3495,
"step": 22185
},
{
"epoch": 7.008844665561083,
"grad_norm": 0.05213109528685697,
"learning_rate": 0.0004966607646317905,
"loss": 2.3474,
"step": 22190
},
{
"epoch": 7.010424070125563,
"grad_norm": 0.0781050242157072,
"learning_rate": 0.0004961843420968894,
"loss": 2.4575,
"step": 22195
},
{
"epoch": 7.012003474690042,
"grad_norm": 0.06626379813972594,
"learning_rate": 0.0004957080727691107,
"loss": 2.4007,
"step": 22200
},
{
"epoch": 7.013582879254521,
"grad_norm": 0.06270721111555298,
"learning_rate": 0.0004952319567932853,
"loss": 2.3663,
"step": 22205
},
{
"epoch": 7.015162283819,
"grad_norm": 0.06311121786415576,
"learning_rate": 0.0004947559943141963,
"loss": 2.4322,
"step": 22210
},
{
"epoch": 7.01674168838348,
"grad_norm": 0.06659528685910747,
"learning_rate": 0.000494280185476582,
"loss": 2.4943,
"step": 22215
},
{
"epoch": 7.018321092947959,
"grad_norm": 0.06299817520987319,
"learning_rate": 0.0004938045304251318,
"loss": 2.4519,
"step": 22220
},
{
"epoch": 7.019900497512438,
"grad_norm": 0.05606478423175714,
"learning_rate": 0.00049332902930449,
"loss": 2.4653,
"step": 22225
},
{
"epoch": 7.021479902076917,
"grad_norm": 0.07214472192138412,
"learning_rate": 0.0004928536822592531,
"loss": 2.3667,
"step": 22230
},
{
"epoch": 7.023059306641396,
"grad_norm": 0.07541747642426318,
"learning_rate": 0.0004923784894339708,
"loss": 2.3852,
"step": 22235
},
{
"epoch": 7.024638711205875,
"grad_norm": 0.08278790926232245,
"learning_rate": 0.000491903450973147,
"loss": 2.4173,
"step": 22240
},
{
"epoch": 7.026218115770354,
"grad_norm": 0.058936614736605816,
"learning_rate": 0.0004914285670212374,
"loss": 2.447,
"step": 22245
},
{
"epoch": 7.0277975203348335,
"grad_norm": 0.06837255690973654,
"learning_rate": 0.0004909538377226508,
"loss": 2.4236,
"step": 22250
},
{
"epoch": 7.029376924899313,
"grad_norm": 0.0650063841790395,
"learning_rate": 0.0004904792632217502,
"loss": 2.3878,
"step": 22255
},
{
"epoch": 7.030956329463792,
"grad_norm": 0.06832521717430635,
"learning_rate": 0.0004900048436628498,
"loss": 2.371,
"step": 22260
},
{
"epoch": 7.032535734028271,
"grad_norm": 0.06550638970535379,
"learning_rate": 0.0004895305791902184,
"loss": 2.4162,
"step": 22265
},
{
"epoch": 7.0341151385927505,
"grad_norm": 0.051004928798066757,
"learning_rate": 0.0004890564699480764,
"loss": 2.3918,
"step": 22270
},
{
"epoch": 7.03569454315723,
"grad_norm": 0.0599442831633633,
"learning_rate": 0.0004885825160805973,
"loss": 2.4745,
"step": 22275
},
{
"epoch": 7.037273947721709,
"grad_norm": 0.05159778302570732,
"learning_rate": 0.00048810871773190766,
"loss": 2.4326,
"step": 22280
},
{
"epoch": 7.038853352286188,
"grad_norm": 0.054053460100690724,
"learning_rate": 0.0004876350750460859,
"loss": 2.3592,
"step": 22285
},
{
"epoch": 7.0404327568506675,
"grad_norm": 0.06618818877064908,
"learning_rate": 0.0004871615881671647,
"loss": 2.4047,
"step": 22290
},
{
"epoch": 7.042012161415147,
"grad_norm": 0.08335182568401928,
"learning_rate": 0.00048668825723912793,
"loss": 2.3465,
"step": 22295
},
{
"epoch": 7.043591565979626,
"grad_norm": 0.07955512465075823,
"learning_rate": 0.0004862150824059119,
"loss": 2.4591,
"step": 22300
},
{
"epoch": 7.045170970544105,
"grad_norm": 0.053631802016637725,
"learning_rate": 0.0004857420638114073,
"loss": 2.3582,
"step": 22305
},
{
"epoch": 7.0467503751085845,
"grad_norm": 0.05721351045966907,
"learning_rate": 0.0004852692015994553,
"loss": 2.33,
"step": 22310
},
{
"epoch": 7.048329779673063,
"grad_norm": 0.04913175962272203,
"learning_rate": 0.0004847964959138503,
"loss": 2.4515,
"step": 22315
},
{
"epoch": 7.049909184237542,
"grad_norm": 0.060385583661990504,
"learning_rate": 0.00048432394689833935,
"loss": 2.3377,
"step": 22320
},
{
"epoch": 7.051488588802021,
"grad_norm": 0.053631885417950824,
"learning_rate": 0.0004838515546966209,
"loss": 2.3595,
"step": 22325
},
{
"epoch": 7.053067993366501,
"grad_norm": 0.06077489506721232,
"learning_rate": 0.00048337931945234726,
"loss": 2.3561,
"step": 22330
},
{
"epoch": 7.05464739793098,
"grad_norm": 0.06373677010752224,
"learning_rate": 0.0004829072413091219,
"loss": 2.4001,
"step": 22335
},
{
"epoch": 7.056226802495459,
"grad_norm": 0.061147150243221884,
"learning_rate": 0.0004824353204105002,
"loss": 2.4548,
"step": 22340
},
{
"epoch": 7.057806207059938,
"grad_norm": 0.05918448000188891,
"learning_rate": 0.00048196355689999115,
"loss": 2.3816,
"step": 22345
},
{
"epoch": 7.059385611624418,
"grad_norm": 0.05668442100891457,
"learning_rate": 0.00048149195092105426,
"loss": 2.4345,
"step": 22350
},
{
"epoch": 7.060965016188897,
"grad_norm": 0.05821659007163679,
"learning_rate": 0.00048102050261710264,
"loss": 2.4405,
"step": 22355
},
{
"epoch": 7.062544420753376,
"grad_norm": 0.05508250643037957,
"learning_rate": 0.0004805492121315003,
"loss": 2.3619,
"step": 22360
},
{
"epoch": 7.064123825317855,
"grad_norm": 0.05474253066945977,
"learning_rate": 0.00048007807960756364,
"loss": 2.488,
"step": 22365
},
{
"epoch": 7.065703229882335,
"grad_norm": 0.05590081357760849,
"learning_rate": 0.0004796071051885611,
"loss": 2.3588,
"step": 22370
},
{
"epoch": 7.067282634446814,
"grad_norm": 0.05252395755875348,
"learning_rate": 0.00047913628901771266,
"loss": 2.3175,
"step": 22375
},
{
"epoch": 7.068862039011293,
"grad_norm": 0.05640428770084165,
"learning_rate": 0.0004786656312381913,
"loss": 2.31,
"step": 22380
},
{
"epoch": 7.070441443575772,
"grad_norm": 0.059343346775562744,
"learning_rate": 0.0004781951319931205,
"loss": 2.5275,
"step": 22385
},
{
"epoch": 7.072020848140252,
"grad_norm": 0.04973075828705303,
"learning_rate": 0.0004777247914255757,
"loss": 2.4719,
"step": 22390
},
{
"epoch": 7.07360025270473,
"grad_norm": 0.05497162427367808,
"learning_rate": 0.0004772546096785854,
"loss": 2.4166,
"step": 22395
},
{
"epoch": 7.075179657269209,
"grad_norm": 0.04481162000314791,
"learning_rate": 0.00047678458689512837,
"loss": 2.3846,
"step": 22400
},
{
"epoch": 7.076759061833688,
"grad_norm": 0.0672072530987208,
"learning_rate": 0.00047631472321813553,
"loss": 2.3995,
"step": 22405
},
{
"epoch": 7.078338466398168,
"grad_norm": 0.06484850193073702,
"learning_rate": 0.0004758450187904895,
"loss": 2.3144,
"step": 22410
},
{
"epoch": 7.079917870962647,
"grad_norm": 0.05459059841054011,
"learning_rate": 0.00047537547375502387,
"loss": 2.3782,
"step": 22415
},
{
"epoch": 7.081497275527126,
"grad_norm": 0.05610420016170933,
"learning_rate": 0.0004749060882545251,
"loss": 2.4242,
"step": 22420
},
{
"epoch": 7.083076680091605,
"grad_norm": 0.05104265391982312,
"learning_rate": 0.0004744368624317301,
"loss": 2.4518,
"step": 22425
},
{
"epoch": 7.084656084656085,
"grad_norm": 0.056969806267277344,
"learning_rate": 0.00047396779642932684,
"loss": 2.3904,
"step": 22430
},
{
"epoch": 7.086235489220564,
"grad_norm": 0.04882437057050647,
"learning_rate": 0.0004734988903899562,
"loss": 2.4577,
"step": 22435
},
{
"epoch": 7.087814893785043,
"grad_norm": 0.06609464678169245,
"learning_rate": 0.00047303014445620876,
"loss": 2.5109,
"step": 22440
},
{
"epoch": 7.089394298349522,
"grad_norm": 0.05103370170970565,
"learning_rate": 0.0004725615587706278,
"loss": 2.3865,
"step": 22445
},
{
"epoch": 7.090973702914002,
"grad_norm": 0.048548993258957965,
"learning_rate": 0.0004720931334757068,
"loss": 2.3628,
"step": 22450
},
{
"epoch": 7.092553107478481,
"grad_norm": 0.0461957856742399,
"learning_rate": 0.0004716248687138912,
"loss": 2.5196,
"step": 22455
},
{
"epoch": 7.09413251204296,
"grad_norm": 0.047518760666497656,
"learning_rate": 0.00047115676462757705,
"loss": 2.3432,
"step": 22460
},
{
"epoch": 7.095711916607439,
"grad_norm": 0.053828768359855776,
"learning_rate": 0.0004706888213591116,
"loss": 2.5968,
"step": 22465
},
{
"epoch": 7.097291321171918,
"grad_norm": 0.058677777840975644,
"learning_rate": 0.00047022103905079406,
"loss": 2.4534,
"step": 22470
},
{
"epoch": 7.098870725736397,
"grad_norm": 0.04953095577710977,
"learning_rate": 0.00046975341784487366,
"loss": 2.3614,
"step": 22475
},
{
"epoch": 7.100450130300876,
"grad_norm": 0.04926369173320298,
"learning_rate": 0.00046928595788355064,
"loss": 2.4417,
"step": 22480
},
{
"epoch": 7.1020295348653555,
"grad_norm": 0.05298044492282592,
"learning_rate": 0.0004688186593089775,
"loss": 2.396,
"step": 22485
},
{
"epoch": 7.103608939429835,
"grad_norm": 0.06768544449259989,
"learning_rate": 0.0004683515222632562,
"loss": 2.3874,
"step": 22490
},
{
"epoch": 7.105188343994314,
"grad_norm": 0.053853648723050415,
"learning_rate": 0.0004678845468884402,
"loss": 2.4692,
"step": 22495
},
{
"epoch": 7.106767748558793,
"grad_norm": 0.04628813353443334,
"learning_rate": 0.0004674177333265336,
"loss": 2.3937,
"step": 22500
},
{
"epoch": 7.1083471531232725,
"grad_norm": 0.05178483320090351,
"learning_rate": 0.0004669510817194913,
"loss": 2.4853,
"step": 22505
},
{
"epoch": 7.109926557687752,
"grad_norm": 0.0561434246840523,
"learning_rate": 0.00046648459220921957,
"loss": 2.4582,
"step": 22510
},
{
"epoch": 7.111505962252231,
"grad_norm": 0.07180099332547474,
"learning_rate": 0.0004660182649375747,
"loss": 2.3706,
"step": 22515
},
{
"epoch": 7.11308536681671,
"grad_norm": 0.054949950862126395,
"learning_rate": 0.0004655521000463633,
"loss": 2.5135,
"step": 22520
},
{
"epoch": 7.1146647713811895,
"grad_norm": 0.06792203229744449,
"learning_rate": 0.0004650860976773441,
"loss": 2.4077,
"step": 22525
},
{
"epoch": 7.116244175945669,
"grad_norm": 0.061213820334894885,
"learning_rate": 0.0004646202579722244,
"loss": 2.438,
"step": 22530
},
{
"epoch": 7.117823580510148,
"grad_norm": 0.05984185122345596,
"learning_rate": 0.00046415458107266415,
"loss": 2.4789,
"step": 22535
},
{
"epoch": 7.119402985074627,
"grad_norm": 0.057070456339471824,
"learning_rate": 0.0004636890671202725,
"loss": 2.411,
"step": 22540
},
{
"epoch": 7.1209823896391065,
"grad_norm": 0.05791212251947189,
"learning_rate": 0.0004632237162566082,
"loss": 2.3306,
"step": 22545
},
{
"epoch": 7.122561794203585,
"grad_norm": 0.046891382606143776,
"learning_rate": 0.00046275852862318257,
"loss": 2.4943,
"step": 22550
},
{
"epoch": 7.124141198768064,
"grad_norm": 0.051774975874388104,
"learning_rate": 0.00046229350436145545,
"loss": 2.3686,
"step": 22555
},
{
"epoch": 7.125720603332543,
"grad_norm": 0.04929627229554973,
"learning_rate": 0.0004618286436128386,
"loss": 2.4466,
"step": 22560
},
{
"epoch": 7.127300007897023,
"grad_norm": 0.046790097185457694,
"learning_rate": 0.00046136394651869275,
"loss": 2.3244,
"step": 22565
},
{
"epoch": 7.128879412461502,
"grad_norm": 0.05793253072507003,
"learning_rate": 0.0004608994132203289,
"loss": 2.3569,
"step": 22570
},
{
"epoch": 7.130458817025981,
"grad_norm": 0.05394842571129561,
"learning_rate": 0.00046043504385900945,
"loss": 2.4881,
"step": 22575
},
{
"epoch": 7.13203822159046,
"grad_norm": 0.05816959393361919,
"learning_rate": 0.0004599708385759459,
"loss": 2.4091,
"step": 22580
},
{
"epoch": 7.1336176261549396,
"grad_norm": 0.054256171347107855,
"learning_rate": 0.00045950679751229984,
"loss": 2.5072,
"step": 22585
},
{
"epoch": 7.135197030719419,
"grad_norm": 0.057231813913574105,
"learning_rate": 0.0004590429208091835,
"loss": 2.4793,
"step": 22590
},
{
"epoch": 7.136776435283898,
"grad_norm": 0.05707810619562981,
"learning_rate": 0.00045857920860765825,
"loss": 2.4329,
"step": 22595
},
{
"epoch": 7.138355839848377,
"grad_norm": 0.06690668435662243,
"learning_rate": 0.0004581156610487367,
"loss": 2.4606,
"step": 22600
},
{
"epoch": 7.1399352444128565,
"grad_norm": 0.056773576894885235,
"learning_rate": 0.0004576522782733802,
"loss": 2.4883,
"step": 22605
},
{
"epoch": 7.141514648977336,
"grad_norm": 0.06528025298970015,
"learning_rate": 0.000457189060422501,
"loss": 2.4328,
"step": 22610
},
{
"epoch": 7.143094053541815,
"grad_norm": 0.054281522678375084,
"learning_rate": 0.00045672600763696047,
"loss": 2.399,
"step": 22615
},
{
"epoch": 7.144673458106294,
"grad_norm": 0.05406865499378434,
"learning_rate": 0.0004562631200575695,
"loss": 2.5182,
"step": 22620
},
{
"epoch": 7.1462528626707735,
"grad_norm": 0.054878683283738044,
"learning_rate": 0.0004558003978250901,
"loss": 2.4772,
"step": 22625
},
{
"epoch": 7.147832267235252,
"grad_norm": 0.05291823031041038,
"learning_rate": 0.0004553378410802331,
"loss": 2.4818,
"step": 22630
},
{
"epoch": 7.149411671799731,
"grad_norm": 0.05799495413724737,
"learning_rate": 0.00045487544996365795,
"loss": 2.3644,
"step": 22635
},
{
"epoch": 7.15099107636421,
"grad_norm": 0.05379204462085772,
"learning_rate": 0.000454413224615976,
"loss": 2.3865,
"step": 22640
},
{
"epoch": 7.15257048092869,
"grad_norm": 0.05035893136911191,
"learning_rate": 0.0004539511651777462,
"loss": 2.4255,
"step": 22645
},
{
"epoch": 7.154149885493169,
"grad_norm": 0.0535544461589526,
"learning_rate": 0.0004534892717894785,
"loss": 2.4239,
"step": 22650
},
{
"epoch": 7.155729290057648,
"grad_norm": 0.049231465561476694,
"learning_rate": 0.00045302754459163166,
"loss": 2.4034,
"step": 22655
},
{
"epoch": 7.157308694622127,
"grad_norm": 0.05450897661295551,
"learning_rate": 0.0004525659837246133,
"loss": 2.4025,
"step": 22660
},
{
"epoch": 7.158888099186607,
"grad_norm": 0.0463752158817942,
"learning_rate": 0.00045210458932878206,
"loss": 2.4036,
"step": 22665
},
{
"epoch": 7.160467503751086,
"grad_norm": 0.05032980803941639,
"learning_rate": 0.0004516433615444446,
"loss": 2.3412,
"step": 22670
},
{
"epoch": 7.162046908315565,
"grad_norm": 0.05756411469882938,
"learning_rate": 0.0004511823005118574,
"loss": 2.3714,
"step": 22675
},
{
"epoch": 7.163626312880044,
"grad_norm": 0.04763318582796287,
"learning_rate": 0.0004507214063712262,
"loss": 2.3937,
"step": 22680
},
{
"epoch": 7.165205717444524,
"grad_norm": 0.04743784396114025,
"learning_rate": 0.0004502606792627053,
"loss": 2.4057,
"step": 22685
},
{
"epoch": 7.166785122009003,
"grad_norm": 0.05082335261559461,
"learning_rate": 0.0004498001193264,
"loss": 2.3993,
"step": 22690
},
{
"epoch": 7.168364526573482,
"grad_norm": 0.0668975121732276,
"learning_rate": 0.00044933972670236255,
"loss": 2.354,
"step": 22695
},
{
"epoch": 7.169943931137961,
"grad_norm": 0.0627127176365516,
"learning_rate": 0.0004488795015305964,
"loss": 2.4189,
"step": 22700
},
{
"epoch": 7.171523335702441,
"grad_norm": 0.0530348548292881,
"learning_rate": 0.0004484194439510527,
"loss": 2.4288,
"step": 22705
},
{
"epoch": 7.173102740266919,
"grad_norm": 0.05489002488624982,
"learning_rate": 0.0004479595541036315,
"loss": 2.4122,
"step": 22710
},
{
"epoch": 7.174682144831398,
"grad_norm": 0.0554180249954323,
"learning_rate": 0.0004474998321281832,
"loss": 2.5061,
"step": 22715
},
{
"epoch": 7.1762615493958775,
"grad_norm": 0.05960033631635437,
"learning_rate": 0.00044704027816450586,
"loss": 2.4847,
"step": 22720
},
{
"epoch": 7.177840953960357,
"grad_norm": 0.061594201159924485,
"learning_rate": 0.0004465808923523471,
"loss": 2.4432,
"step": 22725
},
{
"epoch": 7.179420358524836,
"grad_norm": 0.06446780684025662,
"learning_rate": 0.000446121674831403,
"loss": 2.4772,
"step": 22730
},
{
"epoch": 7.180999763089315,
"grad_norm": 0.046521565333944,
"learning_rate": 0.00044566262574131845,
"loss": 2.3783,
"step": 22735
},
{
"epoch": 7.1825791676537944,
"grad_norm": 0.058597223164450145,
"learning_rate": 0.00044520374522168793,
"loss": 2.3534,
"step": 22740
},
{
"epoch": 7.184158572218274,
"grad_norm": 0.05327155937517266,
"learning_rate": 0.00044474503341205386,
"loss": 2.3408,
"step": 22745
},
{
"epoch": 7.185737976782753,
"grad_norm": 0.06332922236815533,
"learning_rate": 0.0004442864904519072,
"loss": 2.3159,
"step": 22750
},
{
"epoch": 7.187317381347232,
"grad_norm": 0.0537193747896481,
"learning_rate": 0.00044382811648068844,
"loss": 2.4244,
"step": 22755
},
{
"epoch": 7.188896785911711,
"grad_norm": 0.054447157903122496,
"learning_rate": 0.0004433699116377861,
"loss": 2.443,
"step": 22760
},
{
"epoch": 7.190476190476191,
"grad_norm": 0.07686127379391965,
"learning_rate": 0.0004429118760625372,
"loss": 2.4494,
"step": 22765
},
{
"epoch": 7.19205559504067,
"grad_norm": 0.06038874503558769,
"learning_rate": 0.0004424540098942275,
"loss": 2.4226,
"step": 22770
},
{
"epoch": 7.193634999605149,
"grad_norm": 0.048657210149132386,
"learning_rate": 0.00044199631327209067,
"loss": 2.5111,
"step": 22775
},
{
"epoch": 7.195214404169628,
"grad_norm": 0.05548145875073198,
"learning_rate": 0.0004415387863353102,
"loss": 2.3155,
"step": 22780
},
{
"epoch": 7.196793808734107,
"grad_norm": 0.05473862703122515,
"learning_rate": 0.0004410814292230163,
"loss": 2.3674,
"step": 22785
},
{
"epoch": 7.198373213298586,
"grad_norm": 0.05505864233866162,
"learning_rate": 0.0004406242420742892,
"loss": 2.4172,
"step": 22790
},
{
"epoch": 7.199952617863065,
"grad_norm": 0.05150335407360049,
"learning_rate": 0.0004401672250281561,
"loss": 2.3785,
"step": 22795
},
{
"epoch": 7.2015320224275445,
"grad_norm": 0.055842220396361636,
"learning_rate": 0.0004397103782235925,
"loss": 2.441,
"step": 22800
},
{
"epoch": 7.203111426992024,
"grad_norm": 0.06516738120800779,
"learning_rate": 0.0004392537017995236,
"loss": 2.4836,
"step": 22805
},
{
"epoch": 7.204690831556503,
"grad_norm": 0.05294395080085389,
"learning_rate": 0.00043879719589482125,
"loss": 2.541,
"step": 22810
},
{
"epoch": 7.206270236120982,
"grad_norm": 0.05353233354000687,
"learning_rate": 0.00043834086064830605,
"loss": 2.4319,
"step": 22815
},
{
"epoch": 7.2078496406854615,
"grad_norm": 0.05492647814378925,
"learning_rate": 0.0004378846961987465,
"loss": 2.4915,
"step": 22820
},
{
"epoch": 7.209429045249941,
"grad_norm": 0.05632475820237248,
"learning_rate": 0.000437428702684859,
"loss": 2.472,
"step": 22825
},
{
"epoch": 7.21100844981442,
"grad_norm": 0.05901269744287303,
"learning_rate": 0.00043697288024530914,
"loss": 2.3883,
"step": 22830
},
{
"epoch": 7.212587854378899,
"grad_norm": 0.052741292663270635,
"learning_rate": 0.0004365172290187086,
"loss": 2.3636,
"step": 22835
},
{
"epoch": 7.2141672589433785,
"grad_norm": 0.05175073096790014,
"learning_rate": 0.00043606174914361895,
"loss": 2.4367,
"step": 22840
},
{
"epoch": 7.215746663507858,
"grad_norm": 0.05257913321958104,
"learning_rate": 0.00043560644075854837,
"loss": 2.3495,
"step": 22845
},
{
"epoch": 7.217326068072337,
"grad_norm": 0.0541049786157875,
"learning_rate": 0.000435151304001953,
"loss": 2.3664,
"step": 22850
},
{
"epoch": 7.218905472636816,
"grad_norm": 0.05929024156226885,
"learning_rate": 0.00043469633901223727,
"loss": 2.4205,
"step": 22855
},
{
"epoch": 7.2204848772012955,
"grad_norm": 0.05692855564895376,
"learning_rate": 0.000434241545927753,
"loss": 2.45,
"step": 22860
},
{
"epoch": 7.222064281765775,
"grad_norm": 0.0529476590454288,
"learning_rate": 0.0004337869248867995,
"loss": 2.4011,
"step": 22865
},
{
"epoch": 7.223643686330253,
"grad_norm": 0.05387521337145956,
"learning_rate": 0.00043333247602762485,
"loss": 2.4672,
"step": 22870
},
{
"epoch": 7.225223090894732,
"grad_norm": 0.05035042347303634,
"learning_rate": 0.0004328781994884233,
"loss": 2.4697,
"step": 22875
},
{
"epoch": 7.226802495459212,
"grad_norm": 0.06711635883800816,
"learning_rate": 0.00043242409540733827,
"loss": 2.3636,
"step": 22880
},
{
"epoch": 7.228381900023691,
"grad_norm": 0.050208177847224374,
"learning_rate": 0.0004319701639224596,
"loss": 2.4149,
"step": 22885
},
{
"epoch": 7.22996130458817,
"grad_norm": 0.057904178053437116,
"learning_rate": 0.0004315164051718243,
"loss": 2.5331,
"step": 22890
},
{
"epoch": 7.231540709152649,
"grad_norm": 0.050249222969918045,
"learning_rate": 0.0004310628192934185,
"loss": 2.3641,
"step": 22895
},
{
"epoch": 7.233120113717129,
"grad_norm": 0.06476074688114011,
"learning_rate": 0.0004306094064251742,
"loss": 2.4901,
"step": 22900
},
{
"epoch": 7.234699518281608,
"grad_norm": 0.056320733281801316,
"learning_rate": 0.0004301561667049716,
"loss": 2.3834,
"step": 22905
},
{
"epoch": 7.236278922846087,
"grad_norm": 0.05354949264797148,
"learning_rate": 0.00042970310027063774,
"loss": 2.3642,
"step": 22910
},
{
"epoch": 7.237858327410566,
"grad_norm": 0.06256453961923768,
"learning_rate": 0.0004292502072599471,
"loss": 2.492,
"step": 22915
},
{
"epoch": 7.239437731975046,
"grad_norm": 0.08469470347265333,
"learning_rate": 0.0004287974878106222,
"loss": 2.425,
"step": 22920
},
{
"epoch": 7.241017136539525,
"grad_norm": 0.0585997124074025,
"learning_rate": 0.00042834494206033126,
"loss": 2.3965,
"step": 22925
},
{
"epoch": 7.242596541104004,
"grad_norm": 0.06056931894800767,
"learning_rate": 0.0004278925701466915,
"loss": 2.4136,
"step": 22930
},
{
"epoch": 7.244175945668483,
"grad_norm": 0.057340188750008424,
"learning_rate": 0.00042744037220726584,
"loss": 2.4731,
"step": 22935
},
{
"epoch": 7.245755350232963,
"grad_norm": 0.05359526035066487,
"learning_rate": 0.0004269883483795648,
"loss": 2.4574,
"step": 22940
},
{
"epoch": 7.247334754797441,
"grad_norm": 0.05859497430126532,
"learning_rate": 0.00042653649880104597,
"loss": 2.4454,
"step": 22945
},
{
"epoch": 7.24891415936192,
"grad_norm": 0.05675932201342108,
"learning_rate": 0.0004260848236091135,
"loss": 2.2968,
"step": 22950
},
{
"epoch": 7.250493563926399,
"grad_norm": 0.05368356817280291,
"learning_rate": 0.00042563332294111967,
"loss": 2.4265,
"step": 22955
},
{
"epoch": 7.252072968490879,
"grad_norm": 0.06035120629280444,
"learning_rate": 0.00042518199693436254,
"loss": 2.4724,
"step": 22960
},
{
"epoch": 7.253652373055358,
"grad_norm": 0.05070637372168603,
"learning_rate": 0.0004247308457260873,
"loss": 2.4114,
"step": 22965
},
{
"epoch": 7.255231777619837,
"grad_norm": 0.06691780641000095,
"learning_rate": 0.00042427986945348665,
"loss": 2.438,
"step": 22970
},
{
"epoch": 7.256811182184316,
"grad_norm": 0.07254358776207415,
"learning_rate": 0.0004238290682536994,
"loss": 2.3952,
"step": 22975
},
{
"epoch": 7.258390586748796,
"grad_norm": 0.045487663001971275,
"learning_rate": 0.00042337844226381083,
"loss": 2.4653,
"step": 22980
},
{
"epoch": 7.259969991313275,
"grad_norm": 0.06331602162173694,
"learning_rate": 0.00042292799162085414,
"loss": 2.3607,
"step": 22985
},
{
"epoch": 7.261549395877754,
"grad_norm": 0.06316826413889758,
"learning_rate": 0.0004224777164618083,
"loss": 2.4936,
"step": 22990
},
{
"epoch": 7.263128800442233,
"grad_norm": 0.056114953613693536,
"learning_rate": 0.0004220276169235989,
"loss": 2.4358,
"step": 22995
},
{
"epoch": 7.264708205006713,
"grad_norm": 0.05009092501700465,
"learning_rate": 0.00042157769314309844,
"loss": 2.3407,
"step": 23000
},
{
"epoch": 7.266287609571192,
"grad_norm": 0.05042828197747411,
"learning_rate": 0.0004211279452571255,
"loss": 2.4556,
"step": 23005
},
{
"epoch": 7.267867014135671,
"grad_norm": 0.04901947494880478,
"learning_rate": 0.0004206783734024463,
"loss": 2.4552,
"step": 23010
},
{
"epoch": 7.26944641870015,
"grad_norm": 0.11856151065206234,
"learning_rate": 0.000420228977715772,
"loss": 2.459,
"step": 23015
},
{
"epoch": 7.27102582326463,
"grad_norm": 0.10180627702415497,
"learning_rate": 0.00041977975833376157,
"loss": 2.4152,
"step": 23020
},
{
"epoch": 7.272605227829108,
"grad_norm": 0.07026541127932902,
"learning_rate": 0.0004193307153930196,
"loss": 2.5014,
"step": 23025
},
{
"epoch": 7.274184632393587,
"grad_norm": 0.07214665623560222,
"learning_rate": 0.00041888184903009695,
"loss": 2.5648,
"step": 23030
},
{
"epoch": 7.2757640369580665,
"grad_norm": 0.06659079098589442,
"learning_rate": 0.0004184331593814913,
"loss": 2.3869,
"step": 23035
},
{
"epoch": 7.277343441522546,
"grad_norm": 0.07462660970105818,
"learning_rate": 0.00041798464658364566,
"loss": 2.3619,
"step": 23040
},
{
"epoch": 7.278922846087025,
"grad_norm": 0.05553604278264133,
"learning_rate": 0.00041753631077295087,
"loss": 2.4243,
"step": 23045
},
{
"epoch": 7.280502250651504,
"grad_norm": 0.053494717136432286,
"learning_rate": 0.00041708815208574247,
"loss": 2.4105,
"step": 23050
},
{
"epoch": 7.2820816552159835,
"grad_norm": 0.04740050351874312,
"learning_rate": 0.0004166401706583023,
"loss": 2.3979,
"step": 23055
},
{
"epoch": 7.283661059780463,
"grad_norm": 0.06554420156354972,
"learning_rate": 0.0004161923666268594,
"loss": 2.3907,
"step": 23060
},
{
"epoch": 7.285240464344942,
"grad_norm": 0.051215205180764174,
"learning_rate": 0.00041574474012758743,
"loss": 2.5091,
"step": 23065
},
{
"epoch": 7.286819868909421,
"grad_norm": 0.08415768441672569,
"learning_rate": 0.0004152972912966074,
"loss": 2.3129,
"step": 23070
},
{
"epoch": 7.2883992734739005,
"grad_norm": 0.07255028857627768,
"learning_rate": 0.0004148500202699854,
"loss": 2.4459,
"step": 23075
},
{
"epoch": 7.28997867803838,
"grad_norm": 0.06586873948536655,
"learning_rate": 0.0004144029271837336,
"loss": 2.3013,
"step": 23080
},
{
"epoch": 7.291558082602859,
"grad_norm": 0.07231511332265027,
"learning_rate": 0.0004139560121738101,
"loss": 2.4013,
"step": 23085
},
{
"epoch": 7.293137487167338,
"grad_norm": 0.04599805937834209,
"learning_rate": 0.00041350927537611894,
"loss": 2.3289,
"step": 23090
},
{
"epoch": 7.2947168917318175,
"grad_norm": 0.04781180375675834,
"learning_rate": 0.00041306271692650965,
"loss": 2.4017,
"step": 23095
},
{
"epoch": 7.296296296296296,
"grad_norm": 0.057149503805875405,
"learning_rate": 0.0004126163369607784,
"loss": 2.4368,
"step": 23100
},
{
"epoch": 7.297875700860775,
"grad_norm": 0.062140652199933,
"learning_rate": 0.0004121701356146659,
"loss": 2.4677,
"step": 23105
},
{
"epoch": 7.299455105425254,
"grad_norm": 0.07945387282626987,
"learning_rate": 0.0004117241130238597,
"loss": 2.2999,
"step": 23110
},
{
"epoch": 7.301034509989734,
"grad_norm": 0.05837365773633515,
"learning_rate": 0.00041127826932399215,
"loss": 2.4515,
"step": 23115
},
{
"epoch": 7.302613914554213,
"grad_norm": 0.04855213009161075,
"learning_rate": 0.00041083260465064143,
"loss": 2.4669,
"step": 23120
},
{
"epoch": 7.304193319118692,
"grad_norm": 0.06528173832108752,
"learning_rate": 0.00041038711913933133,
"loss": 2.4565,
"step": 23125
},
{
"epoch": 7.305772723683171,
"grad_norm": 0.06008851527567248,
"learning_rate": 0.0004099418129255309,
"loss": 2.4633,
"step": 23130
},
{
"epoch": 7.307352128247651,
"grad_norm": 0.05146277616116244,
"learning_rate": 0.0004094966861446554,
"loss": 2.4149,
"step": 23135
},
{
"epoch": 7.30893153281213,
"grad_norm": 0.06195290842146711,
"learning_rate": 0.0004090517389320649,
"loss": 2.4968,
"step": 23140
},
{
"epoch": 7.310510937376609,
"grad_norm": 0.07761875245487002,
"learning_rate": 0.0004086069714230646,
"loss": 2.3864,
"step": 23145
},
{
"epoch": 7.312090341941088,
"grad_norm": 0.06865512619791067,
"learning_rate": 0.000408162383752906,
"loss": 2.4866,
"step": 23150
},
{
"epoch": 7.313669746505568,
"grad_norm": 0.06956410081498506,
"learning_rate": 0.00040771797605678486,
"loss": 2.4979,
"step": 23155
},
{
"epoch": 7.315249151070047,
"grad_norm": 0.07317747601786594,
"learning_rate": 0.00040727374846984344,
"loss": 2.4307,
"step": 23160
},
{
"epoch": 7.316828555634526,
"grad_norm": 0.08954038886989313,
"learning_rate": 0.000406829701127168,
"loss": 2.3915,
"step": 23165
},
{
"epoch": 7.318407960199005,
"grad_norm": 0.060376807629495,
"learning_rate": 0.0004063858341637905,
"loss": 2.4324,
"step": 23170
},
{
"epoch": 7.3199873647634845,
"grad_norm": 0.06896215610563826,
"learning_rate": 0.0004059421477146882,
"loss": 2.3749,
"step": 23175
},
{
"epoch": 7.321566769327964,
"grad_norm": 0.059302264793621884,
"learning_rate": 0.0004054986419147829,
"loss": 2.4397,
"step": 23180
},
{
"epoch": 7.323146173892442,
"grad_norm": 0.07616637023410781,
"learning_rate": 0.0004050553168989426,
"loss": 2.4457,
"step": 23185
},
{
"epoch": 7.324725578456921,
"grad_norm": 0.08542210106270082,
"learning_rate": 0.00040461217280197915,
"loss": 2.4341,
"step": 23190
},
{
"epoch": 7.326304983021401,
"grad_norm": 0.0676050006964208,
"learning_rate": 0.0004041692097586496,
"loss": 2.3246,
"step": 23195
},
{
"epoch": 7.32788438758588,
"grad_norm": 0.06667913623813965,
"learning_rate": 0.00040372642790365677,
"loss": 2.424,
"step": 23200
},
{
"epoch": 7.329463792150359,
"grad_norm": 0.06937850763951874,
"learning_rate": 0.0004032838273716476,
"loss": 2.3171,
"step": 23205
},
{
"epoch": 7.331043196714838,
"grad_norm": 0.07332911567740688,
"learning_rate": 0.00040284140829721405,
"loss": 2.3978,
"step": 23210
},
{
"epoch": 7.332622601279318,
"grad_norm": 0.08111694144155136,
"learning_rate": 0.00040239917081489273,
"loss": 2.3357,
"step": 23215
},
{
"epoch": 7.334202005843797,
"grad_norm": 0.083413955118876,
"learning_rate": 0.0004019571150591652,
"loss": 2.4407,
"step": 23220
},
{
"epoch": 7.335781410408276,
"grad_norm": 0.05697293108711861,
"learning_rate": 0.00040151524116445827,
"loss": 2.3873,
"step": 23225
},
{
"epoch": 7.337360814972755,
"grad_norm": 0.05309836851489041,
"learning_rate": 0.0004010735492651426,
"loss": 2.3351,
"step": 23230
},
{
"epoch": 7.338940219537235,
"grad_norm": 0.07439624739886878,
"learning_rate": 0.00040063203949553374,
"loss": 2.4385,
"step": 23235
},
{
"epoch": 7.340519624101714,
"grad_norm": 0.07690879262708751,
"learning_rate": 0.0004001907119898924,
"loss": 2.4637,
"step": 23240
},
{
"epoch": 7.342099028666193,
"grad_norm": 0.08223749279640362,
"learning_rate": 0.000399749566882423,
"loss": 2.418,
"step": 23245
},
{
"epoch": 7.343678433230672,
"grad_norm": 0.06549252347156347,
"learning_rate": 0.00039930860430727557,
"loss": 2.4514,
"step": 23250
},
{
"epoch": 7.345257837795152,
"grad_norm": 0.0636998425829546,
"learning_rate": 0.00039886782439854364,
"loss": 2.376,
"step": 23255
},
{
"epoch": 7.34683724235963,
"grad_norm": 0.07918039158244089,
"learning_rate": 0.00039842722729026546,
"loss": 2.4957,
"step": 23260
},
{
"epoch": 7.348416646924109,
"grad_norm": 0.07465679171250583,
"learning_rate": 0.00039798681311642404,
"loss": 2.4803,
"step": 23265
},
{
"epoch": 7.3499960514885885,
"grad_norm": 0.06743300010437357,
"learning_rate": 0.000397546582010946,
"loss": 2.4106,
"step": 23270
},
{
"epoch": 7.351575456053068,
"grad_norm": 0.06227504892752251,
"learning_rate": 0.0003971065341077035,
"loss": 2.4441,
"step": 23275
},
{
"epoch": 7.353154860617547,
"grad_norm": 0.0487941241422176,
"learning_rate": 0.000396666669540512,
"loss": 2.4093,
"step": 23280
},
{
"epoch": 7.354734265182026,
"grad_norm": 0.064214288880335,
"learning_rate": 0.0003962269884431311,
"loss": 2.3302,
"step": 23285
},
{
"epoch": 7.3563136697465055,
"grad_norm": 0.058324830021285086,
"learning_rate": 0.0003957874909492658,
"loss": 2.4446,
"step": 23290
},
{
"epoch": 7.357893074310985,
"grad_norm": 0.07221663354459125,
"learning_rate": 0.0003953481771925641,
"loss": 2.3911,
"step": 23295
},
{
"epoch": 7.359472478875464,
"grad_norm": 0.060054117637791414,
"learning_rate": 0.00039490904730661846,
"loss": 2.4816,
"step": 23300
},
{
"epoch": 7.361051883439943,
"grad_norm": 0.0818000514125703,
"learning_rate": 0.00039447010142496555,
"loss": 2.4044,
"step": 23305
},
{
"epoch": 7.3626312880044225,
"grad_norm": 0.0568783557612923,
"learning_rate": 0.0003940313396810855,
"loss": 2.3786,
"step": 23310
},
{
"epoch": 7.364210692568902,
"grad_norm": 0.05672571981004285,
"learning_rate": 0.00039359276220840377,
"loss": 2.4056,
"step": 23315
},
{
"epoch": 7.365790097133381,
"grad_norm": 0.06653408816348595,
"learning_rate": 0.0003931543691402887,
"loss": 2.411,
"step": 23320
},
{
"epoch": 7.36736950169786,
"grad_norm": 0.06543603909140704,
"learning_rate": 0.0003927161606100523,
"loss": 2.4427,
"step": 23325
},
{
"epoch": 7.3689489062623394,
"grad_norm": 0.05664764631069919,
"learning_rate": 0.0003922781367509519,
"loss": 2.462,
"step": 23330
},
{
"epoch": 7.370528310826819,
"grad_norm": 0.06610941691709837,
"learning_rate": 0.0003918402976961868,
"loss": 2.3808,
"step": 23335
},
{
"epoch": 7.372107715391298,
"grad_norm": 0.08353822829197813,
"learning_rate": 0.00039140264357890187,
"loss": 2.4553,
"step": 23340
},
{
"epoch": 7.373687119955776,
"grad_norm": 0.06463230908761662,
"learning_rate": 0.0003909651745321847,
"loss": 2.3985,
"step": 23345
},
{
"epoch": 7.3752665245202556,
"grad_norm": 0.059977760365744925,
"learning_rate": 0.00039052789068906655,
"loss": 2.3647,
"step": 23350
},
{
"epoch": 7.376845929084735,
"grad_norm": 0.051553343673477116,
"learning_rate": 0.000390090792182523,
"loss": 2.4085,
"step": 23355
},
{
"epoch": 7.378425333649214,
"grad_norm": 0.05368864357172616,
"learning_rate": 0.00038965387914547235,
"loss": 2.3228,
"step": 23360
},
{
"epoch": 7.380004738213693,
"grad_norm": 0.07794239984230016,
"learning_rate": 0.00038921715171077765,
"loss": 2.4189,
"step": 23365
},
{
"epoch": 7.3815841427781725,
"grad_norm": 0.061574805112702094,
"learning_rate": 0.0003887806100112449,
"loss": 2.4475,
"step": 23370
},
{
"epoch": 7.383163547342652,
"grad_norm": 0.05258619548552006,
"learning_rate": 0.0003883442541796229,
"loss": 2.4548,
"step": 23375
},
{
"epoch": 7.384742951907131,
"grad_norm": 0.06368643485947272,
"learning_rate": 0.0003879080843486057,
"loss": 2.5035,
"step": 23380
},
{
"epoch": 7.38632235647161,
"grad_norm": 0.04948048503103337,
"learning_rate": 0.0003874721006508293,
"loss": 2.3889,
"step": 23385
},
{
"epoch": 7.3879017610360895,
"grad_norm": 0.049377374469387274,
"learning_rate": 0.0003870363032188735,
"loss": 2.3956,
"step": 23390
},
{
"epoch": 7.389481165600569,
"grad_norm": 0.055281587869871605,
"learning_rate": 0.0003866006921852616,
"loss": 2.4195,
"step": 23395
},
{
"epoch": 7.391060570165048,
"grad_norm": 0.058033724803978066,
"learning_rate": 0.00038616526768245975,
"loss": 2.4196,
"step": 23400
},
{
"epoch": 7.392639974729527,
"grad_norm": 0.05428967106481442,
"learning_rate": 0.0003857300298428784,
"loss": 2.4819,
"step": 23405
},
{
"epoch": 7.3942193792940065,
"grad_norm": 0.04879265886815176,
"learning_rate": 0.0003852949787988703,
"loss": 2.2827,
"step": 23410
},
{
"epoch": 7.395798783858485,
"grad_norm": 0.048160402312052424,
"learning_rate": 0.0003848601146827314,
"loss": 2.4231,
"step": 23415
},
{
"epoch": 7.397378188422964,
"grad_norm": 0.051898838198475634,
"learning_rate": 0.0003844254376267017,
"loss": 2.4422,
"step": 23420
},
{
"epoch": 7.398957592987443,
"grad_norm": 0.05260687555507956,
"learning_rate": 0.00038399094776296296,
"loss": 2.3719,
"step": 23425
},
{
"epoch": 7.400536997551923,
"grad_norm": 0.053159802639681904,
"learning_rate": 0.0003835566452236416,
"loss": 2.3793,
"step": 23430
},
{
"epoch": 7.402116402116402,
"grad_norm": 0.05620518586086027,
"learning_rate": 0.000383122530140806,
"loss": 2.4135,
"step": 23435
},
{
"epoch": 7.403695806680881,
"grad_norm": 0.06692891799390799,
"learning_rate": 0.00038268860264646757,
"loss": 2.4662,
"step": 23440
},
{
"epoch": 7.40527521124536,
"grad_norm": 0.05887652203823664,
"learning_rate": 0.00038225486287258095,
"loss": 2.3633,
"step": 23445
},
{
"epoch": 7.40685461580984,
"grad_norm": 0.05943560239549316,
"learning_rate": 0.0003818213109510432,
"loss": 2.4586,
"step": 23450
},
{
"epoch": 7.408434020374319,
"grad_norm": 0.0472467899939956,
"learning_rate": 0.0003813879470136956,
"loss": 2.3515,
"step": 23455
},
{
"epoch": 7.410013424938798,
"grad_norm": 0.053760128516914814,
"learning_rate": 0.0003809547711923209,
"loss": 2.3773,
"step": 23460
},
{
"epoch": 7.411592829503277,
"grad_norm": 0.051162254755005174,
"learning_rate": 0.0003805217836186446,
"loss": 2.4469,
"step": 23465
},
{
"epoch": 7.413172234067757,
"grad_norm": 0.05668412599755234,
"learning_rate": 0.0003800889844243365,
"loss": 2.4433,
"step": 23470
},
{
"epoch": 7.414751638632236,
"grad_norm": 0.05112718426244128,
"learning_rate": 0.00037965637374100735,
"loss": 2.4079,
"step": 23475
},
{
"epoch": 7.416331043196715,
"grad_norm": 0.05007729851939131,
"learning_rate": 0.0003792239517002116,
"loss": 2.3804,
"step": 23480
},
{
"epoch": 7.417910447761194,
"grad_norm": 0.057342576874044066,
"learning_rate": 0.0003787917184334457,
"loss": 2.4456,
"step": 23485
},
{
"epoch": 7.419489852325674,
"grad_norm": 0.04923906008979115,
"learning_rate": 0.0003783596740721491,
"loss": 2.3102,
"step": 23490
},
{
"epoch": 7.421069256890153,
"grad_norm": 0.05895303400065207,
"learning_rate": 0.000377927818747704,
"loss": 2.4249,
"step": 23495
},
{
"epoch": 7.422648661454631,
"grad_norm": 0.05242232639144367,
"learning_rate": 0.00037749615259143445,
"loss": 2.3691,
"step": 23500
},
{
"epoch": 7.4242280660191105,
"grad_norm": 0.053844882394087655,
"learning_rate": 0.0003770646757346079,
"loss": 2.3882,
"step": 23505
},
{
"epoch": 7.42580747058359,
"grad_norm": 0.05970724681811083,
"learning_rate": 0.0003766333883084335,
"loss": 2.4231,
"step": 23510
},
{
"epoch": 7.427386875148069,
"grad_norm": 0.058663310720548756,
"learning_rate": 0.00037620229044406253,
"loss": 2.3752,
"step": 23515
},
{
"epoch": 7.428966279712548,
"grad_norm": 0.06418413673406441,
"learning_rate": 0.0003757713822725898,
"loss": 2.4039,
"step": 23520
},
{
"epoch": 7.430545684277027,
"grad_norm": 0.06134913787884254,
"learning_rate": 0.0003753406639250514,
"loss": 2.4186,
"step": 23525
},
{
"epoch": 7.432125088841507,
"grad_norm": 0.058718245540661414,
"learning_rate": 0.00037491013553242605,
"loss": 2.3649,
"step": 23530
},
{
"epoch": 7.433704493405986,
"grad_norm": 0.06665366330747048,
"learning_rate": 0.0003744797972256346,
"loss": 2.4242,
"step": 23535
},
{
"epoch": 7.435283897970465,
"grad_norm": 0.06256570473124856,
"learning_rate": 0.0003740496491355401,
"loss": 2.4853,
"step": 23540
},
{
"epoch": 7.436863302534944,
"grad_norm": 0.07020851693596936,
"learning_rate": 0.00037361969139294816,
"loss": 2.4186,
"step": 23545
},
{
"epoch": 7.438442707099424,
"grad_norm": 0.057256852093966595,
"learning_rate": 0.00037318992412860606,
"loss": 2.4861,
"step": 23550
},
{
"epoch": 7.440022111663903,
"grad_norm": 0.04847339601890821,
"learning_rate": 0.00037276034747320296,
"loss": 2.3127,
"step": 23555
},
{
"epoch": 7.441601516228382,
"grad_norm": 0.06457386012177314,
"learning_rate": 0.00037233096155737087,
"loss": 2.4756,
"step": 23560
},
{
"epoch": 7.443180920792861,
"grad_norm": 0.05290248046859747,
"learning_rate": 0.0003719017665116833,
"loss": 2.3679,
"step": 23565
},
{
"epoch": 7.444760325357341,
"grad_norm": 0.047604859231034254,
"learning_rate": 0.00037147276246665527,
"loss": 2.3873,
"step": 23570
},
{
"epoch": 7.446339729921819,
"grad_norm": 0.06168118905027415,
"learning_rate": 0.0003710439495527446,
"loss": 2.4328,
"step": 23575
},
{
"epoch": 7.447919134486298,
"grad_norm": 0.051595459108080534,
"learning_rate": 0.0003706153279003498,
"loss": 2.3076,
"step": 23580
},
{
"epoch": 7.4494985390507775,
"grad_norm": 0.05230454156519212,
"learning_rate": 0.00037018689763981295,
"loss": 2.4704,
"step": 23585
},
{
"epoch": 7.451077943615257,
"grad_norm": 0.04841876288423717,
"learning_rate": 0.00036975865890141626,
"loss": 2.3298,
"step": 23590
},
{
"epoch": 7.452657348179736,
"grad_norm": 0.0652579610408843,
"learning_rate": 0.000369330611815385,
"loss": 2.5174,
"step": 23595
},
{
"epoch": 7.454236752744215,
"grad_norm": 0.0634467381622313,
"learning_rate": 0.0003689027565118852,
"loss": 2.3438,
"step": 23600
},
{
"epoch": 7.4558161573086945,
"grad_norm": 0.04628284636995493,
"learning_rate": 0.00036847509312102467,
"loss": 2.5966,
"step": 23605
},
{
"epoch": 7.457395561873174,
"grad_norm": 0.05251325655453334,
"learning_rate": 0.00036804762177285367,
"loss": 2.3418,
"step": 23610
},
{
"epoch": 7.458974966437653,
"grad_norm": 0.07722348403670708,
"learning_rate": 0.0003676203425973632,
"loss": 2.4356,
"step": 23615
},
{
"epoch": 7.460554371002132,
"grad_norm": 0.04889864050952568,
"learning_rate": 0.00036719325572448627,
"loss": 2.3873,
"step": 23620
},
{
"epoch": 7.4621337755666115,
"grad_norm": 0.06170912673919736,
"learning_rate": 0.0003667663612840971,
"loss": 2.3894,
"step": 23625
},
{
"epoch": 7.463713180131091,
"grad_norm": 0.059139982586257196,
"learning_rate": 0.0003663396594060113,
"loss": 2.4052,
"step": 23630
},
{
"epoch": 7.46529258469557,
"grad_norm": 0.050779417722003305,
"learning_rate": 0.00036591315021998683,
"loss": 2.3199,
"step": 23635
},
{
"epoch": 7.466871989260049,
"grad_norm": 0.052582832535869733,
"learning_rate": 0.00036548683385572215,
"loss": 2.3601,
"step": 23640
},
{
"epoch": 7.4684513938245285,
"grad_norm": 0.05032154022496104,
"learning_rate": 0.00036506071044285684,
"loss": 2.4505,
"step": 23645
},
{
"epoch": 7.470030798389008,
"grad_norm": 0.05777376740018471,
"learning_rate": 0.00036463478011097307,
"loss": 2.3916,
"step": 23650
},
{
"epoch": 7.471610202953487,
"grad_norm": 0.0602882474269405,
"learning_rate": 0.0003642090429895933,
"loss": 2.4156,
"step": 23655
},
{
"epoch": 7.473189607517965,
"grad_norm": 0.056972774488076666,
"learning_rate": 0.0003637834992081813,
"loss": 2.4732,
"step": 23660
},
{
"epoch": 7.474769012082445,
"grad_norm": 0.06404474027264716,
"learning_rate": 0.00036335814889614236,
"loss": 2.3455,
"step": 23665
},
{
"epoch": 7.476348416646924,
"grad_norm": 0.053151870447874665,
"learning_rate": 0.0003629329921828224,
"loss": 2.3919,
"step": 23670
},
{
"epoch": 7.477927821211403,
"grad_norm": 0.050179990744870624,
"learning_rate": 0.0003625080291975095,
"loss": 2.3638,
"step": 23675
},
{
"epoch": 7.479507225775882,
"grad_norm": 0.05344170008854404,
"learning_rate": 0.0003620832600694314,
"loss": 2.4184,
"step": 23680
},
{
"epoch": 7.481086630340362,
"grad_norm": 0.05880039006582616,
"learning_rate": 0.00036165868492775866,
"loss": 2.5056,
"step": 23685
},
{
"epoch": 7.482666034904841,
"grad_norm": 0.0555886248543761,
"learning_rate": 0.0003612343039016013,
"loss": 2.4771,
"step": 23690
},
{
"epoch": 7.48424543946932,
"grad_norm": 0.053690628524088346,
"learning_rate": 0.00036081011712001056,
"loss": 2.429,
"step": 23695
},
{
"epoch": 7.485824844033799,
"grad_norm": 0.04431564004406769,
"learning_rate": 0.00036038612471197965,
"loss": 2.389,
"step": 23700
},
{
"epoch": 7.487404248598279,
"grad_norm": 0.04878347140061997,
"learning_rate": 0.0003599623268064416,
"loss": 2.4411,
"step": 23705
},
{
"epoch": 7.488983653162758,
"grad_norm": 0.05236608966606008,
"learning_rate": 0.0003595387235322707,
"loss": 2.5116,
"step": 23710
},
{
"epoch": 7.490563057727237,
"grad_norm": 0.05100673243720879,
"learning_rate": 0.000359115315018282,
"loss": 2.4688,
"step": 23715
},
{
"epoch": 7.492142462291716,
"grad_norm": 0.05411154070919953,
"learning_rate": 0.0003586921013932308,
"loss": 2.4296,
"step": 23720
},
{
"epoch": 7.493721866856196,
"grad_norm": 0.052071339596594705,
"learning_rate": 0.0003582690827858146,
"loss": 2.3595,
"step": 23725
},
{
"epoch": 7.495301271420675,
"grad_norm": 0.05677860791346205,
"learning_rate": 0.00035784625932466975,
"loss": 2.4711,
"step": 23730
},
{
"epoch": 7.496880675985153,
"grad_norm": 0.04987279541605942,
"learning_rate": 0.00035742363113837507,
"loss": 2.4433,
"step": 23735
},
{
"epoch": 7.498460080549632,
"grad_norm": 0.050881308693398304,
"learning_rate": 0.0003570011983554485,
"loss": 2.3913,
"step": 23740
},
{
"epoch": 7.500039485114112,
"grad_norm": 0.050137608515738584,
"learning_rate": 0.0003565789611043494,
"loss": 2.451,
"step": 23745
},
{
"epoch": 7.501618889678591,
"grad_norm": 0.055709883463010215,
"learning_rate": 0.0003561569195134772,
"loss": 2.4377,
"step": 23750
},
{
"epoch": 7.50319829424307,
"grad_norm": 0.05561732872868846,
"learning_rate": 0.0003557350737111722,
"loss": 2.4005,
"step": 23755
},
{
"epoch": 7.504777698807549,
"grad_norm": 0.0528833894614215,
"learning_rate": 0.00035531342382571465,
"loss": 2.3904,
"step": 23760
},
{
"epoch": 7.506357103372029,
"grad_norm": 0.06357337200410884,
"learning_rate": 0.00035489196998532614,
"loss": 2.3614,
"step": 23765
},
{
"epoch": 7.507936507936508,
"grad_norm": 0.05040209838574259,
"learning_rate": 0.0003544707123181675,
"loss": 2.3819,
"step": 23770
},
{
"epoch": 7.509515912500987,
"grad_norm": 0.06469235887929314,
"learning_rate": 0.00035404965095234134,
"loss": 2.4137,
"step": 23775
},
{
"epoch": 7.511095317065466,
"grad_norm": 0.058756076336860794,
"learning_rate": 0.0003536287860158891,
"loss": 2.4377,
"step": 23780
},
{
"epoch": 7.512674721629946,
"grad_norm": 0.05427048344633684,
"learning_rate": 0.0003532081176367929,
"loss": 2.4334,
"step": 23785
},
{
"epoch": 7.514254126194425,
"grad_norm": 0.05445023053867659,
"learning_rate": 0.00035278764594297605,
"loss": 2.402,
"step": 23790
},
{
"epoch": 7.515833530758904,
"grad_norm": 0.05572921923027817,
"learning_rate": 0.00035236737106230086,
"loss": 2.3982,
"step": 23795
},
{
"epoch": 7.517412935323383,
"grad_norm": 0.05097853575965763,
"learning_rate": 0.00035194729312257035,
"loss": 2.3,
"step": 23800
},
{
"epoch": 7.518992339887863,
"grad_norm": 0.05610154877951178,
"learning_rate": 0.00035152741225152754,
"loss": 2.4078,
"step": 23805
},
{
"epoch": 7.520571744452342,
"grad_norm": 0.054313889130966415,
"learning_rate": 0.000351107728576855,
"loss": 2.492,
"step": 23810
},
{
"epoch": 7.522151149016821,
"grad_norm": 0.05448783726055358,
"learning_rate": 0.00035068824222617666,
"loss": 2.4035,
"step": 23815
},
{
"epoch": 7.5237305535812995,
"grad_norm": 0.05203520227656932,
"learning_rate": 0.000350268953327055,
"loss": 2.3478,
"step": 23820
},
{
"epoch": 7.525309958145779,
"grad_norm": 0.05930874635571445,
"learning_rate": 0.0003498498620069938,
"loss": 2.3964,
"step": 23825
},
{
"epoch": 7.526889362710258,
"grad_norm": 0.058976632389174895,
"learning_rate": 0.00034943096839343545,
"loss": 2.4033,
"step": 23830
},
{
"epoch": 7.528468767274737,
"grad_norm": 0.05484249479120815,
"learning_rate": 0.0003490122726137632,
"loss": 2.4029,
"step": 23835
},
{
"epoch": 7.5300481718392165,
"grad_norm": 0.05927657326723422,
"learning_rate": 0.0003485937747952994,
"loss": 2.3705,
"step": 23840
},
{
"epoch": 7.531627576403696,
"grad_norm": 0.0510950825555915,
"learning_rate": 0.00034817547506530653,
"loss": 2.4689,
"step": 23845
},
{
"epoch": 7.533206980968175,
"grad_norm": 0.0582258861901962,
"learning_rate": 0.00034775737355098737,
"loss": 2.3584,
"step": 23850
},
{
"epoch": 7.534786385532654,
"grad_norm": 0.06160566988929809,
"learning_rate": 0.0003473394703794837,
"loss": 2.4319,
"step": 23855
},
{
"epoch": 7.5363657900971335,
"grad_norm": 0.05308567369648612,
"learning_rate": 0.0003469217656778766,
"loss": 2.4249,
"step": 23860
},
{
"epoch": 7.537945194661613,
"grad_norm": 0.06369662464028052,
"learning_rate": 0.00034650425957318844,
"loss": 2.4027,
"step": 23865
},
{
"epoch": 7.539524599226092,
"grad_norm": 0.06550185521725853,
"learning_rate": 0.00034608695219237953,
"loss": 2.4746,
"step": 23870
},
{
"epoch": 7.541104003790571,
"grad_norm": 0.04797082265967074,
"learning_rate": 0.0003456698436623502,
"loss": 2.339,
"step": 23875
},
{
"epoch": 7.5426834083550505,
"grad_norm": 0.051517689382158276,
"learning_rate": 0.0003452529341099411,
"loss": 2.355,
"step": 23880
},
{
"epoch": 7.54426281291953,
"grad_norm": 0.05197438536779773,
"learning_rate": 0.0003448362236619315,
"loss": 2.3605,
"step": 23885
},
{
"epoch": 7.545842217484008,
"grad_norm": 0.05083741389939934,
"learning_rate": 0.00034441971244504024,
"loss": 2.4584,
"step": 23890
},
{
"epoch": 7.547421622048487,
"grad_norm": 0.05823221757407544,
"learning_rate": 0.0003440034005859258,
"loss": 2.3936,
"step": 23895
},
{
"epoch": 7.549001026612967,
"grad_norm": 0.06091531540814351,
"learning_rate": 0.0003435872882111857,
"loss": 2.4418,
"step": 23900
},
{
"epoch": 7.550580431177446,
"grad_norm": 0.04708894240854059,
"learning_rate": 0.00034317137544735753,
"loss": 2.3962,
"step": 23905
},
{
"epoch": 7.552159835741925,
"grad_norm": 0.07520886685119499,
"learning_rate": 0.00034275566242091725,
"loss": 2.3792,
"step": 23910
},
{
"epoch": 7.553739240306404,
"grad_norm": 0.0847747471839153,
"learning_rate": 0.00034234014925828114,
"loss": 2.5207,
"step": 23915
},
{
"epoch": 7.555318644870884,
"grad_norm": 0.04947878947689636,
"learning_rate": 0.00034192483608580374,
"loss": 2.429,
"step": 23920
},
{
"epoch": 7.556898049435363,
"grad_norm": 0.05002392750223998,
"learning_rate": 0.0003415097230297791,
"loss": 2.3927,
"step": 23925
},
{
"epoch": 7.558477453999842,
"grad_norm": 0.049763788116632214,
"learning_rate": 0.0003410948102164404,
"loss": 2.4146,
"step": 23930
},
{
"epoch": 7.560056858564321,
"grad_norm": 0.06408432685825667,
"learning_rate": 0.00034068009777195985,
"loss": 2.4863,
"step": 23935
},
{
"epoch": 7.5616362631288006,
"grad_norm": 0.04508639165527899,
"learning_rate": 0.0003402655858224493,
"loss": 2.371,
"step": 23940
},
{
"epoch": 7.56321566769328,
"grad_norm": 0.05422277716588939,
"learning_rate": 0.00033985127449395893,
"loss": 2.441,
"step": 23945
},
{
"epoch": 7.564795072257759,
"grad_norm": 0.06150907765634088,
"learning_rate": 0.00033943716391247793,
"loss": 2.4413,
"step": 23950
},
{
"epoch": 7.566374476822238,
"grad_norm": 0.05127590635874652,
"learning_rate": 0.00033902325420393523,
"loss": 2.3932,
"step": 23955
},
{
"epoch": 7.5679538813867175,
"grad_norm": 0.057672367901127446,
"learning_rate": 0.0003386095454941974,
"loss": 2.3839,
"step": 23960
},
{
"epoch": 7.569533285951197,
"grad_norm": 0.05146168485962654,
"learning_rate": 0.00033819603790907147,
"loss": 2.4863,
"step": 23965
},
{
"epoch": 7.571112690515676,
"grad_norm": 0.0535378903359561,
"learning_rate": 0.00033778273157430207,
"loss": 2.339,
"step": 23970
},
{
"epoch": 7.572692095080154,
"grad_norm": 0.05588125048196558,
"learning_rate": 0.0003373696266155729,
"loss": 2.4713,
"step": 23975
},
{
"epoch": 7.574271499644634,
"grad_norm": 0.053273495265892444,
"learning_rate": 0.0003369567231585067,
"loss": 2.3334,
"step": 23980
},
{
"epoch": 7.575850904209113,
"grad_norm": 0.052236979555313555,
"learning_rate": 0.00033654402132866456,
"loss": 2.3322,
"step": 23985
},
{
"epoch": 7.577430308773592,
"grad_norm": 0.04650182802860371,
"learning_rate": 0.00033613152125154636,
"loss": 2.3798,
"step": 23990
},
{
"epoch": 7.579009713338071,
"grad_norm": 0.05176251861636686,
"learning_rate": 0.00033571922305259126,
"loss": 2.4699,
"step": 23995
},
{
"epoch": 7.580589117902551,
"grad_norm": 0.05006150490263106,
"learning_rate": 0.0003353071268571759,
"loss": 2.2849,
"step": 24000
},
{
"epoch": 7.58216852246703,
"grad_norm": 0.047048389474652993,
"learning_rate": 0.00033489523279061674,
"loss": 2.3827,
"step": 24005
},
{
"epoch": 7.583747927031509,
"grad_norm": 0.0557043619859469,
"learning_rate": 0.0003344835409781679,
"loss": 2.3429,
"step": 24010
},
{
"epoch": 7.585327331595988,
"grad_norm": 0.05172011245331398,
"learning_rate": 0.0003340720515450221,
"loss": 2.3527,
"step": 24015
},
{
"epoch": 7.586906736160468,
"grad_norm": 0.0491888915760231,
"learning_rate": 0.0003336607646163106,
"loss": 2.4425,
"step": 24020
},
{
"epoch": 7.588486140724947,
"grad_norm": 0.05737364984818568,
"learning_rate": 0.00033324968031710303,
"loss": 2.3395,
"step": 24025
},
{
"epoch": 7.590065545289426,
"grad_norm": 0.05841923422913503,
"learning_rate": 0.0003328387987724079,
"loss": 2.3915,
"step": 24030
},
{
"epoch": 7.591644949853905,
"grad_norm": 0.05107266580284771,
"learning_rate": 0.0003324281201071715,
"loss": 2.4611,
"step": 24035
},
{
"epoch": 7.593224354418385,
"grad_norm": 0.05129232000800214,
"learning_rate": 0.00033201764444627823,
"loss": 2.4628,
"step": 24040
},
{
"epoch": 7.594803758982863,
"grad_norm": 0.07500145964807096,
"learning_rate": 0.0003316073719145517,
"loss": 2.3735,
"step": 24045
},
{
"epoch": 7.596383163547342,
"grad_norm": 0.06367210474236257,
"learning_rate": 0.0003311973026367526,
"loss": 2.4049,
"step": 24050
},
{
"epoch": 7.5979625681118215,
"grad_norm": 0.05791689448404274,
"learning_rate": 0.000330787436737581,
"loss": 2.4212,
"step": 24055
},
{
"epoch": 7.599541972676301,
"grad_norm": 0.06148429526004428,
"learning_rate": 0.00033037777434167414,
"loss": 2.3598,
"step": 24060
},
{
"epoch": 7.60112137724078,
"grad_norm": 0.0632806955715157,
"learning_rate": 0.00032996831557360786,
"loss": 2.3784,
"step": 24065
},
{
"epoch": 7.602700781805259,
"grad_norm": 0.04967935712466749,
"learning_rate": 0.0003295590605578959,
"loss": 2.4446,
"step": 24070
},
{
"epoch": 7.6042801863697385,
"grad_norm": 0.053187262168445174,
"learning_rate": 0.0003291500094189895,
"loss": 2.3772,
"step": 24075
},
{
"epoch": 7.605859590934218,
"grad_norm": 0.051516797227960875,
"learning_rate": 0.0003287411622812796,
"loss": 2.4098,
"step": 24080
},
{
"epoch": 7.607438995498697,
"grad_norm": 0.04717329615090927,
"learning_rate": 0.00032833251926909335,
"loss": 2.388,
"step": 24085
},
{
"epoch": 7.609018400063176,
"grad_norm": 0.04961962009053395,
"learning_rate": 0.0003279240805066963,
"loss": 2.4496,
"step": 24090
},
{
"epoch": 7.6105978046276554,
"grad_norm": 0.0508932227695937,
"learning_rate": 0.0003275158461182927,
"loss": 2.3906,
"step": 24095
},
{
"epoch": 7.612177209192135,
"grad_norm": 0.047386740219747786,
"learning_rate": 0.0003271078162280235,
"loss": 2.3996,
"step": 24100
},
{
"epoch": 7.613756613756614,
"grad_norm": 0.051347605182556225,
"learning_rate": 0.0003266999909599684,
"loss": 2.3488,
"step": 24105
},
{
"epoch": 7.615336018321093,
"grad_norm": 0.05207196085327182,
"learning_rate": 0.0003262923704381441,
"loss": 2.4164,
"step": 24110
},
{
"epoch": 7.616915422885572,
"grad_norm": 0.051807947784807226,
"learning_rate": 0.00032588495478650515,
"loss": 2.3612,
"step": 24115
},
{
"epoch": 7.618494827450052,
"grad_norm": 0.05301360880767392,
"learning_rate": 0.00032547774412894484,
"loss": 2.4818,
"step": 24120
},
{
"epoch": 7.620074232014531,
"grad_norm": 0.058436238645943585,
"learning_rate": 0.0003250707385892928,
"loss": 2.4381,
"step": 24125
},
{
"epoch": 7.62165363657901,
"grad_norm": 0.05822939906037066,
"learning_rate": 0.0003246639382913167,
"loss": 2.4283,
"step": 24130
},
{
"epoch": 7.6232330411434885,
"grad_norm": 0.0479691770704822,
"learning_rate": 0.00032425734335872236,
"loss": 2.4451,
"step": 24135
},
{
"epoch": 7.624812445707968,
"grad_norm": 0.059090968265937245,
"learning_rate": 0.0003238509539151522,
"loss": 2.5042,
"step": 24140
},
{
"epoch": 7.626391850272447,
"grad_norm": 0.07224711007320013,
"learning_rate": 0.00032344477008418716,
"loss": 2.3966,
"step": 24145
},
{
"epoch": 7.627971254836926,
"grad_norm": 0.051380954135107085,
"learning_rate": 0.0003230387919893449,
"loss": 2.4038,
"step": 24150
},
{
"epoch": 7.6295506594014055,
"grad_norm": 0.06859372282080024,
"learning_rate": 0.00032263301975408087,
"loss": 2.4316,
"step": 24155
},
{
"epoch": 7.631130063965885,
"grad_norm": 0.05318137975594264,
"learning_rate": 0.00032222745350178773,
"loss": 2.366,
"step": 24160
},
{
"epoch": 7.632709468530364,
"grad_norm": 0.05478988503412728,
"learning_rate": 0.00032182209335579514,
"loss": 2.4576,
"step": 24165
},
{
"epoch": 7.634288873094843,
"grad_norm": 0.05459819331696542,
"learning_rate": 0.00032141693943937133,
"loss": 2.3714,
"step": 24170
},
{
"epoch": 7.6358682776593225,
"grad_norm": 0.0590688189521774,
"learning_rate": 0.0003210119918757206,
"loss": 2.3416,
"step": 24175
},
{
"epoch": 7.637447682223802,
"grad_norm": 0.057447038920899106,
"learning_rate": 0.0003206072507879847,
"loss": 2.4195,
"step": 24180
},
{
"epoch": 7.639027086788281,
"grad_norm": 0.04830826723505675,
"learning_rate": 0.00032020271629924345,
"loss": 2.4337,
"step": 24185
},
{
"epoch": 7.64060649135276,
"grad_norm": 0.062425324790732116,
"learning_rate": 0.00031979838853251274,
"loss": 2.4195,
"step": 24190
},
{
"epoch": 7.6421858959172395,
"grad_norm": 0.05325699262898386,
"learning_rate": 0.0003193942676107462,
"loss": 2.3648,
"step": 24195
},
{
"epoch": 7.643765300481719,
"grad_norm": 0.05871712109832676,
"learning_rate": 0.00031899035365683424,
"loss": 2.4513,
"step": 24200
},
{
"epoch": 7.645344705046197,
"grad_norm": 0.060790001080061655,
"learning_rate": 0.0003185866467936045,
"loss": 2.3598,
"step": 24205
},
{
"epoch": 7.646924109610676,
"grad_norm": 0.056202444771544735,
"learning_rate": 0.000318183147143822,
"loss": 2.3693,
"step": 24210
},
{
"epoch": 7.648503514175156,
"grad_norm": 0.0644776336201835,
"learning_rate": 0.0003177798548301883,
"loss": 2.4006,
"step": 24215
},
{
"epoch": 7.650082918739635,
"grad_norm": 0.04997202841729112,
"learning_rate": 0.0003173767699753416,
"loss": 2.4625,
"step": 24220
},
{
"epoch": 7.651662323304114,
"grad_norm": 0.051209189224240705,
"learning_rate": 0.0003169738927018579,
"loss": 2.4392,
"step": 24225
},
{
"epoch": 7.653241727868593,
"grad_norm": 0.05000245242432989,
"learning_rate": 0.0003165712231322493,
"loss": 2.4012,
"step": 24230
},
{
"epoch": 7.654821132433073,
"grad_norm": 0.05428174890846462,
"learning_rate": 0.00031616876138896547,
"loss": 2.4287,
"step": 24235
},
{
"epoch": 7.656400536997552,
"grad_norm": 0.05278200777192172,
"learning_rate": 0.0003157665075943922,
"loss": 2.4078,
"step": 24240
},
{
"epoch": 7.657979941562031,
"grad_norm": 0.051568768887383196,
"learning_rate": 0.0003153644618708523,
"loss": 2.4421,
"step": 24245
},
{
"epoch": 7.65955934612651,
"grad_norm": 0.054085679211000194,
"learning_rate": 0.00031496262434060516,
"loss": 2.3234,
"step": 24250
},
{
"epoch": 7.66113875069099,
"grad_norm": 0.05481383232982134,
"learning_rate": 0.00031456099512584704,
"loss": 2.5246,
"step": 24255
},
{
"epoch": 7.662718155255469,
"grad_norm": 0.059426566868512096,
"learning_rate": 0.000314159574348711,
"loss": 2.5082,
"step": 24260
},
{
"epoch": 7.664297559819948,
"grad_norm": 0.04820355582047649,
"learning_rate": 0.0003137583621312665,
"loss": 2.4273,
"step": 24265
},
{
"epoch": 7.665876964384427,
"grad_norm": 0.05263363368035431,
"learning_rate": 0.0003133573585955194,
"loss": 2.4454,
"step": 24270
},
{
"epoch": 7.667456368948907,
"grad_norm": 0.06668088972823961,
"learning_rate": 0.00031295656386341264,
"loss": 2.3927,
"step": 24275
},
{
"epoch": 7.669035773513386,
"grad_norm": 0.049805678787756505,
"learning_rate": 0.0003125559780568251,
"loss": 2.4756,
"step": 24280
},
{
"epoch": 7.670615178077865,
"grad_norm": 0.05144318724607636,
"learning_rate": 0.0003121556012975726,
"loss": 2.4335,
"step": 24285
},
{
"epoch": 7.672194582642344,
"grad_norm": 0.06315689642117019,
"learning_rate": 0.0003117554337074069,
"loss": 2.5088,
"step": 24290
},
{
"epoch": 7.673773987206823,
"grad_norm": 0.044208247141836916,
"learning_rate": 0.0003113554754080162,
"loss": 2.3341,
"step": 24295
},
{
"epoch": 7.675353391771302,
"grad_norm": 0.05554919550651169,
"learning_rate": 0.00031095572652102587,
"loss": 2.4467,
"step": 24300
},
{
"epoch": 7.676932796335781,
"grad_norm": 0.059080134993804094,
"learning_rate": 0.0003105561871679966,
"loss": 2.5429,
"step": 24305
},
{
"epoch": 7.67851220090026,
"grad_norm": 0.04417541045997235,
"learning_rate": 0.0003101568574704257,
"loss": 2.3606,
"step": 24310
},
{
"epoch": 7.68009160546474,
"grad_norm": 0.05869619463869087,
"learning_rate": 0.000309757737549747,
"loss": 2.3615,
"step": 24315
},
{
"epoch": 7.681671010029219,
"grad_norm": 0.050441644302818466,
"learning_rate": 0.00030935882752733,
"loss": 2.4206,
"step": 24320
},
{
"epoch": 7.683250414593698,
"grad_norm": 0.05476799175365138,
"learning_rate": 0.0003089601275244813,
"loss": 2.3986,
"step": 24325
},
{
"epoch": 7.684829819158177,
"grad_norm": 0.05157746693117757,
"learning_rate": 0.0003085616376624426,
"loss": 2.4136,
"step": 24330
},
{
"epoch": 7.686409223722657,
"grad_norm": 0.05541127015732504,
"learning_rate": 0.00030816335806239226,
"loss": 2.4145,
"step": 24335
},
{
"epoch": 7.687988628287136,
"grad_norm": 0.053786340179167384,
"learning_rate": 0.0003077652888454443,
"loss": 2.4215,
"step": 24340
},
{
"epoch": 7.689568032851615,
"grad_norm": 0.053708516642199,
"learning_rate": 0.000307367430132649,
"loss": 2.4234,
"step": 24345
},
{
"epoch": 7.691147437416094,
"grad_norm": 0.04735017320728578,
"learning_rate": 0.00030696978204499314,
"loss": 2.422,
"step": 24350
},
{
"epoch": 7.692726841980574,
"grad_norm": 0.05885430027941452,
"learning_rate": 0.00030657234470339866,
"loss": 2.4398,
"step": 24355
},
{
"epoch": 7.694306246545052,
"grad_norm": 0.05120891673299278,
"learning_rate": 0.00030617511822872336,
"loss": 2.4177,
"step": 24360
},
{
"epoch": 7.695885651109531,
"grad_norm": 0.04667274468962752,
"learning_rate": 0.00030577810274176197,
"loss": 2.3767,
"step": 24365
},
{
"epoch": 7.6974650556740105,
"grad_norm": 0.05273872070151963,
"learning_rate": 0.000305381298363244,
"loss": 2.4298,
"step": 24370
},
{
"epoch": 7.69904446023849,
"grad_norm": 0.05372805449575675,
"learning_rate": 0.00030498470521383525,
"loss": 2.3437,
"step": 24375
},
{
"epoch": 7.700623864802969,
"grad_norm": 0.05486851348986952,
"learning_rate": 0.000304588323414137,
"loss": 2.428,
"step": 24380
},
{
"epoch": 7.702203269367448,
"grad_norm": 0.061315818652699604,
"learning_rate": 0.00030419215308468615,
"loss": 2.468,
"step": 24385
},
{
"epoch": 7.7037826739319275,
"grad_norm": 0.05713437637146786,
"learning_rate": 0.00030379619434595627,
"loss": 2.4225,
"step": 24390
},
{
"epoch": 7.705362078496407,
"grad_norm": 0.04373827826896389,
"learning_rate": 0.00030340044731835526,
"loss": 2.3179,
"step": 24395
},
{
"epoch": 7.706941483060886,
"grad_norm": 0.05433521324153791,
"learning_rate": 0.0003030049121222278,
"loss": 2.4587,
"step": 24400
},
{
"epoch": 7.708520887625365,
"grad_norm": 0.0503021980471803,
"learning_rate": 0.0003026095888778533,
"loss": 2.4756,
"step": 24405
},
{
"epoch": 7.7101002921898445,
"grad_norm": 0.04998658543639774,
"learning_rate": 0.00030221447770544674,
"loss": 2.326,
"step": 24410
},
{
"epoch": 7.711679696754324,
"grad_norm": 0.05355613379832616,
"learning_rate": 0.00030181957872515964,
"loss": 2.3546,
"step": 24415
},
{
"epoch": 7.713259101318803,
"grad_norm": 0.04639426212008117,
"learning_rate": 0.0003014248920570778,
"loss": 2.4354,
"step": 24420
},
{
"epoch": 7.714838505883282,
"grad_norm": 0.054239389335720374,
"learning_rate": 0.00030103041782122286,
"loss": 2.3484,
"step": 24425
},
{
"epoch": 7.7164179104477615,
"grad_norm": 0.053889348363275934,
"learning_rate": 0.0003006361561375521,
"loss": 2.3737,
"step": 24430
},
{
"epoch": 7.717997315012241,
"grad_norm": 0.05224091854696833,
"learning_rate": 0.00030024210712595767,
"loss": 2.4265,
"step": 24435
},
{
"epoch": 7.71957671957672,
"grad_norm": 0.050474388133450517,
"learning_rate": 0.00029984827090626787,
"loss": 2.4149,
"step": 24440
},
{
"epoch": 7.721156124141199,
"grad_norm": 0.05000111897800184,
"learning_rate": 0.0002994546475982455,
"loss": 2.4561,
"step": 24445
},
{
"epoch": 7.722735528705678,
"grad_norm": 0.059446531671769685,
"learning_rate": 0.0002990612373215884,
"loss": 2.3574,
"step": 24450
},
{
"epoch": 7.724314933270157,
"grad_norm": 0.05997617920522711,
"learning_rate": 0.0002986680401959311,
"loss": 2.416,
"step": 24455
},
{
"epoch": 7.725894337834636,
"grad_norm": 0.050078519265076,
"learning_rate": 0.00029827505634084185,
"loss": 2.3241,
"step": 24460
},
{
"epoch": 7.727473742399115,
"grad_norm": 0.07124440721657235,
"learning_rate": 0.00029788228587582444,
"loss": 2.4418,
"step": 24465
},
{
"epoch": 7.729053146963595,
"grad_norm": 0.05454655571950861,
"learning_rate": 0.000297489728920318,
"loss": 2.4175,
"step": 24470
},
{
"epoch": 7.730632551528074,
"grad_norm": 0.05623786767915883,
"learning_rate": 0.00029709738559369615,
"loss": 2.3985,
"step": 24475
},
{
"epoch": 7.732211956092553,
"grad_norm": 0.05884938855594215,
"learning_rate": 0.00029670525601526864,
"loss": 2.4452,
"step": 24480
},
{
"epoch": 7.733791360657032,
"grad_norm": 0.05042583690627841,
"learning_rate": 0.00029631334030427915,
"loss": 2.4225,
"step": 24485
},
{
"epoch": 7.735370765221512,
"grad_norm": 0.04991076638423713,
"learning_rate": 0.00029592163857990704,
"loss": 2.3464,
"step": 24490
},
{
"epoch": 7.736950169785991,
"grad_norm": 0.062727062149512,
"learning_rate": 0.00029553015096126634,
"loss": 2.4083,
"step": 24495
},
{
"epoch": 7.73852957435047,
"grad_norm": 0.04802178819470241,
"learning_rate": 0.0002951388775674053,
"loss": 2.3909,
"step": 24500
},
{
"epoch": 7.740108978914949,
"grad_norm": 0.05606591642278618,
"learning_rate": 0.0002947478185173085,
"loss": 2.438,
"step": 24505
},
{
"epoch": 7.741688383479429,
"grad_norm": 0.04736506859581747,
"learning_rate": 0.00029435697392989405,
"loss": 2.4206,
"step": 24510
},
{
"epoch": 7.743267788043908,
"grad_norm": 0.051938138115853495,
"learning_rate": 0.00029396634392401535,
"loss": 2.4396,
"step": 24515
},
{
"epoch": 7.744847192608386,
"grad_norm": 0.049781245083747594,
"learning_rate": 0.0002935759286184605,
"loss": 2.29,
"step": 24520
},
{
"epoch": 7.746426597172865,
"grad_norm": 0.04703249847453405,
"learning_rate": 0.000293185728131952,
"loss": 2.3294,
"step": 24525
},
{
"epoch": 7.748006001737345,
"grad_norm": 0.04840887139950567,
"learning_rate": 0.0002927957425831479,
"loss": 2.4352,
"step": 24530
},
{
"epoch": 7.749585406301824,
"grad_norm": 0.05414582291796758,
"learning_rate": 0.00029240597209064,
"loss": 2.4122,
"step": 24535
},
{
"epoch": 7.751164810866303,
"grad_norm": 0.05452155431449084,
"learning_rate": 0.0002920164167729548,
"loss": 2.4029,
"step": 24540
},
{
"epoch": 7.752744215430782,
"grad_norm": 0.0537875525333651,
"learning_rate": 0.00029162707674855416,
"loss": 2.3363,
"step": 24545
},
{
"epoch": 7.754323619995262,
"grad_norm": 0.057189373852712784,
"learning_rate": 0.00029123795213583346,
"loss": 2.4146,
"step": 24550
},
{
"epoch": 7.755903024559741,
"grad_norm": 0.04624352059819483,
"learning_rate": 0.0002908490430531232,
"loss": 2.5916,
"step": 24555
},
{
"epoch": 7.75748242912422,
"grad_norm": 0.04924444013165075,
"learning_rate": 0.00029046034961868793,
"loss": 2.3289,
"step": 24560
},
{
"epoch": 7.759061833688699,
"grad_norm": 0.0494404459697529,
"learning_rate": 0.0002900718719507268,
"loss": 2.4075,
"step": 24565
},
{
"epoch": 7.760641238253179,
"grad_norm": 0.049554309353999876,
"learning_rate": 0.00028968361016737376,
"loss": 2.3909,
"step": 24570
},
{
"epoch": 7.762220642817658,
"grad_norm": 0.050877082667088167,
"learning_rate": 0.00028929556438669625,
"loss": 2.5629,
"step": 24575
},
{
"epoch": 7.763800047382137,
"grad_norm": 0.052260925315823034,
"learning_rate": 0.00028890773472669716,
"loss": 2.3817,
"step": 24580
},
{
"epoch": 7.765379451946616,
"grad_norm": 0.05061621254808778,
"learning_rate": 0.0002885201213053126,
"loss": 2.4837,
"step": 24585
},
{
"epoch": 7.766958856511096,
"grad_norm": 0.04983143637760769,
"learning_rate": 0.00028813272424041306,
"loss": 2.4145,
"step": 24590
},
{
"epoch": 7.768538261075575,
"grad_norm": 0.04506057127475669,
"learning_rate": 0.0002877455436498041,
"loss": 2.4351,
"step": 24595
},
{
"epoch": 7.770117665640054,
"grad_norm": 0.05247532758561306,
"learning_rate": 0.0002873585796512247,
"loss": 2.3988,
"step": 24600
},
{
"epoch": 7.771697070204533,
"grad_norm": 0.05439760141095109,
"learning_rate": 0.000286971832362348,
"loss": 2.3268,
"step": 24605
},
{
"epoch": 7.773276474769012,
"grad_norm": 0.05627223068340628,
"learning_rate": 0.00028658530190078135,
"loss": 2.3374,
"step": 24610
},
{
"epoch": 7.774855879333491,
"grad_norm": 0.062242832355261256,
"learning_rate": 0.000286198988384066,
"loss": 2.5214,
"step": 24615
},
{
"epoch": 7.77643528389797,
"grad_norm": 0.05252140392419768,
"learning_rate": 0.0002858128919296781,
"loss": 2.4114,
"step": 24620
},
{
"epoch": 7.7780146884624495,
"grad_norm": 0.04564201244557298,
"learning_rate": 0.00028542701265502627,
"loss": 2.3817,
"step": 24625
},
{
"epoch": 7.779594093026929,
"grad_norm": 0.06126100422801149,
"learning_rate": 0.0002850413506774546,
"loss": 2.4307,
"step": 24630
},
{
"epoch": 7.781173497591408,
"grad_norm": 0.04501233507567427,
"learning_rate": 0.0002846559061142403,
"loss": 2.3552,
"step": 24635
},
{
"epoch": 7.782752902155887,
"grad_norm": 0.045411464782780374,
"learning_rate": 0.0002842706790825944,
"loss": 2.3638,
"step": 24640
},
{
"epoch": 7.7843323067203665,
"grad_norm": 0.05683693980555025,
"learning_rate": 0.0002838856696996621,
"loss": 2.429,
"step": 24645
},
{
"epoch": 7.785911711284846,
"grad_norm": 0.05565246689756234,
"learning_rate": 0.00028350087808252234,
"loss": 2.4186,
"step": 24650
},
{
"epoch": 7.787491115849325,
"grad_norm": 0.06675077757834104,
"learning_rate": 0.00028311630434818736,
"loss": 2.4561,
"step": 24655
},
{
"epoch": 7.789070520413804,
"grad_norm": 0.06868642711368238,
"learning_rate": 0.00028273194861360416,
"loss": 2.4313,
"step": 24660
},
{
"epoch": 7.7906499249782835,
"grad_norm": 0.06557858906849585,
"learning_rate": 0.00028234781099565243,
"loss": 2.3944,
"step": 24665
},
{
"epoch": 7.792229329542763,
"grad_norm": 0.04551304143974813,
"learning_rate": 0.00028196389161114644,
"loss": 2.4348,
"step": 24670
},
{
"epoch": 7.793808734107242,
"grad_norm": 0.04838277501568106,
"learning_rate": 0.00028158019057683336,
"loss": 2.4776,
"step": 24675
},
{
"epoch": 7.79538813867172,
"grad_norm": 0.053328847150971935,
"learning_rate": 0.0002811967080093939,
"loss": 2.34,
"step": 24680
},
{
"epoch": 7.7969675432362,
"grad_norm": 0.05287452436354166,
"learning_rate": 0.0002808134440254433,
"loss": 2.4273,
"step": 24685
},
{
"epoch": 7.798546947800679,
"grad_norm": 0.047372276973564276,
"learning_rate": 0.0002804303987415294,
"loss": 2.4362,
"step": 24690
},
{
"epoch": 7.800126352365158,
"grad_norm": 0.053263680184324806,
"learning_rate": 0.0002800475722741337,
"loss": 2.4867,
"step": 24695
},
{
"epoch": 7.801705756929637,
"grad_norm": 0.058508417205338135,
"learning_rate": 0.0002796649647396714,
"loss": 2.4478,
"step": 24700
},
{
"epoch": 7.8032851614941166,
"grad_norm": 0.06229616365010954,
"learning_rate": 0.00027928257625449074,
"loss": 2.3928,
"step": 24705
},
{
"epoch": 7.804864566058596,
"grad_norm": 0.06458054579349871,
"learning_rate": 0.00027890040693487404,
"loss": 2.356,
"step": 24710
},
{
"epoch": 7.806443970623075,
"grad_norm": 0.05575505674396327,
"learning_rate": 0.00027851845689703605,
"loss": 2.4082,
"step": 24715
},
{
"epoch": 7.808023375187554,
"grad_norm": 0.06741414204191258,
"learning_rate": 0.00027813672625712606,
"loss": 2.507,
"step": 24720
},
{
"epoch": 7.8096027797520335,
"grad_norm": 0.05717411663408411,
"learning_rate": 0.00027775521513122536,
"loss": 2.4304,
"step": 24725
},
{
"epoch": 7.811182184316513,
"grad_norm": 0.06218825335938903,
"learning_rate": 0.0002773739236353493,
"loss": 2.4569,
"step": 24730
},
{
"epoch": 7.812761588880992,
"grad_norm": 0.06296445268481235,
"learning_rate": 0.00027699285188544597,
"loss": 2.4216,
"step": 24735
},
{
"epoch": 7.814340993445471,
"grad_norm": 0.049533919825785235,
"learning_rate": 0.00027661199999739686,
"loss": 2.389,
"step": 24740
},
{
"epoch": 7.8159203980099505,
"grad_norm": 0.0484112010932451,
"learning_rate": 0.00027623136808701675,
"loss": 2.4212,
"step": 24745
},
{
"epoch": 7.81749980257443,
"grad_norm": 0.04682500088046376,
"learning_rate": 0.0002758509562700535,
"loss": 2.4025,
"step": 24750
},
{
"epoch": 7.819079207138909,
"grad_norm": 0.06106433282246892,
"learning_rate": 0.00027547076466218735,
"loss": 2.4387,
"step": 24755
},
{
"epoch": 7.820658611703388,
"grad_norm": 0.08171965748716478,
"learning_rate": 0.00027509079337903285,
"loss": 2.4323,
"step": 24760
},
{
"epoch": 7.822238016267867,
"grad_norm": 0.05081140541838431,
"learning_rate": 0.0002747110425361364,
"loss": 2.4346,
"step": 24765
},
{
"epoch": 7.823817420832346,
"grad_norm": 0.06443489431328071,
"learning_rate": 0.00027433151224897777,
"loss": 2.4403,
"step": 24770
},
{
"epoch": 7.825396825396825,
"grad_norm": 0.05517541734537136,
"learning_rate": 0.0002739522026329702,
"loss": 2.4178,
"step": 24775
},
{
"epoch": 7.826976229961304,
"grad_norm": 0.05212450955006681,
"learning_rate": 0.0002735731138034587,
"loss": 2.3864,
"step": 24780
},
{
"epoch": 7.828555634525784,
"grad_norm": 0.05516649716867154,
"learning_rate": 0.0002731942458757223,
"loss": 2.3397,
"step": 24785
},
{
"epoch": 7.830135039090263,
"grad_norm": 0.058408999564183946,
"learning_rate": 0.0002728155989649719,
"loss": 2.5221,
"step": 24790
},
{
"epoch": 7.831714443654742,
"grad_norm": 0.05498459475623652,
"learning_rate": 0.00027243717318635143,
"loss": 2.4406,
"step": 24795
},
{
"epoch": 7.833293848219221,
"grad_norm": 0.057419232996713934,
"learning_rate": 0.0002720589686549383,
"loss": 2.3542,
"step": 24800
},
{
"epoch": 7.834873252783701,
"grad_norm": 0.05003476615573564,
"learning_rate": 0.00027168098548574173,
"loss": 2.4578,
"step": 24805
},
{
"epoch": 7.83645265734818,
"grad_norm": 0.05293468218628365,
"learning_rate": 0.0002713032237937043,
"loss": 2.2924,
"step": 24810
},
{
"epoch": 7.838032061912659,
"grad_norm": 0.04996153521631309,
"learning_rate": 0.00027092568369370076,
"loss": 2.3176,
"step": 24815
},
{
"epoch": 7.839611466477138,
"grad_norm": 0.04739933322752104,
"learning_rate": 0.00027054836530053864,
"loss": 2.4019,
"step": 24820
},
{
"epoch": 7.841190871041618,
"grad_norm": 0.04479465097747017,
"learning_rate": 0.000270171268728958,
"loss": 2.4003,
"step": 24825
},
{
"epoch": 7.842770275606097,
"grad_norm": 0.05562015652094908,
"learning_rate": 0.0002697943940936313,
"loss": 2.3836,
"step": 24830
},
{
"epoch": 7.844349680170575,
"grad_norm": 0.04564832353498086,
"learning_rate": 0.0002694177415091642,
"loss": 2.2812,
"step": 24835
},
{
"epoch": 7.8459290847350545,
"grad_norm": 0.04795302510671516,
"learning_rate": 0.0002690413110900941,
"loss": 2.3891,
"step": 24840
},
{
"epoch": 7.847508489299534,
"grad_norm": 0.04774434859499396,
"learning_rate": 0.0002686651029508908,
"loss": 2.461,
"step": 24845
},
{
"epoch": 7.849087893864013,
"grad_norm": 0.054342846436838,
"learning_rate": 0.0002682891172059573,
"loss": 2.3694,
"step": 24850
},
{
"epoch": 7.850667298428492,
"grad_norm": 0.051858819495668396,
"learning_rate": 0.0002679133539696279,
"loss": 2.4877,
"step": 24855
},
{
"epoch": 7.8522467029929714,
"grad_norm": 0.04923559555748598,
"learning_rate": 0.00026753781335617054,
"loss": 2.4505,
"step": 24860
},
{
"epoch": 7.853826107557451,
"grad_norm": 0.053118258304910675,
"learning_rate": 0.0002671624954797842,
"loss": 2.3934,
"step": 24865
},
{
"epoch": 7.85540551212193,
"grad_norm": 0.055486172400068534,
"learning_rate": 0.00026678740045460084,
"loss": 2.3927,
"step": 24870
},
{
"epoch": 7.856984916686409,
"grad_norm": 0.0451498304811942,
"learning_rate": 0.00026641252839468434,
"loss": 2.4313,
"step": 24875
},
{
"epoch": 7.858564321250888,
"grad_norm": 0.0578150705886433,
"learning_rate": 0.0002660378794140309,
"loss": 2.3502,
"step": 24880
},
{
"epoch": 7.860143725815368,
"grad_norm": 0.05501558259689232,
"learning_rate": 0.00026566345362656873,
"loss": 2.4126,
"step": 24885
},
{
"epoch": 7.861723130379847,
"grad_norm": 0.04988050009494289,
"learning_rate": 0.00026528925114615876,
"loss": 2.3797,
"step": 24890
},
{
"epoch": 7.863302534944326,
"grad_norm": 0.04668678481382195,
"learning_rate": 0.00026491527208659296,
"loss": 2.4605,
"step": 24895
},
{
"epoch": 7.864881939508805,
"grad_norm": 0.05149872000751331,
"learning_rate": 0.00026454151656159664,
"loss": 2.4877,
"step": 24900
},
{
"epoch": 7.866461344073285,
"grad_norm": 0.060029637050715186,
"learning_rate": 0.0002641679846848262,
"loss": 2.4654,
"step": 24905
},
{
"epoch": 7.868040748637764,
"grad_norm": 0.05459942097404522,
"learning_rate": 0.0002637946765698702,
"loss": 2.4399,
"step": 24910
},
{
"epoch": 7.869620153202243,
"grad_norm": 0.05018770004469524,
"learning_rate": 0.0002634215923302494,
"loss": 2.361,
"step": 24915
},
{
"epoch": 7.871199557766722,
"grad_norm": 0.049283660791538106,
"learning_rate": 0.0002630487320794158,
"loss": 2.5835,
"step": 24920
},
{
"epoch": 7.872778962331201,
"grad_norm": 0.04853385403241942,
"learning_rate": 0.0002626760959307547,
"loss": 2.3399,
"step": 24925
},
{
"epoch": 7.87435836689568,
"grad_norm": 0.05341415347819443,
"learning_rate": 0.00026230368399758185,
"loss": 2.3583,
"step": 24930
},
{
"epoch": 7.875937771460159,
"grad_norm": 0.050805421265106745,
"learning_rate": 0.0002619314963931452,
"loss": 2.3894,
"step": 24935
},
{
"epoch": 7.8775171760246385,
"grad_norm": 0.05226530326416026,
"learning_rate": 0.0002615595332306251,
"loss": 2.3752,
"step": 24940
},
{
"epoch": 7.879096580589118,
"grad_norm": 0.055823263173015,
"learning_rate": 0.00026118779462313267,
"loss": 2.4041,
"step": 24945
},
{
"epoch": 7.880675985153597,
"grad_norm": 0.04994407446763293,
"learning_rate": 0.00026081628068371176,
"loss": 2.3953,
"step": 24950
},
{
"epoch": 7.882255389718076,
"grad_norm": 0.053867741991867404,
"learning_rate": 0.00026044499152533707,
"loss": 2.4296,
"step": 24955
},
{
"epoch": 7.8838347942825555,
"grad_norm": 0.05339463529783473,
"learning_rate": 0.0002600739272609154,
"loss": 2.3963,
"step": 24960
},
{
"epoch": 7.885414198847035,
"grad_norm": 0.04730043850167995,
"learning_rate": 0.0002597030880032848,
"loss": 2.4159,
"step": 24965
},
{
"epoch": 7.886993603411514,
"grad_norm": 0.04781844530164919,
"learning_rate": 0.00025933247386521506,
"loss": 2.4254,
"step": 24970
},
{
"epoch": 7.888573007975993,
"grad_norm": 0.04725599956693539,
"learning_rate": 0.000258962084959408,
"loss": 2.3778,
"step": 24975
},
{
"epoch": 7.8901524125404725,
"grad_norm": 0.04833537867243059,
"learning_rate": 0.0002585919213984963,
"loss": 2.4309,
"step": 24980
},
{
"epoch": 7.891731817104952,
"grad_norm": 0.05215626198944377,
"learning_rate": 0.00025822198329504407,
"loss": 2.347,
"step": 24985
},
{
"epoch": 7.893311221669431,
"grad_norm": 0.06425961455609468,
"learning_rate": 0.0002578522707615476,
"loss": 2.4576,
"step": 24990
},
{
"epoch": 7.894890626233909,
"grad_norm": 0.04813500797824529,
"learning_rate": 0.0002574827839104339,
"loss": 2.4278,
"step": 24995
},
{
"epoch": 7.896470030798389,
"grad_norm": 0.08809223004983643,
"learning_rate": 0.00025711352285406154,
"loss": 2.3338,
"step": 25000
},
{
"epoch": 7.898049435362868,
"grad_norm": 0.05086475811689399,
"learning_rate": 0.00025674448770472046,
"loss": 2.4051,
"step": 25005
},
{
"epoch": 7.899628839927347,
"grad_norm": 0.05044024640626117,
"learning_rate": 0.00025637567857463153,
"loss": 2.4012,
"step": 25010
},
{
"epoch": 7.901208244491826,
"grad_norm": 0.06111290308917735,
"learning_rate": 0.0002560070955759479,
"loss": 2.4507,
"step": 25015
},
{
"epoch": 7.902787649056306,
"grad_norm": 0.04877300072310503,
"learning_rate": 0.00025563873882075304,
"loss": 2.4372,
"step": 25020
},
{
"epoch": 7.904367053620785,
"grad_norm": 0.05316787985740273,
"learning_rate": 0.0002552706084210615,
"loss": 2.4281,
"step": 25025
},
{
"epoch": 7.905946458185264,
"grad_norm": 0.05566724684639508,
"learning_rate": 0.00025490270448882014,
"loss": 2.3855,
"step": 25030
},
{
"epoch": 7.907525862749743,
"grad_norm": 0.04554384966620676,
"learning_rate": 0.00025453502713590546,
"loss": 2.3956,
"step": 25035
},
{
"epoch": 7.909105267314223,
"grad_norm": 0.052073954247616266,
"learning_rate": 0.0002541675764741264,
"loss": 2.2434,
"step": 25040
},
{
"epoch": 7.910684671878702,
"grad_norm": 0.05025299714211612,
"learning_rate": 0.00025380035261522206,
"loss": 2.3467,
"step": 25045
},
{
"epoch": 7.912264076443181,
"grad_norm": 0.04376413793424298,
"learning_rate": 0.0002534333556708628,
"loss": 2.5498,
"step": 25050
},
{
"epoch": 7.91384348100766,
"grad_norm": 0.05257378214001336,
"learning_rate": 0.0002530665857526503,
"loss": 2.3848,
"step": 25055
},
{
"epoch": 7.91542288557214,
"grad_norm": 0.050520794839413734,
"learning_rate": 0.00025270004297211633,
"loss": 2.5111,
"step": 25060
},
{
"epoch": 7.917002290136619,
"grad_norm": 0.05498350326076211,
"learning_rate": 0.00025233372744072505,
"loss": 2.419,
"step": 25065
},
{
"epoch": 7.918581694701098,
"grad_norm": 0.05073618067020773,
"learning_rate": 0.0002519676392698703,
"loss": 2.3899,
"step": 25070
},
{
"epoch": 7.920161099265577,
"grad_norm": 0.06477949618644933,
"learning_rate": 0.0002516017785708767,
"loss": 2.5233,
"step": 25075
},
{
"epoch": 7.921740503830057,
"grad_norm": 0.04964483217453487,
"learning_rate": 0.0002512361454550011,
"loss": 2.3623,
"step": 25080
},
{
"epoch": 7.923319908394535,
"grad_norm": 0.05638298112505006,
"learning_rate": 0.0002508707400334296,
"loss": 2.4291,
"step": 25085
},
{
"epoch": 7.924899312959014,
"grad_norm": 0.04650636475469773,
"learning_rate": 0.0002505055624172796,
"loss": 2.3508,
"step": 25090
},
{
"epoch": 7.926478717523493,
"grad_norm": 0.04639131336961897,
"learning_rate": 0.00025014061271759957,
"loss": 2.3451,
"step": 25095
},
{
"epoch": 7.928058122087973,
"grad_norm": 0.05580224658566145,
"learning_rate": 0.0002497758910453679,
"loss": 2.3823,
"step": 25100
},
{
"epoch": 7.929637526652452,
"grad_norm": 0.049781739134590074,
"learning_rate": 0.00024941139751149464,
"loss": 2.3503,
"step": 25105
},
{
"epoch": 7.931216931216931,
"grad_norm": 0.044625547079489805,
"learning_rate": 0.00024904713222681995,
"loss": 2.3956,
"step": 25110
},
{
"epoch": 7.93279633578141,
"grad_norm": 0.05238646039736108,
"learning_rate": 0.000248683095302114,
"loss": 2.4294,
"step": 25115
},
{
"epoch": 7.93437574034589,
"grad_norm": 0.044664781114429127,
"learning_rate": 0.0002483192868480787,
"loss": 2.3869,
"step": 25120
},
{
"epoch": 7.935955144910369,
"grad_norm": 0.04908312164203827,
"learning_rate": 0.0002479557069753454,
"loss": 2.3733,
"step": 25125
},
{
"epoch": 7.937534549474848,
"grad_norm": 0.052091218263155596,
"learning_rate": 0.0002475923557944769,
"loss": 2.4041,
"step": 25130
},
{
"epoch": 7.939113954039327,
"grad_norm": 0.04951036878663111,
"learning_rate": 0.0002472292334159658,
"loss": 2.2851,
"step": 25135
},
{
"epoch": 7.940693358603807,
"grad_norm": 0.04310144923803069,
"learning_rate": 0.0002468663399502352,
"loss": 2.3429,
"step": 25140
},
{
"epoch": 7.942272763168286,
"grad_norm": 0.0797289602218377,
"learning_rate": 0.0002465036755076387,
"loss": 2.4893,
"step": 25145
},
{
"epoch": 7.943852167732764,
"grad_norm": 0.052700644932645854,
"learning_rate": 0.0002461412401984601,
"loss": 2.3395,
"step": 25150
},
{
"epoch": 7.9454315722972435,
"grad_norm": 0.051567074343615994,
"learning_rate": 0.000245779034132914,
"loss": 2.3475,
"step": 25155
},
{
"epoch": 7.947010976861723,
"grad_norm": 0.04728199495954336,
"learning_rate": 0.0002454170574211448,
"loss": 2.4107,
"step": 25160
},
{
"epoch": 7.948590381426202,
"grad_norm": 0.04754404914104074,
"learning_rate": 0.00024505531017322705,
"loss": 2.3486,
"step": 25165
},
{
"epoch": 7.950169785990681,
"grad_norm": 0.05207347438832702,
"learning_rate": 0.0002446937924991661,
"loss": 2.3652,
"step": 25170
},
{
"epoch": 7.9517491905551605,
"grad_norm": 0.05252867657084638,
"learning_rate": 0.0002443325045088972,
"loss": 2.425,
"step": 25175
},
{
"epoch": 7.95332859511964,
"grad_norm": 0.05328886315468783,
"learning_rate": 0.00024397144631228552,
"loss": 2.4787,
"step": 25180
},
{
"epoch": 7.954907999684119,
"grad_norm": 0.06955063626861216,
"learning_rate": 0.00024361061801912666,
"loss": 2.5098,
"step": 25185
},
{
"epoch": 7.956487404248598,
"grad_norm": 0.050174054653189894,
"learning_rate": 0.00024325001973914584,
"loss": 2.4256,
"step": 25190
},
{
"epoch": 7.9580668088130775,
"grad_norm": 0.061649303236535384,
"learning_rate": 0.00024288965158199939,
"loss": 2.3466,
"step": 25195
},
{
"epoch": 7.959646213377557,
"grad_norm": 0.058243615639442824,
"learning_rate": 0.00024252951365727216,
"loss": 2.4067,
"step": 25200
},
{
"epoch": 7.961225617942036,
"grad_norm": 0.0539283275757395,
"learning_rate": 0.00024216960607448057,
"loss": 2.4036,
"step": 25205
},
{
"epoch": 7.962805022506515,
"grad_norm": 0.05536339290923945,
"learning_rate": 0.00024180992894306985,
"loss": 2.4155,
"step": 25210
},
{
"epoch": 7.9643844270709945,
"grad_norm": 0.047689861652926116,
"learning_rate": 0.0002414504823724153,
"loss": 2.44,
"step": 25215
},
{
"epoch": 7.965963831635474,
"grad_norm": 0.04345815695402367,
"learning_rate": 0.00024109126647182277,
"loss": 2.3307,
"step": 25220
},
{
"epoch": 7.967543236199953,
"grad_norm": 0.04900328132371538,
"learning_rate": 0.00024073228135052728,
"loss": 2.4016,
"step": 25225
},
{
"epoch": 7.969122640764432,
"grad_norm": 0.05385073368305318,
"learning_rate": 0.000240373527117694,
"loss": 2.4529,
"step": 25230
},
{
"epoch": 7.9707020453289115,
"grad_norm": 0.0456740677467157,
"learning_rate": 0.00024001500388241771,
"loss": 2.3638,
"step": 25235
},
{
"epoch": 7.97228144989339,
"grad_norm": 0.04980594664486062,
"learning_rate": 0.00023965671175372273,
"loss": 2.3583,
"step": 25240
},
{
"epoch": 7.973860854457869,
"grad_norm": 0.05093312227061897,
"learning_rate": 0.00023929865084056413,
"loss": 2.4135,
"step": 25245
},
{
"epoch": 7.975440259022348,
"grad_norm": 0.04249090971671767,
"learning_rate": 0.00023894082125182548,
"loss": 2.302,
"step": 25250
},
{
"epoch": 7.977019663586828,
"grad_norm": 0.05758831539328352,
"learning_rate": 0.0002385832230963203,
"loss": 2.4207,
"step": 25255
},
{
"epoch": 7.978599068151307,
"grad_norm": 0.051999097443805445,
"learning_rate": 0.00023822585648279238,
"loss": 2.3582,
"step": 25260
},
{
"epoch": 7.980178472715786,
"grad_norm": 0.048981107909608225,
"learning_rate": 0.00023786872151991434,
"loss": 2.3321,
"step": 25265
},
{
"epoch": 7.981757877280265,
"grad_norm": 0.06360018661785359,
"learning_rate": 0.00023751181831628887,
"loss": 2.4279,
"step": 25270
},
{
"epoch": 7.983337281844745,
"grad_norm": 0.051195794290684445,
"learning_rate": 0.0002371551469804476,
"loss": 2.3816,
"step": 25275
},
{
"epoch": 7.984916686409224,
"grad_norm": 0.04329785290808799,
"learning_rate": 0.00023679870762085197,
"loss": 2.4835,
"step": 25280
},
{
"epoch": 7.986496090973703,
"grad_norm": 0.05308426307057489,
"learning_rate": 0.00023644250034589342,
"loss": 2.3839,
"step": 25285
},
{
"epoch": 7.988075495538182,
"grad_norm": 0.051379493670288746,
"learning_rate": 0.00023608652526389175,
"loss": 2.3268,
"step": 25290
},
{
"epoch": 7.9896549001026615,
"grad_norm": 0.05996063664513099,
"learning_rate": 0.00023573078248309722,
"loss": 2.3677,
"step": 25295
},
{
"epoch": 7.991234304667141,
"grad_norm": 0.05129741989969305,
"learning_rate": 0.00023537527211168875,
"loss": 2.3758,
"step": 25300
},
{
"epoch": 7.99281370923162,
"grad_norm": 0.055209849143472586,
"learning_rate": 0.0002350199942577743,
"loss": 2.4039,
"step": 25305
},
{
"epoch": 7.994393113796098,
"grad_norm": 0.05022509106975794,
"learning_rate": 0.00023466494902939239,
"loss": 2.47,
"step": 25310
},
{
"epoch": 7.995972518360578,
"grad_norm": 0.05338175583422317,
"learning_rate": 0.0002343101365345095,
"loss": 2.3362,
"step": 25315
},
{
"epoch": 7.997551922925057,
"grad_norm": 0.049694422459919,
"learning_rate": 0.0002339555568810221,
"loss": 2.4244,
"step": 25320
},
{
"epoch": 7.999131327489536,
"grad_norm": 0.05052937977693835,
"learning_rate": 0.0002336012101767554,
"loss": 2.3527,
"step": 25325
},
{
"epoch": 8.0,
"eval_loss": 2.408498525619507,
"eval_runtime": 118.6136,
"eval_samples_per_second": 22.333,
"eval_steps_per_second": 5.59,
"step": 25328
},
{
"epoch": 8.000631761825792,
"grad_norm": 0.05432150860401315,
"learning_rate": 0.00023324709652946374,
"loss": 2.4273,
"step": 25330
},
{
"epoch": 8.002211166390271,
"grad_norm": 0.06070758772782719,
"learning_rate": 0.00023289321604683133,
"loss": 2.4141,
"step": 25335
},
{
"epoch": 8.00379057095475,
"grad_norm": 0.053364434963253105,
"learning_rate": 0.00023253956883647088,
"loss": 2.3626,
"step": 25340
},
{
"epoch": 8.00536997551923,
"grad_norm": 0.04694053188876907,
"learning_rate": 0.00023218615500592376,
"loss": 2.3654,
"step": 25345
},
{
"epoch": 8.006949380083709,
"grad_norm": 0.05142106576940761,
"learning_rate": 0.0002318329746626614,
"loss": 2.41,
"step": 25350
},
{
"epoch": 8.008528784648188,
"grad_norm": 0.058656523389992464,
"learning_rate": 0.00023148002791408361,
"loss": 2.391,
"step": 25355
},
{
"epoch": 8.010108189212668,
"grad_norm": 0.05123322096394153,
"learning_rate": 0.00023112731486751905,
"loss": 2.361,
"step": 25360
},
{
"epoch": 8.011687593777147,
"grad_norm": 0.06560000338501384,
"learning_rate": 0.0002307748356302256,
"loss": 2.3499,
"step": 25365
},
{
"epoch": 8.013266998341626,
"grad_norm": 0.05016785247315961,
"learning_rate": 0.00023042259030938962,
"loss": 2.4791,
"step": 25370
},
{
"epoch": 8.014846402906105,
"grad_norm": 0.04777312402978052,
"learning_rate": 0.00023007057901212725,
"loss": 2.3446,
"step": 25375
},
{
"epoch": 8.016425807470583,
"grad_norm": 0.05049010453414375,
"learning_rate": 0.00022971880184548233,
"loss": 2.4549,
"step": 25380
},
{
"epoch": 8.018005212035062,
"grad_norm": 0.055384014176131036,
"learning_rate": 0.00022936725891642862,
"loss": 2.3611,
"step": 25385
},
{
"epoch": 8.019584616599541,
"grad_norm": 0.055221474558915615,
"learning_rate": 0.00022901595033186762,
"loss": 2.4506,
"step": 25390
},
{
"epoch": 8.02116402116402,
"grad_norm": 0.0463917278982015,
"learning_rate": 0.00022866487619862996,
"loss": 2.3758,
"step": 25395
},
{
"epoch": 8.0227434257285,
"grad_norm": 0.05158858897446619,
"learning_rate": 0.0002283140366234756,
"loss": 2.4999,
"step": 25400
},
{
"epoch": 8.024322830292979,
"grad_norm": 0.04720736132104705,
"learning_rate": 0.0002279634317130922,
"loss": 2.3788,
"step": 25405
},
{
"epoch": 8.025902234857458,
"grad_norm": 0.04877113220273165,
"learning_rate": 0.00022761306157409656,
"loss": 2.4509,
"step": 25410
},
{
"epoch": 8.027481639421937,
"grad_norm": 0.04710731190705331,
"learning_rate": 0.00022726292631303403,
"loss": 2.4476,
"step": 25415
},
{
"epoch": 8.029061043986417,
"grad_norm": 0.049172626607559375,
"learning_rate": 0.00022691302603637808,
"loss": 2.3511,
"step": 25420
},
{
"epoch": 8.030640448550896,
"grad_norm": 0.06282722754478427,
"learning_rate": 0.00022656336085053187,
"loss": 2.4699,
"step": 25425
},
{
"epoch": 8.032219853115375,
"grad_norm": 0.04587951545285191,
"learning_rate": 0.00022621393086182595,
"loss": 2.3529,
"step": 25430
},
{
"epoch": 8.033799257679854,
"grad_norm": 0.05540063205346335,
"learning_rate": 0.00022586473617651958,
"loss": 2.4312,
"step": 25435
},
{
"epoch": 8.035378662244334,
"grad_norm": 0.05586037835539668,
"learning_rate": 0.00022551577690080104,
"loss": 2.3557,
"step": 25440
},
{
"epoch": 8.036958066808813,
"grad_norm": 0.04641694018306748,
"learning_rate": 0.00022516705314078644,
"loss": 2.3362,
"step": 25445
},
{
"epoch": 8.038537471373292,
"grad_norm": 0.0499358415566031,
"learning_rate": 0.00022481856500252052,
"loss": 2.3701,
"step": 25450
},
{
"epoch": 8.040116875937771,
"grad_norm": 0.06463046342707425,
"learning_rate": 0.00022447031259197615,
"loss": 2.3637,
"step": 25455
},
{
"epoch": 8.04169628050225,
"grad_norm": 0.05002064721781355,
"learning_rate": 0.00022412229601505453,
"loss": 2.3874,
"step": 25460
},
{
"epoch": 8.04327568506673,
"grad_norm": 0.05458736400973159,
"learning_rate": 0.00022377451537758565,
"loss": 2.3834,
"step": 25465
},
{
"epoch": 8.04485508963121,
"grad_norm": 0.050519490608758155,
"learning_rate": 0.00022342697078532692,
"loss": 2.4172,
"step": 25470
},
{
"epoch": 8.046434494195688,
"grad_norm": 0.053622718118899106,
"learning_rate": 0.00022307966234396504,
"loss": 2.3652,
"step": 25475
},
{
"epoch": 8.048013898760168,
"grad_norm": 0.06334614813487868,
"learning_rate": 0.00022273259015911397,
"loss": 2.4569,
"step": 25480
},
{
"epoch": 8.049593303324647,
"grad_norm": 0.06045506352015708,
"learning_rate": 0.00022238575433631582,
"loss": 2.4382,
"step": 25485
},
{
"epoch": 8.051172707889126,
"grad_norm": 0.044523799208584534,
"learning_rate": 0.00022203915498104177,
"loss": 2.386,
"step": 25490
},
{
"epoch": 8.052752112453605,
"grad_norm": 0.043731210286509156,
"learning_rate": 0.0002216927921986901,
"loss": 2.3943,
"step": 25495
},
{
"epoch": 8.054331517018085,
"grad_norm": 0.049095155104685415,
"learning_rate": 0.00022134666609458764,
"loss": 2.3977,
"step": 25500
},
{
"epoch": 8.055910921582564,
"grad_norm": 0.047218796790150126,
"learning_rate": 0.000221000776773989,
"loss": 2.3586,
"step": 25505
},
{
"epoch": 8.057490326147043,
"grad_norm": 0.04712898469271991,
"learning_rate": 0.0002206551243420768,
"loss": 2.4194,
"step": 25510
},
{
"epoch": 8.059069730711522,
"grad_norm": 0.05288130067659473,
"learning_rate": 0.00022030970890396206,
"loss": 2.467,
"step": 25515
},
{
"epoch": 8.060649135276002,
"grad_norm": 0.0467928302266336,
"learning_rate": 0.00021996453056468313,
"loss": 2.4048,
"step": 25520
},
{
"epoch": 8.062228539840481,
"grad_norm": 0.04743439795377022,
"learning_rate": 0.00021961958942920678,
"loss": 2.4559,
"step": 25525
},
{
"epoch": 8.06380794440496,
"grad_norm": 0.05871105138921706,
"learning_rate": 0.00021927488560242748,
"loss": 2.3977,
"step": 25530
},
{
"epoch": 8.065387348969438,
"grad_norm": 0.04618683270318365,
"learning_rate": 0.00021893041918916712,
"loss": 2.3794,
"step": 25535
},
{
"epoch": 8.066966753533917,
"grad_norm": 0.060264806794645034,
"learning_rate": 0.00021858619029417603,
"loss": 2.5053,
"step": 25540
},
{
"epoch": 8.068546158098396,
"grad_norm": 0.05920399847235902,
"learning_rate": 0.00021824219902213184,
"loss": 2.3823,
"step": 25545
},
{
"epoch": 8.070125562662875,
"grad_norm": 0.05817741487690848,
"learning_rate": 0.0002178984454776398,
"loss": 2.4137,
"step": 25550
},
{
"epoch": 8.071704967227355,
"grad_norm": 0.058443901562272596,
"learning_rate": 0.00021755492976523384,
"loss": 2.4134,
"step": 25555
},
{
"epoch": 8.073284371791834,
"grad_norm": 0.06539243429105443,
"learning_rate": 0.0002172116519893742,
"loss": 2.3694,
"step": 25560
},
{
"epoch": 8.074863776356313,
"grad_norm": 0.056757002916980895,
"learning_rate": 0.0002168686122544502,
"loss": 2.3464,
"step": 25565
},
{
"epoch": 8.076443180920792,
"grad_norm": 0.048507458667812024,
"learning_rate": 0.00021652581066477762,
"loss": 2.4113,
"step": 25570
},
{
"epoch": 8.078022585485272,
"grad_norm": 0.04977853610079591,
"learning_rate": 0.00021618324732459993,
"loss": 2.3516,
"step": 25575
},
{
"epoch": 8.07960199004975,
"grad_norm": 0.05706451027447465,
"learning_rate": 0.00021584092233808906,
"loss": 2.4199,
"step": 25580
},
{
"epoch": 8.08118139461423,
"grad_norm": 0.04735352241412206,
"learning_rate": 0.0002154988358093437,
"loss": 2.423,
"step": 25585
},
{
"epoch": 8.08276079917871,
"grad_norm": 0.04404759919677923,
"learning_rate": 0.0002151569878423899,
"loss": 2.3359,
"step": 25590
},
{
"epoch": 8.084340203743189,
"grad_norm": 0.05186351457297536,
"learning_rate": 0.00021481537854118173,
"loss": 2.412,
"step": 25595
},
{
"epoch": 8.085919608307668,
"grad_norm": 0.04819742089184298,
"learning_rate": 0.00021447400800959993,
"loss": 2.4001,
"step": 25600
},
{
"epoch": 8.087499012872147,
"grad_norm": 0.05038219497467566,
"learning_rate": 0.00021413287635145363,
"loss": 2.444,
"step": 25605
},
{
"epoch": 8.089078417436626,
"grad_norm": 0.05904454470900991,
"learning_rate": 0.00021379198367047836,
"loss": 2.3723,
"step": 25610
},
{
"epoch": 8.090657822001106,
"grad_norm": 0.04749538015256275,
"learning_rate": 0.0002134513300703379,
"loss": 2.5017,
"step": 25615
},
{
"epoch": 8.092237226565585,
"grad_norm": 0.050329947502558675,
"learning_rate": 0.0002131109156546226,
"loss": 2.4175,
"step": 25620
},
{
"epoch": 8.093816631130064,
"grad_norm": 0.043354768292465345,
"learning_rate": 0.0002127707405268503,
"loss": 2.3622,
"step": 25625
},
{
"epoch": 8.095396035694543,
"grad_norm": 0.05339115726289078,
"learning_rate": 0.00021243080479046606,
"loss": 2.3798,
"step": 25630
},
{
"epoch": 8.096975440259023,
"grad_norm": 0.04280629673605756,
"learning_rate": 0.00021209110854884184,
"loss": 2.4827,
"step": 25635
},
{
"epoch": 8.098554844823502,
"grad_norm": 0.046150900293078764,
"learning_rate": 0.00021175165190527768,
"loss": 2.4778,
"step": 25640
},
{
"epoch": 8.100134249387981,
"grad_norm": 0.04435862477833089,
"learning_rate": 0.000211412434963,
"loss": 2.4524,
"step": 25645
},
{
"epoch": 8.10171365395246,
"grad_norm": 0.06701759717447257,
"learning_rate": 0.00021107345782516208,
"loss": 2.4495,
"step": 25650
},
{
"epoch": 8.10329305851694,
"grad_norm": 0.05576458550765147,
"learning_rate": 0.00021073472059484534,
"loss": 2.4342,
"step": 25655
},
{
"epoch": 8.104872463081419,
"grad_norm": 0.053718621623195476,
"learning_rate": 0.00021039622337505726,
"loss": 2.4701,
"step": 25660
},
{
"epoch": 8.106451867645898,
"grad_norm": 0.04767905473302163,
"learning_rate": 0.00021005796626873252,
"loss": 2.4262,
"step": 25665
},
{
"epoch": 8.108031272210377,
"grad_norm": 0.05318583148470195,
"learning_rate": 0.0002097199493787334,
"loss": 2.4528,
"step": 25670
},
{
"epoch": 8.109610676774857,
"grad_norm": 0.05349608089719971,
"learning_rate": 0.00020938217280784844,
"loss": 2.4751,
"step": 25675
},
{
"epoch": 8.111190081339336,
"grad_norm": 0.052268340128226724,
"learning_rate": 0.00020904463665879337,
"loss": 2.4085,
"step": 25680
},
{
"epoch": 8.112769485903815,
"grad_norm": 0.04659946604710442,
"learning_rate": 0.00020870734103421075,
"loss": 2.4501,
"step": 25685
},
{
"epoch": 8.114348890468294,
"grad_norm": 0.04613703886202508,
"learning_rate": 0.00020837028603666962,
"loss": 2.3732,
"step": 25690
},
{
"epoch": 8.115928295032772,
"grad_norm": 0.04325557682887605,
"learning_rate": 0.00020803347176866704,
"loss": 2.3152,
"step": 25695
},
{
"epoch": 8.117507699597251,
"grad_norm": 0.04412483063282996,
"learning_rate": 0.00020769689833262527,
"loss": 2.4872,
"step": 25700
},
{
"epoch": 8.11908710416173,
"grad_norm": 0.05533062549270656,
"learning_rate": 0.00020736056583089502,
"loss": 2.4282,
"step": 25705
},
{
"epoch": 8.12066650872621,
"grad_norm": 0.049017971929669675,
"learning_rate": 0.00020702447436575223,
"loss": 2.4769,
"step": 25710
},
{
"epoch": 8.122245913290689,
"grad_norm": 0.0457970432883676,
"learning_rate": 0.00020668862403940035,
"loss": 2.3733,
"step": 25715
},
{
"epoch": 8.123825317855168,
"grad_norm": 0.05341815120981854,
"learning_rate": 0.0002063530149539694,
"loss": 2.4267,
"step": 25720
},
{
"epoch": 8.125404722419647,
"grad_norm": 0.04123828694903196,
"learning_rate": 0.0002060176472115155,
"loss": 2.2966,
"step": 25725
},
{
"epoch": 8.126984126984127,
"grad_norm": 0.04893357789625497,
"learning_rate": 0.0002056825209140224,
"loss": 2.4087,
"step": 25730
},
{
"epoch": 8.128563531548606,
"grad_norm": 0.053472140914025544,
"learning_rate": 0.0002053476361633997,
"loss": 2.3427,
"step": 25735
},
{
"epoch": 8.130142936113085,
"grad_norm": 0.048952037330800856,
"learning_rate": 0.00020501299306148346,
"loss": 2.4165,
"step": 25740
},
{
"epoch": 8.131722340677564,
"grad_norm": 0.06182886325902322,
"learning_rate": 0.0002046785917100369,
"loss": 2.5146,
"step": 25745
},
{
"epoch": 8.133301745242044,
"grad_norm": 0.04157143444076313,
"learning_rate": 0.00020434443221074896,
"loss": 2.3853,
"step": 25750
},
{
"epoch": 8.134881149806523,
"grad_norm": 0.0698682218208614,
"learning_rate": 0.0002040105146652358,
"loss": 2.4076,
"step": 25755
},
{
"epoch": 8.136460554371002,
"grad_norm": 0.0451942558429116,
"learning_rate": 0.00020367683917503943,
"loss": 2.3913,
"step": 25760
},
{
"epoch": 8.138039958935481,
"grad_norm": 0.06147551527667643,
"learning_rate": 0.00020334340584162846,
"loss": 2.3917,
"step": 25765
},
{
"epoch": 8.13961936349996,
"grad_norm": 0.05748052007054688,
"learning_rate": 0.0002030102147663978,
"loss": 2.3703,
"step": 25770
},
{
"epoch": 8.14119876806444,
"grad_norm": 0.0501296109555413,
"learning_rate": 0.0002026772660506686,
"loss": 2.3558,
"step": 25775
},
{
"epoch": 8.142778172628919,
"grad_norm": 0.04664935132387147,
"learning_rate": 0.00020234455979568845,
"loss": 2.3723,
"step": 25780
},
{
"epoch": 8.144357577193398,
"grad_norm": 0.05187910422285325,
"learning_rate": 0.0002020120961026315,
"loss": 2.288,
"step": 25785
},
{
"epoch": 8.145936981757878,
"grad_norm": 0.05553071987652418,
"learning_rate": 0.00020167987507259733,
"loss": 2.4434,
"step": 25790
},
{
"epoch": 8.147516386322357,
"grad_norm": 0.04495139111319613,
"learning_rate": 0.0002013478968066128,
"loss": 2.4154,
"step": 25795
},
{
"epoch": 8.149095790886836,
"grad_norm": 0.052812069657348096,
"learning_rate": 0.00020101616140563017,
"loss": 2.4956,
"step": 25800
},
{
"epoch": 8.150675195451315,
"grad_norm": 0.04455981980348406,
"learning_rate": 0.00020068466897052805,
"loss": 2.4061,
"step": 25805
},
{
"epoch": 8.152254600015794,
"grad_norm": 0.044133623429162515,
"learning_rate": 0.00020035341960211107,
"loss": 2.3777,
"step": 25810
},
{
"epoch": 8.153834004580274,
"grad_norm": 0.052513258384821355,
"learning_rate": 0.00020002241340110982,
"loss": 2.3417,
"step": 25815
},
{
"epoch": 8.155413409144753,
"grad_norm": 0.05292469001695762,
"learning_rate": 0.00019969165046818184,
"loss": 2.3839,
"step": 25820
},
{
"epoch": 8.156992813709232,
"grad_norm": 0.0544564283114912,
"learning_rate": 0.00019936113090390952,
"loss": 2.4386,
"step": 25825
},
{
"epoch": 8.158572218273711,
"grad_norm": 0.053988393971699265,
"learning_rate": 0.00019903085480880167,
"loss": 2.4039,
"step": 25830
},
{
"epoch": 8.16015162283819,
"grad_norm": 0.057185268861558815,
"learning_rate": 0.00019870082228329357,
"loss": 2.3546,
"step": 25835
},
{
"epoch": 8.16173102740267,
"grad_norm": 0.04791400379030568,
"learning_rate": 0.00019837103342774544,
"loss": 2.4683,
"step": 25840
},
{
"epoch": 8.16331043196715,
"grad_norm": 0.05226965451943714,
"learning_rate": 0.00019804148834244462,
"loss": 2.4214,
"step": 25845
},
{
"epoch": 8.164889836531628,
"grad_norm": 0.04203611427973354,
"learning_rate": 0.0001977121871276034,
"loss": 2.4867,
"step": 25850
},
{
"epoch": 8.166469241096106,
"grad_norm": 0.04737734236925939,
"learning_rate": 0.00019738312988336004,
"loss": 2.4974,
"step": 25855
},
{
"epoch": 8.168048645660585,
"grad_norm": 0.044843187250530374,
"learning_rate": 0.0001970543167097789,
"loss": 2.3521,
"step": 25860
},
{
"epoch": 8.169628050225064,
"grad_norm": 0.0451266796780657,
"learning_rate": 0.00019672574770684948,
"loss": 2.4727,
"step": 25865
},
{
"epoch": 8.171207454789544,
"grad_norm": 0.04518404767506238,
"learning_rate": 0.00019639742297448837,
"loss": 2.4196,
"step": 25870
},
{
"epoch": 8.172786859354023,
"grad_norm": 0.049498519036167025,
"learning_rate": 0.0001960693426125364,
"loss": 2.4917,
"step": 25875
},
{
"epoch": 8.174366263918502,
"grad_norm": 0.05481636143947064,
"learning_rate": 0.00019574150672076074,
"loss": 2.3747,
"step": 25880
},
{
"epoch": 8.175945668482981,
"grad_norm": 0.048659749608224355,
"learning_rate": 0.00019541391539885456,
"loss": 2.3617,
"step": 25885
},
{
"epoch": 8.17752507304746,
"grad_norm": 0.04726710591130207,
"learning_rate": 0.00019508656874643604,
"loss": 2.3334,
"step": 25890
},
{
"epoch": 8.17910447761194,
"grad_norm": 0.060859310735491265,
"learning_rate": 0.00019475946686304925,
"loss": 2.3829,
"step": 25895
},
{
"epoch": 8.18068388217642,
"grad_norm": 0.05521704128431342,
"learning_rate": 0.0001944326098481638,
"loss": 2.3431,
"step": 25900
},
{
"epoch": 8.182263286740898,
"grad_norm": 0.05295496588973379,
"learning_rate": 0.00019410599780117445,
"loss": 2.3398,
"step": 25905
},
{
"epoch": 8.183842691305378,
"grad_norm": 0.05094092691924014,
"learning_rate": 0.00019377963082140248,
"loss": 2.4341,
"step": 25910
},
{
"epoch": 8.185422095869857,
"grad_norm": 0.05532205320005589,
"learning_rate": 0.00019345350900809366,
"loss": 2.4228,
"step": 25915
},
{
"epoch": 8.187001500434336,
"grad_norm": 0.05141666122768391,
"learning_rate": 0.00019312763246041932,
"loss": 2.4069,
"step": 25920
},
{
"epoch": 8.188580904998815,
"grad_norm": 0.050508472337427286,
"learning_rate": 0.00019280200127747704,
"loss": 2.3717,
"step": 25925
},
{
"epoch": 8.190160309563295,
"grad_norm": 0.046726584367368186,
"learning_rate": 0.00019247661555828844,
"loss": 2.3031,
"step": 25930
},
{
"epoch": 8.191739714127774,
"grad_norm": 0.056439828014490004,
"learning_rate": 0.000192151475401802,
"loss": 2.4581,
"step": 25935
},
{
"epoch": 8.193319118692253,
"grad_norm": 0.053176267180560934,
"learning_rate": 0.00019182658090689044,
"loss": 2.3222,
"step": 25940
},
{
"epoch": 8.194898523256732,
"grad_norm": 0.05059043390210798,
"learning_rate": 0.0001915019321723519,
"loss": 2.3839,
"step": 25945
},
{
"epoch": 8.196477927821212,
"grad_norm": 0.05642321753023156,
"learning_rate": 0.00019117752929691034,
"loss": 2.3712,
"step": 25950
},
{
"epoch": 8.198057332385691,
"grad_norm": 0.05491345197222136,
"learning_rate": 0.00019085337237921397,
"loss": 2.3938,
"step": 25955
},
{
"epoch": 8.19963673695017,
"grad_norm": 0.056572885344928915,
"learning_rate": 0.00019052946151783766,
"loss": 2.5006,
"step": 25960
},
{
"epoch": 8.20121614151465,
"grad_norm": 0.04636740498907759,
"learning_rate": 0.00019020579681128025,
"loss": 2.3715,
"step": 25965
},
{
"epoch": 8.202795546079129,
"grad_norm": 0.04934308888482995,
"learning_rate": 0.00018988237835796586,
"loss": 2.3747,
"step": 25970
},
{
"epoch": 8.204374950643608,
"grad_norm": 0.04696054508788191,
"learning_rate": 0.00018955920625624435,
"loss": 2.3561,
"step": 25975
},
{
"epoch": 8.205954355208087,
"grad_norm": 0.04097225071526321,
"learning_rate": 0.00018923628060439035,
"loss": 2.3523,
"step": 25980
},
{
"epoch": 8.207533759772566,
"grad_norm": 0.04850041083520627,
"learning_rate": 0.0001889136015006032,
"loss": 2.3739,
"step": 25985
},
{
"epoch": 8.209113164337046,
"grad_norm": 0.04378582333150282,
"learning_rate": 0.00018859116904300767,
"loss": 2.3547,
"step": 25990
},
{
"epoch": 8.210692568901525,
"grad_norm": 0.048930957978835087,
"learning_rate": 0.00018826898332965314,
"loss": 2.452,
"step": 25995
},
{
"epoch": 8.212271973466004,
"grad_norm": 0.048049063084902095,
"learning_rate": 0.00018794704445851475,
"loss": 2.3665,
"step": 26000
},
{
"epoch": 8.213851378030483,
"grad_norm": 0.06494982882472712,
"learning_rate": 0.0001876253525274918,
"loss": 2.3977,
"step": 26005
},
{
"epoch": 8.21543078259496,
"grad_norm": 0.04749173325529246,
"learning_rate": 0.00018730390763440851,
"loss": 2.457,
"step": 26010
},
{
"epoch": 8.21701018715944,
"grad_norm": 0.05242256228335583,
"learning_rate": 0.0001869827098770146,
"loss": 2.3897,
"step": 26015
},
{
"epoch": 8.21858959172392,
"grad_norm": 0.05671602284570124,
"learning_rate": 0.00018666175935298391,
"loss": 2.4221,
"step": 26020
},
{
"epoch": 8.220168996288399,
"grad_norm": 0.05300174129660964,
"learning_rate": 0.00018634105615991593,
"loss": 2.3662,
"step": 26025
},
{
"epoch": 8.221748400852878,
"grad_norm": 0.049436010634950366,
"learning_rate": 0.00018602060039533418,
"loss": 2.3879,
"step": 26030
},
{
"epoch": 8.223327805417357,
"grad_norm": 0.053023278772629315,
"learning_rate": 0.0001857003921566871,
"loss": 2.4243,
"step": 26035
},
{
"epoch": 8.224907209981836,
"grad_norm": 0.0468439537196731,
"learning_rate": 0.00018538043154134808,
"loss": 2.4004,
"step": 26040
},
{
"epoch": 8.226486614546316,
"grad_norm": 0.052684392610237045,
"learning_rate": 0.0001850607186466149,
"loss": 2.4019,
"step": 26045
},
{
"epoch": 8.228066019110795,
"grad_norm": 0.05042324780054584,
"learning_rate": 0.0001847412535697106,
"loss": 2.4142,
"step": 26050
},
{
"epoch": 8.229645423675274,
"grad_norm": 0.05305724449911546,
"learning_rate": 0.0001844220364077822,
"loss": 2.3344,
"step": 26055
},
{
"epoch": 8.231224828239753,
"grad_norm": 0.06049144424858691,
"learning_rate": 0.0001841030672579015,
"loss": 2.3801,
"step": 26060
},
{
"epoch": 8.232804232804233,
"grad_norm": 0.06369255413270646,
"learning_rate": 0.00018378434621706542,
"loss": 2.4561,
"step": 26065
},
{
"epoch": 8.234383637368712,
"grad_norm": 0.056604618448458786,
"learning_rate": 0.00018346587338219456,
"loss": 2.4035,
"step": 26070
},
{
"epoch": 8.235963041933191,
"grad_norm": 0.050889888783988424,
"learning_rate": 0.00018314764885013469,
"loss": 2.3944,
"step": 26075
},
{
"epoch": 8.23754244649767,
"grad_norm": 0.055367532026650776,
"learning_rate": 0.00018282967271765583,
"loss": 2.43,
"step": 26080
},
{
"epoch": 8.23912185106215,
"grad_norm": 0.05550179993088914,
"learning_rate": 0.0001825119450814522,
"loss": 2.4264,
"step": 26085
},
{
"epoch": 8.240701255626629,
"grad_norm": 0.06828840646708353,
"learning_rate": 0.00018219446603814316,
"loss": 2.3253,
"step": 26090
},
{
"epoch": 8.242280660191108,
"grad_norm": 0.0471356896770975,
"learning_rate": 0.00018187723568427173,
"loss": 2.2869,
"step": 26095
},
{
"epoch": 8.243860064755587,
"grad_norm": 0.043796165252091464,
"learning_rate": 0.00018156025411630595,
"loss": 2.4641,
"step": 26100
},
{
"epoch": 8.245439469320067,
"grad_norm": 0.04534146642314097,
"learning_rate": 0.00018124352143063783,
"loss": 2.3954,
"step": 26105
},
{
"epoch": 8.247018873884546,
"grad_norm": 0.0480378942889025,
"learning_rate": 0.00018092703772358342,
"loss": 2.3197,
"step": 26110
},
{
"epoch": 8.248598278449025,
"grad_norm": 0.054109348837150635,
"learning_rate": 0.00018061080309138378,
"loss": 2.4383,
"step": 26115
},
{
"epoch": 8.250177683013504,
"grad_norm": 0.057228962014287385,
"learning_rate": 0.00018029481763020384,
"loss": 2.356,
"step": 26120
},
{
"epoch": 8.251757087577984,
"grad_norm": 0.048818000998402444,
"learning_rate": 0.0001799790814361325,
"loss": 2.4628,
"step": 26125
},
{
"epoch": 8.253336492142463,
"grad_norm": 0.04608977632661902,
"learning_rate": 0.00017966359460518322,
"loss": 2.333,
"step": 26130
},
{
"epoch": 8.254915896706942,
"grad_norm": 0.052682672464735784,
"learning_rate": 0.00017934835723329345,
"loss": 2.4297,
"step": 26135
},
{
"epoch": 8.256495301271421,
"grad_norm": 0.04704706442557295,
"learning_rate": 0.00017903336941632508,
"loss": 2.4017,
"step": 26140
},
{
"epoch": 8.2580747058359,
"grad_norm": 0.048189627762573205,
"learning_rate": 0.00017871863125006382,
"loss": 2.3934,
"step": 26145
},
{
"epoch": 8.25965411040038,
"grad_norm": 0.06717330151488347,
"learning_rate": 0.00017840414283021923,
"loss": 2.3163,
"step": 26150
},
{
"epoch": 8.261233514964859,
"grad_norm": 0.05286030927555884,
"learning_rate": 0.00017808990425242566,
"loss": 2.4227,
"step": 26155
},
{
"epoch": 8.262812919529338,
"grad_norm": 0.0505713728802993,
"learning_rate": 0.00017777591561224094,
"loss": 2.4652,
"step": 26160
},
{
"epoch": 8.264392324093816,
"grad_norm": 0.05267430455837137,
"learning_rate": 0.000177462177005147,
"loss": 2.4026,
"step": 26165
},
{
"epoch": 8.265971728658295,
"grad_norm": 0.0979418781490772,
"learning_rate": 0.00017714868852654954,
"loss": 2.4923,
"step": 26170
},
{
"epoch": 8.267551133222774,
"grad_norm": 0.05397587881496513,
"learning_rate": 0.00017683545027177838,
"loss": 2.3589,
"step": 26175
},
{
"epoch": 8.269130537787253,
"grad_norm": 0.04991240018750876,
"learning_rate": 0.00017652246233608782,
"loss": 2.3441,
"step": 26180
},
{
"epoch": 8.270709942351733,
"grad_norm": 0.051440642685498895,
"learning_rate": 0.0001762097248146547,
"loss": 2.3838,
"step": 26185
},
{
"epoch": 8.272289346916212,
"grad_norm": 0.05483117147502908,
"learning_rate": 0.00017589723780258126,
"loss": 2.3475,
"step": 26190
},
{
"epoch": 8.273868751480691,
"grad_norm": 0.05169302339631226,
"learning_rate": 0.00017558500139489241,
"loss": 2.4143,
"step": 26195
},
{
"epoch": 8.27544815604517,
"grad_norm": 0.047668104522567596,
"learning_rate": 0.0001752730156865371,
"loss": 2.361,
"step": 26200
},
{
"epoch": 8.27702756060965,
"grad_norm": 0.04749377812354923,
"learning_rate": 0.00017496128077238872,
"loss": 2.3622,
"step": 26205
},
{
"epoch": 8.278606965174129,
"grad_norm": 0.04503358060922985,
"learning_rate": 0.00017464979674724335,
"loss": 2.3464,
"step": 26210
},
{
"epoch": 8.280186369738608,
"grad_norm": 0.04909581956112759,
"learning_rate": 0.00017433856370582156,
"loss": 2.4573,
"step": 26215
},
{
"epoch": 8.281765774303087,
"grad_norm": 0.04132899506214487,
"learning_rate": 0.00017402758174276734,
"loss": 2.4487,
"step": 26220
},
{
"epoch": 8.283345178867567,
"grad_norm": 0.047925626721803186,
"learning_rate": 0.0001737168509526479,
"loss": 2.416,
"step": 26225
},
{
"epoch": 8.284924583432046,
"grad_norm": 0.053581647556568454,
"learning_rate": 0.00017340637142995507,
"loss": 2.4093,
"step": 26230
},
{
"epoch": 8.286503987996525,
"grad_norm": 0.043515914265809134,
"learning_rate": 0.0001730961432691034,
"loss": 2.3549,
"step": 26235
},
{
"epoch": 8.288083392561004,
"grad_norm": 0.04861740291279733,
"learning_rate": 0.00017278616656443113,
"loss": 2.3499,
"step": 26240
},
{
"epoch": 8.289662797125484,
"grad_norm": 0.054873344786989145,
"learning_rate": 0.0001724764414102007,
"loss": 2.4345,
"step": 26245
},
{
"epoch": 8.291242201689963,
"grad_norm": 0.050911740028379296,
"learning_rate": 0.00017216696790059717,
"loss": 2.4058,
"step": 26250
},
{
"epoch": 8.292821606254442,
"grad_norm": 0.049689682996031846,
"learning_rate": 0.00017185774612972948,
"loss": 2.3661,
"step": 26255
},
{
"epoch": 8.294401010818921,
"grad_norm": 0.04654305378222502,
"learning_rate": 0.0001715487761916301,
"loss": 2.4523,
"step": 26260
},
{
"epoch": 8.2959804153834,
"grad_norm": 0.05244234404666711,
"learning_rate": 0.00017124005818025444,
"loss": 2.3811,
"step": 26265
},
{
"epoch": 8.29755981994788,
"grad_norm": 0.054568257048876516,
"learning_rate": 0.0001709315921894823,
"loss": 2.4299,
"step": 26270
},
{
"epoch": 8.29913922451236,
"grad_norm": 0.05397607350166474,
"learning_rate": 0.0001706233783131157,
"loss": 2.3978,
"step": 26275
},
{
"epoch": 8.300718629076838,
"grad_norm": 0.04837043789766192,
"learning_rate": 0.00017031541664488093,
"loss": 2.3639,
"step": 26280
},
{
"epoch": 8.302298033641318,
"grad_norm": 0.043583664778450076,
"learning_rate": 0.00017000770727842695,
"loss": 2.3316,
"step": 26285
},
{
"epoch": 8.303877438205797,
"grad_norm": 0.04609596817627926,
"learning_rate": 0.00016970025030732606,
"loss": 2.2548,
"step": 26290
},
{
"epoch": 8.305456842770276,
"grad_norm": 0.04325355512380385,
"learning_rate": 0.0001693930458250742,
"loss": 2.4832,
"step": 26295
},
{
"epoch": 8.307036247334755,
"grad_norm": 0.044709319564823075,
"learning_rate": 0.00016908609392509032,
"loss": 2.4069,
"step": 26300
},
{
"epoch": 8.308615651899235,
"grad_norm": 0.049610329987342446,
"learning_rate": 0.00016877939470071645,
"loss": 2.5391,
"step": 26305
},
{
"epoch": 8.310195056463714,
"grad_norm": 0.06240617850116144,
"learning_rate": 0.00016847294824521775,
"loss": 2.3962,
"step": 26310
},
{
"epoch": 8.311774461028193,
"grad_norm": 0.05002596125229355,
"learning_rate": 0.00016816675465178255,
"loss": 2.4464,
"step": 26315
},
{
"epoch": 8.313353865592672,
"grad_norm": 0.052546192058887144,
"learning_rate": 0.00016786081401352272,
"loss": 2.3425,
"step": 26320
},
{
"epoch": 8.314933270157152,
"grad_norm": 0.04967183298894618,
"learning_rate": 0.00016755512642347258,
"loss": 2.4558,
"step": 26325
},
{
"epoch": 8.31651267472163,
"grad_norm": 0.04421521494190073,
"learning_rate": 0.00016724969197458973,
"loss": 2.4106,
"step": 26330
},
{
"epoch": 8.318092079286108,
"grad_norm": 0.04687791030191425,
"learning_rate": 0.00016694451075975524,
"loss": 2.4344,
"step": 26335
},
{
"epoch": 8.319671483850588,
"grad_norm": 0.049095717835493755,
"learning_rate": 0.0001666395828717724,
"loss": 2.464,
"step": 26340
},
{
"epoch": 8.321250888415067,
"grad_norm": 0.04378104564721021,
"learning_rate": 0.00016633490840336796,
"loss": 2.3461,
"step": 26345
},
{
"epoch": 8.322830292979546,
"grad_norm": 0.047078014371276226,
"learning_rate": 0.0001660304874471914,
"loss": 2.4314,
"step": 26350
},
{
"epoch": 8.324409697544025,
"grad_norm": 0.0518549059272024,
"learning_rate": 0.000165726320095815,
"loss": 2.4218,
"step": 26355
},
{
"epoch": 8.325989102108505,
"grad_norm": 0.04456247783804972,
"learning_rate": 0.00016542240644173468,
"loss": 2.4152,
"step": 26360
},
{
"epoch": 8.327568506672984,
"grad_norm": 0.0490036060868725,
"learning_rate": 0.00016511874657736792,
"loss": 2.4491,
"step": 26365
},
{
"epoch": 8.329147911237463,
"grad_norm": 0.049685845519469106,
"learning_rate": 0.00016481534059505643,
"loss": 2.314,
"step": 26370
},
{
"epoch": 8.330727315801942,
"grad_norm": 0.0486307703124003,
"learning_rate": 0.00016451218858706373,
"loss": 2.5204,
"step": 26375
},
{
"epoch": 8.332306720366422,
"grad_norm": 0.05557163707955135,
"learning_rate": 0.00016420929064557611,
"loss": 2.4001,
"step": 26380
},
{
"epoch": 8.3338861249309,
"grad_norm": 0.04972702754574487,
"learning_rate": 0.0001639066468627034,
"loss": 2.2729,
"step": 26385
},
{
"epoch": 8.33546552949538,
"grad_norm": 0.046247847588770534,
"learning_rate": 0.00016360425733047757,
"loss": 2.2791,
"step": 26390
},
{
"epoch": 8.33704493405986,
"grad_norm": 0.0545684220382701,
"learning_rate": 0.00016330212214085306,
"loss": 2.3244,
"step": 26395
},
{
"epoch": 8.338624338624339,
"grad_norm": 0.0462319555311167,
"learning_rate": 0.00016300024138570746,
"loss": 2.4117,
"step": 26400
},
{
"epoch": 8.340203743188818,
"grad_norm": 0.04850344264398392,
"learning_rate": 0.00016269861515684047,
"loss": 2.3817,
"step": 26405
},
{
"epoch": 8.341783147753297,
"grad_norm": 0.05035687863061325,
"learning_rate": 0.00016239724354597519,
"loss": 2.3797,
"step": 26410
},
{
"epoch": 8.343362552317776,
"grad_norm": 0.05707521474873975,
"learning_rate": 0.00016209612664475637,
"loss": 2.461,
"step": 26415
},
{
"epoch": 8.344941956882256,
"grad_norm": 0.04699302116060703,
"learning_rate": 0.00016179526454475202,
"loss": 2.4187,
"step": 26420
},
{
"epoch": 8.346521361446735,
"grad_norm": 0.053523996291635755,
"learning_rate": 0.00016149465733745238,
"loss": 2.3255,
"step": 26425
},
{
"epoch": 8.348100766011214,
"grad_norm": 0.05194690202934393,
"learning_rate": 0.00016119430511427014,
"loss": 2.4033,
"step": 26430
},
{
"epoch": 8.349680170575693,
"grad_norm": 0.054657962419011275,
"learning_rate": 0.0001608942079665403,
"loss": 2.4114,
"step": 26435
},
{
"epoch": 8.351259575140173,
"grad_norm": 0.055225413042288445,
"learning_rate": 0.00016059436598552069,
"loss": 2.3767,
"step": 26440
},
{
"epoch": 8.352838979704652,
"grad_norm": 0.05268677918775418,
"learning_rate": 0.000160294779262391,
"loss": 2.3768,
"step": 26445
},
{
"epoch": 8.354418384269131,
"grad_norm": 0.05632519112731206,
"learning_rate": 0.00015999544788825425,
"loss": 2.413,
"step": 26450
},
{
"epoch": 8.35599778883361,
"grad_norm": 0.049034219555129196,
"learning_rate": 0.00015969637195413456,
"loss": 2.3094,
"step": 26455
},
{
"epoch": 8.35757719339809,
"grad_norm": 0.05535212452768547,
"learning_rate": 0.00015939755155097945,
"loss": 2.3666,
"step": 26460
},
{
"epoch": 8.359156597962569,
"grad_norm": 0.04734654037385947,
"learning_rate": 0.0001590989867696583,
"loss": 2.3512,
"step": 26465
},
{
"epoch": 8.360736002527048,
"grad_norm": 0.0435967203440791,
"learning_rate": 0.00015880067770096228,
"loss": 2.3269,
"step": 26470
},
{
"epoch": 8.362315407091527,
"grad_norm": 0.04425811481071514,
"learning_rate": 0.00015850262443560593,
"loss": 2.3649,
"step": 26475
},
{
"epoch": 8.363894811656007,
"grad_norm": 0.04438345363301554,
"learning_rate": 0.0001582048270642249,
"loss": 2.357,
"step": 26480
},
{
"epoch": 8.365474216220484,
"grad_norm": 0.048318944753731725,
"learning_rate": 0.00015790728567737766,
"loss": 2.3863,
"step": 26485
},
{
"epoch": 8.367053620784963,
"grad_norm": 0.05420293787173252,
"learning_rate": 0.0001576100003655445,
"loss": 2.3693,
"step": 26490
},
{
"epoch": 8.368633025349443,
"grad_norm": 0.04559022718920658,
"learning_rate": 0.0001573129712191279,
"loss": 2.3431,
"step": 26495
},
{
"epoch": 8.370212429913922,
"grad_norm": 0.04762563613401282,
"learning_rate": 0.0001570161983284528,
"loss": 2.3648,
"step": 26500
},
{
"epoch": 8.371791834478401,
"grad_norm": 0.05777111844522066,
"learning_rate": 0.00015671968178376572,
"loss": 2.3841,
"step": 26505
},
{
"epoch": 8.37337123904288,
"grad_norm": 0.049095515126832724,
"learning_rate": 0.0001564234216752357,
"loss": 2.4302,
"step": 26510
},
{
"epoch": 8.37495064360736,
"grad_norm": 0.04579683805403143,
"learning_rate": 0.0001561274180929534,
"loss": 2.3392,
"step": 26515
},
{
"epoch": 8.376530048171839,
"grad_norm": 0.048248477670352924,
"learning_rate": 0.00015583167112693153,
"loss": 2.4126,
"step": 26520
},
{
"epoch": 8.378109452736318,
"grad_norm": 0.04552189737279042,
"learning_rate": 0.00015553618086710508,
"loss": 2.4136,
"step": 26525
},
{
"epoch": 8.379688857300797,
"grad_norm": 0.050940319294448574,
"learning_rate": 0.00015524094740333028,
"loss": 2.2568,
"step": 26530
},
{
"epoch": 8.381268261865277,
"grad_norm": 0.05291216628105431,
"learning_rate": 0.0001549459708253863,
"loss": 2.3679,
"step": 26535
},
{
"epoch": 8.382847666429756,
"grad_norm": 0.04821052566882871,
"learning_rate": 0.00015465125122297342,
"loss": 2.3571,
"step": 26540
},
{
"epoch": 8.384427070994235,
"grad_norm": 0.04638352697588534,
"learning_rate": 0.00015435678868571369,
"loss": 2.3476,
"step": 26545
},
{
"epoch": 8.386006475558714,
"grad_norm": 0.046550515327761256,
"learning_rate": 0.00015406258330315171,
"loss": 2.3701,
"step": 26550
},
{
"epoch": 8.387585880123194,
"grad_norm": 0.04523603637288149,
"learning_rate": 0.00015376863516475338,
"loss": 2.4565,
"step": 26555
},
{
"epoch": 8.389165284687673,
"grad_norm": 0.04638226636861543,
"learning_rate": 0.00015347494435990615,
"loss": 2.3883,
"step": 26560
},
{
"epoch": 8.390744689252152,
"grad_norm": 0.055369001795188055,
"learning_rate": 0.00015318151097791998,
"loss": 2.4628,
"step": 26565
},
{
"epoch": 8.392324093816631,
"grad_norm": 0.05640221039565341,
"learning_rate": 0.0001528883351080259,
"loss": 2.4476,
"step": 26570
},
{
"epoch": 8.39390349838111,
"grad_norm": 0.05197482558497645,
"learning_rate": 0.00015259541683937673,
"loss": 2.3666,
"step": 26575
},
{
"epoch": 8.39548290294559,
"grad_norm": 0.046953061317044895,
"learning_rate": 0.00015230275626104705,
"loss": 2.4272,
"step": 26580
},
{
"epoch": 8.397062307510069,
"grad_norm": 0.04310952477961555,
"learning_rate": 0.00015201035346203284,
"loss": 2.3836,
"step": 26585
},
{
"epoch": 8.398641712074548,
"grad_norm": 0.05835831448667237,
"learning_rate": 0.0001517182085312524,
"loss": 2.3522,
"step": 26590
},
{
"epoch": 8.400221116639027,
"grad_norm": 0.048975691541835814,
"learning_rate": 0.00015142632155754478,
"loss": 2.3551,
"step": 26595
},
{
"epoch": 8.401800521203507,
"grad_norm": 0.056033320488535765,
"learning_rate": 0.0001511346926296713,
"loss": 2.4217,
"step": 26600
},
{
"epoch": 8.403379925767986,
"grad_norm": 0.06094107723061698,
"learning_rate": 0.00015084332183631422,
"loss": 2.4006,
"step": 26605
},
{
"epoch": 8.404959330332465,
"grad_norm": 0.05229380726945292,
"learning_rate": 0.0001505522092660776,
"loss": 2.4538,
"step": 26610
},
{
"epoch": 8.406538734896944,
"grad_norm": 0.05219638857585323,
"learning_rate": 0.00015026135500748684,
"loss": 2.427,
"step": 26615
},
{
"epoch": 8.408118139461424,
"grad_norm": 0.04864854277850766,
"learning_rate": 0.0001499707591489886,
"loss": 2.3032,
"step": 26620
},
{
"epoch": 8.409697544025903,
"grad_norm": 0.05130633586960696,
"learning_rate": 0.00014968042177895182,
"loss": 2.3415,
"step": 26625
},
{
"epoch": 8.411276948590382,
"grad_norm": 0.04961022287215968,
"learning_rate": 0.0001493903429856659,
"loss": 2.3595,
"step": 26630
},
{
"epoch": 8.412856353154861,
"grad_norm": 0.05268348341405079,
"learning_rate": 0.00014910052285734178,
"loss": 2.388,
"step": 26635
},
{
"epoch": 8.414435757719339,
"grad_norm": 0.05251257408511266,
"learning_rate": 0.00014881096148211239,
"loss": 2.367,
"step": 26640
},
{
"epoch": 8.416015162283818,
"grad_norm": 0.04519350339044876,
"learning_rate": 0.00014852165894803083,
"loss": 2.5289,
"step": 26645
},
{
"epoch": 8.417594566848297,
"grad_norm": 0.052478266597039615,
"learning_rate": 0.00014823261534307287,
"loss": 2.3678,
"step": 26650
},
{
"epoch": 8.419173971412777,
"grad_norm": 0.045857352778371906,
"learning_rate": 0.00014794383075513451,
"loss": 2.377,
"step": 26655
},
{
"epoch": 8.420753375977256,
"grad_norm": 0.05592977801746524,
"learning_rate": 0.0001476553052720333,
"loss": 2.4182,
"step": 26660
},
{
"epoch": 8.422332780541735,
"grad_norm": 0.04213014073549877,
"learning_rate": 0.00014736703898150794,
"loss": 2.4226,
"step": 26665
},
{
"epoch": 8.423912185106214,
"grad_norm": 0.04737225206610175,
"learning_rate": 0.0001470790319712183,
"loss": 2.3809,
"step": 26670
},
{
"epoch": 8.425491589670694,
"grad_norm": 0.04591930059151109,
"learning_rate": 0.00014679128432874546,
"loss": 2.3618,
"step": 26675
},
{
"epoch": 8.427070994235173,
"grad_norm": 0.0490800680706064,
"learning_rate": 0.00014650379614159192,
"loss": 2.3649,
"step": 26680
},
{
"epoch": 8.428650398799652,
"grad_norm": 0.04682291738342209,
"learning_rate": 0.00014621656749718071,
"loss": 2.3481,
"step": 26685
},
{
"epoch": 8.430229803364131,
"grad_norm": 0.04462638581380862,
"learning_rate": 0.00014592959848285647,
"loss": 2.3209,
"step": 26690
},
{
"epoch": 8.43180920792861,
"grad_norm": 0.050556366675289864,
"learning_rate": 0.00014564288918588464,
"loss": 2.3737,
"step": 26695
},
{
"epoch": 8.43338861249309,
"grad_norm": 0.04678957148115946,
"learning_rate": 0.00014535643969345146,
"loss": 2.4973,
"step": 26700
},
{
"epoch": 8.43496801705757,
"grad_norm": 0.0850072426060503,
"learning_rate": 0.0001450702500926645,
"loss": 2.4133,
"step": 26705
},
{
"epoch": 8.436547421622048,
"grad_norm": 0.04691358996576342,
"learning_rate": 0.00014478432047055202,
"loss": 2.4485,
"step": 26710
},
{
"epoch": 8.438126826186528,
"grad_norm": 0.046276320227182303,
"learning_rate": 0.0001444986509140638,
"loss": 2.3055,
"step": 26715
},
{
"epoch": 8.439706230751007,
"grad_norm": 0.04898850587615209,
"learning_rate": 0.00014421324151006986,
"loss": 2.3767,
"step": 26720
},
{
"epoch": 8.441285635315486,
"grad_norm": 0.04747495705557107,
"learning_rate": 0.00014392809234536118,
"loss": 2.331,
"step": 26725
},
{
"epoch": 8.442865039879965,
"grad_norm": 0.05078741604035361,
"learning_rate": 0.00014364320350665016,
"loss": 2.3616,
"step": 26730
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.04340690204052605,
"learning_rate": 0.0001433585750805695,
"loss": 2.4084,
"step": 26735
},
{
"epoch": 8.446023849008924,
"grad_norm": 0.04199834629885642,
"learning_rate": 0.00014307420715367302,
"loss": 2.3922,
"step": 26740
},
{
"epoch": 8.447603253573403,
"grad_norm": 0.044490710119333865,
"learning_rate": 0.00014279009981243507,
"loss": 2.3083,
"step": 26745
},
{
"epoch": 8.449182658137882,
"grad_norm": 0.040045586116287835,
"learning_rate": 0.00014250625314325094,
"loss": 2.3888,
"step": 26750
},
{
"epoch": 8.450762062702362,
"grad_norm": 0.04978781860839173,
"learning_rate": 0.0001422226672324366,
"loss": 2.3644,
"step": 26755
},
{
"epoch": 8.45234146726684,
"grad_norm": 0.04652874495738621,
"learning_rate": 0.0001419393421662284,
"loss": 2.4269,
"step": 26760
},
{
"epoch": 8.45392087183132,
"grad_norm": 0.050196459481046826,
"learning_rate": 0.00014165627803078417,
"loss": 2.4805,
"step": 26765
},
{
"epoch": 8.4555002763958,
"grad_norm": 0.04059194596613595,
"learning_rate": 0.00014137347491218166,
"loss": 2.385,
"step": 26770
},
{
"epoch": 8.457079680960279,
"grad_norm": 0.052167719274323225,
"learning_rate": 0.0001410909328964193,
"loss": 2.3675,
"step": 26775
},
{
"epoch": 8.458659085524758,
"grad_norm": 0.043088470669438834,
"learning_rate": 0.00014080865206941674,
"loss": 2.3721,
"step": 26780
},
{
"epoch": 8.460238490089237,
"grad_norm": 0.043145268513182636,
"learning_rate": 0.0001405266325170136,
"loss": 2.3419,
"step": 26785
},
{
"epoch": 8.461817894653716,
"grad_norm": 0.04893717640972426,
"learning_rate": 0.00014024487432497012,
"loss": 2.4063,
"step": 26790
},
{
"epoch": 8.463397299218194,
"grad_norm": 0.046200764762835744,
"learning_rate": 0.00013996337757896725,
"loss": 2.3645,
"step": 26795
},
{
"epoch": 8.464976703782673,
"grad_norm": 0.05843755466813624,
"learning_rate": 0.00013968214236460618,
"loss": 2.3927,
"step": 26800
},
{
"epoch": 8.466556108347152,
"grad_norm": 0.05640577883199117,
"learning_rate": 0.00013940116876740905,
"loss": 2.4421,
"step": 26805
},
{
"epoch": 8.468135512911632,
"grad_norm": 0.04517329373689541,
"learning_rate": 0.00013912045687281793,
"loss": 2.2847,
"step": 26810
},
{
"epoch": 8.46971491747611,
"grad_norm": 0.04739957033206674,
"learning_rate": 0.00013884000676619545,
"loss": 2.3114,
"step": 26815
},
{
"epoch": 8.47129432204059,
"grad_norm": 0.043223701891898524,
"learning_rate": 0.00013855981853282495,
"loss": 2.3103,
"step": 26820
},
{
"epoch": 8.47287372660507,
"grad_norm": 0.04433846046078752,
"learning_rate": 0.0001382798922579096,
"loss": 2.3672,
"step": 26825
},
{
"epoch": 8.474453131169549,
"grad_norm": 0.04629210415232134,
"learning_rate": 0.00013800022802657342,
"loss": 2.553,
"step": 26830
},
{
"epoch": 8.476032535734028,
"grad_norm": 0.05006461672492424,
"learning_rate": 0.00013772082592386058,
"loss": 2.4208,
"step": 26835
},
{
"epoch": 8.477611940298507,
"grad_norm": 0.05271271109362467,
"learning_rate": 0.00013744168603473518,
"loss": 2.3943,
"step": 26840
},
{
"epoch": 8.479191344862986,
"grad_norm": 0.04604968648561785,
"learning_rate": 0.00013716280844408213,
"loss": 2.4582,
"step": 26845
},
{
"epoch": 8.480770749427466,
"grad_norm": 0.04884660446778068,
"learning_rate": 0.00013688419323670597,
"loss": 2.4209,
"step": 26850
},
{
"epoch": 8.482350153991945,
"grad_norm": 0.05080444628439175,
"learning_rate": 0.00013660584049733228,
"loss": 2.3041,
"step": 26855
},
{
"epoch": 8.483929558556424,
"grad_norm": 0.04919265879582246,
"learning_rate": 0.00013632775031060607,
"loss": 2.4452,
"step": 26860
},
{
"epoch": 8.485508963120903,
"grad_norm": 0.045609570661221,
"learning_rate": 0.00013604992276109262,
"loss": 2.2835,
"step": 26865
},
{
"epoch": 8.487088367685383,
"grad_norm": 0.05316964828168353,
"learning_rate": 0.00013577235793327792,
"loss": 2.4214,
"step": 26870
},
{
"epoch": 8.488667772249862,
"grad_norm": 0.063021037748091,
"learning_rate": 0.0001354950559115673,
"loss": 2.4789,
"step": 26875
},
{
"epoch": 8.490247176814341,
"grad_norm": 0.04980120465793191,
"learning_rate": 0.0001352180167802871,
"loss": 2.4388,
"step": 26880
},
{
"epoch": 8.49182658137882,
"grad_norm": 0.05399731781967924,
"learning_rate": 0.00013494124062368262,
"loss": 2.3671,
"step": 26885
},
{
"epoch": 8.4934059859433,
"grad_norm": 0.04841454391886285,
"learning_rate": 0.00013466472752591952,
"loss": 2.3689,
"step": 26890
},
{
"epoch": 8.494985390507779,
"grad_norm": 0.03998517085400193,
"learning_rate": 0.0001343884775710843,
"loss": 2.3471,
"step": 26895
},
{
"epoch": 8.496564795072258,
"grad_norm": 0.04565734698974977,
"learning_rate": 0.00013411249084318246,
"loss": 2.3439,
"step": 26900
},
{
"epoch": 8.498144199636737,
"grad_norm": 0.04601453715272313,
"learning_rate": 0.0001338367674261397,
"loss": 2.3804,
"step": 26905
},
{
"epoch": 8.499723604201217,
"grad_norm": 0.11609018795950421,
"learning_rate": 0.00013356130740380202,
"loss": 2.439,
"step": 26910
},
{
"epoch": 8.501303008765696,
"grad_norm": 0.045572490502842455,
"learning_rate": 0.0001332861108599348,
"loss": 2.3025,
"step": 26915
},
{
"epoch": 8.502882413330175,
"grad_norm": 0.05861871484665386,
"learning_rate": 0.0001330111778782238,
"loss": 2.4143,
"step": 26920
},
{
"epoch": 8.504461817894654,
"grad_norm": 0.04635822578408009,
"learning_rate": 0.00013273650854227437,
"loss": 2.4446,
"step": 26925
},
{
"epoch": 8.506041222459134,
"grad_norm": 0.053073309477083,
"learning_rate": 0.00013246210293561144,
"loss": 2.4323,
"step": 26930
},
{
"epoch": 8.507620627023613,
"grad_norm": 0.04600629854742416,
"learning_rate": 0.0001321879611416803,
"loss": 2.4094,
"step": 26935
},
{
"epoch": 8.509200031588092,
"grad_norm": 0.05531048033616932,
"learning_rate": 0.00013191408324384523,
"loss": 2.4388,
"step": 26940
},
{
"epoch": 8.510779436152571,
"grad_norm": 0.04913565330415894,
"learning_rate": 0.0001316404693253914,
"loss": 2.4353,
"step": 26945
},
{
"epoch": 8.512358840717049,
"grad_norm": 0.05304389817746647,
"learning_rate": 0.00013136711946952273,
"loss": 2.4191,
"step": 26950
},
{
"epoch": 8.51393824528153,
"grad_norm": 0.04397066264998692,
"learning_rate": 0.000131094033759363,
"loss": 2.3114,
"step": 26955
},
{
"epoch": 8.515517649846007,
"grad_norm": 0.04816854374254131,
"learning_rate": 0.00013082121227795619,
"loss": 2.3708,
"step": 26960
},
{
"epoch": 8.517097054410486,
"grad_norm": 0.04685898216009314,
"learning_rate": 0.00013054865510826508,
"loss": 2.4369,
"step": 26965
},
{
"epoch": 8.518676458974966,
"grad_norm": 0.05330126054735967,
"learning_rate": 0.00013027636233317342,
"loss": 2.3684,
"step": 26970
},
{
"epoch": 8.520255863539445,
"grad_norm": 0.04876095077472937,
"learning_rate": 0.00013000433403548295,
"loss": 2.4019,
"step": 26975
},
{
"epoch": 8.521835268103924,
"grad_norm": 0.047721082130814954,
"learning_rate": 0.00012973257029791563,
"loss": 2.4544,
"step": 26980
},
{
"epoch": 8.523414672668403,
"grad_norm": 0.05133114655764637,
"learning_rate": 0.00012946107120311368,
"loss": 2.4207,
"step": 26985
},
{
"epoch": 8.524994077232883,
"grad_norm": 0.04997269340629122,
"learning_rate": 0.00012918983683363772,
"loss": 2.46,
"step": 26990
},
{
"epoch": 8.526573481797362,
"grad_norm": 0.048481489112942415,
"learning_rate": 0.0001289188672719689,
"loss": 2.3359,
"step": 26995
},
{
"epoch": 8.528152886361841,
"grad_norm": 0.0496135719248878,
"learning_rate": 0.00012864816260050693,
"loss": 2.3773,
"step": 27000
},
{
"epoch": 8.52973229092632,
"grad_norm": 0.04479886542411035,
"learning_rate": 0.00012837772290157133,
"loss": 2.4574,
"step": 27005
},
{
"epoch": 8.5313116954908,
"grad_norm": 0.07687684708137763,
"learning_rate": 0.00012810754825740144,
"loss": 2.3684,
"step": 27010
},
{
"epoch": 8.532891100055279,
"grad_norm": 0.04561187302439505,
"learning_rate": 0.00012783763875015542,
"loss": 2.3255,
"step": 27015
},
{
"epoch": 8.534470504619758,
"grad_norm": 0.04867719139249393,
"learning_rate": 0.00012756799446191113,
"loss": 2.515,
"step": 27020
},
{
"epoch": 8.536049909184237,
"grad_norm": 0.04745768862703271,
"learning_rate": 0.0001272986154746656,
"loss": 2.3639,
"step": 27025
},
{
"epoch": 8.537629313748717,
"grad_norm": 0.049244802139073084,
"learning_rate": 0.00012702950187033502,
"loss": 2.3926,
"step": 27030
},
{
"epoch": 8.539208718313196,
"grad_norm": 0.04750145665638371,
"learning_rate": 0.00012676065373075552,
"loss": 2.4717,
"step": 27035
},
{
"epoch": 8.540788122877675,
"grad_norm": 0.04806728344499754,
"learning_rate": 0.00012649207113768203,
"loss": 2.363,
"step": 27040
},
{
"epoch": 8.542367527442154,
"grad_norm": 0.04496497615356363,
"learning_rate": 0.00012622375417278842,
"loss": 2.3572,
"step": 27045
},
{
"epoch": 8.543946932006634,
"grad_norm": 0.04310426247596021,
"learning_rate": 0.00012595570291766878,
"loss": 2.36,
"step": 27050
},
{
"epoch": 8.545526336571113,
"grad_norm": 0.04670895850589334,
"learning_rate": 0.00012568791745383513,
"loss": 2.3836,
"step": 27055
},
{
"epoch": 8.547105741135592,
"grad_norm": 0.04226904425227385,
"learning_rate": 0.0001254203978627201,
"loss": 2.3702,
"step": 27060
},
{
"epoch": 8.548685145700071,
"grad_norm": 0.04512397132344977,
"learning_rate": 0.00012515314422567402,
"loss": 2.3634,
"step": 27065
},
{
"epoch": 8.55026455026455,
"grad_norm": 0.04799706807324663,
"learning_rate": 0.00012488615662396707,
"loss": 2.3858,
"step": 27070
},
{
"epoch": 8.55184395482903,
"grad_norm": 0.055867285713436535,
"learning_rate": 0.00012461943513878882,
"loss": 2.5932,
"step": 27075
},
{
"epoch": 8.55342335939351,
"grad_norm": 0.043694439654936905,
"learning_rate": 0.00012435297985124717,
"loss": 2.5261,
"step": 27080
},
{
"epoch": 8.555002763957988,
"grad_norm": 0.05598575873544264,
"learning_rate": 0.00012408679084236984,
"loss": 2.4595,
"step": 27085
},
{
"epoch": 8.556582168522468,
"grad_norm": 0.047626173937539366,
"learning_rate": 0.00012382086819310312,
"loss": 2.3446,
"step": 27090
},
{
"epoch": 8.558161573086947,
"grad_norm": 0.04451372643455489,
"learning_rate": 0.00012355521198431207,
"loss": 2.4342,
"step": 27095
},
{
"epoch": 8.559740977651426,
"grad_norm": 0.046612085781107815,
"learning_rate": 0.00012328982229678153,
"loss": 2.4306,
"step": 27100
},
{
"epoch": 8.561320382215905,
"grad_norm": 0.05063744861099408,
"learning_rate": 0.00012302469921121462,
"loss": 2.4076,
"step": 27105
},
{
"epoch": 8.562899786780385,
"grad_norm": 0.04704547000760339,
"learning_rate": 0.0001227598428082335,
"loss": 2.359,
"step": 27110
},
{
"epoch": 8.564479191344862,
"grad_norm": 0.054329836868169466,
"learning_rate": 0.00012249525316837927,
"loss": 2.4404,
"step": 27115
},
{
"epoch": 8.566058595909341,
"grad_norm": 0.05317607530238786,
"learning_rate": 0.00012223093037211187,
"loss": 2.3496,
"step": 27120
},
{
"epoch": 8.56763800047382,
"grad_norm": 0.04136473952865533,
"learning_rate": 0.00012196687449981047,
"loss": 2.431,
"step": 27125
},
{
"epoch": 8.5692174050383,
"grad_norm": 0.043318270853819764,
"learning_rate": 0.00012170308563177268,
"loss": 2.3985,
"step": 27130
},
{
"epoch": 8.57079680960278,
"grad_norm": 0.04751826112051017,
"learning_rate": 0.00012143956384821476,
"loss": 2.4557,
"step": 27135
},
{
"epoch": 8.572376214167258,
"grad_norm": 0.042756837797052694,
"learning_rate": 0.00012117630922927236,
"loss": 2.3584,
"step": 27140
},
{
"epoch": 8.573955618731738,
"grad_norm": 0.04847605148089578,
"learning_rate": 0.00012091332185499915,
"loss": 2.3624,
"step": 27145
},
{
"epoch": 8.575535023296217,
"grad_norm": 0.04483130882326042,
"learning_rate": 0.00012065060180536858,
"loss": 2.5897,
"step": 27150
},
{
"epoch": 8.577114427860696,
"grad_norm": 0.048787954003955754,
"learning_rate": 0.00012038814916027141,
"loss": 2.4003,
"step": 27155
},
{
"epoch": 8.578693832425175,
"grad_norm": 0.05199760419375112,
"learning_rate": 0.00012012596399951791,
"loss": 2.3965,
"step": 27160
},
{
"epoch": 8.580273236989655,
"grad_norm": 0.044039090468451725,
"learning_rate": 0.00011986404640283732,
"loss": 2.365,
"step": 27165
},
{
"epoch": 8.581852641554134,
"grad_norm": 0.052250016311700974,
"learning_rate": 0.0001196023964498767,
"loss": 2.3715,
"step": 27170
},
{
"epoch": 8.583432046118613,
"grad_norm": 0.04163668584358668,
"learning_rate": 0.00011934101422020238,
"loss": 2.3493,
"step": 27175
},
{
"epoch": 8.585011450683092,
"grad_norm": 0.04326699889371583,
"learning_rate": 0.00011907989979329904,
"loss": 2.3631,
"step": 27180
},
{
"epoch": 8.586590855247572,
"grad_norm": 0.04830817975399211,
"learning_rate": 0.00011881905324856967,
"loss": 2.3557,
"step": 27185
},
{
"epoch": 8.58817025981205,
"grad_norm": 0.042318808544985625,
"learning_rate": 0.00011855847466533632,
"loss": 2.353,
"step": 27190
},
{
"epoch": 8.58974966437653,
"grad_norm": 0.049810077206891894,
"learning_rate": 0.00011829816412283911,
"loss": 2.3398,
"step": 27195
},
{
"epoch": 8.59132906894101,
"grad_norm": 0.04226567704517314,
"learning_rate": 0.00011803812170023687,
"loss": 2.4034,
"step": 27200
},
{
"epoch": 8.592908473505489,
"grad_norm": 0.058126044736796,
"learning_rate": 0.00011777834747660676,
"loss": 2.4602,
"step": 27205
},
{
"epoch": 8.594487878069968,
"grad_norm": 0.04136207040910559,
"learning_rate": 0.00011751884153094438,
"loss": 2.3869,
"step": 27210
},
{
"epoch": 8.596067282634447,
"grad_norm": 0.053233027417794544,
"learning_rate": 0.00011725960394216418,
"loss": 2.4308,
"step": 27215
},
{
"epoch": 8.597646687198926,
"grad_norm": 0.04607223624915819,
"learning_rate": 0.00011700063478909817,
"loss": 2.2557,
"step": 27220
},
{
"epoch": 8.599226091763406,
"grad_norm": 0.04288248737709212,
"learning_rate": 0.00011674193415049772,
"loss": 2.443,
"step": 27225
},
{
"epoch": 8.600805496327885,
"grad_norm": 0.057320515749170395,
"learning_rate": 0.00011648350210503178,
"loss": 2.4144,
"step": 27230
},
{
"epoch": 8.602384900892364,
"grad_norm": 0.04733134712064031,
"learning_rate": 0.00011622533873128771,
"loss": 2.344,
"step": 27235
},
{
"epoch": 8.603964305456843,
"grad_norm": 0.04635166920196346,
"learning_rate": 0.00011596744410777205,
"loss": 2.3624,
"step": 27240
},
{
"epoch": 8.605543710021323,
"grad_norm": 0.0441457813955413,
"learning_rate": 0.00011570981831290805,
"loss": 2.4046,
"step": 27245
},
{
"epoch": 8.607123114585802,
"grad_norm": 0.05363622936563738,
"learning_rate": 0.0001154524614250383,
"loss": 2.3747,
"step": 27250
},
{
"epoch": 8.608702519150281,
"grad_norm": 0.0578032377866284,
"learning_rate": 0.0001151953735224236,
"loss": 2.4532,
"step": 27255
},
{
"epoch": 8.61028192371476,
"grad_norm": 0.04490790333327239,
"learning_rate": 0.00011493855468324255,
"loss": 2.5246,
"step": 27260
},
{
"epoch": 8.61186132827924,
"grad_norm": 0.05386919061929367,
"learning_rate": 0.00011468200498559234,
"loss": 2.3122,
"step": 27265
},
{
"epoch": 8.613440732843717,
"grad_norm": 0.04117088259884821,
"learning_rate": 0.00011442572450748801,
"loss": 2.3539,
"step": 27270
},
{
"epoch": 8.615020137408198,
"grad_norm": 0.04948064248550165,
"learning_rate": 0.00011416971332686243,
"loss": 2.4723,
"step": 27275
},
{
"epoch": 8.616599541972676,
"grad_norm": 0.04401771619064978,
"learning_rate": 0.00011391397152156768,
"loss": 2.3534,
"step": 27280
},
{
"epoch": 8.618178946537155,
"grad_norm": 0.07132376617845432,
"learning_rate": 0.00011365849916937276,
"loss": 2.4067,
"step": 27285
},
{
"epoch": 8.619758351101634,
"grad_norm": 0.052906899995070006,
"learning_rate": 0.0001134032963479652,
"loss": 2.3107,
"step": 27290
},
{
"epoch": 8.621337755666113,
"grad_norm": 0.04631271081941863,
"learning_rate": 0.00011314836313495069,
"loss": 2.4194,
"step": 27295
},
{
"epoch": 8.622917160230593,
"grad_norm": 0.055879728433552676,
"learning_rate": 0.00011289369960785234,
"loss": 2.422,
"step": 27300
},
{
"epoch": 8.624496564795072,
"grad_norm": 0.045166759824474614,
"learning_rate": 0.00011263930584411242,
"loss": 2.3003,
"step": 27305
},
{
"epoch": 8.626075969359551,
"grad_norm": 0.0438466195263903,
"learning_rate": 0.00011238518192108982,
"loss": 2.437,
"step": 27310
},
{
"epoch": 8.62765537392403,
"grad_norm": 0.050246720491075134,
"learning_rate": 0.00011213132791606251,
"loss": 2.3956,
"step": 27315
},
{
"epoch": 8.62923477848851,
"grad_norm": 0.04891591637877597,
"learning_rate": 0.00011187774390622563,
"loss": 2.4418,
"step": 27320
},
{
"epoch": 8.630814183052989,
"grad_norm": 0.05314097459143172,
"learning_rate": 0.00011162442996869215,
"loss": 2.3765,
"step": 27325
},
{
"epoch": 8.632393587617468,
"grad_norm": 0.04941419266303433,
"learning_rate": 0.00011137138618049402,
"loss": 2.4047,
"step": 27330
},
{
"epoch": 8.633972992181947,
"grad_norm": 0.04342886656314699,
"learning_rate": 0.00011111861261857958,
"loss": 2.3687,
"step": 27335
},
{
"epoch": 8.635552396746426,
"grad_norm": 0.042425583545525086,
"learning_rate": 0.00011086610935981556,
"loss": 2.3883,
"step": 27340
},
{
"epoch": 8.637131801310906,
"grad_norm": 0.048788257732269476,
"learning_rate": 0.00011061387648098708,
"loss": 2.426,
"step": 27345
},
{
"epoch": 8.638711205875385,
"grad_norm": 0.054371736603684495,
"learning_rate": 0.00011036191405879614,
"loss": 2.4437,
"step": 27350
},
{
"epoch": 8.640290610439864,
"grad_norm": 0.046904955329134176,
"learning_rate": 0.00011011022216986322,
"loss": 2.5424,
"step": 27355
},
{
"epoch": 8.641870015004343,
"grad_norm": 0.0442531954142581,
"learning_rate": 0.00010985880089072608,
"loss": 2.3574,
"step": 27360
},
{
"epoch": 8.643449419568823,
"grad_norm": 0.04775785352655346,
"learning_rate": 0.00010960765029784015,
"loss": 2.382,
"step": 27365
},
{
"epoch": 8.645028824133302,
"grad_norm": 0.04706402749975755,
"learning_rate": 0.00010935677046757907,
"loss": 2.3971,
"step": 27370
},
{
"epoch": 8.646608228697781,
"grad_norm": 0.05756185716173598,
"learning_rate": 0.00010910616147623365,
"loss": 2.3354,
"step": 27375
},
{
"epoch": 8.64818763326226,
"grad_norm": 0.04188256178005546,
"learning_rate": 0.00010885582340001243,
"loss": 2.3628,
"step": 27380
},
{
"epoch": 8.64976703782674,
"grad_norm": 0.04607030317523363,
"learning_rate": 0.00010860575631504155,
"loss": 2.3739,
"step": 27385
},
{
"epoch": 8.651346442391219,
"grad_norm": 0.043954050005782654,
"learning_rate": 0.00010835596029736484,
"loss": 2.3688,
"step": 27390
},
{
"epoch": 8.652925846955698,
"grad_norm": 0.047380854160769884,
"learning_rate": 0.00010810643542294385,
"loss": 2.3996,
"step": 27395
},
{
"epoch": 8.654505251520177,
"grad_norm": 0.05130458026043142,
"learning_rate": 0.00010785718176765713,
"loss": 2.3342,
"step": 27400
},
{
"epoch": 8.656084656084657,
"grad_norm": 0.04409781789315218,
"learning_rate": 0.00010760819940730171,
"loss": 2.317,
"step": 27405
},
{
"epoch": 8.657664060649136,
"grad_norm": 0.04764750498622033,
"learning_rate": 0.00010735948841759113,
"loss": 2.4534,
"step": 27410
},
{
"epoch": 8.659243465213615,
"grad_norm": 0.04451771256488658,
"learning_rate": 0.00010711104887415669,
"loss": 2.3672,
"step": 27415
},
{
"epoch": 8.660822869778094,
"grad_norm": 0.039178296466845196,
"learning_rate": 0.00010686288085254781,
"loss": 2.2947,
"step": 27420
},
{
"epoch": 8.662402274342572,
"grad_norm": 0.054099678943141875,
"learning_rate": 0.00010661498442823014,
"loss": 2.5264,
"step": 27425
},
{
"epoch": 8.663981678907053,
"grad_norm": 0.04901490624167286,
"learning_rate": 0.00010636735967658784,
"loss": 2.3274,
"step": 27430
},
{
"epoch": 8.66556108347153,
"grad_norm": 0.044666537972616924,
"learning_rate": 0.00010612000667292188,
"loss": 2.3824,
"step": 27435
},
{
"epoch": 8.66714048803601,
"grad_norm": 0.04843337262954962,
"learning_rate": 0.00010587292549245064,
"loss": 2.3741,
"step": 27440
},
{
"epoch": 8.668719892600489,
"grad_norm": 0.04634279589793078,
"learning_rate": 0.00010562611621031015,
"loss": 2.3499,
"step": 27445
},
{
"epoch": 8.670299297164968,
"grad_norm": 0.04298136264392165,
"learning_rate": 0.00010537957890155336,
"loss": 2.4299,
"step": 27450
},
{
"epoch": 8.671878701729447,
"grad_norm": 0.04636239404178666,
"learning_rate": 0.00010513331364115052,
"loss": 2.4993,
"step": 27455
},
{
"epoch": 8.673458106293927,
"grad_norm": 0.041895038972217795,
"learning_rate": 0.00010488732050398986,
"loss": 2.3574,
"step": 27460
},
{
"epoch": 8.675037510858406,
"grad_norm": 0.06384282552993045,
"learning_rate": 0.00010464159956487595,
"loss": 2.46,
"step": 27465
},
{
"epoch": 8.676616915422885,
"grad_norm": 0.04789644020746819,
"learning_rate": 0.00010439615089853094,
"loss": 2.4163,
"step": 27470
},
{
"epoch": 8.678196319987364,
"grad_norm": 0.04561918769867514,
"learning_rate": 0.00010415097457959432,
"loss": 2.5047,
"step": 27475
},
{
"epoch": 8.679775724551844,
"grad_norm": 0.05114824193321859,
"learning_rate": 0.00010390607068262248,
"loss": 2.4126,
"step": 27480
},
{
"epoch": 8.681355129116323,
"grad_norm": 0.05406059632662854,
"learning_rate": 0.00010366143928208938,
"loss": 2.3904,
"step": 27485
},
{
"epoch": 8.682934533680802,
"grad_norm": 0.049908841207682936,
"learning_rate": 0.00010341708045238552,
"loss": 2.3692,
"step": 27490
},
{
"epoch": 8.684513938245281,
"grad_norm": 0.04737961789122741,
"learning_rate": 0.00010317299426781923,
"loss": 2.3752,
"step": 27495
},
{
"epoch": 8.68609334280976,
"grad_norm": 0.04405464190125366,
"learning_rate": 0.00010292918080261537,
"loss": 2.4539,
"step": 27500
},
{
"epoch": 8.68767274737424,
"grad_norm": 0.04905899822998163,
"learning_rate": 0.00010268564013091596,
"loss": 2.4646,
"step": 27505
},
{
"epoch": 8.68925215193872,
"grad_norm": 0.06104583394465677,
"learning_rate": 0.00010244237232678066,
"loss": 2.4228,
"step": 27510
},
{
"epoch": 8.690831556503198,
"grad_norm": 0.0513316924495129,
"learning_rate": 0.00010219937746418495,
"loss": 2.4072,
"step": 27515
},
{
"epoch": 8.692410961067678,
"grad_norm": 0.04766972525437872,
"learning_rate": 0.0001019566556170225,
"loss": 2.4783,
"step": 27520
},
{
"epoch": 8.693990365632157,
"grad_norm": 0.054019712486918475,
"learning_rate": 0.00010171420685910326,
"loss": 2.4326,
"step": 27525
},
{
"epoch": 8.695569770196636,
"grad_norm": 0.041842413914295176,
"learning_rate": 0.00010147203126415428,
"loss": 2.329,
"step": 27530
},
{
"epoch": 8.697149174761115,
"grad_norm": 0.049855000006414546,
"learning_rate": 0.00010123012890581983,
"loss": 2.4639,
"step": 27535
},
{
"epoch": 8.698728579325595,
"grad_norm": 0.05850871822413731,
"learning_rate": 0.00010098849985766068,
"loss": 2.3149,
"step": 27540
},
{
"epoch": 8.700307983890074,
"grad_norm": 0.05523500580028637,
"learning_rate": 0.000100747144193155,
"loss": 2.4291,
"step": 27545
},
{
"epoch": 8.701887388454553,
"grad_norm": 0.04170201774340248,
"learning_rate": 0.00010050606198569723,
"loss": 2.4053,
"step": 27550
},
{
"epoch": 8.703466793019032,
"grad_norm": 0.05083914755449448,
"learning_rate": 0.00010026525330859903,
"loss": 2.3439,
"step": 27555
},
{
"epoch": 8.705046197583512,
"grad_norm": 0.047049094641974425,
"learning_rate": 0.00010002471823508864,
"loss": 2.441,
"step": 27560
},
{
"epoch": 8.70662560214799,
"grad_norm": 0.049698552700892414,
"learning_rate": 9.97844568383114e-05,
"loss": 2.3274,
"step": 27565
},
{
"epoch": 8.70820500671247,
"grad_norm": 0.04427067432949184,
"learning_rate": 9.954446919132899e-05,
"loss": 2.429,
"step": 27570
},
{
"epoch": 8.70978441127695,
"grad_norm": 0.04320417937956996,
"learning_rate": 9.930475536712057e-05,
"loss": 2.4198,
"step": 27575
},
{
"epoch": 8.711363815841429,
"grad_norm": 0.041329497032012934,
"learning_rate": 9.90653154385811e-05,
"loss": 2.4125,
"step": 27580
},
{
"epoch": 8.712943220405908,
"grad_norm": 0.04029998989493044,
"learning_rate": 9.882614947852319e-05,
"loss": 2.3716,
"step": 27585
},
{
"epoch": 8.714522624970385,
"grad_norm": 0.04127556580376296,
"learning_rate": 9.858725755967546e-05,
"loss": 2.3834,
"step": 27590
},
{
"epoch": 8.716102029534865,
"grad_norm": 0.04684041591857787,
"learning_rate": 9.834863975468322e-05,
"loss": 2.5174,
"step": 27595
},
{
"epoch": 8.717681434099344,
"grad_norm": 0.04554218645399579,
"learning_rate": 9.811029613610912e-05,
"loss": 2.4288,
"step": 27600
},
{
"epoch": 8.719260838663823,
"grad_norm": 0.044368508354788026,
"learning_rate": 9.787222677643137e-05,
"loss": 2.3462,
"step": 27605
},
{
"epoch": 8.720840243228302,
"grad_norm": 0.05312336467108467,
"learning_rate": 9.763443174804576e-05,
"loss": 2.411,
"step": 27610
},
{
"epoch": 8.722419647792782,
"grad_norm": 0.041609962188932766,
"learning_rate": 9.73969111232641e-05,
"loss": 2.4051,
"step": 27615
},
{
"epoch": 8.72399905235726,
"grad_norm": 0.04773125756521095,
"learning_rate": 9.715966497431461e-05,
"loss": 2.2886,
"step": 27620
},
{
"epoch": 8.72557845692174,
"grad_norm": 0.04503554984633216,
"learning_rate": 9.692269337334281e-05,
"loss": 2.3965,
"step": 27625
},
{
"epoch": 8.72715786148622,
"grad_norm": 0.05132758840218605,
"learning_rate": 9.668599639240993e-05,
"loss": 2.3531,
"step": 27630
},
{
"epoch": 8.728737266050699,
"grad_norm": 0.04774843040892377,
"learning_rate": 9.64495741034942e-05,
"loss": 2.502,
"step": 27635
},
{
"epoch": 8.730316670615178,
"grad_norm": 0.04329382389998516,
"learning_rate": 9.621342657849008e-05,
"loss": 2.3854,
"step": 27640
},
{
"epoch": 8.731896075179657,
"grad_norm": 0.04246665809880522,
"learning_rate": 9.597755388920849e-05,
"loss": 2.3339,
"step": 27645
},
{
"epoch": 8.733475479744136,
"grad_norm": 0.043336216890366264,
"learning_rate": 9.574195610737679e-05,
"loss": 2.3294,
"step": 27650
},
{
"epoch": 8.735054884308616,
"grad_norm": 0.04949076809099762,
"learning_rate": 9.55066333046386e-05,
"loss": 2.42,
"step": 27655
},
{
"epoch": 8.736634288873095,
"grad_norm": 0.044252555737269855,
"learning_rate": 9.527158555255445e-05,
"loss": 2.3788,
"step": 27660
},
{
"epoch": 8.738213693437574,
"grad_norm": 0.04390254514457231,
"learning_rate": 9.503681292260068e-05,
"loss": 2.4204,
"step": 27665
},
{
"epoch": 8.739793098002053,
"grad_norm": 0.039219814813633305,
"learning_rate": 9.480231548616991e-05,
"loss": 2.4288,
"step": 27670
},
{
"epoch": 8.741372502566533,
"grad_norm": 0.039131792602412445,
"learning_rate": 9.456809331457172e-05,
"loss": 2.3258,
"step": 27675
},
{
"epoch": 8.742951907131012,
"grad_norm": 0.04887100846216087,
"learning_rate": 9.433414647903137e-05,
"loss": 2.4431,
"step": 27680
},
{
"epoch": 8.744531311695491,
"grad_norm": 0.04164886010131534,
"learning_rate": 9.410047505069042e-05,
"loss": 2.386,
"step": 27685
},
{
"epoch": 8.74611071625997,
"grad_norm": 0.048611792433111846,
"learning_rate": 9.386707910060755e-05,
"loss": 2.3307,
"step": 27690
},
{
"epoch": 8.74769012082445,
"grad_norm": 0.04580083384677626,
"learning_rate": 9.363395869975599e-05,
"loss": 2.3363,
"step": 27695
},
{
"epoch": 8.749269525388929,
"grad_norm": 0.04694393619135054,
"learning_rate": 9.340111391902684e-05,
"loss": 2.3043,
"step": 27700
},
{
"epoch": 8.750848929953408,
"grad_norm": 0.047667781871471394,
"learning_rate": 9.316854482922655e-05,
"loss": 2.3623,
"step": 27705
},
{
"epoch": 8.752428334517887,
"grad_norm": 0.04368853260378372,
"learning_rate": 9.293625150107765e-05,
"loss": 2.4077,
"step": 27710
},
{
"epoch": 8.754007739082367,
"grad_norm": 0.046375364667146195,
"learning_rate": 9.270423400521955e-05,
"loss": 2.358,
"step": 27715
},
{
"epoch": 8.755587143646846,
"grad_norm": 0.044457278881758974,
"learning_rate": 9.247249241220679e-05,
"loss": 2.4747,
"step": 27720
},
{
"epoch": 8.757166548211325,
"grad_norm": 0.046833651263446595,
"learning_rate": 9.224102679251089e-05,
"loss": 2.3934,
"step": 27725
},
{
"epoch": 8.758745952775804,
"grad_norm": 0.050935973942569567,
"learning_rate": 9.20098372165189e-05,
"loss": 2.3344,
"step": 27730
},
{
"epoch": 8.760325357340284,
"grad_norm": 0.04390985059357109,
"learning_rate": 9.177892375453412e-05,
"loss": 2.3983,
"step": 27735
},
{
"epoch": 8.761904761904763,
"grad_norm": 0.04954885808261568,
"learning_rate": 9.154828647677593e-05,
"loss": 2.4317,
"step": 27740
},
{
"epoch": 8.76348416646924,
"grad_norm": 0.05344633123620102,
"learning_rate": 9.131792545337925e-05,
"loss": 2.3044,
"step": 27745
},
{
"epoch": 8.76506357103372,
"grad_norm": 0.047837756495592146,
"learning_rate": 9.108784075439603e-05,
"loss": 2.4257,
"step": 27750
},
{
"epoch": 8.766642975598199,
"grad_norm": 0.04092377494775665,
"learning_rate": 9.085803244979307e-05,
"loss": 2.387,
"step": 27755
},
{
"epoch": 8.768222380162678,
"grad_norm": 0.047610008045174436,
"learning_rate": 9.062850060945371e-05,
"loss": 2.4132,
"step": 27760
},
{
"epoch": 8.769801784727157,
"grad_norm": 0.05302693388087465,
"learning_rate": 9.039924530317733e-05,
"loss": 2.341,
"step": 27765
},
{
"epoch": 8.771381189291636,
"grad_norm": 0.04229024510916769,
"learning_rate": 9.017026660067863e-05,
"loss": 2.4872,
"step": 27770
},
{
"epoch": 8.772960593856116,
"grad_norm": 0.041842382305429464,
"learning_rate": 8.994156457158897e-05,
"loss": 2.317,
"step": 27775
},
{
"epoch": 8.774539998420595,
"grad_norm": 0.05620726232188397,
"learning_rate": 8.971313928545521e-05,
"loss": 2.4053,
"step": 27780
},
{
"epoch": 8.776119402985074,
"grad_norm": 0.04416747257295702,
"learning_rate": 8.948499081173955e-05,
"loss": 2.4268,
"step": 27785
},
{
"epoch": 8.777698807549553,
"grad_norm": 0.043428373042804086,
"learning_rate": 8.925711921982083e-05,
"loss": 2.3584,
"step": 27790
},
{
"epoch": 8.779278212114033,
"grad_norm": 0.04610945382687839,
"learning_rate": 8.902952457899316e-05,
"loss": 2.314,
"step": 27795
},
{
"epoch": 8.780857616678512,
"grad_norm": 0.046303902360458664,
"learning_rate": 8.880220695846663e-05,
"loss": 2.381,
"step": 27800
},
{
"epoch": 8.782437021242991,
"grad_norm": 0.04338916205988012,
"learning_rate": 8.857516642736741e-05,
"loss": 2.3796,
"step": 27805
},
{
"epoch": 8.78401642580747,
"grad_norm": 0.04301418542589073,
"learning_rate": 8.834840305473657e-05,
"loss": 2.2911,
"step": 27810
},
{
"epoch": 8.78559583037195,
"grad_norm": 0.043566278340293474,
"learning_rate": 8.812191690953187e-05,
"loss": 2.3303,
"step": 27815
},
{
"epoch": 8.787175234936429,
"grad_norm": 0.04314661109931861,
"learning_rate": 8.789570806062597e-05,
"loss": 2.3549,
"step": 27820
},
{
"epoch": 8.788754639500908,
"grad_norm": 0.04198275517691545,
"learning_rate": 8.766977657680775e-05,
"loss": 2.4167,
"step": 27825
},
{
"epoch": 8.790334044065387,
"grad_norm": 0.04406552289381544,
"learning_rate": 8.744412252678147e-05,
"loss": 2.6015,
"step": 27830
},
{
"epoch": 8.791913448629867,
"grad_norm": 0.053622602101489454,
"learning_rate": 8.721874597916679e-05,
"loss": 2.4954,
"step": 27835
},
{
"epoch": 8.793492853194346,
"grad_norm": 0.052635623372226624,
"learning_rate": 8.699364700249979e-05,
"loss": 2.3696,
"step": 27840
},
{
"epoch": 8.795072257758825,
"grad_norm": 0.045562967529298753,
"learning_rate": 8.676882566523137e-05,
"loss": 2.4215,
"step": 27845
},
{
"epoch": 8.796651662323304,
"grad_norm": 0.043320603795194146,
"learning_rate": 8.654428203572795e-05,
"loss": 2.4264,
"step": 27850
},
{
"epoch": 8.798231066887784,
"grad_norm": 0.04302390073292323,
"learning_rate": 8.632001618227248e-05,
"loss": 2.4053,
"step": 27855
},
{
"epoch": 8.799810471452263,
"grad_norm": 0.0418577135978061,
"learning_rate": 8.609602817306217e-05,
"loss": 2.3199,
"step": 27860
},
{
"epoch": 8.801389876016742,
"grad_norm": 0.04727817390643634,
"learning_rate": 8.587231807621098e-05,
"loss": 2.3408,
"step": 27865
},
{
"epoch": 8.802969280581221,
"grad_norm": 0.0423515591106379,
"learning_rate": 8.564888595974718e-05,
"loss": 2.4128,
"step": 27870
},
{
"epoch": 8.8045486851457,
"grad_norm": 0.052276271649834685,
"learning_rate": 8.542573189161496e-05,
"loss": 2.3171,
"step": 27875
},
{
"epoch": 8.80612808971018,
"grad_norm": 0.05429324170374112,
"learning_rate": 8.520285593967447e-05,
"loss": 2.4736,
"step": 27880
},
{
"epoch": 8.80770749427466,
"grad_norm": 0.053387068558818716,
"learning_rate": 8.498025817170063e-05,
"loss": 2.4531,
"step": 27885
},
{
"epoch": 8.809286898839138,
"grad_norm": 0.046713980444878955,
"learning_rate": 8.475793865538417e-05,
"loss": 2.4184,
"step": 27890
},
{
"epoch": 8.810866303403618,
"grad_norm": 0.044394572356236055,
"learning_rate": 8.45358974583309e-05,
"loss": 2.4898,
"step": 27895
},
{
"epoch": 8.812445707968095,
"grad_norm": 0.047160557226061856,
"learning_rate": 8.431413464806193e-05,
"loss": 2.4044,
"step": 27900
},
{
"epoch": 8.814025112532576,
"grad_norm": 0.04377217525018816,
"learning_rate": 8.40926502920144e-05,
"loss": 2.4242,
"step": 27905
},
{
"epoch": 8.815604517097054,
"grad_norm": 0.050799956400435595,
"learning_rate": 8.387144445753992e-05,
"loss": 2.373,
"step": 27910
},
{
"epoch": 8.817183921661533,
"grad_norm": 0.04693361828675467,
"learning_rate": 8.365051721190598e-05,
"loss": 2.4543,
"step": 27915
},
{
"epoch": 8.818763326226012,
"grad_norm": 0.04940313629484455,
"learning_rate": 8.342986862229496e-05,
"loss": 2.4479,
"step": 27920
},
{
"epoch": 8.820342730790491,
"grad_norm": 0.03822568584026386,
"learning_rate": 8.320949875580464e-05,
"loss": 2.4455,
"step": 27925
},
{
"epoch": 8.82192213535497,
"grad_norm": 0.040691554610554286,
"learning_rate": 8.29894076794484e-05,
"loss": 2.393,
"step": 27930
},
{
"epoch": 8.82350153991945,
"grad_norm": 0.044593275922070776,
"learning_rate": 8.276959546015428e-05,
"loss": 2.4724,
"step": 27935
},
{
"epoch": 8.825080944483929,
"grad_norm": 0.04400811097919304,
"learning_rate": 8.255006216476569e-05,
"loss": 2.4454,
"step": 27940
},
{
"epoch": 8.826660349048408,
"grad_norm": 0.05607438124327106,
"learning_rate": 8.233080786004166e-05,
"loss": 2.4234,
"step": 27945
},
{
"epoch": 8.828239753612888,
"grad_norm": 0.04947045921122129,
"learning_rate": 8.211183261265554e-05,
"loss": 2.3173,
"step": 27950
},
{
"epoch": 8.829819158177367,
"grad_norm": 0.04898501933533458,
"learning_rate": 8.189313648919694e-05,
"loss": 2.3083,
"step": 27955
},
{
"epoch": 8.831398562741846,
"grad_norm": 0.0503493787919402,
"learning_rate": 8.167471955616945e-05,
"loss": 2.3711,
"step": 27960
},
{
"epoch": 8.832977967306325,
"grad_norm": 0.04810358661506592,
"learning_rate": 8.145658187999227e-05,
"loss": 2.3185,
"step": 27965
},
{
"epoch": 8.834557371870805,
"grad_norm": 0.04761398760448387,
"learning_rate": 8.12387235269999e-05,
"loss": 2.3879,
"step": 27970
},
{
"epoch": 8.836136776435284,
"grad_norm": 0.04509133436808405,
"learning_rate": 8.102114456344145e-05,
"loss": 2.4121,
"step": 27975
},
{
"epoch": 8.837716180999763,
"grad_norm": 0.046277123819228494,
"learning_rate": 8.080384505548156e-05,
"loss": 2.4586,
"step": 27980
},
{
"epoch": 8.839295585564242,
"grad_norm": 0.04643289229968356,
"learning_rate": 8.058682506919945e-05,
"loss": 2.3555,
"step": 27985
},
{
"epoch": 8.840874990128722,
"grad_norm": 0.043840976748217575,
"learning_rate": 8.037008467058949e-05,
"loss": 2.389,
"step": 27990
},
{
"epoch": 8.8424543946932,
"grad_norm": 0.04452475438728594,
"learning_rate": 8.015362392556114e-05,
"loss": 2.4102,
"step": 27995
},
{
"epoch": 8.84403379925768,
"grad_norm": 0.04296832445148172,
"learning_rate": 7.993744289993876e-05,
"loss": 2.399,
"step": 28000
},
{
"epoch": 8.84561320382216,
"grad_norm": 0.042633473559069476,
"learning_rate": 7.972154165946155e-05,
"loss": 2.3855,
"step": 28005
},
{
"epoch": 8.847192608386639,
"grad_norm": 0.04437673113217061,
"learning_rate": 7.950592026978376e-05,
"loss": 2.2616,
"step": 28010
},
{
"epoch": 8.848772012951118,
"grad_norm": 0.04398465278826395,
"learning_rate": 7.929057879647416e-05,
"loss": 2.3888,
"step": 28015
},
{
"epoch": 8.850351417515597,
"grad_norm": 0.042152276865596444,
"learning_rate": 7.907551730501717e-05,
"loss": 2.4197,
"step": 28020
},
{
"epoch": 8.851930822080076,
"grad_norm": 0.046600110064270915,
"learning_rate": 7.886073586081133e-05,
"loss": 2.3751,
"step": 28025
},
{
"epoch": 8.853510226644556,
"grad_norm": 0.04043326347735244,
"learning_rate": 7.86462345291703e-05,
"loss": 2.3082,
"step": 28030
},
{
"epoch": 8.855089631209035,
"grad_norm": 0.041788048853306614,
"learning_rate": 7.843201337532291e-05,
"loss": 2.3259,
"step": 28035
},
{
"epoch": 8.856669035773514,
"grad_norm": 0.047200211239369405,
"learning_rate": 7.821807246441193e-05,
"loss": 2.3561,
"step": 28040
},
{
"epoch": 8.858248440337993,
"grad_norm": 0.05771240570091253,
"learning_rate": 7.800441186149598e-05,
"loss": 2.467,
"step": 28045
},
{
"epoch": 8.859827844902473,
"grad_norm": 0.0444103647842252,
"learning_rate": 7.779103163154755e-05,
"loss": 2.3739,
"step": 28050
},
{
"epoch": 8.86140724946695,
"grad_norm": 0.048050966507214264,
"learning_rate": 7.757793183945394e-05,
"loss": 2.3261,
"step": 28055
},
{
"epoch": 8.862986654031431,
"grad_norm": 0.053344791542482436,
"learning_rate": 7.736511255001799e-05,
"loss": 2.3812,
"step": 28060
},
{
"epoch": 8.864566058595909,
"grad_norm": 0.04263266640138856,
"learning_rate": 7.715257382795626e-05,
"loss": 2.4577,
"step": 28065
},
{
"epoch": 8.866145463160388,
"grad_norm": 0.04611157933668599,
"learning_rate": 7.694031573790073e-05,
"loss": 2.3725,
"step": 28070
},
{
"epoch": 8.867724867724867,
"grad_norm": 0.04356758179219666,
"learning_rate": 7.672833834439763e-05,
"loss": 2.3687,
"step": 28075
},
{
"epoch": 8.869304272289346,
"grad_norm": 0.04474931155565027,
"learning_rate": 7.651664171190764e-05,
"loss": 2.3559,
"step": 28080
},
{
"epoch": 8.870883676853826,
"grad_norm": 0.0457923641571006,
"learning_rate": 7.630522590480693e-05,
"loss": 2.4139,
"step": 28085
},
{
"epoch": 8.872463081418305,
"grad_norm": 0.04370572522376028,
"learning_rate": 7.609409098738518e-05,
"loss": 2.3599,
"step": 28090
},
{
"epoch": 8.874042485982784,
"grad_norm": 0.042834939875349386,
"learning_rate": 7.588323702384747e-05,
"loss": 2.3194,
"step": 28095
},
{
"epoch": 8.875621890547263,
"grad_norm": 0.045218227911846244,
"learning_rate": 7.567266407831308e-05,
"loss": 2.3007,
"step": 28100
},
{
"epoch": 8.877201295111742,
"grad_norm": 0.04422452601325681,
"learning_rate": 7.546237221481567e-05,
"loss": 2.324,
"step": 28105
},
{
"epoch": 8.878780699676222,
"grad_norm": 0.041829744755636056,
"learning_rate": 7.525236149730396e-05,
"loss": 2.459,
"step": 28110
},
{
"epoch": 8.880360104240701,
"grad_norm": 0.04451605381016142,
"learning_rate": 7.504263198964057e-05,
"loss": 2.4524,
"step": 28115
},
{
"epoch": 8.88193950880518,
"grad_norm": 0.04376098243257381,
"learning_rate": 7.483318375560322e-05,
"loss": 2.3056,
"step": 28120
},
{
"epoch": 8.88351891336966,
"grad_norm": 0.04964806600970924,
"learning_rate": 7.462401685888364e-05,
"loss": 2.4674,
"step": 28125
},
{
"epoch": 8.885098317934139,
"grad_norm": 0.04661754849539569,
"learning_rate": 7.441513136308809e-05,
"loss": 2.4871,
"step": 28130
},
{
"epoch": 8.886677722498618,
"grad_norm": 0.05142264274006399,
"learning_rate": 7.42065273317376e-05,
"loss": 2.3739,
"step": 28135
},
{
"epoch": 8.888257127063097,
"grad_norm": 0.04710796769977028,
"learning_rate": 7.399820482826692e-05,
"loss": 2.3984,
"step": 28140
},
{
"epoch": 8.889836531627576,
"grad_norm": 0.039886449765171465,
"learning_rate": 7.379016391602555e-05,
"loss": 2.3704,
"step": 28145
},
{
"epoch": 8.891415936192056,
"grad_norm": 0.04349712073760303,
"learning_rate": 7.358240465827793e-05,
"loss": 2.4033,
"step": 28150
},
{
"epoch": 8.892995340756535,
"grad_norm": 0.044988138574963435,
"learning_rate": 7.33749271182017e-05,
"loss": 2.3507,
"step": 28155
},
{
"epoch": 8.894574745321014,
"grad_norm": 0.05141158532945691,
"learning_rate": 7.316773135888999e-05,
"loss": 2.448,
"step": 28160
},
{
"epoch": 8.896154149885493,
"grad_norm": 0.04263475877598905,
"learning_rate": 7.296081744334948e-05,
"loss": 2.354,
"step": 28165
},
{
"epoch": 8.897733554449973,
"grad_norm": 0.04428641248965364,
"learning_rate": 7.275418543450118e-05,
"loss": 2.4402,
"step": 28170
},
{
"epoch": 8.899312959014452,
"grad_norm": 0.0430606727234692,
"learning_rate": 7.254783539518095e-05,
"loss": 2.3231,
"step": 28175
},
{
"epoch": 8.900892363578931,
"grad_norm": 0.04384537450340839,
"learning_rate": 7.234176738813824e-05,
"loss": 2.4522,
"step": 28180
},
{
"epoch": 8.90247176814341,
"grad_norm": 0.04473594645809847,
"learning_rate": 7.213598147603717e-05,
"loss": 2.3859,
"step": 28185
},
{
"epoch": 8.90405117270789,
"grad_norm": 0.047074046223589794,
"learning_rate": 7.193047772145588e-05,
"loss": 2.3877,
"step": 28190
},
{
"epoch": 8.905630577272369,
"grad_norm": 0.04426029183417958,
"learning_rate": 7.172525618688641e-05,
"loss": 2.4467,
"step": 28195
},
{
"epoch": 8.907209981836848,
"grad_norm": 0.049948508073831714,
"learning_rate": 7.152031693473594e-05,
"loss": 2.3481,
"step": 28200
},
{
"epoch": 8.908789386401327,
"grad_norm": 0.03971916438871055,
"learning_rate": 7.131566002732459e-05,
"loss": 2.4169,
"step": 28205
},
{
"epoch": 8.910368790965807,
"grad_norm": 0.04655908358862637,
"learning_rate": 7.111128552688773e-05,
"loss": 2.3672,
"step": 28210
},
{
"epoch": 8.911948195530286,
"grad_norm": 0.041045892957907094,
"learning_rate": 7.090719349557406e-05,
"loss": 2.5533,
"step": 28215
},
{
"epoch": 8.913527600094763,
"grad_norm": 0.04753375969385832,
"learning_rate": 7.070338399544662e-05,
"loss": 2.3752,
"step": 28220
},
{
"epoch": 8.915107004659243,
"grad_norm": 0.04521840246370948,
"learning_rate": 7.049985708848294e-05,
"loss": 2.3799,
"step": 28225
},
{
"epoch": 8.916686409223722,
"grad_norm": 0.04107290690489171,
"learning_rate": 7.029661283657385e-05,
"loss": 2.3567,
"step": 28230
},
{
"epoch": 8.918265813788201,
"grad_norm": 0.039786480821288654,
"learning_rate": 7.009365130152456e-05,
"loss": 2.3903,
"step": 28235
},
{
"epoch": 8.91984521835268,
"grad_norm": 0.04278558574535202,
"learning_rate": 6.989097254505473e-05,
"loss": 2.3218,
"step": 28240
},
{
"epoch": 8.92142462291716,
"grad_norm": 0.04404789760974957,
"learning_rate": 6.968857662879735e-05,
"loss": 2.34,
"step": 28245
},
{
"epoch": 8.923004027481639,
"grad_norm": 0.04698598034068341,
"learning_rate": 6.948646361430011e-05,
"loss": 2.4482,
"step": 28250
},
{
"epoch": 8.924583432046118,
"grad_norm": 0.04197192304493057,
"learning_rate": 6.928463356302395e-05,
"loss": 2.4799,
"step": 28255
},
{
"epoch": 8.926162836610597,
"grad_norm": 0.04842250969939549,
"learning_rate": 6.908308653634421e-05,
"loss": 2.3636,
"step": 28260
},
{
"epoch": 8.927742241175077,
"grad_norm": 0.04541276885376044,
"learning_rate": 6.888182259555009e-05,
"loss": 2.3819,
"step": 28265
},
{
"epoch": 8.929321645739556,
"grad_norm": 0.0468123822660515,
"learning_rate": 6.868084180184476e-05,
"loss": 2.3585,
"step": 28270
},
{
"epoch": 8.930901050304035,
"grad_norm": 0.0465213220810895,
"learning_rate": 6.848014421634497e-05,
"loss": 2.4015,
"step": 28275
},
{
"epoch": 8.932480454868514,
"grad_norm": 0.043897267262942054,
"learning_rate": 6.827972990008169e-05,
"loss": 2.4203,
"step": 28280
},
{
"epoch": 8.934059859432994,
"grad_norm": 0.040792885485446226,
"learning_rate": 6.807959891399951e-05,
"loss": 2.266,
"step": 28285
},
{
"epoch": 8.935639263997473,
"grad_norm": 0.042875993149876494,
"learning_rate": 6.787975131895718e-05,
"loss": 2.3575,
"step": 28290
},
{
"epoch": 8.937218668561952,
"grad_norm": 0.045745682590700325,
"learning_rate": 6.768018717572699e-05,
"loss": 2.4595,
"step": 28295
},
{
"epoch": 8.938798073126431,
"grad_norm": 0.04342424753009981,
"learning_rate": 6.748090654499517e-05,
"loss": 2.5196,
"step": 28300
},
{
"epoch": 8.94037747769091,
"grad_norm": 0.04334245849721706,
"learning_rate": 6.728190948736157e-05,
"loss": 2.408,
"step": 28305
},
{
"epoch": 8.94195688225539,
"grad_norm": 0.05159437317808072,
"learning_rate": 6.708319606334001e-05,
"loss": 2.4762,
"step": 28310
},
{
"epoch": 8.94353628681987,
"grad_norm": 0.043809806637989064,
"learning_rate": 6.688476633335816e-05,
"loss": 2.4506,
"step": 28315
},
{
"epoch": 8.945115691384348,
"grad_norm": 0.04417766741026222,
"learning_rate": 6.668662035775675e-05,
"loss": 2.4299,
"step": 28320
},
{
"epoch": 8.946695095948828,
"grad_norm": 0.04224320060040361,
"learning_rate": 6.648875819679112e-05,
"loss": 2.4565,
"step": 28325
},
{
"epoch": 8.948274500513307,
"grad_norm": 0.060494669613537645,
"learning_rate": 6.629117991062972e-05,
"loss": 2.4756,
"step": 28330
},
{
"epoch": 8.949853905077786,
"grad_norm": 0.047902803502330005,
"learning_rate": 6.60938855593548e-05,
"loss": 2.4002,
"step": 28335
},
{
"epoch": 8.951433309642265,
"grad_norm": 0.047155739020529386,
"learning_rate": 6.58968752029625e-05,
"loss": 2.3659,
"step": 28340
},
{
"epoch": 8.953012714206745,
"grad_norm": 0.04418604212975609,
"learning_rate": 6.570014890136223e-05,
"loss": 2.4233,
"step": 28345
},
{
"epoch": 8.954592118771224,
"grad_norm": 0.04267468340802014,
"learning_rate": 6.550370671437722e-05,
"loss": 2.3731,
"step": 28350
},
{
"epoch": 8.956171523335703,
"grad_norm": 0.04434042419174162,
"learning_rate": 6.530754870174448e-05,
"loss": 2.4518,
"step": 28355
},
{
"epoch": 8.957750927900182,
"grad_norm": 0.03954702637722497,
"learning_rate": 6.51116749231142e-05,
"loss": 2.3943,
"step": 28360
},
{
"epoch": 8.959330332464662,
"grad_norm": 0.04657808267518389,
"learning_rate": 6.49160854380505e-05,
"loss": 2.3881,
"step": 28365
},
{
"epoch": 8.96090973702914,
"grad_norm": 0.04487094095565116,
"learning_rate": 6.472078030603079e-05,
"loss": 2.3503,
"step": 28370
},
{
"epoch": 8.962489141593618,
"grad_norm": 0.043540370969267606,
"learning_rate": 6.45257595864459e-05,
"loss": 2.3871,
"step": 28375
},
{
"epoch": 8.9640685461581,
"grad_norm": 0.04566018129093167,
"learning_rate": 6.433102333860075e-05,
"loss": 2.4074,
"step": 28380
},
{
"epoch": 8.965647950722577,
"grad_norm": 0.04158628696039653,
"learning_rate": 6.413657162171316e-05,
"loss": 2.3664,
"step": 28385
},
{
"epoch": 8.967227355287056,
"grad_norm": 0.055477619760494895,
"learning_rate": 6.394240449491496e-05,
"loss": 2.3784,
"step": 28390
},
{
"epoch": 8.968806759851535,
"grad_norm": 0.041267318230141375,
"learning_rate": 6.374852201725078e-05,
"loss": 2.451,
"step": 28395
},
{
"epoch": 8.970386164416015,
"grad_norm": 0.04433084927731949,
"learning_rate": 6.355492424767906e-05,
"loss": 2.3834,
"step": 28400
},
{
"epoch": 8.971965568980494,
"grad_norm": 0.042990007247864004,
"learning_rate": 6.336161124507211e-05,
"loss": 2.3596,
"step": 28405
},
{
"epoch": 8.973544973544973,
"grad_norm": 0.04170281731718802,
"learning_rate": 6.31685830682145e-05,
"loss": 2.3039,
"step": 28410
},
{
"epoch": 8.975124378109452,
"grad_norm": 0.04574421635384774,
"learning_rate": 6.297583977580534e-05,
"loss": 2.3449,
"step": 28415
},
{
"epoch": 8.976703782673932,
"grad_norm": 0.04389535434415798,
"learning_rate": 6.278338142645657e-05,
"loss": 2.4148,
"step": 28420
},
{
"epoch": 8.97828318723841,
"grad_norm": 0.04570332578078057,
"learning_rate": 6.259120807869323e-05,
"loss": 2.408,
"step": 28425
},
{
"epoch": 8.97986259180289,
"grad_norm": 0.03850813327755201,
"learning_rate": 6.239931979095436e-05,
"loss": 2.3837,
"step": 28430
},
{
"epoch": 8.98144199636737,
"grad_norm": 0.053194674215291324,
"learning_rate": 6.220771662159175e-05,
"loss": 2.4011,
"step": 28435
},
{
"epoch": 8.983021400931849,
"grad_norm": 0.04013136467677291,
"learning_rate": 6.201639862887098e-05,
"loss": 2.3831,
"step": 28440
},
{
"epoch": 8.984600805496328,
"grad_norm": 0.05028889901043422,
"learning_rate": 6.182536587097043e-05,
"loss": 2.3901,
"step": 28445
},
{
"epoch": 8.986180210060807,
"grad_norm": 0.04601368636372297,
"learning_rate": 6.163461840598183e-05,
"loss": 2.4495,
"step": 28450
},
{
"epoch": 8.987759614625286,
"grad_norm": 0.04871419340899612,
"learning_rate": 6.144415629191058e-05,
"loss": 2.4751,
"step": 28455
},
{
"epoch": 8.989339019189766,
"grad_norm": 0.04013116759332213,
"learning_rate": 6.125397958667467e-05,
"loss": 2.3162,
"step": 28460
},
{
"epoch": 8.990918423754245,
"grad_norm": 0.043904232486768474,
"learning_rate": 6.106408834810562e-05,
"loss": 2.3792,
"step": 28465
},
{
"epoch": 8.992497828318724,
"grad_norm": 0.04647239091746081,
"learning_rate": 6.087448263394846e-05,
"loss": 2.3962,
"step": 28470
},
{
"epoch": 8.994077232883203,
"grad_norm": 0.04164440679482371,
"learning_rate": 6.0685162501860735e-05,
"loss": 2.3796,
"step": 28475
},
{
"epoch": 8.995656637447683,
"grad_norm": 0.04287185734386562,
"learning_rate": 6.0496128009413845e-05,
"loss": 2.3395,
"step": 28480
},
{
"epoch": 8.997236042012162,
"grad_norm": 0.04134370498230083,
"learning_rate": 6.0307379214091684e-05,
"loss": 2.3704,
"step": 28485
},
{
"epoch": 8.998815446576641,
"grad_norm": 0.041497695897780615,
"learning_rate": 6.011891617329146e-05,
"loss": 2.3183,
"step": 28490
},
{
"epoch": 9.0,
"eval_loss": 2.398218870162964,
"eval_runtime": 118.6226,
"eval_samples_per_second": 22.331,
"eval_steps_per_second": 5.589,
"step": 28494
},
{
"epoch": 9.000315880912895,
"grad_norm": 0.05539895210854539,
"learning_rate": 5.993073894432421e-05,
"loss": 2.3754,
"step": 28495
},
{
"epoch": 9.001895285477374,
"grad_norm": 0.04123702262548509,
"learning_rate": 5.9742847584412505e-05,
"loss": 2.3419,
"step": 28500
},
{
"epoch": 9.003474690041854,
"grad_norm": 0.04271805382790392,
"learning_rate": 5.9555242150693636e-05,
"loss": 2.4621,
"step": 28505
},
{
"epoch": 9.005054094606333,
"grad_norm": 0.040627830008919785,
"learning_rate": 5.936792270021696e-05,
"loss": 2.4283,
"step": 28510
},
{
"epoch": 9.006633499170812,
"grad_norm": 0.04332750417326001,
"learning_rate": 5.918088928994492e-05,
"loss": 2.3038,
"step": 28515
},
{
"epoch": 9.008212903735291,
"grad_norm": 0.04885440830855184,
"learning_rate": 5.899414197675357e-05,
"loss": 2.4495,
"step": 28520
},
{
"epoch": 9.00979230829977,
"grad_norm": 0.044867524974600684,
"learning_rate": 5.880768081743126e-05,
"loss": 2.4239,
"step": 28525
},
{
"epoch": 9.01137171286425,
"grad_norm": 0.04523618610822175,
"learning_rate": 5.862150586867998e-05,
"loss": 2.3923,
"step": 28530
},
{
"epoch": 9.01295111742873,
"grad_norm": 0.04445363859267257,
"learning_rate": 5.843561718711399e-05,
"loss": 2.4436,
"step": 28535
},
{
"epoch": 9.014530521993208,
"grad_norm": 0.03926001759058027,
"learning_rate": 5.825001482926107e-05,
"loss": 2.3301,
"step": 28540
},
{
"epoch": 9.016109926557688,
"grad_norm": 0.04244760575594229,
"learning_rate": 5.806469885156163e-05,
"loss": 2.367,
"step": 28545
},
{
"epoch": 9.017689331122167,
"grad_norm": 0.043163584829729325,
"learning_rate": 5.787966931036892e-05,
"loss": 2.4273,
"step": 28550
},
{
"epoch": 9.019268735686646,
"grad_norm": 0.04168211336664651,
"learning_rate": 5.76949262619495e-05,
"loss": 2.3911,
"step": 28555
},
{
"epoch": 9.020848140251125,
"grad_norm": 0.04123821904881108,
"learning_rate": 5.751046976248253e-05,
"loss": 2.4215,
"step": 28560
},
{
"epoch": 9.022427544815605,
"grad_norm": 0.0478115894593452,
"learning_rate": 5.732629986805982e-05,
"loss": 2.3722,
"step": 28565
},
{
"epoch": 9.024006949380084,
"grad_norm": 0.049719193458654574,
"learning_rate": 5.7142416634686443e-05,
"loss": 2.3503,
"step": 28570
},
{
"epoch": 9.025586353944563,
"grad_norm": 0.03781963748589037,
"learning_rate": 5.695882011828024e-05,
"loss": 2.4532,
"step": 28575
},
{
"epoch": 9.027165758509042,
"grad_norm": 0.03952041594418926,
"learning_rate": 5.677551037467132e-05,
"loss": 2.3267,
"step": 28580
},
{
"epoch": 9.028745163073522,
"grad_norm": 0.042776444291402475,
"learning_rate": 5.659248745960366e-05,
"loss": 2.4848,
"step": 28585
},
{
"epoch": 9.030324567638,
"grad_norm": 0.04458128838563757,
"learning_rate": 5.6409751428732613e-05,
"loss": 2.362,
"step": 28590
},
{
"epoch": 9.03190397220248,
"grad_norm": 0.040324798745327996,
"learning_rate": 5.622730233762752e-05,
"loss": 2.375,
"step": 28595
},
{
"epoch": 9.03348337676696,
"grad_norm": 0.03821938744796752,
"learning_rate": 5.6045140241769874e-05,
"loss": 2.446,
"step": 28600
},
{
"epoch": 9.035062781331439,
"grad_norm": 0.04090154449247851,
"learning_rate": 5.586326519655383e-05,
"loss": 2.401,
"step": 28605
},
{
"epoch": 9.036642185895918,
"grad_norm": 0.04416521736060769,
"learning_rate": 5.568167725728679e-05,
"loss": 2.3723,
"step": 28610
},
{
"epoch": 9.038221590460397,
"grad_norm": 0.03951361589992286,
"learning_rate": 5.550037647918804e-05,
"loss": 2.2889,
"step": 28615
},
{
"epoch": 9.039800995024876,
"grad_norm": 0.04508963694263608,
"learning_rate": 5.531936291739037e-05,
"loss": 2.3452,
"step": 28620
},
{
"epoch": 9.041380399589356,
"grad_norm": 0.04172239277099709,
"learning_rate": 5.513863662693874e-05,
"loss": 2.4078,
"step": 28625
},
{
"epoch": 9.042959804153835,
"grad_norm": 0.04164906108837666,
"learning_rate": 5.4958197662790864e-05,
"loss": 2.3829,
"step": 28630
},
{
"epoch": 9.044539208718314,
"grad_norm": 0.04207127374846404,
"learning_rate": 5.477804607981707e-05,
"loss": 2.3591,
"step": 28635
},
{
"epoch": 9.046118613282792,
"grad_norm": 0.03840138408108005,
"learning_rate": 5.4598181932799976e-05,
"loss": 2.4897,
"step": 28640
},
{
"epoch": 9.04769801784727,
"grad_norm": 0.0426254394099393,
"learning_rate": 5.4418605276435716e-05,
"loss": 2.3705,
"step": 28645
},
{
"epoch": 9.04927742241175,
"grad_norm": 0.04431027410454712,
"learning_rate": 5.423931616533207e-05,
"loss": 2.3392,
"step": 28650
},
{
"epoch": 9.05085682697623,
"grad_norm": 0.0400975127964107,
"learning_rate": 5.4060314654009514e-05,
"loss": 2.3702,
"step": 28655
},
{
"epoch": 9.052436231540709,
"grad_norm": 0.049244319688171286,
"learning_rate": 5.388160079690174e-05,
"loss": 2.4115,
"step": 28660
},
{
"epoch": 9.054015636105188,
"grad_norm": 0.042487722963328124,
"learning_rate": 5.370317464835406e-05,
"loss": 2.3407,
"step": 28665
},
{
"epoch": 9.055595040669667,
"grad_norm": 0.0418498746620096,
"learning_rate": 5.352503626262506e-05,
"loss": 2.4438,
"step": 28670
},
{
"epoch": 9.057174445234146,
"grad_norm": 0.03910125985395159,
"learning_rate": 5.3347185693885415e-05,
"loss": 2.3736,
"step": 28675
},
{
"epoch": 9.058753849798626,
"grad_norm": 0.04285573909389075,
"learning_rate": 5.316962299621808e-05,
"loss": 2.3383,
"step": 28680
},
{
"epoch": 9.060333254363105,
"grad_norm": 0.04284186987031212,
"learning_rate": 5.299234822361898e-05,
"loss": 2.3175,
"step": 28685
},
{
"epoch": 9.061912658927584,
"grad_norm": 0.04475347292548298,
"learning_rate": 5.281536142999621e-05,
"loss": 2.3799,
"step": 28690
},
{
"epoch": 9.063492063492063,
"grad_norm": 0.05419028033988098,
"learning_rate": 5.2638662669170276e-05,
"loss": 2.4104,
"step": 28695
},
{
"epoch": 9.065071468056543,
"grad_norm": 0.04581772934670987,
"learning_rate": 5.24622519948742e-05,
"loss": 2.3238,
"step": 28700
},
{
"epoch": 9.066650872621022,
"grad_norm": 0.044180637089044926,
"learning_rate": 5.2286129460753174e-05,
"loss": 2.4376,
"step": 28705
},
{
"epoch": 9.068230277185501,
"grad_norm": 0.04129274745161341,
"learning_rate": 5.211029512036514e-05,
"loss": 2.3849,
"step": 28710
},
{
"epoch": 9.06980968174998,
"grad_norm": 0.04136017192879712,
"learning_rate": 5.1934749027180206e-05,
"loss": 2.3897,
"step": 28715
},
{
"epoch": 9.07138908631446,
"grad_norm": 0.04171840858660408,
"learning_rate": 5.1759491234580794e-05,
"loss": 2.4476,
"step": 28720
},
{
"epoch": 9.072968490878939,
"grad_norm": 0.04140480744152659,
"learning_rate": 5.158452179586148e-05,
"loss": 2.3982,
"step": 28725
},
{
"epoch": 9.074547895443418,
"grad_norm": 0.046919517800492944,
"learning_rate": 5.1409840764229385e-05,
"loss": 2.3654,
"step": 28730
},
{
"epoch": 9.076127300007897,
"grad_norm": 0.045881106496820825,
"learning_rate": 5.1235448192804233e-05,
"loss": 2.4369,
"step": 28735
},
{
"epoch": 9.077706704572376,
"grad_norm": 0.04218174046617552,
"learning_rate": 5.106134413461738e-05,
"loss": 2.3918,
"step": 28740
},
{
"epoch": 9.079286109136856,
"grad_norm": 0.03948596879165074,
"learning_rate": 5.08875286426127e-05,
"loss": 2.3829,
"step": 28745
},
{
"epoch": 9.080865513701335,
"grad_norm": 0.054065235496467405,
"learning_rate": 5.071400176964669e-05,
"loss": 2.3362,
"step": 28750
},
{
"epoch": 9.082444918265814,
"grad_norm": 0.050348006713228734,
"learning_rate": 5.054076356848747e-05,
"loss": 2.2797,
"step": 28755
},
{
"epoch": 9.084024322830293,
"grad_norm": 0.039850146641120986,
"learning_rate": 5.03678140918159e-05,
"loss": 2.3972,
"step": 28760
},
{
"epoch": 9.085603727394773,
"grad_norm": 0.04094970802271905,
"learning_rate": 5.01951533922248e-05,
"loss": 2.3674,
"step": 28765
},
{
"epoch": 9.087183131959252,
"grad_norm": 0.044132106892893985,
"learning_rate": 5.0022781522218844e-05,
"loss": 2.432,
"step": 28770
},
{
"epoch": 9.088762536523731,
"grad_norm": 0.042469894371006324,
"learning_rate": 4.985069853421553e-05,
"loss": 2.4111,
"step": 28775
},
{
"epoch": 9.09034194108821,
"grad_norm": 0.04143817516641654,
"learning_rate": 4.9678904480544126e-05,
"loss": 2.3979,
"step": 28780
},
{
"epoch": 9.09192134565269,
"grad_norm": 0.04923096244123163,
"learning_rate": 4.950739941344606e-05,
"loss": 2.3604,
"step": 28785
},
{
"epoch": 9.093500750217169,
"grad_norm": 0.06095904883104004,
"learning_rate": 4.933618338507506e-05,
"loss": 2.3528,
"step": 28790
},
{
"epoch": 9.095080154781646,
"grad_norm": 0.04419883584848066,
"learning_rate": 4.916525644749659e-05,
"loss": 2.455,
"step": 28795
},
{
"epoch": 9.096659559346126,
"grad_norm": 0.04314787308438026,
"learning_rate": 4.899461865268873e-05,
"loss": 2.3835,
"step": 28800
},
{
"epoch": 9.098238963910605,
"grad_norm": 0.042573988537507504,
"learning_rate": 4.88242700525412e-05,
"loss": 2.3775,
"step": 28805
},
{
"epoch": 9.099818368475084,
"grad_norm": 0.04207029225332564,
"learning_rate": 4.86542106988559e-05,
"loss": 2.3409,
"step": 28810
},
{
"epoch": 9.101397773039563,
"grad_norm": 0.0471923165839834,
"learning_rate": 4.848444064334678e-05,
"loss": 2.3384,
"step": 28815
},
{
"epoch": 9.102977177604043,
"grad_norm": 0.04401213089781764,
"learning_rate": 4.831495993763968e-05,
"loss": 2.4127,
"step": 28820
},
{
"epoch": 9.104556582168522,
"grad_norm": 0.049263828477959626,
"learning_rate": 4.8145768633273024e-05,
"loss": 2.3605,
"step": 28825
},
{
"epoch": 9.106135986733001,
"grad_norm": 0.03914847392450638,
"learning_rate": 4.797686678169655e-05,
"loss": 2.4509,
"step": 28830
},
{
"epoch": 9.10771539129748,
"grad_norm": 0.044789539216387265,
"learning_rate": 4.780825443427206e-05,
"loss": 2.3187,
"step": 28835
},
{
"epoch": 9.10929479586196,
"grad_norm": 0.047710012261827764,
"learning_rate": 4.763993164227387e-05,
"loss": 2.3875,
"step": 28840
},
{
"epoch": 9.110874200426439,
"grad_norm": 0.042945195674350796,
"learning_rate": 4.747189845688749e-05,
"loss": 2.3657,
"step": 28845
},
{
"epoch": 9.112453604990918,
"grad_norm": 0.03912386907019885,
"learning_rate": 4.730415492921103e-05,
"loss": 2.4348,
"step": 28850
},
{
"epoch": 9.114033009555397,
"grad_norm": 0.047292153040255004,
"learning_rate": 4.7136701110254255e-05,
"loss": 2.3042,
"step": 28855
},
{
"epoch": 9.115612414119877,
"grad_norm": 0.04365479795989776,
"learning_rate": 4.6969537050938426e-05,
"loss": 2.3098,
"step": 28860
},
{
"epoch": 9.117191818684356,
"grad_norm": 0.040473229546279604,
"learning_rate": 4.680266280209744e-05,
"loss": 2.3938,
"step": 28865
},
{
"epoch": 9.118771223248835,
"grad_norm": 0.04031988130048521,
"learning_rate": 4.663607841447637e-05,
"loss": 2.3472,
"step": 28870
},
{
"epoch": 9.120350627813314,
"grad_norm": 0.040675601722479415,
"learning_rate": 4.64697839387328e-05,
"loss": 2.493,
"step": 28875
},
{
"epoch": 9.121930032377794,
"grad_norm": 0.04107817503894481,
"learning_rate": 4.6303779425435625e-05,
"loss": 2.3484,
"step": 28880
},
{
"epoch": 9.123509436942273,
"grad_norm": 0.04263272398412405,
"learning_rate": 4.613806492506567e-05,
"loss": 2.3788,
"step": 28885
},
{
"epoch": 9.125088841506752,
"grad_norm": 0.04711185197888553,
"learning_rate": 4.597264048801597e-05,
"loss": 2.5292,
"step": 28890
},
{
"epoch": 9.126668246071231,
"grad_norm": 0.0535459189393864,
"learning_rate": 4.580750616459084e-05,
"loss": 2.5002,
"step": 28895
},
{
"epoch": 9.12824765063571,
"grad_norm": 0.052533700604335064,
"learning_rate": 4.564266200500655e-05,
"loss": 2.3703,
"step": 28900
},
{
"epoch": 9.12982705520019,
"grad_norm": 0.05082432610328768,
"learning_rate": 4.547810805939112e-05,
"loss": 2.3579,
"step": 28905
},
{
"epoch": 9.13140645976467,
"grad_norm": 0.04111080342238679,
"learning_rate": 4.5313844377784406e-05,
"loss": 2.382,
"step": 28910
},
{
"epoch": 9.132985864329148,
"grad_norm": 0.03809082206329682,
"learning_rate": 4.514987101013801e-05,
"loss": 2.3631,
"step": 28915
},
{
"epoch": 9.134565268893628,
"grad_norm": 0.043926550953605356,
"learning_rate": 4.498618800631515e-05,
"loss": 2.3854,
"step": 28920
},
{
"epoch": 9.136144673458107,
"grad_norm": 0.045335341408915215,
"learning_rate": 4.482279541609069e-05,
"loss": 2.4797,
"step": 28925
},
{
"epoch": 9.137724078022586,
"grad_norm": 0.0399260035243842,
"learning_rate": 4.465969328915142e-05,
"loss": 2.3646,
"step": 28930
},
{
"epoch": 9.139303482587065,
"grad_norm": 0.0453549885770251,
"learning_rate": 4.449688167509547e-05,
"loss": 2.4305,
"step": 28935
},
{
"epoch": 9.140882887151545,
"grad_norm": 0.048491122344655156,
"learning_rate": 4.433436062343299e-05,
"loss": 2.3452,
"step": 28940
},
{
"epoch": 9.142462291716024,
"grad_norm": 0.10868317438250054,
"learning_rate": 4.417213018358579e-05,
"loss": 2.3084,
"step": 28945
},
{
"epoch": 9.144041696280503,
"grad_norm": 0.050474003875059764,
"learning_rate": 4.401019040488652e-05,
"loss": 2.3653,
"step": 28950
},
{
"epoch": 9.14562110084498,
"grad_norm": 0.043321815901163206,
"learning_rate": 4.384854133658045e-05,
"loss": 2.3629,
"step": 28955
},
{
"epoch": 9.14720050540946,
"grad_norm": 0.05295842618436736,
"learning_rate": 4.368718302782382e-05,
"loss": 2.4341,
"step": 28960
},
{
"epoch": 9.148779909973939,
"grad_norm": 0.0497329088476317,
"learning_rate": 4.352611552768493e-05,
"loss": 2.3811,
"step": 28965
},
{
"epoch": 9.150359314538418,
"grad_norm": 0.06417418384325152,
"learning_rate": 4.336533888514327e-05,
"loss": 2.3073,
"step": 28970
},
{
"epoch": 9.151938719102898,
"grad_norm": 0.04820864210283732,
"learning_rate": 4.320485314908973e-05,
"loss": 2.3611,
"step": 28975
},
{
"epoch": 9.153518123667377,
"grad_norm": 0.04434132639117964,
"learning_rate": 4.304465836832738e-05,
"loss": 2.3873,
"step": 28980
},
{
"epoch": 9.155097528231856,
"grad_norm": 0.04215533152988047,
"learning_rate": 4.2884754591570264e-05,
"loss": 2.3061,
"step": 28985
},
{
"epoch": 9.156676932796335,
"grad_norm": 0.04633299675860233,
"learning_rate": 4.272514186744414e-05,
"loss": 2.4238,
"step": 28990
},
{
"epoch": 9.158256337360815,
"grad_norm": 0.04476402273132824,
"learning_rate": 4.256582024448608e-05,
"loss": 2.5064,
"step": 28995
},
{
"epoch": 9.159835741925294,
"grad_norm": 0.04526344305115905,
"learning_rate": 4.240678977114487e-05,
"loss": 2.3035,
"step": 29000
},
{
"epoch": 9.161415146489773,
"grad_norm": 0.041130237054635116,
"learning_rate": 4.224805049578073e-05,
"loss": 2.4096,
"step": 29005
},
{
"epoch": 9.162994551054252,
"grad_norm": 0.04590940137847429,
"learning_rate": 4.208960246666505e-05,
"loss": 2.3711,
"step": 29010
},
{
"epoch": 9.164573955618732,
"grad_norm": 0.044620038541742534,
"learning_rate": 4.1931445731981044e-05,
"loss": 2.4342,
"step": 29015
},
{
"epoch": 9.16615336018321,
"grad_norm": 0.04449040968754132,
"learning_rate": 4.177358033982326e-05,
"loss": 2.3963,
"step": 29020
},
{
"epoch": 9.16773276474769,
"grad_norm": 0.04514262158130095,
"learning_rate": 4.1616006338197175e-05,
"loss": 2.4084,
"step": 29025
},
{
"epoch": 9.16931216931217,
"grad_norm": 0.04054822044176869,
"learning_rate": 4.145872377502047e-05,
"loss": 2.3895,
"step": 29030
},
{
"epoch": 9.170891573876649,
"grad_norm": 0.039526620752448054,
"learning_rate": 4.1301732698121654e-05,
"loss": 2.3139,
"step": 29035
},
{
"epoch": 9.172470978441128,
"grad_norm": 0.03928310439499334,
"learning_rate": 4.114503315524043e-05,
"loss": 2.4264,
"step": 29040
},
{
"epoch": 9.174050383005607,
"grad_norm": 0.045551796789566175,
"learning_rate": 4.098862519402846e-05,
"loss": 2.2744,
"step": 29045
},
{
"epoch": 9.175629787570086,
"grad_norm": 0.0399422783979937,
"learning_rate": 4.0832508862048145e-05,
"loss": 2.3409,
"step": 29050
},
{
"epoch": 9.177209192134566,
"grad_norm": 0.04924591331646857,
"learning_rate": 4.067668420677373e-05,
"loss": 2.3838,
"step": 29055
},
{
"epoch": 9.178788596699045,
"grad_norm": 0.044755165729311416,
"learning_rate": 4.052115127559031e-05,
"loss": 2.3797,
"step": 29060
},
{
"epoch": 9.180368001263524,
"grad_norm": 0.04045754005435543,
"learning_rate": 4.036591011579438e-05,
"loss": 2.3068,
"step": 29065
},
{
"epoch": 9.181947405828003,
"grad_norm": 0.04275806283767011,
"learning_rate": 4.0210960774594075e-05,
"loss": 2.4139,
"step": 29070
},
{
"epoch": 9.183526810392483,
"grad_norm": 0.04585594079134387,
"learning_rate": 4.005630329910825e-05,
"loss": 2.4098,
"step": 29075
},
{
"epoch": 9.185106214956962,
"grad_norm": 0.0638290729672531,
"learning_rate": 3.990193773636752e-05,
"loss": 2.3453,
"step": 29080
},
{
"epoch": 9.186685619521441,
"grad_norm": 0.04527464135500821,
"learning_rate": 3.974786413331311e-05,
"loss": 2.4506,
"step": 29085
},
{
"epoch": 9.18826502408592,
"grad_norm": 0.039989221818523775,
"learning_rate": 3.9594082536797974e-05,
"loss": 2.3661,
"step": 29090
},
{
"epoch": 9.1898444286504,
"grad_norm": 0.043090513322247545,
"learning_rate": 3.9440592993586264e-05,
"loss": 2.336,
"step": 29095
},
{
"epoch": 9.191423833214879,
"grad_norm": 0.04200432774320849,
"learning_rate": 3.928739555035288e-05,
"loss": 2.3503,
"step": 29100
},
{
"epoch": 9.193003237779358,
"grad_norm": 0.03999808765273191,
"learning_rate": 3.913449025368443e-05,
"loss": 2.3326,
"step": 29105
},
{
"epoch": 9.194582642343835,
"grad_norm": 0.03907144195784045,
"learning_rate": 3.898187715007839e-05,
"loss": 2.3648,
"step": 29110
},
{
"epoch": 9.196162046908315,
"grad_norm": 0.04386079230797623,
"learning_rate": 3.882955628594331e-05,
"loss": 2.4088,
"step": 29115
},
{
"epoch": 9.197741451472794,
"grad_norm": 0.053581209390653715,
"learning_rate": 3.867752770759914e-05,
"loss": 2.3855,
"step": 29120
},
{
"epoch": 9.199320856037273,
"grad_norm": 0.05346550104521183,
"learning_rate": 3.8525791461276774e-05,
"loss": 2.4107,
"step": 29125
},
{
"epoch": 9.200900260601752,
"grad_norm": 0.04310942027466358,
"learning_rate": 3.837434759311809e-05,
"loss": 2.3684,
"step": 29130
},
{
"epoch": 9.202479665166232,
"grad_norm": 0.04058309001451409,
"learning_rate": 3.822319614917647e-05,
"loss": 2.3326,
"step": 29135
},
{
"epoch": 9.204059069730711,
"grad_norm": 0.04102981134601642,
"learning_rate": 3.807233717541569e-05,
"loss": 2.3551,
"step": 29140
},
{
"epoch": 9.20563847429519,
"grad_norm": 0.047155817064088196,
"learning_rate": 3.792177071771141e-05,
"loss": 2.318,
"step": 29145
},
{
"epoch": 9.20721787885967,
"grad_norm": 0.04273745785676265,
"learning_rate": 3.777149682184977e-05,
"loss": 2.3615,
"step": 29150
},
{
"epoch": 9.208797283424149,
"grad_norm": 0.0430157506013178,
"learning_rate": 3.7621515533527995e-05,
"loss": 2.3555,
"step": 29155
},
{
"epoch": 9.210376687988628,
"grad_norm": 0.05222183607969487,
"learning_rate": 3.747182689835471e-05,
"loss": 2.3054,
"step": 29160
},
{
"epoch": 9.211956092553107,
"grad_norm": 0.04307376017354491,
"learning_rate": 3.732243096184895e-05,
"loss": 2.272,
"step": 29165
},
{
"epoch": 9.213535497117586,
"grad_norm": 0.044373358610198824,
"learning_rate": 3.717332776944138e-05,
"loss": 2.3317,
"step": 29170
},
{
"epoch": 9.215114901682066,
"grad_norm": 0.039858621168155556,
"learning_rate": 3.702451736647305e-05,
"loss": 2.4248,
"step": 29175
},
{
"epoch": 9.216694306246545,
"grad_norm": 0.04569387957148044,
"learning_rate": 3.6875999798196336e-05,
"loss": 2.4086,
"step": 29180
},
{
"epoch": 9.218273710811024,
"grad_norm": 0.03971019782674811,
"learning_rate": 3.6727775109774544e-05,
"loss": 2.3534,
"step": 29185
},
{
"epoch": 9.219853115375503,
"grad_norm": 0.04636364959204572,
"learning_rate": 3.657984334628173e-05,
"loss": 2.4084,
"step": 29190
},
{
"epoch": 9.221432519939983,
"grad_norm": 0.04392572368215338,
"learning_rate": 3.643220455270324e-05,
"loss": 2.5082,
"step": 29195
},
{
"epoch": 9.223011924504462,
"grad_norm": 0.03935989812368449,
"learning_rate": 3.6284858773934946e-05,
"loss": 2.3977,
"step": 29200
},
{
"epoch": 9.224591329068941,
"grad_norm": 0.04332327640570466,
"learning_rate": 3.613780605478367e-05,
"loss": 2.2912,
"step": 29205
},
{
"epoch": 9.22617073363342,
"grad_norm": 0.04855215380149932,
"learning_rate": 3.599104643996731e-05,
"loss": 2.389,
"step": 29210
},
{
"epoch": 9.2277501381979,
"grad_norm": 0.03795310592202636,
"learning_rate": 3.58445799741145e-05,
"loss": 2.3961,
"step": 29215
},
{
"epoch": 9.229329542762379,
"grad_norm": 0.05191156694604205,
"learning_rate": 3.569840670176483e-05,
"loss": 2.4241,
"step": 29220
},
{
"epoch": 9.230908947326858,
"grad_norm": 0.038684792584600135,
"learning_rate": 3.555252666736863e-05,
"loss": 2.4226,
"step": 29225
},
{
"epoch": 9.232488351891337,
"grad_norm": 0.04123486576408677,
"learning_rate": 3.540693991528676e-05,
"loss": 2.4001,
"step": 29230
},
{
"epoch": 9.234067756455817,
"grad_norm": 0.04405148701945233,
"learning_rate": 3.52616464897918e-05,
"loss": 2.4158,
"step": 29235
},
{
"epoch": 9.235647161020296,
"grad_norm": 0.04464129511567758,
"learning_rate": 3.511664643506618e-05,
"loss": 2.4193,
"step": 29240
},
{
"epoch": 9.237226565584775,
"grad_norm": 0.03677218014352938,
"learning_rate": 3.497193979520341e-05,
"loss": 2.3093,
"step": 29245
},
{
"epoch": 9.238805970149254,
"grad_norm": 0.04171266312585504,
"learning_rate": 3.4827526614208184e-05,
"loss": 2.4566,
"step": 29250
},
{
"epoch": 9.240385374713734,
"grad_norm": 0.04169852827156441,
"learning_rate": 3.468340693599547e-05,
"loss": 2.3478,
"step": 29255
},
{
"epoch": 9.241964779278213,
"grad_norm": 0.04334946368855338,
"learning_rate": 3.453958080439112e-05,
"loss": 2.3112,
"step": 29260
},
{
"epoch": 9.243544183842692,
"grad_norm": 0.03922094500882477,
"learning_rate": 3.43960482631317e-05,
"loss": 2.3882,
"step": 29265
},
{
"epoch": 9.24512358840717,
"grad_norm": 0.04519921017609258,
"learning_rate": 3.42528093558645e-05,
"loss": 2.316,
"step": 29270
},
{
"epoch": 9.246702992971649,
"grad_norm": 0.04997435216087838,
"learning_rate": 3.41098641261478e-05,
"loss": 2.3285,
"step": 29275
},
{
"epoch": 9.248282397536128,
"grad_norm": 0.039028010205086464,
"learning_rate": 3.396721261744995e-05,
"loss": 2.3983,
"step": 29280
},
{
"epoch": 9.249861802100607,
"grad_norm": 0.05505879845239007,
"learning_rate": 3.382485487315079e-05,
"loss": 2.4632,
"step": 29285
},
{
"epoch": 9.251441206665087,
"grad_norm": 0.04380356142493805,
"learning_rate": 3.3682790936540255e-05,
"loss": 2.3833,
"step": 29290
},
{
"epoch": 9.253020611229566,
"grad_norm": 0.04071937099385057,
"learning_rate": 3.354102085081878e-05,
"loss": 2.3134,
"step": 29295
},
{
"epoch": 9.254600015794045,
"grad_norm": 0.04483331538338372,
"learning_rate": 3.339954465909822e-05,
"loss": 2.4023,
"step": 29300
},
{
"epoch": 9.256179420358524,
"grad_norm": 0.05433343268438282,
"learning_rate": 3.325836240440028e-05,
"loss": 2.3841,
"step": 29305
},
{
"epoch": 9.257758824923004,
"grad_norm": 0.06271809829545318,
"learning_rate": 3.31174741296576e-05,
"loss": 2.3909,
"step": 29310
},
{
"epoch": 9.259338229487483,
"grad_norm": 0.055331123105307564,
"learning_rate": 3.297687987771359e-05,
"loss": 2.3095,
"step": 29315
},
{
"epoch": 9.260917634051962,
"grad_norm": 0.06144956409274985,
"learning_rate": 3.28365796913217e-05,
"loss": 2.41,
"step": 29320
},
{
"epoch": 9.262497038616441,
"grad_norm": 0.049214960201155206,
"learning_rate": 3.269657361314671e-05,
"loss": 2.4013,
"step": 29325
},
{
"epoch": 9.26407644318092,
"grad_norm": 0.042137070485659386,
"learning_rate": 3.255686168576333e-05,
"loss": 2.4245,
"step": 29330
},
{
"epoch": 9.2656558477454,
"grad_norm": 0.03725081325722173,
"learning_rate": 3.241744395165713e-05,
"loss": 2.3898,
"step": 29335
},
{
"epoch": 9.267235252309879,
"grad_norm": 0.045664163663368236,
"learning_rate": 3.227832045322432e-05,
"loss": 2.3903,
"step": 29340
},
{
"epoch": 9.268814656874358,
"grad_norm": 0.04055654534671299,
"learning_rate": 3.213949123277127e-05,
"loss": 2.4562,
"step": 29345
},
{
"epoch": 9.270394061438838,
"grad_norm": 0.0421780986676158,
"learning_rate": 3.200095633251499e-05,
"loss": 2.4214,
"step": 29350
},
{
"epoch": 9.271973466003317,
"grad_norm": 0.040088304665845904,
"learning_rate": 3.186271579458333e-05,
"loss": 2.3567,
"step": 29355
},
{
"epoch": 9.273552870567796,
"grad_norm": 0.03988299510997113,
"learning_rate": 3.172476966101401e-05,
"loss": 2.4537,
"step": 29360
},
{
"epoch": 9.275132275132275,
"grad_norm": 0.03951477475300885,
"learning_rate": 3.15871179737558e-05,
"loss": 2.4638,
"step": 29365
},
{
"epoch": 9.276711679696755,
"grad_norm": 0.03652875104171294,
"learning_rate": 3.144976077466766e-05,
"loss": 2.3528,
"step": 29370
},
{
"epoch": 9.278291084261234,
"grad_norm": 0.042400024464408444,
"learning_rate": 3.1312698105519065e-05,
"loss": 2.4876,
"step": 29375
},
{
"epoch": 9.279870488825713,
"grad_norm": 0.06535280970340411,
"learning_rate": 3.117593000798991e-05,
"loss": 2.4615,
"step": 29380
},
{
"epoch": 9.281449893390192,
"grad_norm": 0.06965542532897992,
"learning_rate": 3.1039456523670354e-05,
"loss": 2.394,
"step": 29385
},
{
"epoch": 9.283029297954672,
"grad_norm": 0.04933183325778829,
"learning_rate": 3.0903277694061206e-05,
"loss": 2.5109,
"step": 29390
},
{
"epoch": 9.28460870251915,
"grad_norm": 0.07329883949860261,
"learning_rate": 3.0767393560573676e-05,
"loss": 2.4093,
"step": 29395
},
{
"epoch": 9.28618810708363,
"grad_norm": 0.043019979665248294,
"learning_rate": 3.0631804164529155e-05,
"loss": 2.3237,
"step": 29400
},
{
"epoch": 9.28776751164811,
"grad_norm": 0.04286309010098627,
"learning_rate": 3.0496509547159546e-05,
"loss": 2.3888,
"step": 29405
},
{
"epoch": 9.289346916212589,
"grad_norm": 0.05463231724209813,
"learning_rate": 3.0361509749606942e-05,
"loss": 2.4998,
"step": 29410
},
{
"epoch": 9.290926320777068,
"grad_norm": 0.04574377988326568,
"learning_rate": 3.022680481292406e-05,
"loss": 2.5221,
"step": 29415
},
{
"epoch": 9.292505725341547,
"grad_norm": 0.055234541516584426,
"learning_rate": 3.0092394778073796e-05,
"loss": 2.361,
"step": 29420
},
{
"epoch": 9.294085129906026,
"grad_norm": 0.04377149950342692,
"learning_rate": 2.9958279685929347e-05,
"loss": 2.3823,
"step": 29425
},
{
"epoch": 9.295664534470504,
"grad_norm": 0.04641116740681584,
"learning_rate": 2.982445957727431e-05,
"loss": 2.3346,
"step": 29430
},
{
"epoch": 9.297243939034983,
"grad_norm": 0.056468153318240545,
"learning_rate": 2.969093449280258e-05,
"loss": 2.4519,
"step": 29435
},
{
"epoch": 9.298823343599462,
"grad_norm": 0.05573138933765895,
"learning_rate": 2.9557704473118117e-05,
"loss": 2.4091,
"step": 29440
},
{
"epoch": 9.300402748163942,
"grad_norm": 0.04510163972553918,
"learning_rate": 2.9424769558735297e-05,
"loss": 2.3855,
"step": 29445
},
{
"epoch": 9.30198215272842,
"grad_norm": 0.040779117496225474,
"learning_rate": 2.9292129790079004e-05,
"loss": 2.3881,
"step": 29450
},
{
"epoch": 9.3035615572929,
"grad_norm": 0.05090618230006451,
"learning_rate": 2.915978520748397e-05,
"loss": 2.3817,
"step": 29455
},
{
"epoch": 9.30514096185738,
"grad_norm": 0.07942568741459298,
"learning_rate": 2.9027735851195337e-05,
"loss": 2.3227,
"step": 29460
},
{
"epoch": 9.306720366421859,
"grad_norm": 0.04441821545230185,
"learning_rate": 2.889598176136865e-05,
"loss": 2.3473,
"step": 29465
},
{
"epoch": 9.308299770986338,
"grad_norm": 0.0642102432950859,
"learning_rate": 2.8764522978069197e-05,
"loss": 2.3395,
"step": 29470
},
{
"epoch": 9.309879175550817,
"grad_norm": 0.04734207881890283,
"learning_rate": 2.8633359541272997e-05,
"loss": 2.4116,
"step": 29475
},
{
"epoch": 9.311458580115296,
"grad_norm": 0.06115612128620437,
"learning_rate": 2.8502491490865922e-05,
"loss": 2.4584,
"step": 29480
},
{
"epoch": 9.313037984679776,
"grad_norm": 0.060420910808885335,
"learning_rate": 2.8371918866644143e-05,
"loss": 2.4345,
"step": 29485
},
{
"epoch": 9.314617389244255,
"grad_norm": 0.05433250871228697,
"learning_rate": 2.8241641708313894e-05,
"loss": 2.3294,
"step": 29490
},
{
"epoch": 9.316196793808734,
"grad_norm": 0.03906077732226694,
"learning_rate": 2.8111660055491705e-05,
"loss": 2.4747,
"step": 29495
},
{
"epoch": 9.317776198373213,
"grad_norm": 0.04805529604126894,
"learning_rate": 2.7981973947704077e-05,
"loss": 2.2703,
"step": 29500
},
{
"epoch": 9.319355602937692,
"grad_norm": 0.039849712288317914,
"learning_rate": 2.785258342438779e-05,
"loss": 2.3317,
"step": 29505
},
{
"epoch": 9.320935007502172,
"grad_norm": 0.04634429218917224,
"learning_rate": 2.7723488524889594e-05,
"loss": 2.3926,
"step": 29510
},
{
"epoch": 9.322514412066651,
"grad_norm": 0.03917190970641901,
"learning_rate": 2.7594689288466535e-05,
"loss": 2.4679,
"step": 29515
},
{
"epoch": 9.32409381663113,
"grad_norm": 0.06671558758633714,
"learning_rate": 2.7466185754285723e-05,
"loss": 2.4398,
"step": 29520
},
{
"epoch": 9.32567322119561,
"grad_norm": 0.04498801475803708,
"learning_rate": 2.733797796142401e-05,
"loss": 2.428,
"step": 29525
},
{
"epoch": 9.327252625760089,
"grad_norm": 0.045553744363332226,
"learning_rate": 2.7210065948868767e-05,
"loss": 2.3575,
"step": 29530
},
{
"epoch": 9.328832030324568,
"grad_norm": 0.04375299099707137,
"learning_rate": 2.708244975551699e-05,
"loss": 2.5259,
"step": 29535
},
{
"epoch": 9.330411434889047,
"grad_norm": 0.05064363637638876,
"learning_rate": 2.6955129420176194e-05,
"loss": 2.3011,
"step": 29540
},
{
"epoch": 9.331990839453526,
"grad_norm": 0.04416609196413431,
"learning_rate": 2.682810498156363e-05,
"loss": 2.3921,
"step": 29545
},
{
"epoch": 9.333570244018006,
"grad_norm": 0.09097704405726886,
"learning_rate": 2.6701376478306392e-05,
"loss": 2.4306,
"step": 29550
},
{
"epoch": 9.335149648582485,
"grad_norm": 0.04292206993219541,
"learning_rate": 2.6574943948942222e-05,
"loss": 2.3514,
"step": 29555
},
{
"epoch": 9.336729053146964,
"grad_norm": 0.04708392703843058,
"learning_rate": 2.644880743191802e-05,
"loss": 2.3165,
"step": 29560
},
{
"epoch": 9.338308457711443,
"grad_norm": 0.049826577755894406,
"learning_rate": 2.6322966965591443e-05,
"loss": 2.4197,
"step": 29565
},
{
"epoch": 9.339887862275923,
"grad_norm": 0.041587359271011315,
"learning_rate": 2.6197422588229546e-05,
"loss": 2.3115,
"step": 29570
},
{
"epoch": 9.341467266840402,
"grad_norm": 0.044478761158850064,
"learning_rate": 2.607217433800968e-05,
"loss": 2.3174,
"step": 29575
},
{
"epoch": 9.343046671404881,
"grad_norm": 0.05454092939613832,
"learning_rate": 2.594722225301893e-05,
"loss": 2.3909,
"step": 29580
},
{
"epoch": 9.344626075969359,
"grad_norm": 0.045706366906300894,
"learning_rate": 2.5822566371254574e-05,
"loss": 2.3744,
"step": 29585
},
{
"epoch": 9.346205480533838,
"grad_norm": 0.05969173282860256,
"learning_rate": 2.569820673062351e-05,
"loss": 2.3694,
"step": 29590
},
{
"epoch": 9.347784885098317,
"grad_norm": 0.04288615088936334,
"learning_rate": 2.5574143368942816e-05,
"loss": 2.4734,
"step": 29595
},
{
"epoch": 9.349364289662796,
"grad_norm": 0.04541704867615316,
"learning_rate": 2.5450376323939318e-05,
"loss": 2.4614,
"step": 29600
},
{
"epoch": 9.350943694227276,
"grad_norm": 0.056044125697114734,
"learning_rate": 2.532690563324991e-05,
"loss": 2.325,
"step": 29605
},
{
"epoch": 9.352523098791755,
"grad_norm": 0.04623682450196402,
"learning_rate": 2.520373133442111e-05,
"loss": 2.2482,
"step": 29610
},
{
"epoch": 9.354102503356234,
"grad_norm": 0.04593035270856986,
"learning_rate": 2.5080853464909514e-05,
"loss": 2.3643,
"step": 29615
},
{
"epoch": 9.355681907920713,
"grad_norm": 0.05682904007841541,
"learning_rate": 2.4958272062081343e-05,
"loss": 2.3752,
"step": 29620
},
{
"epoch": 9.357261312485193,
"grad_norm": 0.046275985250694325,
"learning_rate": 2.4835987163212893e-05,
"loss": 2.3326,
"step": 29625
},
{
"epoch": 9.358840717049672,
"grad_norm": 0.0429234631915372,
"learning_rate": 2.47139988054903e-05,
"loss": 2.3498,
"step": 29630
},
{
"epoch": 9.360420121614151,
"grad_norm": 0.04821878012848691,
"learning_rate": 2.4592307026009452e-05,
"loss": 2.3521,
"step": 29635
},
{
"epoch": 9.36199952617863,
"grad_norm": 0.054843210641088026,
"learning_rate": 2.4470911861775857e-05,
"loss": 2.4585,
"step": 29640
},
{
"epoch": 9.36357893074311,
"grad_norm": 0.057313655789485635,
"learning_rate": 2.434981334970532e-05,
"loss": 2.3287,
"step": 29645
},
{
"epoch": 9.365158335307589,
"grad_norm": 0.04453559734930796,
"learning_rate": 2.4229011526622712e-05,
"loss": 2.3613,
"step": 29650
},
{
"epoch": 9.366737739872068,
"grad_norm": 0.05319782786638664,
"learning_rate": 2.4108506429263542e-05,
"loss": 2.394,
"step": 29655
},
{
"epoch": 9.368317144436547,
"grad_norm": 0.04118834226833598,
"learning_rate": 2.3988298094272277e-05,
"loss": 2.3939,
"step": 29660
},
{
"epoch": 9.369896549001027,
"grad_norm": 0.08576726726639043,
"learning_rate": 2.386838655820378e-05,
"loss": 2.3874,
"step": 29665
},
{
"epoch": 9.371475953565506,
"grad_norm": 0.044162918232095824,
"learning_rate": 2.3748771857522223e-05,
"loss": 2.4165,
"step": 29670
},
{
"epoch": 9.373055358129985,
"grad_norm": 0.04094173941087508,
"learning_rate": 2.3629454028601615e-05,
"loss": 2.3924,
"step": 29675
},
{
"epoch": 9.374634762694464,
"grad_norm": 0.05818302960770649,
"learning_rate": 2.3510433107725824e-05,
"loss": 2.362,
"step": 29680
},
{
"epoch": 9.376214167258944,
"grad_norm": 0.042235338314351285,
"learning_rate": 2.3391709131088455e-05,
"loss": 2.3209,
"step": 29685
},
{
"epoch": 9.377793571823423,
"grad_norm": 0.04954815858018991,
"learning_rate": 2.3273282134792517e-05,
"loss": 2.431,
"step": 29690
},
{
"epoch": 9.379372976387902,
"grad_norm": 0.03960092325835107,
"learning_rate": 2.3155152154851087e-05,
"loss": 2.2968,
"step": 29695
},
{
"epoch": 9.380952380952381,
"grad_norm": 0.05437119338882939,
"learning_rate": 2.303731922718666e-05,
"loss": 2.4298,
"step": 29700
},
{
"epoch": 9.38253178551686,
"grad_norm": 0.043076222301554494,
"learning_rate": 2.2919783387631456e-05,
"loss": 2.4017,
"step": 29705
},
{
"epoch": 9.38411119008134,
"grad_norm": 0.04124777112591901,
"learning_rate": 2.280254467192744e-05,
"loss": 2.4128,
"step": 29710
},
{
"epoch": 9.38569059464582,
"grad_norm": 0.04907795423437361,
"learning_rate": 2.2685603115725873e-05,
"loss": 2.4163,
"step": 29715
},
{
"epoch": 9.387269999210298,
"grad_norm": 0.06137536486453631,
"learning_rate": 2.256895875458831e-05,
"loss": 2.3463,
"step": 29720
},
{
"epoch": 9.388849403774778,
"grad_norm": 0.05648848069899127,
"learning_rate": 2.2452611623985485e-05,
"loss": 2.3269,
"step": 29725
},
{
"epoch": 9.390428808339257,
"grad_norm": 0.045017933640088825,
"learning_rate": 2.2336561759297656e-05,
"loss": 2.3745,
"step": 29730
},
{
"epoch": 9.392008212903736,
"grad_norm": 0.05507048607068748,
"learning_rate": 2.222080919581493e-05,
"loss": 2.3204,
"step": 29735
},
{
"epoch": 9.393587617468214,
"grad_norm": 0.043162768313010934,
"learning_rate": 2.2105353968736808e-05,
"loss": 2.3629,
"step": 29740
},
{
"epoch": 9.395167022032693,
"grad_norm": 0.043935727744640586,
"learning_rate": 2.1990196113172767e-05,
"loss": 2.446,
"step": 29745
},
{
"epoch": 9.396746426597172,
"grad_norm": 0.048094083771353345,
"learning_rate": 2.187533566414146e-05,
"loss": 2.3781,
"step": 29750
},
{
"epoch": 9.398325831161651,
"grad_norm": 0.04365144321456516,
"learning_rate": 2.176077265657106e-05,
"loss": 2.3454,
"step": 29755
},
{
"epoch": 9.39990523572613,
"grad_norm": 0.045399127811420394,
"learning_rate": 2.1646507125299588e-05,
"loss": 2.2808,
"step": 29760
},
{
"epoch": 9.40148464029061,
"grad_norm": 0.04207657274447779,
"learning_rate": 2.1532539105074357e-05,
"loss": 2.3258,
"step": 29765
},
{
"epoch": 9.403064044855089,
"grad_norm": 0.051074072509045616,
"learning_rate": 2.1418868630552426e-05,
"loss": 2.3527,
"step": 29770
},
{
"epoch": 9.404643449419568,
"grad_norm": 0.04342401351013079,
"learning_rate": 2.130549573630025e-05,
"loss": 2.3091,
"step": 29775
},
{
"epoch": 9.406222853984048,
"grad_norm": 0.044606635309908146,
"learning_rate": 2.1192420456793703e-05,
"loss": 2.3136,
"step": 29780
},
{
"epoch": 9.407802258548527,
"grad_norm": 0.04151635614695825,
"learning_rate": 2.1079642826418387e-05,
"loss": 2.4504,
"step": 29785
},
{
"epoch": 9.409381663113006,
"grad_norm": 0.04797609619965564,
"learning_rate": 2.0967162879469204e-05,
"loss": 2.452,
"step": 29790
},
{
"epoch": 9.410961067677485,
"grad_norm": 0.03965412747029897,
"learning_rate": 2.085498065015057e-05,
"loss": 2.473,
"step": 29795
},
{
"epoch": 9.412540472241965,
"grad_norm": 0.04644627128071307,
"learning_rate": 2.0743096172576414e-05,
"loss": 2.3485,
"step": 29800
},
{
"epoch": 9.414119876806444,
"grad_norm": 0.04676942187692444,
"learning_rate": 2.0631509480769862e-05,
"loss": 2.3853,
"step": 29805
},
{
"epoch": 9.415699281370923,
"grad_norm": 0.04158178213914722,
"learning_rate": 2.0520220608664098e-05,
"loss": 2.3735,
"step": 29810
},
{
"epoch": 9.417278685935402,
"grad_norm": 0.03858380553527941,
"learning_rate": 2.0409229590101163e-05,
"loss": 2.3907,
"step": 29815
},
{
"epoch": 9.418858090499882,
"grad_norm": 0.0731203280795725,
"learning_rate": 2.029853645883262e-05,
"loss": 2.5312,
"step": 29820
},
{
"epoch": 9.42043749506436,
"grad_norm": 0.03981733529130424,
"learning_rate": 2.0188141248519754e-05,
"loss": 2.2781,
"step": 29825
},
{
"epoch": 9.42201689962884,
"grad_norm": 0.04781066089100794,
"learning_rate": 2.0078043992732942e-05,
"loss": 2.458,
"step": 29830
},
{
"epoch": 9.42359630419332,
"grad_norm": 0.056305636259483774,
"learning_rate": 1.9968244724952067e-05,
"loss": 2.4044,
"step": 29835
},
{
"epoch": 9.425175708757799,
"grad_norm": 0.04500037877631552,
"learning_rate": 1.985874347856631e-05,
"loss": 2.3375,
"step": 29840
},
{
"epoch": 9.426755113322278,
"grad_norm": 0.03801018917288479,
"learning_rate": 1.9749540286874478e-05,
"loss": 2.4163,
"step": 29845
},
{
"epoch": 9.428334517886757,
"grad_norm": 0.03964796905422109,
"learning_rate": 1.9640635183084344e-05,
"loss": 2.3131,
"step": 29850
},
{
"epoch": 9.429913922451236,
"grad_norm": 0.04935019034574686,
"learning_rate": 1.953202820031341e-05,
"loss": 2.4528,
"step": 29855
},
{
"epoch": 9.431493327015716,
"grad_norm": 0.04734727391081135,
"learning_rate": 1.9423719371588265e-05,
"loss": 2.4603,
"step": 29860
},
{
"epoch": 9.433072731580195,
"grad_norm": 0.04823994120007205,
"learning_rate": 1.9315708729845116e-05,
"loss": 2.2735,
"step": 29865
},
{
"epoch": 9.434652136144674,
"grad_norm": 0.05545676647956338,
"learning_rate": 1.920799630792902e-05,
"loss": 2.3343,
"step": 29870
},
{
"epoch": 9.436231540709153,
"grad_norm": 0.04708308106014793,
"learning_rate": 1.910058213859489e-05,
"loss": 2.3726,
"step": 29875
},
{
"epoch": 9.437810945273633,
"grad_norm": 0.051773996714407994,
"learning_rate": 1.8993466254506486e-05,
"loss": 2.4484,
"step": 29880
},
{
"epoch": 9.439390349838112,
"grad_norm": 0.055360447135438084,
"learning_rate": 1.8886648688237307e-05,
"loss": 2.5286,
"step": 29885
},
{
"epoch": 9.440969754402591,
"grad_norm": 0.04011192475212031,
"learning_rate": 1.8780129472269704e-05,
"loss": 2.3998,
"step": 29890
},
{
"epoch": 9.44254915896707,
"grad_norm": 0.040613211830941014,
"learning_rate": 1.867390863899543e-05,
"loss": 2.3915,
"step": 29895
},
{
"epoch": 9.44412856353155,
"grad_norm": 0.050149446685664095,
"learning_rate": 1.8567986220715872e-05,
"loss": 2.3799,
"step": 29900
},
{
"epoch": 9.445707968096027,
"grad_norm": 0.04017383635390211,
"learning_rate": 1.846236224964093e-05,
"loss": 2.4791,
"step": 29905
},
{
"epoch": 9.447287372660506,
"grad_norm": 0.04618957395879647,
"learning_rate": 1.835703675789058e-05,
"loss": 2.4316,
"step": 29910
},
{
"epoch": 9.448866777224985,
"grad_norm": 0.039771449300195334,
"learning_rate": 1.8252009777493418e-05,
"loss": 2.3222,
"step": 29915
},
{
"epoch": 9.450446181789465,
"grad_norm": 0.04138208104877937,
"learning_rate": 1.8147281340387457e-05,
"loss": 2.3547,
"step": 29920
},
{
"epoch": 9.452025586353944,
"grad_norm": 0.04225855510943169,
"learning_rate": 1.8042851478420108e-05,
"loss": 2.4435,
"step": 29925
},
{
"epoch": 9.453604990918423,
"grad_norm": 0.042701158580847276,
"learning_rate": 1.7938720223347748e-05,
"loss": 2.4467,
"step": 29930
},
{
"epoch": 9.455184395482902,
"grad_norm": 0.05047940706705218,
"learning_rate": 1.7834887606835937e-05,
"loss": 2.4121,
"step": 29935
},
{
"epoch": 9.456763800047382,
"grad_norm": 0.044368724827039276,
"learning_rate": 1.773135366045964e-05,
"loss": 2.356,
"step": 29940
},
{
"epoch": 9.458343204611861,
"grad_norm": 0.04474235502172375,
"learning_rate": 1.7628118415702667e-05,
"loss": 2.2713,
"step": 29945
},
{
"epoch": 9.45992260917634,
"grad_norm": 0.04537155012918435,
"learning_rate": 1.7525181903958465e-05,
"loss": 2.3115,
"step": 29950
},
{
"epoch": 9.46150201374082,
"grad_norm": 0.045396921077155275,
"learning_rate": 1.7422544156529217e-05,
"loss": 2.3988,
"step": 29955
},
{
"epoch": 9.463081418305299,
"grad_norm": 0.04569536491545306,
"learning_rate": 1.7320205204626295e-05,
"loss": 2.4154,
"step": 29960
},
{
"epoch": 9.464660822869778,
"grad_norm": 0.04503139905374363,
"learning_rate": 1.7218165079370573e-05,
"loss": 2.3998,
"step": 29965
},
{
"epoch": 9.466240227434257,
"grad_norm": 0.04357632442196037,
"learning_rate": 1.7116423811791793e-05,
"loss": 2.3596,
"step": 29970
},
{
"epoch": 9.467819631998736,
"grad_norm": 0.0477858930960453,
"learning_rate": 1.7014981432828537e-05,
"loss": 2.4124,
"step": 29975
},
{
"epoch": 9.469399036563216,
"grad_norm": 0.050867574355163044,
"learning_rate": 1.6913837973329126e-05,
"loss": 2.5338,
"step": 29980
},
{
"epoch": 9.470978441127695,
"grad_norm": 0.04198565363155485,
"learning_rate": 1.6812993464050297e-05,
"loss": 2.3959,
"step": 29985
},
{
"epoch": 9.472557845692174,
"grad_norm": 0.041113056332670775,
"learning_rate": 1.6712447935658514e-05,
"loss": 2.4635,
"step": 29990
},
{
"epoch": 9.474137250256653,
"grad_norm": 0.041309350679928164,
"learning_rate": 1.661220141872877e-05,
"loss": 2.3399,
"step": 29995
},
{
"epoch": 9.475716654821133,
"grad_norm": 0.05830406789875503,
"learning_rate": 1.651225394374567e-05,
"loss": 2.4523,
"step": 30000
},
{
"epoch": 9.477296059385612,
"grad_norm": 0.05305332294843227,
"learning_rate": 1.6412605541102465e-05,
"loss": 2.3259,
"step": 30005
},
{
"epoch": 9.478875463950091,
"grad_norm": 0.0456174472825058,
"learning_rate": 1.631325624110158e-05,
"loss": 2.5383,
"step": 30010
},
{
"epoch": 9.48045486851457,
"grad_norm": 0.04176300384571954,
"learning_rate": 1.621420607395452e-05,
"loss": 2.4388,
"step": 30015
},
{
"epoch": 9.48203427307905,
"grad_norm": 0.04386143968655976,
"learning_rate": 1.611545506978185e-05,
"loss": 2.3695,
"step": 30020
},
{
"epoch": 9.483613677643529,
"grad_norm": 0.05561908224519766,
"learning_rate": 1.6017003258612993e-05,
"loss": 2.4921,
"step": 30025
},
{
"epoch": 9.485193082208008,
"grad_norm": 0.05537260677822884,
"learning_rate": 1.5918850670386677e-05,
"loss": 2.3573,
"step": 30030
},
{
"epoch": 9.486772486772487,
"grad_norm": 0.04378169445858685,
"learning_rate": 1.5820997334950348e-05,
"loss": 2.3794,
"step": 30035
},
{
"epoch": 9.488351891336967,
"grad_norm": 0.04942199234857001,
"learning_rate": 1.5723443282060657e-05,
"loss": 2.4008,
"step": 30040
},
{
"epoch": 9.489931295901446,
"grad_norm": 0.04756229203870128,
"learning_rate": 1.5626188541383202e-05,
"loss": 2.4094,
"step": 30045
},
{
"epoch": 9.491510700465925,
"grad_norm": 0.04332502222276905,
"learning_rate": 1.5529233142492437e-05,
"loss": 2.4333,
"step": 30050
},
{
"epoch": 9.493090105030404,
"grad_norm": 0.04305266382009812,
"learning_rate": 1.5432577114871893e-05,
"loss": 2.3677,
"step": 30055
},
{
"epoch": 9.494669509594882,
"grad_norm": 0.04441467026906564,
"learning_rate": 1.5336220487914053e-05,
"loss": 2.4181,
"step": 30060
},
{
"epoch": 9.496248914159361,
"grad_norm": 0.05774432747502679,
"learning_rate": 1.5240163290920483e-05,
"loss": 2.4699,
"step": 30065
},
{
"epoch": 9.49782831872384,
"grad_norm": 0.04548534785304647,
"learning_rate": 1.514440555310137e-05,
"loss": 2.4159,
"step": 30070
},
{
"epoch": 9.49940772328832,
"grad_norm": 0.04934305339107333,
"learning_rate": 1.5048947303576088e-05,
"loss": 2.2676,
"step": 30075
},
{
"epoch": 9.500987127852799,
"grad_norm": 0.04429559617729792,
"learning_rate": 1.4953788571372862e-05,
"loss": 2.4057,
"step": 30080
},
{
"epoch": 9.502566532417278,
"grad_norm": 0.03998151373318868,
"learning_rate": 1.4858929385428987e-05,
"loss": 2.4556,
"step": 30085
},
{
"epoch": 9.504145936981757,
"grad_norm": 0.05214201915349131,
"learning_rate": 1.476436977459039e-05,
"loss": 2.4301,
"step": 30090
},
{
"epoch": 9.505725341546237,
"grad_norm": 0.039208856656101625,
"learning_rate": 1.4670109767612184e-05,
"loss": 2.4322,
"step": 30095
},
{
"epoch": 9.507304746110716,
"grad_norm": 0.04334137454919302,
"learning_rate": 1.457614939315799e-05,
"loss": 2.3992,
"step": 30100
},
{
"epoch": 9.508884150675195,
"grad_norm": 0.041758109790759806,
"learning_rate": 1.4482488679800843e-05,
"loss": 2.3598,
"step": 30105
},
{
"epoch": 9.510463555239674,
"grad_norm": 0.042498191826742984,
"learning_rate": 1.4389127656022294e-05,
"loss": 2.4085,
"step": 30110
},
{
"epoch": 9.512042959804154,
"grad_norm": 0.0507269881752584,
"learning_rate": 1.4296066350212744e-05,
"loss": 2.4762,
"step": 30115
},
{
"epoch": 9.513622364368633,
"grad_norm": 0.03973664673653034,
"learning_rate": 1.4203304790671556e-05,
"loss": 2.3195,
"step": 30120
},
{
"epoch": 9.515201768933112,
"grad_norm": 0.039955040633668434,
"learning_rate": 1.4110843005606833e-05,
"loss": 2.3229,
"step": 30125
},
{
"epoch": 9.516781173497591,
"grad_norm": 0.05790493422855318,
"learning_rate": 1.401868102313586e-05,
"loss": 2.396,
"step": 30130
},
{
"epoch": 9.51836057806207,
"grad_norm": 0.0601991017071839,
"learning_rate": 1.392681887128433e-05,
"loss": 2.4457,
"step": 30135
},
{
"epoch": 9.51993998262655,
"grad_norm": 0.05021823124552835,
"learning_rate": 1.383525657798701e-05,
"loss": 2.3837,
"step": 30140
},
{
"epoch": 9.521519387191029,
"grad_norm": 0.04775413593498285,
"learning_rate": 1.3743994171087404e-05,
"loss": 2.4212,
"step": 30145
},
{
"epoch": 9.523098791755508,
"grad_norm": 0.04553946922627249,
"learning_rate": 1.3653031678337868e-05,
"loss": 2.2845,
"step": 30150
},
{
"epoch": 9.524678196319988,
"grad_norm": 0.05062828533969865,
"learning_rate": 1.3562369127399387e-05,
"loss": 2.4671,
"step": 30155
},
{
"epoch": 9.526257600884467,
"grad_norm": 0.046298147695143774,
"learning_rate": 1.3472006545841908e-05,
"loss": 2.4893,
"step": 30160
},
{
"epoch": 9.527837005448946,
"grad_norm": 0.04729088554594134,
"learning_rate": 1.3381943961144117e-05,
"loss": 2.4226,
"step": 30165
},
{
"epoch": 9.529416410013425,
"grad_norm": 0.048787859332196835,
"learning_rate": 1.3292181400693548e-05,
"loss": 2.3341,
"step": 30170
},
{
"epoch": 9.530995814577905,
"grad_norm": 0.04796829554852364,
"learning_rate": 1.3202718891786259e-05,
"loss": 2.2858,
"step": 30175
},
{
"epoch": 9.532575219142384,
"grad_norm": 0.041277624080704595,
"learning_rate": 1.3113556461627485e-05,
"loss": 2.3621,
"step": 30180
},
{
"epoch": 9.534154623706863,
"grad_norm": 0.03873505231492884,
"learning_rate": 1.302469413733065e-05,
"loss": 2.4327,
"step": 30185
},
{
"epoch": 9.535734028271342,
"grad_norm": 0.045494772722792585,
"learning_rate": 1.2936131945918472e-05,
"loss": 2.3949,
"step": 30190
},
{
"epoch": 9.537313432835822,
"grad_norm": 0.04382388795010529,
"learning_rate": 1.2847869914321964e-05,
"loss": 2.3093,
"step": 30195
},
{
"epoch": 9.5388928374003,
"grad_norm": 0.04484146571483914,
"learning_rate": 1.275990806938121e-05,
"loss": 2.3497,
"step": 30200
},
{
"epoch": 9.54047224196478,
"grad_norm": 0.039862199131951076,
"learning_rate": 1.2672246437844703e-05,
"loss": 2.4191,
"step": 30205
},
{
"epoch": 9.54205164652926,
"grad_norm": 0.04359029508207695,
"learning_rate": 1.2584885046369898e-05,
"loss": 2.417,
"step": 30210
},
{
"epoch": 9.543631051093737,
"grad_norm": 0.04438023129957551,
"learning_rate": 1.2497823921522767e-05,
"loss": 2.286,
"step": 30215
},
{
"epoch": 9.545210455658216,
"grad_norm": 0.04096978438925655,
"learning_rate": 1.2411063089778019e-05,
"loss": 2.3608,
"step": 30220
},
{
"epoch": 9.546789860222695,
"grad_norm": 0.05192133645429705,
"learning_rate": 1.2324602577518996e-05,
"loss": 2.3916,
"step": 30225
},
{
"epoch": 9.548369264787175,
"grad_norm": 0.0514436329176969,
"learning_rate": 1.2238442411038109e-05,
"loss": 2.4901,
"step": 30230
},
{
"epoch": 9.549948669351654,
"grad_norm": 0.050780672753172856,
"learning_rate": 1.2152582616535845e-05,
"loss": 2.3917,
"step": 30235
},
{
"epoch": 9.551528073916133,
"grad_norm": 0.05512754865263124,
"learning_rate": 1.2067023220121653e-05,
"loss": 2.4749,
"step": 30240
},
{
"epoch": 9.553107478480612,
"grad_norm": 0.04586744410439057,
"learning_rate": 1.198176424781361e-05,
"loss": 2.3557,
"step": 30245
},
{
"epoch": 9.554686883045092,
"grad_norm": 0.04129686388116584,
"learning_rate": 1.1896805725538417e-05,
"loss": 2.3429,
"step": 30250
},
{
"epoch": 9.55626628760957,
"grad_norm": 0.04261799093173591,
"learning_rate": 1.1812147679131414e-05,
"loss": 2.4092,
"step": 30255
},
{
"epoch": 9.55784569217405,
"grad_norm": 0.03968786044092743,
"learning_rate": 1.1727790134336668e-05,
"loss": 2.3737,
"step": 30260
},
{
"epoch": 9.55942509673853,
"grad_norm": 0.03949974284061107,
"learning_rate": 1.1643733116806554e-05,
"loss": 2.3981,
"step": 30265
},
{
"epoch": 9.561004501303008,
"grad_norm": 0.05787206772249031,
"learning_rate": 1.1559976652102621e-05,
"loss": 2.3627,
"step": 30270
},
{
"epoch": 9.562583905867488,
"grad_norm": 0.04210911987426291,
"learning_rate": 1.1476520765694387e-05,
"loss": 2.3844,
"step": 30275
},
{
"epoch": 9.564163310431967,
"grad_norm": 0.03923730653481024,
"learning_rate": 1.1393365482960217e-05,
"loss": 2.4123,
"step": 30280
},
{
"epoch": 9.565742714996446,
"grad_norm": 0.04447443451239888,
"learning_rate": 1.1310510829187325e-05,
"loss": 2.3568,
"step": 30285
},
{
"epoch": 9.567322119560925,
"grad_norm": 0.04632371230474168,
"learning_rate": 1.1227956829571229e-05,
"loss": 2.4679,
"step": 30290
},
{
"epoch": 9.568901524125405,
"grad_norm": 0.04289668157296784,
"learning_rate": 1.1145703509215954e-05,
"loss": 2.3484,
"step": 30295
},
{
"epoch": 9.570480928689884,
"grad_norm": 0.04683699577472539,
"learning_rate": 1.1063750893134273e-05,
"loss": 2.4411,
"step": 30300
},
{
"epoch": 9.572060333254363,
"grad_norm": 0.04195129324963458,
"learning_rate": 1.098209900624747e-05,
"loss": 2.3848,
"step": 30305
},
{
"epoch": 9.573639737818842,
"grad_norm": 0.043437481591911235,
"learning_rate": 1.0900747873385353e-05,
"loss": 2.3015,
"step": 30310
},
{
"epoch": 9.575219142383322,
"grad_norm": 0.04696243927648497,
"learning_rate": 1.0819697519286243e-05,
"loss": 2.4201,
"step": 30315
},
{
"epoch": 9.576798546947801,
"grad_norm": 0.054747008086861344,
"learning_rate": 1.073894796859709e-05,
"loss": 2.3748,
"step": 30320
},
{
"epoch": 9.57837795151228,
"grad_norm": 0.045182127280545464,
"learning_rate": 1.0658499245873365e-05,
"loss": 2.2968,
"step": 30325
},
{
"epoch": 9.57995735607676,
"grad_norm": 0.03951587981848332,
"learning_rate": 1.0578351375578943e-05,
"loss": 2.4699,
"step": 30330
},
{
"epoch": 9.581536760641239,
"grad_norm": 0.05834058521159146,
"learning_rate": 1.0498504382086216e-05,
"loss": 2.4349,
"step": 30335
},
{
"epoch": 9.583116165205718,
"grad_norm": 0.041251491689309985,
"learning_rate": 1.0418958289676094e-05,
"loss": 2.4767,
"step": 30340
},
{
"epoch": 9.584695569770197,
"grad_norm": 0.07170533104500992,
"learning_rate": 1.0339713122538341e-05,
"loss": 2.3626,
"step": 30345
},
{
"epoch": 9.586274974334676,
"grad_norm": 0.04260019416686983,
"learning_rate": 1.0260768904770678e-05,
"loss": 2.3644,
"step": 30350
},
{
"epoch": 9.587854378899156,
"grad_norm": 0.0487208252713447,
"learning_rate": 1.018212566037946e-05,
"loss": 2.3559,
"step": 30355
},
{
"epoch": 9.589433783463635,
"grad_norm": 0.040201645318378686,
"learning_rate": 1.0103783413279777e-05,
"loss": 2.4521,
"step": 30360
},
{
"epoch": 9.591013188028114,
"grad_norm": 0.04248018611799858,
"learning_rate": 1.0025742187294907e-05,
"loss": 2.362,
"step": 30365
},
{
"epoch": 9.592592592592592,
"grad_norm": 0.0421978492946494,
"learning_rate": 9.948002006156753e-06,
"loss": 2.3562,
"step": 30370
},
{
"epoch": 9.594171997157073,
"grad_norm": 0.04136724537884408,
"learning_rate": 9.87056289350552e-06,
"loss": 2.4004,
"step": 30375
},
{
"epoch": 9.59575140172155,
"grad_norm": 0.03960262386614813,
"learning_rate": 9.793424872890033e-06,
"loss": 2.4496,
"step": 30380
},
{
"epoch": 9.59733080628603,
"grad_norm": 0.04137355502171963,
"learning_rate": 9.716587967767532e-06,
"loss": 2.3325,
"step": 30385
},
{
"epoch": 9.598910210850509,
"grad_norm": 0.04095316967018649,
"learning_rate": 9.640052201503436e-06,
"loss": 2.3791,
"step": 30390
},
{
"epoch": 9.600489615414988,
"grad_norm": 0.039896460435274506,
"learning_rate": 9.563817597371793e-06,
"loss": 2.4157,
"step": 30395
},
{
"epoch": 9.602069019979467,
"grad_norm": 0.04526125496782704,
"learning_rate": 9.487884178555285e-06,
"loss": 2.311,
"step": 30400
},
{
"epoch": 9.603648424543946,
"grad_norm": 0.056007224336220955,
"learning_rate": 9.412251968144548e-06,
"loss": 2.3221,
"step": 30405
},
{
"epoch": 9.605227829108426,
"grad_norm": 0.04247609138890829,
"learning_rate": 9.336920989139075e-06,
"loss": 2.385,
"step": 30410
},
{
"epoch": 9.606807233672905,
"grad_norm": 0.04838922797912883,
"learning_rate": 9.261891264446321e-06,
"loss": 2.3418,
"step": 30415
},
{
"epoch": 9.608386638237384,
"grad_norm": 0.04463021386680935,
"learning_rate": 9.187162816882478e-06,
"loss": 2.3256,
"step": 30420
},
{
"epoch": 9.609966042801863,
"grad_norm": 0.04252370965679977,
"learning_rate": 9.112735669171923e-06,
"loss": 2.3774,
"step": 30425
},
{
"epoch": 9.611545447366343,
"grad_norm": 0.041372974750041105,
"learning_rate": 9.038609843947331e-06,
"loss": 2.3604,
"step": 30430
},
{
"epoch": 9.613124851930822,
"grad_norm": 0.03918200730070356,
"learning_rate": 8.964785363750227e-06,
"loss": 2.3614,
"step": 30435
},
{
"epoch": 9.614704256495301,
"grad_norm": 0.04867686562063659,
"learning_rate": 8.891262251029986e-06,
"loss": 2.3443,
"step": 30440
},
{
"epoch": 9.61628366105978,
"grad_norm": 0.04765697256733076,
"learning_rate": 8.8180405281445e-06,
"loss": 2.4513,
"step": 30445
},
{
"epoch": 9.61786306562426,
"grad_norm": 0.04711140899166385,
"learning_rate": 8.745120217360069e-06,
"loss": 2.302,
"step": 30450
},
{
"epoch": 9.619442470188739,
"grad_norm": 0.06165867220520249,
"learning_rate": 8.67250134085129e-06,
"loss": 2.3276,
"step": 30455
},
{
"epoch": 9.621021874753218,
"grad_norm": 0.04080824588056334,
"learning_rate": 8.600183920701054e-06,
"loss": 2.4288,
"step": 30460
},
{
"epoch": 9.622601279317697,
"grad_norm": 0.0589845625137612,
"learning_rate": 8.528167978900658e-06,
"loss": 2.3658,
"step": 30465
},
{
"epoch": 9.624180683882177,
"grad_norm": 0.040971728425958566,
"learning_rate": 8.456453537349695e-06,
"loss": 2.4291,
"step": 30470
},
{
"epoch": 9.625760088446656,
"grad_norm": 0.04055824535769639,
"learning_rate": 8.385040617856165e-06,
"loss": 2.4538,
"step": 30475
},
{
"epoch": 9.627339493011135,
"grad_norm": 0.03890212954797521,
"learning_rate": 8.313929242136031e-06,
"loss": 2.3181,
"step": 30480
},
{
"epoch": 9.628918897575614,
"grad_norm": 0.04510643757841533,
"learning_rate": 8.243119431813994e-06,
"loss": 2.3916,
"step": 30485
},
{
"epoch": 9.630498302140094,
"grad_norm": 0.0605976137075534,
"learning_rate": 8.172611208422832e-06,
"loss": 2.3935,
"step": 30490
},
{
"epoch": 9.632077706704573,
"grad_norm": 0.04555784366649664,
"learning_rate": 8.102404593403612e-06,
"loss": 2.3827,
"step": 30495
},
{
"epoch": 9.633657111269052,
"grad_norm": 0.05622759311961614,
"learning_rate": 8.032499608105814e-06,
"loss": 2.3199,
"step": 30500
},
{
"epoch": 9.635236515833531,
"grad_norm": 0.04127894921791548,
"learning_rate": 7.962896273787102e-06,
"loss": 2.3836,
"step": 30505
},
{
"epoch": 9.63681592039801,
"grad_norm": 0.04587006613023309,
"learning_rate": 7.893594611613208e-06,
"loss": 2.3174,
"step": 30510
},
{
"epoch": 9.63839532496249,
"grad_norm": 0.04783809377117116,
"learning_rate": 7.8245946426585e-06,
"loss": 2.3802,
"step": 30515
},
{
"epoch": 9.639974729526969,
"grad_norm": 0.05573266032683132,
"learning_rate": 7.755896387905303e-06,
"loss": 2.3265,
"step": 30520
},
{
"epoch": 9.641554134091448,
"grad_norm": 0.038286832964746505,
"learning_rate": 7.687499868244463e-06,
"loss": 2.4079,
"step": 30525
},
{
"epoch": 9.643133538655928,
"grad_norm": 0.04762025727828804,
"learning_rate": 7.619405104474786e-06,
"loss": 2.337,
"step": 30530
},
{
"epoch": 9.644712943220405,
"grad_norm": 0.0470953542275122,
"learning_rate": 7.5516121173035966e-06,
"loss": 2.427,
"step": 30535
},
{
"epoch": 9.646292347784884,
"grad_norm": 0.044777400364591644,
"learning_rate": 7.484120927346183e-06,
"loss": 2.3778,
"step": 30540
},
{
"epoch": 9.647871752349364,
"grad_norm": 0.04723592826455098,
"learning_rate": 7.416931555126239e-06,
"loss": 2.4023,
"step": 30545
},
{
"epoch": 9.649451156913843,
"grad_norm": 0.045076040442776764,
"learning_rate": 7.350044021075641e-06,
"loss": 2.3968,
"step": 30550
},
{
"epoch": 9.651030561478322,
"grad_norm": 0.04036003268200752,
"learning_rate": 7.283458345534455e-06,
"loss": 2.3877,
"step": 30555
},
{
"epoch": 9.652609966042801,
"grad_norm": 0.04560351252702661,
"learning_rate": 7.217174548750927e-06,
"loss": 2.4267,
"step": 30560
},
{
"epoch": 9.65418937060728,
"grad_norm": 0.04756462549138177,
"learning_rate": 7.151192650881488e-06,
"loss": 2.4844,
"step": 30565
},
{
"epoch": 9.65576877517176,
"grad_norm": 0.046416280920711504,
"learning_rate": 7.085512671990979e-06,
"loss": 2.4185,
"step": 30570
},
{
"epoch": 9.657348179736239,
"grad_norm": 0.04094277725135964,
"learning_rate": 7.0201346320520885e-06,
"loss": 2.3085,
"step": 30575
},
{
"epoch": 9.658927584300718,
"grad_norm": 0.0421059556137951,
"learning_rate": 6.955058550945914e-06,
"loss": 2.4972,
"step": 30580
},
{
"epoch": 9.660506988865198,
"grad_norm": 0.051743655208623236,
"learning_rate": 6.8902844484617365e-06,
"loss": 2.3755,
"step": 30585
},
{
"epoch": 9.662086393429677,
"grad_norm": 0.05207865617791107,
"learning_rate": 6.825812344296911e-06,
"loss": 2.3845,
"step": 30590
},
{
"epoch": 9.663665797994156,
"grad_norm": 0.04238329717910449,
"learning_rate": 6.761642258056977e-06,
"loss": 2.4376,
"step": 30595
},
{
"epoch": 9.665245202558635,
"grad_norm": 0.03890907294829127,
"learning_rate": 6.697774209255769e-06,
"loss": 2.5243,
"step": 30600
},
{
"epoch": 9.666824607123115,
"grad_norm": 0.040397506187010615,
"learning_rate": 6.634208217314863e-06,
"loss": 2.4171,
"step": 30605
},
{
"epoch": 9.668404011687594,
"grad_norm": 0.05460175974014248,
"learning_rate": 6.570944301564574e-06,
"loss": 2.3557,
"step": 30610
},
{
"epoch": 9.669983416252073,
"grad_norm": 0.04497588646677044,
"learning_rate": 6.5079824812428465e-06,
"loss": 2.3133,
"step": 30615
},
{
"epoch": 9.671562820816552,
"grad_norm": 0.04670290589313647,
"learning_rate": 6.445322775496032e-06,
"loss": 2.3479,
"step": 30620
},
{
"epoch": 9.673142225381032,
"grad_norm": 0.04177047213439708,
"learning_rate": 6.382965203378666e-06,
"loss": 2.3274,
"step": 30625
},
{
"epoch": 9.67472162994551,
"grad_norm": 0.052684775403997894,
"learning_rate": 6.3209097838531345e-06,
"loss": 2.4518,
"step": 30630
},
{
"epoch": 9.67630103450999,
"grad_norm": 0.04117237377462446,
"learning_rate": 6.259156535790011e-06,
"loss": 2.379,
"step": 30635
},
{
"epoch": 9.67788043907447,
"grad_norm": 0.04181974896568244,
"learning_rate": 6.197705477968385e-06,
"loss": 2.4248,
"step": 30640
},
{
"epoch": 9.679459843638949,
"grad_norm": 0.03963486459269917,
"learning_rate": 6.136556629074863e-06,
"loss": 2.3532,
"step": 30645
},
{
"epoch": 9.681039248203428,
"grad_norm": 0.03986513963167819,
"learning_rate": 6.075710007704571e-06,
"loss": 2.3489,
"step": 30650
},
{
"epoch": 9.682618652767907,
"grad_norm": 0.03910156824584804,
"learning_rate": 6.0151656323604865e-06,
"loss": 2.3829,
"step": 30655
},
{
"epoch": 9.684198057332386,
"grad_norm": 0.042672288156665183,
"learning_rate": 5.95492352145377e-06,
"loss": 2.2549,
"step": 30660
},
{
"epoch": 9.685777461896866,
"grad_norm": 0.047864101614561595,
"learning_rate": 5.894983693303657e-06,
"loss": 2.3353,
"step": 30665
},
{
"epoch": 9.687356866461345,
"grad_norm": 0.04677235295553442,
"learning_rate": 5.835346166137456e-06,
"loss": 2.5169,
"step": 30670
},
{
"epoch": 9.688936271025824,
"grad_norm": 0.0393518213132439,
"learning_rate": 5.776010958090661e-06,
"loss": 2.3498,
"step": 30675
},
{
"epoch": 9.690515675590303,
"grad_norm": 0.04166323858330401,
"learning_rate": 5.7169780872066145e-06,
"loss": 2.3622,
"step": 30680
},
{
"epoch": 9.692095080154782,
"grad_norm": 0.04751055801597481,
"learning_rate": 5.658247571436958e-06,
"loss": 2.4624,
"step": 30685
},
{
"epoch": 9.69367448471926,
"grad_norm": 0.048265634956542285,
"learning_rate": 5.599819428641073e-06,
"loss": 2.4059,
"step": 30690
},
{
"epoch": 9.69525388928374,
"grad_norm": 0.04300194900217398,
"learning_rate": 5.541693676586857e-06,
"loss": 2.4031,
"step": 30695
},
{
"epoch": 9.696833293848218,
"grad_norm": 0.05287579478685596,
"learning_rate": 5.483870332949614e-06,
"loss": 2.3722,
"step": 30700
},
{
"epoch": 9.698412698412698,
"grad_norm": 0.04379440634829869,
"learning_rate": 5.426349415313503e-06,
"loss": 2.3225,
"step": 30705
},
{
"epoch": 9.699992102977177,
"grad_norm": 0.04480391987726241,
"learning_rate": 5.369130941169864e-06,
"loss": 2.498,
"step": 30710
},
{
"epoch": 9.701571507541656,
"grad_norm": 0.055956011611355945,
"learning_rate": 5.312214927918668e-06,
"loss": 2.4603,
"step": 30715
},
{
"epoch": 9.703150912106135,
"grad_norm": 0.05239746357954682,
"learning_rate": 5.255601392867626e-06,
"loss": 2.3886,
"step": 30720
},
{
"epoch": 9.704730316670615,
"grad_norm": 0.04521987865346676,
"learning_rate": 5.199290353232633e-06,
"loss": 2.4192,
"step": 30725
},
{
"epoch": 9.706309721235094,
"grad_norm": 0.0385121905506996,
"learning_rate": 5.143281826137547e-06,
"loss": 2.4245,
"step": 30730
},
{
"epoch": 9.707889125799573,
"grad_norm": 0.05024761315841477,
"learning_rate": 5.087575828614077e-06,
"loss": 2.3304,
"step": 30735
},
{
"epoch": 9.709468530364052,
"grad_norm": 0.04971828992090415,
"learning_rate": 5.0321723776022285e-06,
"loss": 2.4155,
"step": 30740
},
{
"epoch": 9.711047934928532,
"grad_norm": 0.05178697599173527,
"learning_rate": 4.977071489949636e-06,
"loss": 2.3765,
"step": 30745
},
{
"epoch": 9.712627339493011,
"grad_norm": 0.04105378489204273,
"learning_rate": 4.922273182412229e-06,
"loss": 2.3604,
"step": 30750
},
{
"epoch": 9.71420674405749,
"grad_norm": 0.04731059855443261,
"learning_rate": 4.8677774716539005e-06,
"loss": 2.3253,
"step": 30755
},
{
"epoch": 9.71578614862197,
"grad_norm": 0.044619941817862,
"learning_rate": 4.813584374246283e-06,
"loss": 2.3247,
"step": 30760
},
{
"epoch": 9.717365553186449,
"grad_norm": 0.049710979675962795,
"learning_rate": 4.759693906669193e-06,
"loss": 2.4843,
"step": 30765
},
{
"epoch": 9.718944957750928,
"grad_norm": 0.04589773917863417,
"learning_rate": 4.7061060853105245e-06,
"loss": 2.4386,
"step": 30770
},
{
"epoch": 9.720524362315407,
"grad_norm": 0.04962276990711766,
"learning_rate": 4.652820926465795e-06,
"loss": 2.4673,
"step": 30775
},
{
"epoch": 9.722103766879886,
"grad_norm": 0.06204472796634575,
"learning_rate": 4.599838446338933e-06,
"loss": 2.4417,
"step": 30780
},
{
"epoch": 9.723683171444366,
"grad_norm": 0.04288187894005779,
"learning_rate": 4.547158661041273e-06,
"loss": 2.3675,
"step": 30785
},
{
"epoch": 9.725262576008845,
"grad_norm": 0.04264232488693888,
"learning_rate": 4.494781586592556e-06,
"loss": 2.3589,
"step": 30790
},
{
"epoch": 9.726841980573324,
"grad_norm": 0.04045280768756304,
"learning_rate": 4.442707238920262e-06,
"loss": 2.4392,
"step": 30795
},
{
"epoch": 9.728421385137803,
"grad_norm": 0.044577534247766414,
"learning_rate": 4.390935633859949e-06,
"loss": 2.3925,
"step": 30800
},
{
"epoch": 9.730000789702283,
"grad_norm": 0.0628512056528852,
"learning_rate": 4.339466787155022e-06,
"loss": 2.3964,
"step": 30805
},
{
"epoch": 9.731580194266762,
"grad_norm": 0.04678420169862776,
"learning_rate": 4.288300714456739e-06,
"loss": 2.4355,
"step": 30810
},
{
"epoch": 9.733159598831241,
"grad_norm": 0.042966489138362214,
"learning_rate": 4.237437431324432e-06,
"loss": 2.3571,
"step": 30815
},
{
"epoch": 9.73473900339572,
"grad_norm": 0.041981582567709584,
"learning_rate": 4.186876953225282e-06,
"loss": 2.3353,
"step": 30820
},
{
"epoch": 9.7363184079602,
"grad_norm": 0.04462294311148982,
"learning_rate": 4.1366192955345495e-06,
"loss": 2.4082,
"step": 30825
},
{
"epoch": 9.737897812524679,
"grad_norm": 0.05296942511870504,
"learning_rate": 4.086664473535007e-06,
"loss": 2.3646,
"step": 30830
},
{
"epoch": 9.739477217089158,
"grad_norm": 0.061021263036657995,
"learning_rate": 4.037012502417836e-06,
"loss": 2.365,
"step": 30835
},
{
"epoch": 9.741056621653637,
"grad_norm": 0.042408455896456566,
"learning_rate": 3.987663397281627e-06,
"loss": 2.3529,
"step": 30840
},
{
"epoch": 9.742636026218115,
"grad_norm": 0.05110007897643741,
"learning_rate": 3.938617173133485e-06,
"loss": 2.4308,
"step": 30845
},
{
"epoch": 9.744215430782596,
"grad_norm": 0.042637042104907055,
"learning_rate": 3.8898738448877035e-06,
"loss": 2.248,
"step": 30850
},
{
"epoch": 9.745794835347073,
"grad_norm": 0.05276895688387875,
"learning_rate": 3.841433427366981e-06,
"loss": 2.3986,
"step": 30855
},
{
"epoch": 9.747374239911553,
"grad_norm": 0.04629343277981927,
"learning_rate": 3.793295935301755e-06,
"loss": 2.3809,
"step": 30860
},
{
"epoch": 9.748953644476032,
"grad_norm": 0.05131672840523233,
"learning_rate": 3.7454613833302067e-06,
"loss": 2.3555,
"step": 30865
},
{
"epoch": 9.750533049040511,
"grad_norm": 0.04295634906659519,
"learning_rate": 3.6979297859986994e-06,
"loss": 2.3327,
"step": 30870
},
{
"epoch": 9.75211245360499,
"grad_norm": 0.051091952606107854,
"learning_rate": 3.650701157761227e-06,
"loss": 2.4104,
"step": 30875
},
{
"epoch": 9.75369185816947,
"grad_norm": 0.04170818109947054,
"learning_rate": 3.6037755129795235e-06,
"loss": 2.308,
"step": 30880
},
{
"epoch": 9.755271262733949,
"grad_norm": 0.03961119134473402,
"learning_rate": 3.5571528659236187e-06,
"loss": 2.3481,
"step": 30885
},
{
"epoch": 9.756850667298428,
"grad_norm": 0.06305211933680803,
"learning_rate": 3.51083323077106e-06,
"loss": 2.3825,
"step": 30890
},
{
"epoch": 9.758430071862907,
"grad_norm": 0.04611465862398895,
"learning_rate": 3.4648166216074695e-06,
"loss": 2.4526,
"step": 30895
},
{
"epoch": 9.760009476427387,
"grad_norm": 0.0477775299579714,
"learning_rate": 3.419103052425987e-06,
"loss": 2.3879,
"step": 30900
},
{
"epoch": 9.761588880991866,
"grad_norm": 0.047779903939825376,
"learning_rate": 3.373692537127937e-06,
"loss": 2.5331,
"step": 30905
},
{
"epoch": 9.763168285556345,
"grad_norm": 0.04679318402033712,
"learning_rate": 3.3285850895224955e-06,
"loss": 2.3771,
"step": 30910
},
{
"epoch": 9.764747690120824,
"grad_norm": 0.045154703815724034,
"learning_rate": 3.2837807233263574e-06,
"loss": 2.3445,
"step": 30915
},
{
"epoch": 9.766327094685304,
"grad_norm": 0.04264748461729115,
"learning_rate": 3.2392794521642897e-06,
"loss": 2.382,
"step": 30920
},
{
"epoch": 9.767906499249783,
"grad_norm": 0.04750089500426993,
"learning_rate": 3.195081289568802e-06,
"loss": 2.2917,
"step": 30925
},
{
"epoch": 9.769485903814262,
"grad_norm": 0.0426232785587374,
"learning_rate": 3.1511862489803645e-06,
"loss": 2.3385,
"step": 30930
},
{
"epoch": 9.771065308378741,
"grad_norm": 0.04052656479849263,
"learning_rate": 3.1075943437471885e-06,
"loss": 2.3917,
"step": 30935
},
{
"epoch": 9.77264471294322,
"grad_norm": 0.039327189772560396,
"learning_rate": 3.0643055871252267e-06,
"loss": 2.4122,
"step": 30940
},
{
"epoch": 9.7742241175077,
"grad_norm": 0.04677095681220249,
"learning_rate": 3.021319992278282e-06,
"loss": 2.3975,
"step": 30945
},
{
"epoch": 9.775803522072179,
"grad_norm": 0.05579730291696965,
"learning_rate": 2.97863757227812e-06,
"loss": 2.3975,
"step": 30950
},
{
"epoch": 9.777382926636658,
"grad_norm": 0.04784428512811323,
"learning_rate": 2.9362583401041366e-06,
"loss": 2.3602,
"step": 30955
},
{
"epoch": 9.778962331201138,
"grad_norm": 0.0411858140902529,
"learning_rate": 2.894182308643467e-06,
"loss": 2.3335,
"step": 30960
},
{
"epoch": 9.780541735765617,
"grad_norm": 0.050145954867967774,
"learning_rate": 2.852409490691432e-06,
"loss": 2.3699,
"step": 30965
},
{
"epoch": 9.782121140330096,
"grad_norm": 0.04583264587347306,
"learning_rate": 2.8109398989505376e-06,
"loss": 2.4688,
"step": 30970
},
{
"epoch": 9.783700544894575,
"grad_norm": 0.04245562266316667,
"learning_rate": 2.7697735460316952e-06,
"loss": 2.4882,
"step": 30975
},
{
"epoch": 9.785279949459055,
"grad_norm": 0.04696733045462945,
"learning_rate": 2.7289104444532253e-06,
"loss": 2.3811,
"step": 30980
},
{
"epoch": 9.786859354023534,
"grad_norm": 0.0464987441356757,
"learning_rate": 2.68835060664141e-06,
"loss": 2.4104,
"step": 30985
},
{
"epoch": 9.788438758588013,
"grad_norm": 0.045702405023384705,
"learning_rate": 2.6480940449301604e-06,
"loss": 2.3994,
"step": 30990
},
{
"epoch": 9.790018163152492,
"grad_norm": 0.04612368902511887,
"learning_rate": 2.6081407715611295e-06,
"loss": 2.3026,
"step": 30995
},
{
"epoch": 9.79159756771697,
"grad_norm": 0.05515527290172604,
"learning_rate": 2.568490798684153e-06,
"loss": 2.3169,
"step": 31000
},
{
"epoch": 9.79317697228145,
"grad_norm": 0.053000123638092964,
"learning_rate": 2.5291441383562543e-06,
"loss": 2.4969,
"step": 31005
},
{
"epoch": 9.794756376845928,
"grad_norm": 0.040632902763189684,
"learning_rate": 2.4901008025426388e-06,
"loss": 2.3309,
"step": 31010
},
{
"epoch": 9.796335781410408,
"grad_norm": 0.043100027052276765,
"learning_rate": 2.451360803116032e-06,
"loss": 2.3358,
"step": 31015
},
{
"epoch": 9.797915185974887,
"grad_norm": 0.04365306395795135,
"learning_rate": 2.412924151857121e-06,
"loss": 2.3802,
"step": 31020
},
{
"epoch": 9.799494590539366,
"grad_norm": 0.048728724479073275,
"learning_rate": 2.3747908604542235e-06,
"loss": 2.4144,
"step": 31025
},
{
"epoch": 9.801073995103845,
"grad_norm": 0.04094146611112561,
"learning_rate": 2.3369609405035073e-06,
"loss": 2.4692,
"step": 31030
},
{
"epoch": 9.802653399668324,
"grad_norm": 0.04158612571434427,
"learning_rate": 2.29943440350866e-06,
"loss": 2.4477,
"step": 31035
},
{
"epoch": 9.804232804232804,
"grad_norm": 0.05301400544320318,
"learning_rate": 2.2622112608813305e-06,
"loss": 2.3161,
"step": 31040
},
{
"epoch": 9.805812208797283,
"grad_norm": 0.04697171401422431,
"learning_rate": 2.2252915239407978e-06,
"loss": 2.3835,
"step": 31045
},
{
"epoch": 9.807391613361762,
"grad_norm": 0.0430581547166588,
"learning_rate": 2.1886752039141923e-06,
"loss": 2.375,
"step": 31050
},
{
"epoch": 9.808971017926241,
"grad_norm": 0.04091095510825859,
"learning_rate": 2.1523623119361625e-06,
"loss": 2.3768,
"step": 31055
},
{
"epoch": 9.81055042249072,
"grad_norm": 0.04723473573002013,
"learning_rate": 2.1163528590494307e-06,
"loss": 2.3879,
"step": 31060
},
{
"epoch": 9.8121298270552,
"grad_norm": 0.04752032893714477,
"learning_rate": 2.080646856204127e-06,
"loss": 2.3903,
"step": 31065
},
{
"epoch": 9.81370923161968,
"grad_norm": 0.040396078273481684,
"learning_rate": 2.0452443142582323e-06,
"loss": 2.4456,
"step": 31070
},
{
"epoch": 9.815288636184158,
"grad_norm": 0.04194883118868972,
"learning_rate": 2.010145243977357e-06,
"loss": 2.4235,
"step": 31075
},
{
"epoch": 9.816868040748638,
"grad_norm": 0.03963730955798123,
"learning_rate": 1.975349656035075e-06,
"loss": 2.3923,
"step": 31080
},
{
"epoch": 9.818447445313117,
"grad_norm": 0.03779142611215144,
"learning_rate": 1.940857561012366e-06,
"loss": 2.3656,
"step": 31085
},
{
"epoch": 9.820026849877596,
"grad_norm": 0.05077742388623736,
"learning_rate": 1.9066689693981731e-06,
"loss": 2.3962,
"step": 31090
},
{
"epoch": 9.821606254442075,
"grad_norm": 0.04634979034819079,
"learning_rate": 1.8727838915888474e-06,
"loss": 2.3981,
"step": 31095
},
{
"epoch": 9.823185659006555,
"grad_norm": 0.0510649134998081,
"learning_rate": 1.8392023378888122e-06,
"loss": 2.2927,
"step": 31100
},
{
"epoch": 9.824765063571034,
"grad_norm": 0.055070622203620925,
"learning_rate": 1.8059243185097885e-06,
"loss": 2.4004,
"step": 31105
},
{
"epoch": 9.826344468135513,
"grad_norm": 0.04031151443518212,
"learning_rate": 1.7729498435716806e-06,
"loss": 2.4404,
"step": 31110
},
{
"epoch": 9.827923872699992,
"grad_norm": 0.057769974945689806,
"learning_rate": 1.7402789231015791e-06,
"loss": 2.4039,
"step": 31115
},
{
"epoch": 9.829503277264472,
"grad_norm": 0.044190660673983126,
"learning_rate": 1.7079115670346478e-06,
"loss": 2.373,
"step": 31120
},
{
"epoch": 9.831082681828951,
"grad_norm": 0.04084854306165555,
"learning_rate": 1.675847785213569e-06,
"loss": 2.3918,
"step": 31125
},
{
"epoch": 9.83266208639343,
"grad_norm": 0.04859205539272279,
"learning_rate": 1.6440875873886541e-06,
"loss": 2.4677,
"step": 31130
},
{
"epoch": 9.83424149095791,
"grad_norm": 0.04216777782005341,
"learning_rate": 1.6126309832180664e-06,
"loss": 2.4853,
"step": 31135
},
{
"epoch": 9.835820895522389,
"grad_norm": 0.04953110205193987,
"learning_rate": 1.5814779822674875e-06,
"loss": 2.4103,
"step": 31140
},
{
"epoch": 9.837400300086868,
"grad_norm": 0.04180295744217038,
"learning_rate": 1.5506285940103393e-06,
"loss": 2.3793,
"step": 31145
},
{
"epoch": 9.838979704651347,
"grad_norm": 0.047837444378521964,
"learning_rate": 1.5200828278278954e-06,
"loss": 2.4945,
"step": 31150
},
{
"epoch": 9.840559109215826,
"grad_norm": 0.04530582592693608,
"learning_rate": 1.489840693008726e-06,
"loss": 2.4009,
"step": 31155
},
{
"epoch": 9.842138513780306,
"grad_norm": 0.04291881654761109,
"learning_rate": 1.4599021987493632e-06,
"loss": 2.3659,
"step": 31160
},
{
"epoch": 9.843717918344783,
"grad_norm": 0.04355167250909392,
"learning_rate": 1.430267354153858e-06,
"loss": 2.4755,
"step": 31165
},
{
"epoch": 9.845297322909262,
"grad_norm": 0.0427130805628063,
"learning_rate": 1.4009361682340017e-06,
"loss": 2.3501,
"step": 31170
},
{
"epoch": 9.846876727473742,
"grad_norm": 0.040811771130643736,
"learning_rate": 1.3719086499092148e-06,
"loss": 2.3141,
"step": 31175
},
{
"epoch": 9.848456132038221,
"grad_norm": 0.040310308608363724,
"learning_rate": 1.3431848080066588e-06,
"loss": 2.3714,
"step": 31180
},
{
"epoch": 9.8500355366027,
"grad_norm": 0.045963168756802425,
"learning_rate": 1.3147646512610135e-06,
"loss": 2.3644,
"step": 31185
},
{
"epoch": 9.85161494116718,
"grad_norm": 0.03729718237169882,
"learning_rate": 1.2866481883146986e-06,
"loss": 2.347,
"step": 31190
},
{
"epoch": 9.853194345731659,
"grad_norm": 0.052023903170032054,
"learning_rate": 1.258835427717653e-06,
"loss": 2.4908,
"step": 31195
},
{
"epoch": 9.854773750296138,
"grad_norm": 0.04285587877456899,
"learning_rate": 1.2313263779275551e-06,
"loss": 2.3909,
"step": 31200
},
{
"epoch": 9.856353154860617,
"grad_norm": 0.043623908820302676,
"learning_rate": 1.2041210473098252e-06,
"loss": 2.3754,
"step": 31205
},
{
"epoch": 9.857932559425096,
"grad_norm": 0.0449197725350227,
"learning_rate": 1.1772194441374008e-06,
"loss": 2.4296,
"step": 31210
},
{
"epoch": 9.859511963989576,
"grad_norm": 0.040065117368542624,
"learning_rate": 1.15062157659096e-06,
"loss": 2.3674,
"step": 31215
},
{
"epoch": 9.861091368554055,
"grad_norm": 0.04012162349533012,
"learning_rate": 1.1243274527587e-06,
"loss": 2.3932,
"step": 31220
},
{
"epoch": 9.862670773118534,
"grad_norm": 0.048506253143626714,
"learning_rate": 1.0983370806363358e-06,
"loss": 2.4138,
"step": 31225
},
{
"epoch": 9.864250177683013,
"grad_norm": 0.041801382362332766,
"learning_rate": 1.0726504681275452e-06,
"loss": 2.4828,
"step": 31230
},
{
"epoch": 9.865829582247493,
"grad_norm": 0.04437419891825854,
"learning_rate": 1.047267623043524e-06,
"loss": 2.3569,
"step": 31235
},
{
"epoch": 9.867408986811972,
"grad_norm": 0.048059431546183196,
"learning_rate": 1.0221885531027652e-06,
"loss": 2.4038,
"step": 31240
},
{
"epoch": 9.868988391376451,
"grad_norm": 0.04839714228324445,
"learning_rate": 9.974132659319458e-07,
"loss": 2.4167,
"step": 31245
},
{
"epoch": 9.87056779594093,
"grad_norm": 0.0499803154894237,
"learning_rate": 9.729417690649279e-07,
"loss": 2.4503,
"step": 31250
},
{
"epoch": 9.87214720050541,
"grad_norm": 0.04442938220004997,
"learning_rate": 9.487740699433145e-07,
"loss": 2.4478,
"step": 31255
},
{
"epoch": 9.873726605069889,
"grad_norm": 0.03881726819260687,
"learning_rate": 9.249101759164491e-07,
"loss": 2.3652,
"step": 31260
},
{
"epoch": 9.875306009634368,
"grad_norm": 0.044093565278199146,
"learning_rate": 9.013500942410824e-07,
"loss": 2.3736,
"step": 31265
},
{
"epoch": 9.876885414198847,
"grad_norm": 0.05108213400375329,
"learning_rate": 8.780938320817056e-07,
"loss": 2.4016,
"step": 31270
},
{
"epoch": 9.878464818763327,
"grad_norm": 0.043450172850814976,
"learning_rate": 8.551413965105504e-07,
"loss": 2.4129,
"step": 31275
},
{
"epoch": 9.880044223327806,
"grad_norm": 0.04571056199368923,
"learning_rate": 8.324927945070337e-07,
"loss": 2.3449,
"step": 31280
},
{
"epoch": 9.881623627892285,
"grad_norm": 0.03929514397498466,
"learning_rate": 8.101480329587574e-07,
"loss": 2.4344,
"step": 31285
},
{
"epoch": 9.883203032456764,
"grad_norm": 0.05605053636602669,
"learning_rate": 7.881071186602861e-07,
"loss": 2.3708,
"step": 31290
},
{
"epoch": 9.884782437021244,
"grad_norm": 0.040096833215611215,
"learning_rate": 7.663700583144806e-07,
"loss": 2.4076,
"step": 31295
},
{
"epoch": 9.886361841585723,
"grad_norm": 0.04414612578297414,
"learning_rate": 7.449368585311644e-07,
"loss": 2.4094,
"step": 31300
},
{
"epoch": 9.887941246150202,
"grad_norm": 0.04143465975472764,
"learning_rate": 7.23807525828124e-07,
"loss": 2.3948,
"step": 31305
},
{
"epoch": 9.889520650714681,
"grad_norm": 0.03961622831463739,
"learning_rate": 7.029820666306641e-07,
"loss": 2.402,
"step": 31310
},
{
"epoch": 9.89110005527916,
"grad_norm": 0.05701225564541441,
"learning_rate": 6.824604872717188e-07,
"loss": 2.4436,
"step": 31315
},
{
"epoch": 9.892679459843638,
"grad_norm": 0.04003804695594368,
"learning_rate": 6.622427939916298e-07,
"loss": 2.263,
"step": 31320
},
{
"epoch": 9.894258864408119,
"grad_norm": 0.05036660745155673,
"learning_rate": 6.4232899293859e-07,
"loss": 2.4116,
"step": 31325
},
{
"epoch": 9.895838268972597,
"grad_norm": 0.04904166864564292,
"learning_rate": 6.22719090168311e-07,
"loss": 2.37,
"step": 31330
},
{
"epoch": 9.897417673537076,
"grad_norm": 0.053753816783488964,
"learning_rate": 6.034130916439118e-07,
"loss": 2.3696,
"step": 31335
},
{
"epoch": 9.898997078101555,
"grad_norm": 0.03768381693534451,
"learning_rate": 5.844110032362515e-07,
"loss": 2.416,
"step": 31340
},
{
"epoch": 9.900576482666034,
"grad_norm": 0.03941877065191387,
"learning_rate": 5.65712830723708e-07,
"loss": 2.4877,
"step": 31345
},
{
"epoch": 9.902155887230514,
"grad_norm": 0.060231101967682533,
"learning_rate": 5.473185797923996e-07,
"loss": 2.4678,
"step": 31350
},
{
"epoch": 9.903735291794993,
"grad_norm": 0.05347136728291984,
"learning_rate": 5.292282560358519e-07,
"loss": 2.4145,
"step": 31355
},
{
"epoch": 9.905314696359472,
"grad_norm": 0.039082639537203874,
"learning_rate": 5.114418649552199e-07,
"loss": 2.3073,
"step": 31360
},
{
"epoch": 9.906894100923951,
"grad_norm": 0.044346146786145166,
"learning_rate": 4.939594119590663e-07,
"loss": 2.3193,
"step": 31365
},
{
"epoch": 9.90847350548843,
"grad_norm": 0.040929624910032235,
"learning_rate": 4.767809023639158e-07,
"loss": 2.4079,
"step": 31370
},
{
"epoch": 9.91005291005291,
"grad_norm": 0.04329208176770207,
"learning_rate": 4.5990634139359e-07,
"loss": 2.4446,
"step": 31375
},
{
"epoch": 9.911632314617389,
"grad_norm": 0.07861902074514227,
"learning_rate": 4.433357341795396e-07,
"loss": 2.4424,
"step": 31380
},
{
"epoch": 9.913211719181868,
"grad_norm": 0.047631633008037584,
"learning_rate": 4.27069085760623e-07,
"loss": 2.3565,
"step": 31385
},
{
"epoch": 9.914791123746348,
"grad_norm": 0.03730679415803507,
"learning_rate": 4.111064010836607e-07,
"loss": 2.4453,
"step": 31390
},
{
"epoch": 9.916370528310827,
"grad_norm": 0.03886844888904772,
"learning_rate": 3.954476850026589e-07,
"loss": 2.2789,
"step": 31395
},
{
"epoch": 9.917949932875306,
"grad_norm": 0.04317889692874003,
"learning_rate": 3.800929422793642e-07,
"loss": 2.257,
"step": 31400
},
{
"epoch": 9.919529337439785,
"grad_norm": 0.040823580863975674,
"learning_rate": 3.6504217758304147e-07,
"loss": 2.3185,
"step": 31405
},
{
"epoch": 9.921108742004265,
"grad_norm": 0.039782695619364115,
"learning_rate": 3.502953954905852e-07,
"loss": 2.2989,
"step": 31410
},
{
"epoch": 9.922688146568744,
"grad_norm": 0.04409438948650292,
"learning_rate": 3.3585260048629717e-07,
"loss": 2.3599,
"step": 31415
},
{
"epoch": 9.924267551133223,
"grad_norm": 0.05089332354941331,
"learning_rate": 3.217137969622197e-07,
"loss": 2.3699,
"step": 31420
},
{
"epoch": 9.925846955697702,
"grad_norm": 0.041525466589508155,
"learning_rate": 3.078789892179135e-07,
"loss": 2.3521,
"step": 31425
},
{
"epoch": 9.927426360262182,
"grad_norm": 0.05122548282281933,
"learning_rate": 2.943481814603466e-07,
"loss": 2.3181,
"step": 31430
},
{
"epoch": 9.92900576482666,
"grad_norm": 0.037858106490544545,
"learning_rate": 2.8112137780422765e-07,
"loss": 2.3224,
"step": 31435
},
{
"epoch": 9.93058516939114,
"grad_norm": 0.04483147270983839,
"learning_rate": 2.681985822716726e-07,
"loss": 2.44,
"step": 31440
},
{
"epoch": 9.93216457395562,
"grad_norm": 0.0419017418322362,
"learning_rate": 2.555797987924269e-07,
"loss": 2.35,
"step": 31445
},
{
"epoch": 9.933743978520098,
"grad_norm": 0.04684765994945669,
"learning_rate": 2.4326503120397634e-07,
"loss": 2.3613,
"step": 31450
},
{
"epoch": 9.935323383084578,
"grad_norm": 0.04900510243746249,
"learning_rate": 2.3125428325088127e-07,
"loss": 2.391,
"step": 31455
},
{
"epoch": 9.936902787649057,
"grad_norm": 0.05155968176817936,
"learning_rate": 2.1954755858566432e-07,
"loss": 2.3253,
"step": 31460
},
{
"epoch": 9.938482192213536,
"grad_norm": 0.04528411011174494,
"learning_rate": 2.0814486076825566e-07,
"loss": 2.4523,
"step": 31465
},
{
"epoch": 9.940061596778015,
"grad_norm": 0.04943254274441148,
"learning_rate": 1.9704619326621487e-07,
"loss": 2.389,
"step": 31470
},
{
"epoch": 9.941641001342493,
"grad_norm": 0.042564905701143275,
"learning_rate": 1.862515594545089e-07,
"loss": 2.3866,
"step": 31475
},
{
"epoch": 9.943220405906974,
"grad_norm": 0.04164976415564028,
"learning_rate": 1.7576096261562313e-07,
"loss": 2.3793,
"step": 31480
},
{
"epoch": 9.944799810471451,
"grad_norm": 0.0452706836490186,
"learning_rate": 1.6557440593989448e-07,
"loss": 2.3188,
"step": 31485
},
{
"epoch": 9.94637921503593,
"grad_norm": 0.04956588186054781,
"learning_rate": 1.5569189252473415e-07,
"loss": 2.3585,
"step": 31490
},
{
"epoch": 9.94795861960041,
"grad_norm": 0.04468823103013206,
"learning_rate": 1.4611342537562688e-07,
"loss": 2.4117,
"step": 31495
},
{
"epoch": 9.94953802416489,
"grad_norm": 0.03982867230890297,
"learning_rate": 1.3683900740513178e-07,
"loss": 2.3792,
"step": 31500
},
{
"epoch": 9.951117428729368,
"grad_norm": 0.04261144088253554,
"learning_rate": 1.2786864143354837e-07,
"loss": 2.4035,
"step": 31505
},
{
"epoch": 9.952696833293848,
"grad_norm": 0.05113283763969754,
"learning_rate": 1.1920233018880566e-07,
"loss": 2.4228,
"step": 31510
},
{
"epoch": 9.954276237858327,
"grad_norm": 0.04560790509600287,
"learning_rate": 1.1084007630612903e-07,
"loss": 2.3877,
"step": 31515
},
{
"epoch": 9.955855642422806,
"grad_norm": 0.05556904301586004,
"learning_rate": 1.0278188232859531e-07,
"loss": 2.3437,
"step": 31520
},
{
"epoch": 9.957435046987285,
"grad_norm": 0.04668718987766734,
"learning_rate": 9.502775070657776e-08,
"loss": 2.5098,
"step": 31525
},
{
"epoch": 9.959014451551765,
"grad_norm": 0.046887195489042646,
"learning_rate": 8.757768379796804e-08,
"loss": 2.4029,
"step": 31530
},
{
"epoch": 9.960593856116244,
"grad_norm": 0.041706676304071585,
"learning_rate": 8.043168386839827e-08,
"loss": 2.3795,
"step": 31535
},
{
"epoch": 9.962173260680723,
"grad_norm": 0.04488075360311965,
"learning_rate": 7.358975309090799e-08,
"loss": 2.3347,
"step": 31540
},
{
"epoch": 9.963752665245202,
"grad_norm": 0.04196027227225851,
"learning_rate": 6.705189354616615e-08,
"loss": 2.4042,
"step": 31545
},
{
"epoch": 9.965332069809682,
"grad_norm": 0.045232793995175886,
"learning_rate": 6.081810722202707e-08,
"loss": 2.3868,
"step": 31550
},
{
"epoch": 9.966911474374161,
"grad_norm": 0.041741344406426024,
"learning_rate": 5.488839601441864e-08,
"loss": 2.3922,
"step": 31555
},
{
"epoch": 9.96849087893864,
"grad_norm": 0.04742214796670987,
"learning_rate": 4.926276172645405e-08,
"loss": 2.4131,
"step": 31560
},
{
"epoch": 9.97007028350312,
"grad_norm": 0.042601983893819445,
"learning_rate": 4.394120606876495e-08,
"loss": 2.3363,
"step": 31565
},
{
"epoch": 9.971649688067599,
"grad_norm": 0.04327385252232404,
"learning_rate": 3.8923730659612414e-08,
"loss": 2.484,
"step": 31570
},
{
"epoch": 9.973229092632078,
"grad_norm": 0.04202420797597011,
"learning_rate": 3.4210337024886964e-08,
"loss": 2.2765,
"step": 31575
},
{
"epoch": 9.974808497196557,
"grad_norm": 0.041801096069587484,
"learning_rate": 2.9801026597775505e-08,
"loss": 2.4922,
"step": 31580
},
{
"epoch": 9.976387901761036,
"grad_norm": 0.0391322209059303,
"learning_rate": 2.5695800719205408e-08,
"loss": 2.4308,
"step": 31585
},
{
"epoch": 9.977967306325516,
"grad_norm": 0.044431361724413,
"learning_rate": 2.1894660637622467e-08,
"loss": 2.3047,
"step": 31590
},
{
"epoch": 9.979546710889995,
"grad_norm": 0.044143168004319534,
"learning_rate": 1.8397607508768842e-08,
"loss": 2.3034,
"step": 31595
},
{
"epoch": 9.981126115454474,
"grad_norm": 0.04273657060682564,
"learning_rate": 1.5204642396127178e-08,
"loss": 2.3248,
"step": 31600
},
{
"epoch": 9.982705520018953,
"grad_norm": 0.03915895438328436,
"learning_rate": 1.2315766270698526e-08,
"loss": 2.428,
"step": 31605
},
{
"epoch": 9.984284924583433,
"grad_norm": 0.054851881731455776,
"learning_rate": 9.730980010891343e-09,
"loss": 2.4559,
"step": 31610
},
{
"epoch": 9.985864329147912,
"grad_norm": 0.03955674353438591,
"learning_rate": 7.450284402854557e-09,
"loss": 2.3912,
"step": 31615
},
{
"epoch": 9.987443733712391,
"grad_norm": 0.03918676660618922,
"learning_rate": 5.473680140033466e-09,
"loss": 2.3536,
"step": 31620
},
{
"epoch": 9.98902313827687,
"grad_norm": 0.038162441992950706,
"learning_rate": 3.801167823502816e-09,
"loss": 2.3491,
"step": 31625
},
{
"epoch": 9.99060254284135,
"grad_norm": 0.04205200312824167,
"learning_rate": 2.4327479618557746e-09,
"loss": 2.4484,
"step": 31630
},
{
"epoch": 9.992181947405829,
"grad_norm": 0.04447096819750325,
"learning_rate": 1.3684209713149542e-09,
"loss": 2.4543,
"step": 31635
},
{
"epoch": 9.993761351970306,
"grad_norm": 0.0503482955066234,
"learning_rate": 6.08187175399344e-10,
"loss": 2.3356,
"step": 31640
},
{
"epoch": 9.995340756534786,
"grad_norm": 0.04672093855240827,
"learning_rate": 1.5204680536839987e-10,
"loss": 2.4031,
"step": 31645
},
{
"epoch": 9.996920161099265,
"grad_norm": 0.03833379789275092,
"learning_rate": 0.0,
"loss": 2.4458,
"step": 31650
},
{
"epoch": 9.996920161099265,
"eval_loss": 2.395866870880127,
"eval_runtime": 118.7072,
"eval_samples_per_second": 22.315,
"eval_steps_per_second": 5.585,
"step": 31650
},
{
"epoch": 9.996920161099265,
"step": 31650,
"total_flos": 9.148115283266765e+16,
"train_loss": 2.7709856205679606,
"train_runtime": 79261.052,
"train_samples_per_second": 6.39,
"train_steps_per_second": 0.399
}
],
"logging_steps": 5,
"max_steps": 31650,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.148115283266765e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}