Files
titulm-llama-3.2-3b-v1.1/trainer_state.json
ModelHub XC d697383c9d 初始化项目,由ModelHub XC社区提供模型
Model: hishab/titulm-llama-3.2-3b-v1.1
Source: Original Platform
2026-05-30 23:59:24 +08:00

32740 lines
799 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999197238500441,
"eval_steps": 500,
"global_step": 4671,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00021406973321559498,
"grad_norm": 1.1166919279503258,
"learning_rate": 8.510638297872341e-07,
"loss": 1.1827,
"step": 1
},
{
"epoch": 0.00042813946643118997,
"grad_norm": 1.1708289727451116,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.1359,
"step": 2
},
{
"epoch": 0.0006422091996467849,
"grad_norm": 1.0907211593333068,
"learning_rate": 2.553191489361702e-06,
"loss": 1.1557,
"step": 3
},
{
"epoch": 0.0008562789328623799,
"grad_norm": 1.0197824487418166,
"learning_rate": 3.4042553191489363e-06,
"loss": 1.1925,
"step": 4
},
{
"epoch": 0.001070348666077975,
"grad_norm": 0.7931135221390712,
"learning_rate": 4.255319148936171e-06,
"loss": 1.1818,
"step": 5
},
{
"epoch": 0.0012844183992935698,
"grad_norm": 1.2062759929754334,
"learning_rate": 5.106382978723404e-06,
"loss": 1.1836,
"step": 6
},
{
"epoch": 0.001498488132509165,
"grad_norm": 0.7236664543575566,
"learning_rate": 5.957446808510638e-06,
"loss": 1.1129,
"step": 7
},
{
"epoch": 0.0017125578657247599,
"grad_norm": 0.876566120952773,
"learning_rate": 6.808510638297873e-06,
"loss": 1.1466,
"step": 8
},
{
"epoch": 0.0019266275989403548,
"grad_norm": 0.9927832245845079,
"learning_rate": 7.659574468085107e-06,
"loss": 1.089,
"step": 9
},
{
"epoch": 0.00214069733215595,
"grad_norm": 0.6601070620009425,
"learning_rate": 8.510638297872341e-06,
"loss": 1.0964,
"step": 10
},
{
"epoch": 0.0023547670653715448,
"grad_norm": 1.2645696188115831,
"learning_rate": 9.361702127659576e-06,
"loss": 1.1111,
"step": 11
},
{
"epoch": 0.0025688367985871397,
"grad_norm": 0.9038447023024733,
"learning_rate": 1.0212765957446808e-05,
"loss": 1.1165,
"step": 12
},
{
"epoch": 0.0027829065318027346,
"grad_norm": 0.7339109817654351,
"learning_rate": 1.1063829787234044e-05,
"loss": 1.1966,
"step": 13
},
{
"epoch": 0.00299697626501833,
"grad_norm": 0.9353478021948082,
"learning_rate": 1.1914893617021277e-05,
"loss": 1.1288,
"step": 14
},
{
"epoch": 0.003211045998233925,
"grad_norm": 0.7288553042986659,
"learning_rate": 1.2765957446808513e-05,
"loss": 1.1335,
"step": 15
},
{
"epoch": 0.0034251157314495197,
"grad_norm": 0.7356599947982664,
"learning_rate": 1.3617021276595745e-05,
"loss": 1.07,
"step": 16
},
{
"epoch": 0.0036391854646651146,
"grad_norm": 0.8088448974520007,
"learning_rate": 1.4468085106382981e-05,
"loss": 1.0834,
"step": 17
},
{
"epoch": 0.0038532551978807095,
"grad_norm": 0.6185232663513837,
"learning_rate": 1.5319148936170214e-05,
"loss": 1.0397,
"step": 18
},
{
"epoch": 0.004067324931096305,
"grad_norm": 0.6953950133146246,
"learning_rate": 1.6170212765957446e-05,
"loss": 1.0825,
"step": 19
},
{
"epoch": 0.0042813946643119,
"grad_norm": 0.6338356520863616,
"learning_rate": 1.7021276595744682e-05,
"loss": 1.0906,
"step": 20
},
{
"epoch": 0.004495464397527495,
"grad_norm": 0.5541703704791683,
"learning_rate": 1.7872340425531915e-05,
"loss": 1.0918,
"step": 21
},
{
"epoch": 0.0047095341307430896,
"grad_norm": 0.5579133715396074,
"learning_rate": 1.872340425531915e-05,
"loss": 1.0435,
"step": 22
},
{
"epoch": 0.0049236038639586845,
"grad_norm": 0.5909042405991046,
"learning_rate": 1.9574468085106384e-05,
"loss": 1.1142,
"step": 23
},
{
"epoch": 0.005137673597174279,
"grad_norm": 0.601096746808294,
"learning_rate": 2.0425531914893616e-05,
"loss": 1.0311,
"step": 24
},
{
"epoch": 0.005351743330389874,
"grad_norm": 0.606153601431191,
"learning_rate": 2.1276595744680852e-05,
"loss": 1.0409,
"step": 25
},
{
"epoch": 0.005565813063605469,
"grad_norm": 0.7133301692826983,
"learning_rate": 2.2127659574468088e-05,
"loss": 1.0529,
"step": 26
},
{
"epoch": 0.005779882796821064,
"grad_norm": 0.9435346016766639,
"learning_rate": 2.2978723404255324e-05,
"loss": 1.0484,
"step": 27
},
{
"epoch": 0.00599395253003666,
"grad_norm": 1.176958680484456,
"learning_rate": 2.3829787234042553e-05,
"loss": 1.057,
"step": 28
},
{
"epoch": 0.006208022263252255,
"grad_norm": 0.7760780885243199,
"learning_rate": 2.468085106382979e-05,
"loss": 1.0051,
"step": 29
},
{
"epoch": 0.00642209199646785,
"grad_norm": 0.6677333961706371,
"learning_rate": 2.5531914893617025e-05,
"loss": 0.9905,
"step": 30
},
{
"epoch": 0.0066361617296834445,
"grad_norm": 0.834480954253743,
"learning_rate": 2.6382978723404255e-05,
"loss": 1.0342,
"step": 31
},
{
"epoch": 0.0068502314628990394,
"grad_norm": 1.0974033292691274,
"learning_rate": 2.723404255319149e-05,
"loss": 1.0149,
"step": 32
},
{
"epoch": 0.007064301196114634,
"grad_norm": 1.038976902577752,
"learning_rate": 2.8085106382978727e-05,
"loss": 1.0572,
"step": 33
},
{
"epoch": 0.007278370929330229,
"grad_norm": 0.8852689841618762,
"learning_rate": 2.8936170212765963e-05,
"loss": 0.9999,
"step": 34
},
{
"epoch": 0.007492440662545824,
"grad_norm": 0.9301997082176462,
"learning_rate": 2.9787234042553192e-05,
"loss": 1.0109,
"step": 35
},
{
"epoch": 0.007706510395761419,
"grad_norm": 1.5082522529592066,
"learning_rate": 3.063829787234043e-05,
"loss": 1.0071,
"step": 36
},
{
"epoch": 0.007920580128977015,
"grad_norm": 0.7195107242376084,
"learning_rate": 3.1489361702127664e-05,
"loss": 1.0669,
"step": 37
},
{
"epoch": 0.00813464986219261,
"grad_norm": 0.9748082972789284,
"learning_rate": 3.234042553191489e-05,
"loss": 0.9628,
"step": 38
},
{
"epoch": 0.008348719595408205,
"grad_norm": 1.1875078001762558,
"learning_rate": 3.319148936170213e-05,
"loss": 0.9952,
"step": 39
},
{
"epoch": 0.0085627893286238,
"grad_norm": 1.0391692344066028,
"learning_rate": 3.4042553191489365e-05,
"loss": 1.0394,
"step": 40
},
{
"epoch": 0.008776859061839394,
"grad_norm": 1.4756395878896853,
"learning_rate": 3.48936170212766e-05,
"loss": 1.0107,
"step": 41
},
{
"epoch": 0.00899092879505499,
"grad_norm": 0.6635851734986676,
"learning_rate": 3.574468085106383e-05,
"loss": 0.9681,
"step": 42
},
{
"epoch": 0.009204998528270584,
"grad_norm": 1.2729496957274005,
"learning_rate": 3.6595744680851066e-05,
"loss": 0.9411,
"step": 43
},
{
"epoch": 0.009419068261486179,
"grad_norm": 0.7233378367122119,
"learning_rate": 3.74468085106383e-05,
"loss": 0.9916,
"step": 44
},
{
"epoch": 0.009633137994701774,
"grad_norm": 1.1693159971090483,
"learning_rate": 3.829787234042554e-05,
"loss": 0.9817,
"step": 45
},
{
"epoch": 0.009847207727917369,
"grad_norm": 0.7455612181912622,
"learning_rate": 3.914893617021277e-05,
"loss": 0.9939,
"step": 46
},
{
"epoch": 0.010061277461132964,
"grad_norm": 1.290433563215881,
"learning_rate": 4e-05,
"loss": 0.9576,
"step": 47
},
{
"epoch": 0.010275347194348559,
"grad_norm": 1.1054549329447891,
"learning_rate": 3.999999538401831e-05,
"loss": 0.9123,
"step": 48
},
{
"epoch": 0.010489416927564154,
"grad_norm": 1.0856558491489532,
"learning_rate": 3.999998153607536e-05,
"loss": 0.9401,
"step": 49
},
{
"epoch": 0.010703486660779749,
"grad_norm": 1.0090349725115968,
"learning_rate": 3.9999958456177544e-05,
"loss": 0.9271,
"step": 50
},
{
"epoch": 0.010917556393995343,
"grad_norm": 1.2254714038725856,
"learning_rate": 3.999992614433551e-05,
"loss": 0.9731,
"step": 51
},
{
"epoch": 0.011131626127210938,
"grad_norm": 0.7146351542113133,
"learning_rate": 3.999988460056418e-05,
"loss": 0.951,
"step": 52
},
{
"epoch": 0.011345695860426533,
"grad_norm": 0.8766895624956862,
"learning_rate": 3.999983382488274e-05,
"loss": 0.9421,
"step": 53
},
{
"epoch": 0.011559765593642128,
"grad_norm": 1.0455184491547953,
"learning_rate": 3.99997738173146e-05,
"loss": 0.9302,
"step": 54
},
{
"epoch": 0.011773835326857725,
"grad_norm": 0.9023316909274607,
"learning_rate": 3.9999704577887497e-05,
"loss": 0.9737,
"step": 55
},
{
"epoch": 0.01198790506007332,
"grad_norm": 0.8468310557786813,
"learning_rate": 3.9999626106633364e-05,
"loss": 0.9569,
"step": 56
},
{
"epoch": 0.012201974793288915,
"grad_norm": 0.7796679102876239,
"learning_rate": 3.9999538403588424e-05,
"loss": 0.959,
"step": 57
},
{
"epoch": 0.01241604452650451,
"grad_norm": 0.7611741736834936,
"learning_rate": 3.999944146879317e-05,
"loss": 0.9388,
"step": 58
},
{
"epoch": 0.012630114259720104,
"grad_norm": 0.6692315664257814,
"learning_rate": 3.999933530229235e-05,
"loss": 1.0062,
"step": 59
},
{
"epoch": 0.0128441839929357,
"grad_norm": 0.5812982258666269,
"learning_rate": 3.999921990413496e-05,
"loss": 0.9834,
"step": 60
},
{
"epoch": 0.013058253726151294,
"grad_norm": 0.6197457280615808,
"learning_rate": 3.9999095274374274e-05,
"loss": 0.9347,
"step": 61
},
{
"epoch": 0.013272323459366889,
"grad_norm": 0.5540448143621675,
"learning_rate": 3.999896141306782e-05,
"loss": 0.9421,
"step": 62
},
{
"epoch": 0.013486393192582484,
"grad_norm": 0.5797201354380859,
"learning_rate": 3.999881832027739e-05,
"loss": 0.9639,
"step": 63
},
{
"epoch": 0.013700462925798079,
"grad_norm": 0.5593714489515639,
"learning_rate": 3.999866599606903e-05,
"loss": 0.9042,
"step": 64
},
{
"epoch": 0.013914532659013674,
"grad_norm": 0.4702809339742805,
"learning_rate": 3.9998504440513055e-05,
"loss": 0.921,
"step": 65
},
{
"epoch": 0.014128602392229269,
"grad_norm": 0.89907196888218,
"learning_rate": 3.999833365368403e-05,
"loss": 0.9141,
"step": 66
},
{
"epoch": 0.014342672125444864,
"grad_norm": 0.47718688760050365,
"learning_rate": 3.999815363566081e-05,
"loss": 0.9056,
"step": 67
},
{
"epoch": 0.014556741858660458,
"grad_norm": 0.4821685450306697,
"learning_rate": 3.999796438652648e-05,
"loss": 0.9617,
"step": 68
},
{
"epoch": 0.014770811591876053,
"grad_norm": 0.5711772498703427,
"learning_rate": 3.9997765906368394e-05,
"loss": 0.9217,
"step": 69
},
{
"epoch": 0.014984881325091648,
"grad_norm": 0.5043638438184408,
"learning_rate": 3.999755819527817e-05,
"loss": 0.9546,
"step": 70
},
{
"epoch": 0.015198951058307243,
"grad_norm": 0.51402390704286,
"learning_rate": 3.999734125335169e-05,
"loss": 0.9214,
"step": 71
},
{
"epoch": 0.015413020791522838,
"grad_norm": 0.5094766948616595,
"learning_rate": 3.99971150806891e-05,
"loss": 0.9068,
"step": 72
},
{
"epoch": 0.015627090524738433,
"grad_norm": 0.5203099086883713,
"learning_rate": 3.99968796773948e-05,
"loss": 0.9343,
"step": 73
},
{
"epoch": 0.01584116025795403,
"grad_norm": 0.5036751618637034,
"learning_rate": 3.999663504357743e-05,
"loss": 0.9594,
"step": 74
},
{
"epoch": 0.016055229991169623,
"grad_norm": 0.46537339509159126,
"learning_rate": 3.999638117934994e-05,
"loss": 0.9341,
"step": 75
},
{
"epoch": 0.01626929972438522,
"grad_norm": 0.48878004609491565,
"learning_rate": 3.99961180848295e-05,
"loss": 0.9085,
"step": 76
},
{
"epoch": 0.016483369457600813,
"grad_norm": 0.5050176828983611,
"learning_rate": 3.9995845760137556e-05,
"loss": 0.945,
"step": 77
},
{
"epoch": 0.01669743919081641,
"grad_norm": 0.4990493792937551,
"learning_rate": 3.999556420539981e-05,
"loss": 0.9205,
"step": 78
},
{
"epoch": 0.016911508924032002,
"grad_norm": 0.6134109651957985,
"learning_rate": 3.9995273420746235e-05,
"loss": 0.8763,
"step": 79
},
{
"epoch": 0.0171255786572476,
"grad_norm": 0.7417954503922389,
"learning_rate": 3.999497340631106e-05,
"loss": 0.9216,
"step": 80
},
{
"epoch": 0.017339648390463192,
"grad_norm": 0.8060670544765532,
"learning_rate": 3.999466416223275e-05,
"loss": 0.9099,
"step": 81
},
{
"epoch": 0.01755371812367879,
"grad_norm": 0.8087969162506505,
"learning_rate": 3.9994345688654063e-05,
"loss": 0.9038,
"step": 82
},
{
"epoch": 0.017767787856894382,
"grad_norm": 0.7004889255347019,
"learning_rate": 3.999401798572201e-05,
"loss": 0.9014,
"step": 83
},
{
"epoch": 0.01798185759010998,
"grad_norm": 0.5802585102032293,
"learning_rate": 3.999368105358786e-05,
"loss": 0.9031,
"step": 84
},
{
"epoch": 0.018195927323325572,
"grad_norm": 0.6408609753897807,
"learning_rate": 3.9993334892407135e-05,
"loss": 0.895,
"step": 85
},
{
"epoch": 0.01840999705654117,
"grad_norm": 0.6980937286960113,
"learning_rate": 3.999297950233962e-05,
"loss": 0.905,
"step": 86
},
{
"epoch": 0.01862406678975676,
"grad_norm": 0.7130072915591322,
"learning_rate": 3.999261488354937e-05,
"loss": 0.8795,
"step": 87
},
{
"epoch": 0.018838136522972358,
"grad_norm": 0.6452008052964264,
"learning_rate": 3.999224103620468e-05,
"loss": 0.8989,
"step": 88
},
{
"epoch": 0.019052206256187955,
"grad_norm": 0.5404873483636125,
"learning_rate": 3.999185796047813e-05,
"loss": 0.8825,
"step": 89
},
{
"epoch": 0.019266275989403548,
"grad_norm": 0.6099564670308534,
"learning_rate": 3.9991465656546536e-05,
"loss": 0.892,
"step": 90
},
{
"epoch": 0.019480345722619145,
"grad_norm": 0.6069250708476934,
"learning_rate": 3.9991064124591e-05,
"loss": 0.9067,
"step": 91
},
{
"epoch": 0.019694415455834738,
"grad_norm": 0.5483146337131054,
"learning_rate": 3.999065336479685e-05,
"loss": 0.9025,
"step": 92
},
{
"epoch": 0.019908485189050334,
"grad_norm": 0.44566955352426096,
"learning_rate": 3.9990233377353706e-05,
"loss": 0.9234,
"step": 93
},
{
"epoch": 0.020122554922265928,
"grad_norm": 0.4936248836222646,
"learning_rate": 3.998980416245543e-05,
"loss": 0.9049,
"step": 94
},
{
"epoch": 0.020336624655481524,
"grad_norm": 0.5566756740675656,
"learning_rate": 3.998936572030015e-05,
"loss": 0.9151,
"step": 95
},
{
"epoch": 0.020550694388697117,
"grad_norm": 0.49376355752853496,
"learning_rate": 3.998891805109024e-05,
"loss": 0.904,
"step": 96
},
{
"epoch": 0.020764764121912714,
"grad_norm": 0.4792881635842234,
"learning_rate": 3.9988461155032344e-05,
"loss": 0.8702,
"step": 97
},
{
"epoch": 0.020978833855128307,
"grad_norm": 0.5267230099582725,
"learning_rate": 3.998799503233738e-05,
"loss": 0.8907,
"step": 98
},
{
"epoch": 0.021192903588343904,
"grad_norm": 0.467618771139337,
"learning_rate": 3.9987519683220483e-05,
"loss": 0.8849,
"step": 99
},
{
"epoch": 0.021406973321559497,
"grad_norm": 0.48943080169295844,
"learning_rate": 3.99870351079011e-05,
"loss": 0.8757,
"step": 100
},
{
"epoch": 0.021621043054775094,
"grad_norm": 0.5455761297358167,
"learning_rate": 3.9986541306602894e-05,
"loss": 0.874,
"step": 101
},
{
"epoch": 0.021835112787990687,
"grad_norm": 0.5544546890537473,
"learning_rate": 3.998603827955381e-05,
"loss": 0.8614,
"step": 102
},
{
"epoch": 0.022049182521206283,
"grad_norm": 0.5223485214178217,
"learning_rate": 3.9985526026986046e-05,
"loss": 0.8871,
"step": 103
},
{
"epoch": 0.022263252254421877,
"grad_norm": 0.562423681293549,
"learning_rate": 3.998500454913605e-05,
"loss": 0.9012,
"step": 104
},
{
"epoch": 0.022477321987637473,
"grad_norm": 0.6315099710292131,
"learning_rate": 3.998447384624454e-05,
"loss": 0.8732,
"step": 105
},
{
"epoch": 0.022691391720853066,
"grad_norm": 0.5791228632908744,
"learning_rate": 3.9983933918556476e-05,
"loss": 0.8617,
"step": 106
},
{
"epoch": 0.022905461454068663,
"grad_norm": 0.5634022864648549,
"learning_rate": 3.9983384766321106e-05,
"loss": 0.853,
"step": 107
},
{
"epoch": 0.023119531187284256,
"grad_norm": 0.5070564223042243,
"learning_rate": 3.99828263897919e-05,
"loss": 0.8726,
"step": 108
},
{
"epoch": 0.023333600920499853,
"grad_norm": 0.3773709430324831,
"learning_rate": 3.9982258789226625e-05,
"loss": 0.9322,
"step": 109
},
{
"epoch": 0.02354767065371545,
"grad_norm": 0.3609456838554747,
"learning_rate": 3.998168196488727e-05,
"loss": 0.8814,
"step": 110
},
{
"epoch": 0.023761740386931043,
"grad_norm": 0.412295909527699,
"learning_rate": 3.9981095917040094e-05,
"loss": 0.8747,
"step": 111
},
{
"epoch": 0.02397581012014664,
"grad_norm": 0.3762488153855239,
"learning_rate": 3.998050064595562e-05,
"loss": 0.8616,
"step": 112
},
{
"epoch": 0.024189879853362233,
"grad_norm": 0.36866061675524436,
"learning_rate": 3.997989615190862e-05,
"loss": 0.8622,
"step": 113
},
{
"epoch": 0.02440394958657783,
"grad_norm": 0.47042581407904815,
"learning_rate": 3.9979282435178135e-05,
"loss": 0.9049,
"step": 114
},
{
"epoch": 0.024618019319793422,
"grad_norm": 0.37162201772872094,
"learning_rate": 3.9978659496047456e-05,
"loss": 0.8515,
"step": 115
},
{
"epoch": 0.02483208905300902,
"grad_norm": 0.37771982910788005,
"learning_rate": 3.997802733480412e-05,
"loss": 0.8841,
"step": 116
},
{
"epoch": 0.025046158786224612,
"grad_norm": 0.3798986424281923,
"learning_rate": 3.9977385951739935e-05,
"loss": 0.8686,
"step": 117
},
{
"epoch": 0.02526022851944021,
"grad_norm": 0.3799809323160474,
"learning_rate": 3.997673534715097e-05,
"loss": 0.8673,
"step": 118
},
{
"epoch": 0.025474298252655802,
"grad_norm": 0.4943355337856385,
"learning_rate": 3.9976075521337534e-05,
"loss": 0.8803,
"step": 119
},
{
"epoch": 0.0256883679858714,
"grad_norm": 0.39176012267746063,
"learning_rate": 3.997540647460421e-05,
"loss": 0.8276,
"step": 120
},
{
"epoch": 0.02590243771908699,
"grad_norm": 0.35847329416041274,
"learning_rate": 3.997472820725982e-05,
"loss": 0.8546,
"step": 121
},
{
"epoch": 0.02611650745230259,
"grad_norm": 0.3973695987050465,
"learning_rate": 3.997404071961745e-05,
"loss": 0.8595,
"step": 122
},
{
"epoch": 0.02633057718551818,
"grad_norm": 0.4179535298926474,
"learning_rate": 3.9973344011994453e-05,
"loss": 0.892,
"step": 123
},
{
"epoch": 0.026544646918733778,
"grad_norm": 0.4482449207513205,
"learning_rate": 3.9972638084712424e-05,
"loss": 0.8723,
"step": 124
},
{
"epoch": 0.02675871665194937,
"grad_norm": 0.47235961115937525,
"learning_rate": 3.997192293809722e-05,
"loss": 0.9035,
"step": 125
},
{
"epoch": 0.026972786385164968,
"grad_norm": 0.49655813852674496,
"learning_rate": 3.997119857247894e-05,
"loss": 0.8758,
"step": 126
},
{
"epoch": 0.02718685611838056,
"grad_norm": 0.48475706708204314,
"learning_rate": 3.9970464988191965e-05,
"loss": 0.8822,
"step": 127
},
{
"epoch": 0.027400925851596158,
"grad_norm": 0.40880945016693543,
"learning_rate": 3.99697221855749e-05,
"loss": 0.8634,
"step": 128
},
{
"epoch": 0.02761499558481175,
"grad_norm": 0.3655103522350681,
"learning_rate": 3.996897016497063e-05,
"loss": 0.9002,
"step": 129
},
{
"epoch": 0.027829065318027348,
"grad_norm": 0.43172410026660996,
"learning_rate": 3.9968208926726296e-05,
"loss": 0.8895,
"step": 130
},
{
"epoch": 0.02804313505124294,
"grad_norm": 0.46427343900301987,
"learning_rate": 3.9967438471193265e-05,
"loss": 0.8669,
"step": 131
},
{
"epoch": 0.028257204784458537,
"grad_norm": 0.5504951668967951,
"learning_rate": 3.99666587987272e-05,
"loss": 0.8596,
"step": 132
},
{
"epoch": 0.028471274517674134,
"grad_norm": 0.650575248512708,
"learning_rate": 3.9965869909687966e-05,
"loss": 0.8193,
"step": 133
},
{
"epoch": 0.028685344250889727,
"grad_norm": 0.7382625130650217,
"learning_rate": 3.996507180443975e-05,
"loss": 0.8905,
"step": 134
},
{
"epoch": 0.028899413984105324,
"grad_norm": 0.7854088207409329,
"learning_rate": 3.996426448335092e-05,
"loss": 0.8695,
"step": 135
},
{
"epoch": 0.029113483717320917,
"grad_norm": 0.7567675030960987,
"learning_rate": 3.996344794679416e-05,
"loss": 0.8604,
"step": 136
},
{
"epoch": 0.029327553450536514,
"grad_norm": 0.726503109872982,
"learning_rate": 3.996262219514637e-05,
"loss": 0.8397,
"step": 137
},
{
"epoch": 0.029541623183752107,
"grad_norm": 0.6381158401869549,
"learning_rate": 3.996178722878872e-05,
"loss": 0.8965,
"step": 138
},
{
"epoch": 0.029755692916967703,
"grad_norm": 0.48767806648620604,
"learning_rate": 3.996094304810663e-05,
"loss": 0.8345,
"step": 139
},
{
"epoch": 0.029969762650183297,
"grad_norm": 0.45817659042886905,
"learning_rate": 3.996008965348976e-05,
"loss": 0.8845,
"step": 140
},
{
"epoch": 0.030183832383398893,
"grad_norm": 0.47155499357081787,
"learning_rate": 3.995922704533205e-05,
"loss": 0.8762,
"step": 141
},
{
"epoch": 0.030397902116614486,
"grad_norm": 0.5133652574437176,
"learning_rate": 3.995835522403167e-05,
"loss": 0.8772,
"step": 142
},
{
"epoch": 0.030611971849830083,
"grad_norm": 0.47623028462087225,
"learning_rate": 3.995747418999105e-05,
"loss": 0.8216,
"step": 143
},
{
"epoch": 0.030826041583045676,
"grad_norm": 0.43088130022426235,
"learning_rate": 3.9956583943616885e-05,
"loss": 0.8725,
"step": 144
},
{
"epoch": 0.031040111316261273,
"grad_norm": 0.4990034504213856,
"learning_rate": 3.9955684485320094e-05,
"loss": 0.8879,
"step": 145
},
{
"epoch": 0.031254181049476866,
"grad_norm": 0.509088423722999,
"learning_rate": 3.9954775815515885e-05,
"loss": 0.8806,
"step": 146
},
{
"epoch": 0.03146825078269246,
"grad_norm": 0.4878196898279725,
"learning_rate": 3.995385793462369e-05,
"loss": 0.8159,
"step": 147
},
{
"epoch": 0.03168232051590806,
"grad_norm": 0.5378457617051259,
"learning_rate": 3.995293084306719e-05,
"loss": 0.854,
"step": 148
},
{
"epoch": 0.03189639024912365,
"grad_norm": 0.4847293019964765,
"learning_rate": 3.9951994541274345e-05,
"loss": 0.8999,
"step": 149
},
{
"epoch": 0.032110459982339246,
"grad_norm": 0.45735207708928377,
"learning_rate": 3.9951049029677336e-05,
"loss": 0.8507,
"step": 150
},
{
"epoch": 0.03232452971555484,
"grad_norm": 0.4015977411530065,
"learning_rate": 3.995009430871262e-05,
"loss": 0.8433,
"step": 151
},
{
"epoch": 0.03253859944877044,
"grad_norm": 0.4659213769968778,
"learning_rate": 3.994913037882089e-05,
"loss": 0.8377,
"step": 152
},
{
"epoch": 0.03275266918198603,
"grad_norm": 0.496543964665041,
"learning_rate": 3.99481572404471e-05,
"loss": 0.8754,
"step": 153
},
{
"epoch": 0.032966738915201625,
"grad_norm": 0.8011256219814623,
"learning_rate": 3.994717489404044e-05,
"loss": 0.8792,
"step": 154
},
{
"epoch": 0.03318080864841722,
"grad_norm": 0.3857661806297209,
"learning_rate": 3.994618334005437e-05,
"loss": 0.8511,
"step": 155
},
{
"epoch": 0.03339487838163282,
"grad_norm": 0.4892529277103356,
"learning_rate": 3.994518257894658e-05,
"loss": 0.856,
"step": 156
},
{
"epoch": 0.033608948114848415,
"grad_norm": 0.5032040745248906,
"learning_rate": 3.994417261117902e-05,
"loss": 0.8869,
"step": 157
},
{
"epoch": 0.033823017848064005,
"grad_norm": 0.4514552548655316,
"learning_rate": 3.9943153437217894e-05,
"loss": 0.867,
"step": 158
},
{
"epoch": 0.0340370875812796,
"grad_norm": 0.4536746086133719,
"learning_rate": 3.994212505753365e-05,
"loss": 0.8517,
"step": 159
},
{
"epoch": 0.0342511573144952,
"grad_norm": 0.4066334374512129,
"learning_rate": 3.994108747260098e-05,
"loss": 0.864,
"step": 160
},
{
"epoch": 0.034465227047710795,
"grad_norm": 0.4055298856419892,
"learning_rate": 3.994004068289884e-05,
"loss": 0.8737,
"step": 161
},
{
"epoch": 0.034679296780926384,
"grad_norm": 2.104779557084848,
"learning_rate": 3.9938984688910424e-05,
"loss": 0.8407,
"step": 162
},
{
"epoch": 0.03489336651414198,
"grad_norm": 0.6346506015412093,
"learning_rate": 3.9937919491123175e-05,
"loss": 0.828,
"step": 163
},
{
"epoch": 0.03510743624735758,
"grad_norm": 0.5957786882455672,
"learning_rate": 3.9936845090028784e-05,
"loss": 0.8925,
"step": 164
},
{
"epoch": 0.035321505980573174,
"grad_norm": 0.5329421128953273,
"learning_rate": 3.9935761486123204e-05,
"loss": 0.8558,
"step": 165
},
{
"epoch": 0.035535575713788764,
"grad_norm": 0.5299129923584438,
"learning_rate": 3.9934668679906606e-05,
"loss": 0.8133,
"step": 166
},
{
"epoch": 0.03574964544700436,
"grad_norm": 0.5218280672821705,
"learning_rate": 3.9933566671883434e-05,
"loss": 0.8681,
"step": 167
},
{
"epoch": 0.03596371518021996,
"grad_norm": 0.550560457642017,
"learning_rate": 3.993245546256239e-05,
"loss": 0.8476,
"step": 168
},
{
"epoch": 0.036177784913435554,
"grad_norm": 0.4621680061241431,
"learning_rate": 3.993133505245638e-05,
"loss": 0.8354,
"step": 169
},
{
"epoch": 0.036391854646651144,
"grad_norm": 0.6373155217327401,
"learning_rate": 3.9930205442082595e-05,
"loss": 0.8599,
"step": 170
},
{
"epoch": 0.03660592437986674,
"grad_norm": 0.5203859749008951,
"learning_rate": 3.992906663196247e-05,
"loss": 0.8332,
"step": 171
},
{
"epoch": 0.03681999411308234,
"grad_norm": 0.4363094179995528,
"learning_rate": 3.992791862262166e-05,
"loss": 0.86,
"step": 172
},
{
"epoch": 0.037034063846297934,
"grad_norm": 0.42070978411572774,
"learning_rate": 3.992676141459011e-05,
"loss": 0.8536,
"step": 173
},
{
"epoch": 0.03724813357951352,
"grad_norm": 0.45121027670694946,
"learning_rate": 3.992559500840195e-05,
"loss": 0.8453,
"step": 174
},
{
"epoch": 0.03746220331272912,
"grad_norm": 0.4341733950661765,
"learning_rate": 3.992441940459561e-05,
"loss": 0.8563,
"step": 175
},
{
"epoch": 0.037676273045944716,
"grad_norm": 0.47227901314281506,
"learning_rate": 3.992323460371376e-05,
"loss": 0.8721,
"step": 176
},
{
"epoch": 0.03789034277916031,
"grad_norm": 0.48029119556045824,
"learning_rate": 3.992204060630328e-05,
"loss": 0.8403,
"step": 177
},
{
"epoch": 0.03810441251237591,
"grad_norm": 0.4303295879743548,
"learning_rate": 3.992083741291533e-05,
"loss": 0.8446,
"step": 178
},
{
"epoch": 0.0383184822455915,
"grad_norm": 0.3874629842751901,
"learning_rate": 3.991962502410529e-05,
"loss": 0.904,
"step": 179
},
{
"epoch": 0.038532551978807096,
"grad_norm": 0.3858328343883844,
"learning_rate": 3.99184034404328e-05,
"loss": 0.832,
"step": 180
},
{
"epoch": 0.03874662171202269,
"grad_norm": 0.4411864662637025,
"learning_rate": 3.991717266246175e-05,
"loss": 0.841,
"step": 181
},
{
"epoch": 0.03896069144523829,
"grad_norm": 0.4291021290425202,
"learning_rate": 3.991593269076026e-05,
"loss": 0.8698,
"step": 182
},
{
"epoch": 0.03917476117845388,
"grad_norm": 0.3683048851246173,
"learning_rate": 3.991468352590069e-05,
"loss": 0.8542,
"step": 183
},
{
"epoch": 0.039388830911669476,
"grad_norm": 0.3398453757458759,
"learning_rate": 3.9913425168459666e-05,
"loss": 0.8906,
"step": 184
},
{
"epoch": 0.03960290064488507,
"grad_norm": 0.38735565990380716,
"learning_rate": 3.991215761901804e-05,
"loss": 0.8205,
"step": 185
},
{
"epoch": 0.03981697037810067,
"grad_norm": 0.38136761019907073,
"learning_rate": 3.99108808781609e-05,
"loss": 0.8542,
"step": 186
},
{
"epoch": 0.04003104011131626,
"grad_norm": 0.43905108617330535,
"learning_rate": 3.99095949464776e-05,
"loss": 0.8698,
"step": 187
},
{
"epoch": 0.040245109844531855,
"grad_norm": 0.4061491303471741,
"learning_rate": 3.990829982456172e-05,
"loss": 0.8415,
"step": 188
},
{
"epoch": 0.04045917957774745,
"grad_norm": 0.41124646823204997,
"learning_rate": 3.9906995513011084e-05,
"loss": 0.8895,
"step": 189
},
{
"epoch": 0.04067324931096305,
"grad_norm": 0.3334094320198046,
"learning_rate": 3.990568201242775e-05,
"loss": 0.8292,
"step": 190
},
{
"epoch": 0.04088731904417864,
"grad_norm": 0.3844590336608152,
"learning_rate": 3.9904359323418055e-05,
"loss": 0.8981,
"step": 191
},
{
"epoch": 0.041101388777394235,
"grad_norm": 0.4031068929590705,
"learning_rate": 3.990302744659252e-05,
"loss": 0.8412,
"step": 192
},
{
"epoch": 0.04131545851060983,
"grad_norm": 0.36692441593346126,
"learning_rate": 3.9901686382565954e-05,
"loss": 0.8415,
"step": 193
},
{
"epoch": 0.04152952824382543,
"grad_norm": 0.2831303025455792,
"learning_rate": 3.9900336131957386e-05,
"loss": 0.8312,
"step": 194
},
{
"epoch": 0.04174359797704102,
"grad_norm": 0.366415168679558,
"learning_rate": 3.989897669539009e-05,
"loss": 0.8522,
"step": 195
},
{
"epoch": 0.041957667710256615,
"grad_norm": 0.3943598843733109,
"learning_rate": 3.989760807349157e-05,
"loss": 0.853,
"step": 196
},
{
"epoch": 0.04217173744347221,
"grad_norm": 0.3474071769471279,
"learning_rate": 3.989623026689359e-05,
"loss": 0.8656,
"step": 197
},
{
"epoch": 0.04238580717668781,
"grad_norm": 0.31608169403589165,
"learning_rate": 3.989484327623215e-05,
"loss": 0.8117,
"step": 198
},
{
"epoch": 0.042599876909903404,
"grad_norm": 0.37963562176126103,
"learning_rate": 3.9893447102147466e-05,
"loss": 0.8231,
"step": 199
},
{
"epoch": 0.042813946643118994,
"grad_norm": 0.40816728349425735,
"learning_rate": 3.989204174528402e-05,
"loss": 0.8681,
"step": 200
},
{
"epoch": 0.04302801637633459,
"grad_norm": 0.4747247682443595,
"learning_rate": 3.9890627206290505e-05,
"loss": 0.836,
"step": 201
},
{
"epoch": 0.04324208610955019,
"grad_norm": 0.49508739667588336,
"learning_rate": 3.988920348581989e-05,
"loss": 0.8707,
"step": 202
},
{
"epoch": 0.043456155842765784,
"grad_norm": 0.49791535308207097,
"learning_rate": 3.988777058452936e-05,
"loss": 0.8198,
"step": 203
},
{
"epoch": 0.043670225575981374,
"grad_norm": 0.34082252492079196,
"learning_rate": 3.988632850308033e-05,
"loss": 0.8037,
"step": 204
},
{
"epoch": 0.04388429530919697,
"grad_norm": 0.3294186089388978,
"learning_rate": 3.988487724213847e-05,
"loss": 0.8362,
"step": 205
},
{
"epoch": 0.04409836504241257,
"grad_norm": 0.35026743920813685,
"learning_rate": 3.988341680237367e-05,
"loss": 0.8548,
"step": 206
},
{
"epoch": 0.044312434775628164,
"grad_norm": 0.3687532517516464,
"learning_rate": 3.9881947184460076e-05,
"loss": 0.8676,
"step": 207
},
{
"epoch": 0.04452650450884375,
"grad_norm": 0.32475090803242124,
"learning_rate": 3.988046838907606e-05,
"loss": 0.8353,
"step": 208
},
{
"epoch": 0.04474057424205935,
"grad_norm": 0.29199110503381154,
"learning_rate": 3.9878980416904224e-05,
"loss": 0.8643,
"step": 209
},
{
"epoch": 0.04495464397527495,
"grad_norm": 0.4389870509714204,
"learning_rate": 3.987748326863141e-05,
"loss": 0.826,
"step": 210
},
{
"epoch": 0.04516871370849054,
"grad_norm": 0.36760355389096555,
"learning_rate": 3.987597694494872e-05,
"loss": 0.8298,
"step": 211
},
{
"epoch": 0.04538278344170613,
"grad_norm": 0.33074126382300445,
"learning_rate": 3.9874461446551446e-05,
"loss": 0.8178,
"step": 212
},
{
"epoch": 0.04559685317492173,
"grad_norm": 0.3643273559583193,
"learning_rate": 3.9872936774139156e-05,
"loss": 0.8111,
"step": 213
},
{
"epoch": 0.045810922908137326,
"grad_norm": 0.41777917894694583,
"learning_rate": 3.987140292841563e-05,
"loss": 0.8217,
"step": 214
},
{
"epoch": 0.04602499264135292,
"grad_norm": 0.36907755400467723,
"learning_rate": 3.986985991008888e-05,
"loss": 0.821,
"step": 215
},
{
"epoch": 0.04623906237456851,
"grad_norm": 0.4232893993657819,
"learning_rate": 3.986830771987118e-05,
"loss": 0.8158,
"step": 216
},
{
"epoch": 0.04645313210778411,
"grad_norm": 0.4239041644027544,
"learning_rate": 3.9866746358479e-05,
"loss": 0.8421,
"step": 217
},
{
"epoch": 0.046667201840999706,
"grad_norm": 0.40093565968948486,
"learning_rate": 3.986517582663307e-05,
"loss": 0.8159,
"step": 218
},
{
"epoch": 0.0468812715742153,
"grad_norm": 0.4106319864596722,
"learning_rate": 3.986359612505835e-05,
"loss": 0.8368,
"step": 219
},
{
"epoch": 0.0470953413074309,
"grad_norm": 0.4223283176069799,
"learning_rate": 3.9862007254484006e-05,
"loss": 0.8199,
"step": 220
},
{
"epoch": 0.04730941104064649,
"grad_norm": 0.37936290326812794,
"learning_rate": 3.986040921564349e-05,
"loss": 0.838,
"step": 221
},
{
"epoch": 0.047523480773862085,
"grad_norm": 0.35203034317166726,
"learning_rate": 3.985880200927442e-05,
"loss": 0.8538,
"step": 222
},
{
"epoch": 0.04773755050707768,
"grad_norm": 0.3858112167023,
"learning_rate": 3.98571856361187e-05,
"loss": 0.8241,
"step": 223
},
{
"epoch": 0.04795162024029328,
"grad_norm": 0.42734155787690564,
"learning_rate": 3.9855560096922445e-05,
"loss": 0.8149,
"step": 224
},
{
"epoch": 0.04816568997350887,
"grad_norm": 0.4914044224155271,
"learning_rate": 3.985392539243599e-05,
"loss": 0.8224,
"step": 225
},
{
"epoch": 0.048379759706724465,
"grad_norm": 0.5686722761141866,
"learning_rate": 3.9852281523413926e-05,
"loss": 0.8315,
"step": 226
},
{
"epoch": 0.04859382943994006,
"grad_norm": 0.4801795715672078,
"learning_rate": 3.9850628490615047e-05,
"loss": 0.8342,
"step": 227
},
{
"epoch": 0.04880789917315566,
"grad_norm": 0.39210217708523265,
"learning_rate": 3.9848966294802395e-05,
"loss": 0.8082,
"step": 228
},
{
"epoch": 0.04902196890637125,
"grad_norm": 0.3469412230066025,
"learning_rate": 3.9847294936743234e-05,
"loss": 0.7959,
"step": 229
},
{
"epoch": 0.049236038639586845,
"grad_norm": 0.3796726457058207,
"learning_rate": 3.984561441720907e-05,
"loss": 0.8481,
"step": 230
},
{
"epoch": 0.04945010837280244,
"grad_norm": 0.49537905000215987,
"learning_rate": 3.984392473697561e-05,
"loss": 0.8591,
"step": 231
},
{
"epoch": 0.04966417810601804,
"grad_norm": 0.4251626976992016,
"learning_rate": 3.984222589682282e-05,
"loss": 0.8062,
"step": 232
},
{
"epoch": 0.04987824783923363,
"grad_norm": 0.4025171048830565,
"learning_rate": 3.984051789753488e-05,
"loss": 0.8282,
"step": 233
},
{
"epoch": 0.050092317572449224,
"grad_norm": 0.36901048716118134,
"learning_rate": 3.98388007399002e-05,
"loss": 0.824,
"step": 234
},
{
"epoch": 0.05030638730566482,
"grad_norm": 0.3534585695479168,
"learning_rate": 3.983707442471141e-05,
"loss": 0.9405,
"step": 235
},
{
"epoch": 0.05052045703888042,
"grad_norm": 0.4027690525572557,
"learning_rate": 3.983533895276538e-05,
"loss": 0.8181,
"step": 236
},
{
"epoch": 0.05073452677209601,
"grad_norm": 0.4408889835415928,
"learning_rate": 3.98335943248632e-05,
"loss": 0.8307,
"step": 237
},
{
"epoch": 0.050948596505311604,
"grad_norm": 0.35131499744012107,
"learning_rate": 3.983184054181019e-05,
"loss": 0.8083,
"step": 238
},
{
"epoch": 0.0511626662385272,
"grad_norm": 0.32597257746183034,
"learning_rate": 3.983007760441589e-05,
"loss": 0.8272,
"step": 239
},
{
"epoch": 0.0513767359717428,
"grad_norm": 0.39655163014973094,
"learning_rate": 3.9828305513494066e-05,
"loss": 0.8326,
"step": 240
},
{
"epoch": 0.05159080570495839,
"grad_norm": 0.4535156910719396,
"learning_rate": 3.982652426986271e-05,
"loss": 0.8536,
"step": 241
},
{
"epoch": 0.05180487543817398,
"grad_norm": 0.3907601010611339,
"learning_rate": 3.982473387434404e-05,
"loss": 0.8414,
"step": 242
},
{
"epoch": 0.05201894517138958,
"grad_norm": 0.39771764683531524,
"learning_rate": 3.9822934327764516e-05,
"loss": 0.8218,
"step": 243
},
{
"epoch": 0.05223301490460518,
"grad_norm": 0.37328579168951637,
"learning_rate": 3.98211256309548e-05,
"loss": 0.8291,
"step": 244
},
{
"epoch": 0.05244708463782077,
"grad_norm": 0.3633500648158138,
"learning_rate": 3.981930778474976e-05,
"loss": 0.8349,
"step": 245
},
{
"epoch": 0.05266115437103636,
"grad_norm": 0.3702781367598446,
"learning_rate": 3.981748078998854e-05,
"loss": 0.8151,
"step": 246
},
{
"epoch": 0.05287522410425196,
"grad_norm": 0.3602753970348582,
"learning_rate": 3.981564464751445e-05,
"loss": 0.8287,
"step": 247
},
{
"epoch": 0.053089293837467556,
"grad_norm": 0.4071160336422372,
"learning_rate": 3.981379935817508e-05,
"loss": 0.82,
"step": 248
},
{
"epoch": 0.05330336357068315,
"grad_norm": 0.35917590351006656,
"learning_rate": 3.981194492282219e-05,
"loss": 0.831,
"step": 249
},
{
"epoch": 0.05351743330389874,
"grad_norm": 0.32204623832014,
"learning_rate": 3.9810081342311786e-05,
"loss": 0.8394,
"step": 250
},
{
"epoch": 0.05373150303711434,
"grad_norm": 0.3483119661055623,
"learning_rate": 3.9808208617504106e-05,
"loss": 0.8674,
"step": 251
},
{
"epoch": 0.053945572770329936,
"grad_norm": 0.4360437668310791,
"learning_rate": 3.980632674926358e-05,
"loss": 0.8223,
"step": 252
},
{
"epoch": 0.05415964250354553,
"grad_norm": 0.4641704582383462,
"learning_rate": 3.980443573845889e-05,
"loss": 0.8015,
"step": 253
},
{
"epoch": 0.05437371223676112,
"grad_norm": 0.4696957121239442,
"learning_rate": 3.980253558596292e-05,
"loss": 0.8346,
"step": 254
},
{
"epoch": 0.05458778196997672,
"grad_norm": 0.3737882880960115,
"learning_rate": 3.980062629265277e-05,
"loss": 0.8209,
"step": 255
},
{
"epoch": 0.054801851703192316,
"grad_norm": 0.2975144847988321,
"learning_rate": 3.9798707859409774e-05,
"loss": 0.8238,
"step": 256
},
{
"epoch": 0.05501592143640791,
"grad_norm": 0.3667298421919832,
"learning_rate": 3.9796780287119466e-05,
"loss": 0.8354,
"step": 257
},
{
"epoch": 0.0552299911696235,
"grad_norm": 0.40522416701915287,
"learning_rate": 3.9794843576671616e-05,
"loss": 0.8178,
"step": 258
},
{
"epoch": 0.0554440609028391,
"grad_norm": 1.309590427072815,
"learning_rate": 3.979289772896021e-05,
"loss": 0.8378,
"step": 259
},
{
"epoch": 0.055658130636054695,
"grad_norm": 0.42641461789020774,
"learning_rate": 3.9790942744883444e-05,
"loss": 0.811,
"step": 260
},
{
"epoch": 0.05587220036927029,
"grad_norm": 0.563183288710883,
"learning_rate": 3.978897862534374e-05,
"loss": 0.8427,
"step": 261
},
{
"epoch": 0.05608627010248588,
"grad_norm": 0.6529241510079115,
"learning_rate": 3.978700537124772e-05,
"loss": 0.8414,
"step": 262
},
{
"epoch": 0.05630033983570148,
"grad_norm": 0.5458687208648185,
"learning_rate": 3.978502298350625e-05,
"loss": 0.8278,
"step": 263
},
{
"epoch": 0.056514409568917075,
"grad_norm": 0.4534677439535358,
"learning_rate": 3.978303146303438e-05,
"loss": 0.8515,
"step": 264
},
{
"epoch": 0.05672847930213267,
"grad_norm": 0.7640035334309163,
"learning_rate": 3.978103081075141e-05,
"loss": 0.7841,
"step": 265
},
{
"epoch": 0.05694254903534827,
"grad_norm": 0.7697304707149756,
"learning_rate": 3.9779021027580827e-05,
"loss": 0.8562,
"step": 266
},
{
"epoch": 0.05715661876856386,
"grad_norm": 0.5582322749059998,
"learning_rate": 3.977700211445034e-05,
"loss": 0.8212,
"step": 267
},
{
"epoch": 0.057370688501779454,
"grad_norm": 0.5278221215123791,
"learning_rate": 3.9774974072291884e-05,
"loss": 0.8213,
"step": 268
},
{
"epoch": 0.05758475823499505,
"grad_norm": 0.41272355095007723,
"learning_rate": 3.977293690204159e-05,
"loss": 0.7884,
"step": 269
},
{
"epoch": 0.05779882796821065,
"grad_norm": 0.38048044038534395,
"learning_rate": 3.977089060463982e-05,
"loss": 0.8024,
"step": 270
},
{
"epoch": 0.05801289770142624,
"grad_norm": 0.4995866625895206,
"learning_rate": 3.976883518103115e-05,
"loss": 0.7964,
"step": 271
},
{
"epoch": 0.058226967434641834,
"grad_norm": 0.4089655275491415,
"learning_rate": 3.9766770632164336e-05,
"loss": 0.781,
"step": 272
},
{
"epoch": 0.05844103716785743,
"grad_norm": 0.373373102001395,
"learning_rate": 3.976469695899238e-05,
"loss": 0.7916,
"step": 273
},
{
"epoch": 0.05865510690107303,
"grad_norm": 0.3804694016639783,
"learning_rate": 3.9762614162472496e-05,
"loss": 0.7615,
"step": 274
},
{
"epoch": 0.05886917663428862,
"grad_norm": 0.37938721740023695,
"learning_rate": 3.976052224356609e-05,
"loss": 0.8109,
"step": 275
},
{
"epoch": 0.059083246367504214,
"grad_norm": 0.40528840545316336,
"learning_rate": 3.975842120323879e-05,
"loss": 0.8283,
"step": 276
},
{
"epoch": 0.05929731610071981,
"grad_norm": 0.3859803919733042,
"learning_rate": 3.9756311042460434e-05,
"loss": 0.8038,
"step": 277
},
{
"epoch": 0.05951138583393541,
"grad_norm": 0.3330269112958392,
"learning_rate": 3.975419176220506e-05,
"loss": 0.8686,
"step": 278
},
{
"epoch": 0.059725455567150997,
"grad_norm": 0.30938332914596234,
"learning_rate": 3.9752063363450935e-05,
"loss": 0.8186,
"step": 279
},
{
"epoch": 0.05993952530036659,
"grad_norm": 0.6301295625426127,
"learning_rate": 3.974992584718051e-05,
"loss": 0.8481,
"step": 280
},
{
"epoch": 0.06015359503358219,
"grad_norm": 0.4359404599621847,
"learning_rate": 3.974777921438048e-05,
"loss": 0.8328,
"step": 281
},
{
"epoch": 0.060367664766797786,
"grad_norm": 0.474513946124991,
"learning_rate": 3.974562346604171e-05,
"loss": 0.8206,
"step": 282
},
{
"epoch": 0.060581734500013376,
"grad_norm": 0.5311517647733177,
"learning_rate": 3.9743458603159295e-05,
"loss": 0.8154,
"step": 283
},
{
"epoch": 0.06079580423322897,
"grad_norm": 0.4714557871254846,
"learning_rate": 3.974128462673253e-05,
"loss": 0.8523,
"step": 284
},
{
"epoch": 0.06100987396644457,
"grad_norm": 0.3535252356315764,
"learning_rate": 3.973910153776492e-05,
"loss": 0.84,
"step": 285
},
{
"epoch": 0.061223943699660166,
"grad_norm": 0.39662502883369144,
"learning_rate": 3.9736909337264166e-05,
"loss": 0.8414,
"step": 286
},
{
"epoch": 0.06143801343287576,
"grad_norm": 0.42934998708894967,
"learning_rate": 3.97347080262422e-05,
"loss": 0.8042,
"step": 287
},
{
"epoch": 0.06165208316609135,
"grad_norm": 0.4601344074880732,
"learning_rate": 3.9732497605715136e-05,
"loss": 0.8316,
"step": 288
},
{
"epoch": 0.06186615289930695,
"grad_norm": 0.4355363882508308,
"learning_rate": 3.9730278076703293e-05,
"loss": 0.8386,
"step": 289
},
{
"epoch": 0.062080222632522546,
"grad_norm": 0.40532871292062256,
"learning_rate": 3.9728049440231216e-05,
"loss": 0.815,
"step": 290
},
{
"epoch": 0.06229429236573814,
"grad_norm": 0.3613108221158915,
"learning_rate": 3.972581169732762e-05,
"loss": 0.7949,
"step": 291
},
{
"epoch": 0.06250836209895373,
"grad_norm": 0.4526241087199315,
"learning_rate": 3.972356484902546e-05,
"loss": 0.8251,
"step": 292
},
{
"epoch": 0.06272243183216933,
"grad_norm": 0.4780714113750351,
"learning_rate": 3.972130889636187e-05,
"loss": 0.8441,
"step": 293
},
{
"epoch": 0.06293650156538493,
"grad_norm": 0.4004084776021443,
"learning_rate": 3.97190438403782e-05,
"loss": 0.8188,
"step": 294
},
{
"epoch": 0.06315057129860052,
"grad_norm": 0.3604097941785592,
"learning_rate": 3.971676968211998e-05,
"loss": 0.8404,
"step": 295
},
{
"epoch": 0.06336464103181612,
"grad_norm": 0.3982334974950628,
"learning_rate": 3.971448642263697e-05,
"loss": 0.8249,
"step": 296
},
{
"epoch": 0.06357871076503172,
"grad_norm": 0.47301721418460696,
"learning_rate": 3.971219406298312e-05,
"loss": 0.8195,
"step": 297
},
{
"epoch": 0.0637927804982473,
"grad_norm": 0.3805707811072661,
"learning_rate": 3.9709892604216576e-05,
"loss": 0.8268,
"step": 298
},
{
"epoch": 0.0640068502314629,
"grad_norm": 0.4056773003140015,
"learning_rate": 3.970758204739968e-05,
"loss": 0.8127,
"step": 299
},
{
"epoch": 0.06422091996467849,
"grad_norm": 0.4032926251226971,
"learning_rate": 3.9705262393598996e-05,
"loss": 0.8351,
"step": 300
},
{
"epoch": 0.06443498969789409,
"grad_norm": 0.40841832469995953,
"learning_rate": 3.970293364388526e-05,
"loss": 0.7682,
"step": 301
},
{
"epoch": 0.06464905943110968,
"grad_norm": 0.4415611740074898,
"learning_rate": 3.970059579933342e-05,
"loss": 0.801,
"step": 302
},
{
"epoch": 0.06486312916432528,
"grad_norm": 0.3722992901576134,
"learning_rate": 3.969824886102262e-05,
"loss": 0.8077,
"step": 303
},
{
"epoch": 0.06507719889754088,
"grad_norm": 0.3257654805382487,
"learning_rate": 3.969589283003621e-05,
"loss": 0.8045,
"step": 304
},
{
"epoch": 0.06529126863075647,
"grad_norm": 0.34518855917033997,
"learning_rate": 3.969352770746173e-05,
"loss": 0.8056,
"step": 305
},
{
"epoch": 0.06550533836397206,
"grad_norm": 0.3651226904229758,
"learning_rate": 3.96911534943909e-05,
"loss": 0.8487,
"step": 306
},
{
"epoch": 0.06571940809718765,
"grad_norm": 0.39292873911020426,
"learning_rate": 3.9688770191919665e-05,
"loss": 0.8159,
"step": 307
},
{
"epoch": 0.06593347783040325,
"grad_norm": 0.33420897271201755,
"learning_rate": 3.968637780114815e-05,
"loss": 0.8183,
"step": 308
},
{
"epoch": 0.06614754756361885,
"grad_norm": 0.33910887310227855,
"learning_rate": 3.968397632318068e-05,
"loss": 0.8023,
"step": 309
},
{
"epoch": 0.06636161729683444,
"grad_norm": 0.4242209288953011,
"learning_rate": 3.9681565759125775e-05,
"loss": 0.817,
"step": 310
},
{
"epoch": 0.06657568703005004,
"grad_norm": 0.4205864487007387,
"learning_rate": 3.967914611009614e-05,
"loss": 0.8008,
"step": 311
},
{
"epoch": 0.06678975676326564,
"grad_norm": 0.38211559167478176,
"learning_rate": 3.967671737720869e-05,
"loss": 0.8234,
"step": 312
},
{
"epoch": 0.06700382649648123,
"grad_norm": 0.30498176756349277,
"learning_rate": 3.9674279561584514e-05,
"loss": 0.8099,
"step": 313
},
{
"epoch": 0.06721789622969683,
"grad_norm": 0.3177472315752944,
"learning_rate": 3.967183266434891e-05,
"loss": 0.8241,
"step": 314
},
{
"epoch": 0.06743196596291241,
"grad_norm": 0.3991469669718329,
"learning_rate": 3.966937668663136e-05,
"loss": 0.8269,
"step": 315
},
{
"epoch": 0.06764603569612801,
"grad_norm": 0.3836837245424652,
"learning_rate": 3.9666911629565534e-05,
"loss": 0.8051,
"step": 316
},
{
"epoch": 0.0678601054293436,
"grad_norm": 0.33706670379619297,
"learning_rate": 3.966443749428931e-05,
"loss": 0.8179,
"step": 317
},
{
"epoch": 0.0680741751625592,
"grad_norm": 0.3381706480607538,
"learning_rate": 3.966195428194472e-05,
"loss": 0.8051,
"step": 318
},
{
"epoch": 0.0682882448957748,
"grad_norm": 0.33121589164954485,
"learning_rate": 3.965946199367804e-05,
"loss": 0.8183,
"step": 319
},
{
"epoch": 0.0685023146289904,
"grad_norm": 0.32660295615566726,
"learning_rate": 3.9656960630639686e-05,
"loss": 0.8168,
"step": 320
},
{
"epoch": 0.06871638436220599,
"grad_norm": 0.331583354100291,
"learning_rate": 3.965445019398429e-05,
"loss": 0.8055,
"step": 321
},
{
"epoch": 0.06893045409542159,
"grad_norm": 0.3266278746059167,
"learning_rate": 3.9651930684870666e-05,
"loss": 0.8269,
"step": 322
},
{
"epoch": 0.06914452382863717,
"grad_norm": 0.47908107351770507,
"learning_rate": 3.96494021044618e-05,
"loss": 0.8003,
"step": 323
},
{
"epoch": 0.06935859356185277,
"grad_norm": 0.3530060012652866,
"learning_rate": 3.9646864453924905e-05,
"loss": 0.8131,
"step": 324
},
{
"epoch": 0.06957266329506837,
"grad_norm": 0.3295787877177966,
"learning_rate": 3.9644317734431344e-05,
"loss": 0.8097,
"step": 325
},
{
"epoch": 0.06978673302828396,
"grad_norm": 0.3517571963617062,
"learning_rate": 3.964176194715667e-05,
"loss": 0.8061,
"step": 326
},
{
"epoch": 0.07000080276149956,
"grad_norm": 0.3362094473757138,
"learning_rate": 3.963919709328064e-05,
"loss": 0.7805,
"step": 327
},
{
"epoch": 0.07021487249471516,
"grad_norm": 0.32807044312133443,
"learning_rate": 3.9636623173987176e-05,
"loss": 0.8123,
"step": 328
},
{
"epoch": 0.07042894222793075,
"grad_norm": 0.3444530460484444,
"learning_rate": 3.963404019046441e-05,
"loss": 0.8152,
"step": 329
},
{
"epoch": 0.07064301196114635,
"grad_norm": 0.3442453072545327,
"learning_rate": 3.963144814390463e-05,
"loss": 0.8282,
"step": 330
},
{
"epoch": 0.07085708169436195,
"grad_norm": 0.3128444035140549,
"learning_rate": 3.9628847035504326e-05,
"loss": 0.8065,
"step": 331
},
{
"epoch": 0.07107115142757753,
"grad_norm": 0.31629960274605623,
"learning_rate": 3.962623686646416e-05,
"loss": 0.8214,
"step": 332
},
{
"epoch": 0.07128522116079312,
"grad_norm": 0.3443646437494624,
"learning_rate": 3.962361763798899e-05,
"loss": 0.8394,
"step": 333
},
{
"epoch": 0.07149929089400872,
"grad_norm": 0.4808659192640893,
"learning_rate": 3.962098935128783e-05,
"loss": 0.8375,
"step": 334
},
{
"epoch": 0.07171336062722432,
"grad_norm": 0.32973106515477585,
"learning_rate": 3.9618352007573906e-05,
"loss": 0.7917,
"step": 335
},
{
"epoch": 0.07192743036043991,
"grad_norm": 0.34663939104067665,
"learning_rate": 3.961570560806461e-05,
"loss": 0.7989,
"step": 336
},
{
"epoch": 0.07214150009365551,
"grad_norm": 0.3151206600229396,
"learning_rate": 3.9613050153981515e-05,
"loss": 0.8217,
"step": 337
},
{
"epoch": 0.07235556982687111,
"grad_norm": 0.3284998110054707,
"learning_rate": 3.9610385646550374e-05,
"loss": 0.8002,
"step": 338
},
{
"epoch": 0.0725696395600867,
"grad_norm": 0.3316066108268066,
"learning_rate": 3.960771208700111e-05,
"loss": 0.8179,
"step": 339
},
{
"epoch": 0.07278370929330229,
"grad_norm": 0.3107803293102406,
"learning_rate": 3.9605029476567845e-05,
"loss": 0.7983,
"step": 340
},
{
"epoch": 0.07299777902651788,
"grad_norm": 0.3167494965705026,
"learning_rate": 3.960233781648886e-05,
"loss": 0.8023,
"step": 341
},
{
"epoch": 0.07321184875973348,
"grad_norm": 0.28869946757029363,
"learning_rate": 3.959963710800662e-05,
"loss": 0.8063,
"step": 342
},
{
"epoch": 0.07342591849294908,
"grad_norm": 0.3356343455698333,
"learning_rate": 3.9596927352367774e-05,
"loss": 0.8586,
"step": 343
},
{
"epoch": 0.07363998822616467,
"grad_norm": 0.2698423586183263,
"learning_rate": 3.959420855082314e-05,
"loss": 0.7954,
"step": 344
},
{
"epoch": 0.07385405795938027,
"grad_norm": 0.2797285230172307,
"learning_rate": 3.9591480704627695e-05,
"loss": 0.8249,
"step": 345
},
{
"epoch": 0.07406812769259587,
"grad_norm": 0.2872525953074487,
"learning_rate": 3.958874381504063e-05,
"loss": 0.8093,
"step": 346
},
{
"epoch": 0.07428219742581146,
"grad_norm": 0.2794424585889053,
"learning_rate": 3.9585997883325275e-05,
"loss": 0.8004,
"step": 347
},
{
"epoch": 0.07449626715902705,
"grad_norm": 0.28923703879391893,
"learning_rate": 3.958324291074915e-05,
"loss": 0.8063,
"step": 348
},
{
"epoch": 0.07471033689224264,
"grad_norm": 0.28718264474262556,
"learning_rate": 3.9580478898583946e-05,
"loss": 0.8338,
"step": 349
},
{
"epoch": 0.07492440662545824,
"grad_norm": 0.3098163432395999,
"learning_rate": 3.9577705848105534e-05,
"loss": 0.809,
"step": 350
},
{
"epoch": 0.07513847635867384,
"grad_norm": 0.31522990371085624,
"learning_rate": 3.957492376059393e-05,
"loss": 0.8154,
"step": 351
},
{
"epoch": 0.07535254609188943,
"grad_norm": 0.28338087888379176,
"learning_rate": 3.9572132637333354e-05,
"loss": 0.818,
"step": 352
},
{
"epoch": 0.07556661582510503,
"grad_norm": 0.27413238491020875,
"learning_rate": 3.956933247961218e-05,
"loss": 0.789,
"step": 353
},
{
"epoch": 0.07578068555832063,
"grad_norm": 0.29472918038836227,
"learning_rate": 3.956652328872296e-05,
"loss": 0.8045,
"step": 354
},
{
"epoch": 0.07599475529153622,
"grad_norm": 0.32822208650805884,
"learning_rate": 3.956370506596241e-05,
"loss": 0.7943,
"step": 355
},
{
"epoch": 0.07620882502475182,
"grad_norm": 0.3654921111096867,
"learning_rate": 3.956087781263141e-05,
"loss": 0.8134,
"step": 356
},
{
"epoch": 0.0764228947579674,
"grad_norm": 0.3305771615214981,
"learning_rate": 3.955804153003502e-05,
"loss": 0.7889,
"step": 357
},
{
"epoch": 0.076636964491183,
"grad_norm": 0.3186337070746346,
"learning_rate": 3.9555196219482465e-05,
"loss": 0.7702,
"step": 358
},
{
"epoch": 0.0768510342243986,
"grad_norm": 0.315898247232711,
"learning_rate": 3.9552341882287126e-05,
"loss": 0.7864,
"step": 359
},
{
"epoch": 0.07706510395761419,
"grad_norm": 0.2836453847637815,
"learning_rate": 3.9549478519766574e-05,
"loss": 0.7744,
"step": 360
},
{
"epoch": 0.07727917369082979,
"grad_norm": 0.30499627108597266,
"learning_rate": 3.954660613324252e-05,
"loss": 0.8501,
"step": 361
},
{
"epoch": 0.07749324342404539,
"grad_norm": 0.3538275134350595,
"learning_rate": 3.9543724724040854e-05,
"loss": 0.8076,
"step": 362
},
{
"epoch": 0.07770731315726098,
"grad_norm": 0.30000054335125104,
"learning_rate": 3.9540834293491636e-05,
"loss": 0.8131,
"step": 363
},
{
"epoch": 0.07792138289047658,
"grad_norm": 0.2931599936516347,
"learning_rate": 3.953793484292908e-05,
"loss": 0.7891,
"step": 364
},
{
"epoch": 0.07813545262369216,
"grad_norm": 0.29320389703691146,
"learning_rate": 3.9535026373691554e-05,
"loss": 0.8171,
"step": 365
},
{
"epoch": 0.07834952235690776,
"grad_norm": 0.28709837757977674,
"learning_rate": 3.953210888712162e-05,
"loss": 0.8229,
"step": 366
},
{
"epoch": 0.07856359209012335,
"grad_norm": 0.29183743422406655,
"learning_rate": 3.952918238456599e-05,
"loss": 0.785,
"step": 367
},
{
"epoch": 0.07877766182333895,
"grad_norm": 0.3045123243298051,
"learning_rate": 3.952624686737551e-05,
"loss": 0.8198,
"step": 368
},
{
"epoch": 0.07899173155655455,
"grad_norm": 0.30356376258153456,
"learning_rate": 3.952330233690522e-05,
"loss": 0.8174,
"step": 369
},
{
"epoch": 0.07920580128977014,
"grad_norm": 0.3057393338323986,
"learning_rate": 3.9520348794514316e-05,
"loss": 0.8337,
"step": 370
},
{
"epoch": 0.07941987102298574,
"grad_norm": 0.3136540585600542,
"learning_rate": 3.951738624156614e-05,
"loss": 0.772,
"step": 371
},
{
"epoch": 0.07963394075620134,
"grad_norm": 0.308298984980087,
"learning_rate": 3.95144146794282e-05,
"loss": 0.8192,
"step": 372
},
{
"epoch": 0.07984801048941692,
"grad_norm": 0.29718555367357496,
"learning_rate": 3.9511434109472173e-05,
"loss": 0.8334,
"step": 373
},
{
"epoch": 0.08006208022263252,
"grad_norm": 0.2912289078920853,
"learning_rate": 3.950844453307387e-05,
"loss": 0.7954,
"step": 374
},
{
"epoch": 0.08027614995584811,
"grad_norm": 0.3181402386904103,
"learning_rate": 3.9505445951613286e-05,
"loss": 0.7862,
"step": 375
},
{
"epoch": 0.08049021968906371,
"grad_norm": 0.30604477014382697,
"learning_rate": 3.950243836647456e-05,
"loss": 0.8126,
"step": 376
},
{
"epoch": 0.08070428942227931,
"grad_norm": 0.3360179805652376,
"learning_rate": 3.949942177904598e-05,
"loss": 0.7973,
"step": 377
},
{
"epoch": 0.0809183591554949,
"grad_norm": 0.3686297699994102,
"learning_rate": 3.9496396190720004e-05,
"loss": 0.7621,
"step": 378
},
{
"epoch": 0.0811324288887105,
"grad_norm": 0.4008325822424953,
"learning_rate": 3.9493361602893234e-05,
"loss": 0.7653,
"step": 379
},
{
"epoch": 0.0813464986219261,
"grad_norm": 0.3386933604103968,
"learning_rate": 3.9490318016966435e-05,
"loss": 0.8287,
"step": 380
},
{
"epoch": 0.0815605683551417,
"grad_norm": 0.3011982026024404,
"learning_rate": 3.948726543434451e-05,
"loss": 0.8307,
"step": 381
},
{
"epoch": 0.08177463808835728,
"grad_norm": 0.28080627474451375,
"learning_rate": 3.9484203856436536e-05,
"loss": 0.8102,
"step": 382
},
{
"epoch": 0.08198870782157287,
"grad_norm": 0.3080645597652293,
"learning_rate": 3.9481133284655736e-05,
"loss": 0.7848,
"step": 383
},
{
"epoch": 0.08220277755478847,
"grad_norm": 0.3589399876731314,
"learning_rate": 3.9478053720419474e-05,
"loss": 0.7941,
"step": 384
},
{
"epoch": 0.08241684728800407,
"grad_norm": 0.34417850338458056,
"learning_rate": 3.947496516514926e-05,
"loss": 0.8075,
"step": 385
},
{
"epoch": 0.08263091702121966,
"grad_norm": 0.37642738265223674,
"learning_rate": 3.947186762027078e-05,
"loss": 0.7935,
"step": 386
},
{
"epoch": 0.08284498675443526,
"grad_norm": 0.4211305728036991,
"learning_rate": 3.9468761087213864e-05,
"loss": 0.8258,
"step": 387
},
{
"epoch": 0.08305905648765086,
"grad_norm": 0.3826173978469827,
"learning_rate": 3.946564556741246e-05,
"loss": 0.8389,
"step": 388
},
{
"epoch": 0.08327312622086645,
"grad_norm": 0.35118327732229604,
"learning_rate": 3.946252106230469e-05,
"loss": 0.8192,
"step": 389
},
{
"epoch": 0.08348719595408204,
"grad_norm": 0.35210458632394,
"learning_rate": 3.9459387573332826e-05,
"loss": 0.8237,
"step": 390
},
{
"epoch": 0.08370126568729763,
"grad_norm": 0.3337749530435175,
"learning_rate": 3.945624510194328e-05,
"loss": 0.7743,
"step": 391
},
{
"epoch": 0.08391533542051323,
"grad_norm": 0.33434494046784513,
"learning_rate": 3.945309364958662e-05,
"loss": 0.8695,
"step": 392
},
{
"epoch": 0.08412940515372883,
"grad_norm": 0.3241230752049322,
"learning_rate": 3.944993321771754e-05,
"loss": 0.8008,
"step": 393
},
{
"epoch": 0.08434347488694442,
"grad_norm": 0.30413418950235466,
"learning_rate": 3.9446763807794887e-05,
"loss": 0.7955,
"step": 394
},
{
"epoch": 0.08455754462016002,
"grad_norm": 0.33284889602440043,
"learning_rate": 3.944358542128166e-05,
"loss": 0.7702,
"step": 395
},
{
"epoch": 0.08477161435337562,
"grad_norm": 0.36554769164425394,
"learning_rate": 3.944039805964499e-05,
"loss": 0.8267,
"step": 396
},
{
"epoch": 0.08498568408659121,
"grad_norm": 0.29715786433131725,
"learning_rate": 3.943720172435617e-05,
"loss": 0.7628,
"step": 397
},
{
"epoch": 0.08519975381980681,
"grad_norm": 0.38192305128262505,
"learning_rate": 3.943399641689061e-05,
"loss": 0.8062,
"step": 398
},
{
"epoch": 0.08541382355302239,
"grad_norm": 0.3844664213287207,
"learning_rate": 3.943078213872788e-05,
"loss": 0.7531,
"step": 399
},
{
"epoch": 0.08562789328623799,
"grad_norm": 0.3358990433389961,
"learning_rate": 3.942755889135169e-05,
"loss": 0.8012,
"step": 400
},
{
"epoch": 0.08584196301945358,
"grad_norm": 0.43718403319920773,
"learning_rate": 3.9424326676249874e-05,
"loss": 0.7862,
"step": 401
},
{
"epoch": 0.08605603275266918,
"grad_norm": 0.437345292516579,
"learning_rate": 3.942108549491442e-05,
"loss": 0.766,
"step": 402
},
{
"epoch": 0.08627010248588478,
"grad_norm": 0.30500226693756965,
"learning_rate": 3.941783534884146e-05,
"loss": 0.8088,
"step": 403
},
{
"epoch": 0.08648417221910037,
"grad_norm": 0.43272229710292914,
"learning_rate": 3.941457623953125e-05,
"loss": 0.7802,
"step": 404
},
{
"epoch": 0.08669824195231597,
"grad_norm": 0.48940603312831304,
"learning_rate": 3.941130816848818e-05,
"loss": 0.7547,
"step": 405
},
{
"epoch": 0.08691231168553157,
"grad_norm": 0.35487060015388955,
"learning_rate": 3.940803113722079e-05,
"loss": 0.8284,
"step": 406
},
{
"epoch": 0.08712638141874715,
"grad_norm": 0.4442303353198171,
"learning_rate": 3.9404745147241765e-05,
"loss": 0.8189,
"step": 407
},
{
"epoch": 0.08734045115196275,
"grad_norm": 0.40338436474672607,
"learning_rate": 3.94014502000679e-05,
"loss": 0.7775,
"step": 408
},
{
"epoch": 0.08755452088517834,
"grad_norm": 0.3570502308426536,
"learning_rate": 3.939814629722014e-05,
"loss": 0.7955,
"step": 409
},
{
"epoch": 0.08776859061839394,
"grad_norm": 0.38292216978816046,
"learning_rate": 3.939483344022355e-05,
"loss": 0.7958,
"step": 410
},
{
"epoch": 0.08798266035160954,
"grad_norm": 0.3070187493512314,
"learning_rate": 3.9391511630607356e-05,
"loss": 0.7875,
"step": 411
},
{
"epoch": 0.08819673008482513,
"grad_norm": 0.3169319812921083,
"learning_rate": 3.9388180869904885e-05,
"loss": 0.7871,
"step": 412
},
{
"epoch": 0.08841079981804073,
"grad_norm": 0.3646895828122296,
"learning_rate": 3.9384841159653617e-05,
"loss": 0.8015,
"step": 413
},
{
"epoch": 0.08862486955125633,
"grad_norm": 0.39476208220879444,
"learning_rate": 3.9381492501395157e-05,
"loss": 0.7908,
"step": 414
},
{
"epoch": 0.08883893928447191,
"grad_norm": 0.3223532335546537,
"learning_rate": 3.937813489667524e-05,
"loss": 0.7759,
"step": 415
},
{
"epoch": 0.0890530090176875,
"grad_norm": 0.32244150295392837,
"learning_rate": 3.9374768347043724e-05,
"loss": 0.8035,
"step": 416
},
{
"epoch": 0.0892670787509031,
"grad_norm": 0.27716414649444343,
"learning_rate": 3.9371392854054605e-05,
"loss": 0.8271,
"step": 417
},
{
"epoch": 0.0894811484841187,
"grad_norm": 0.31640706444434497,
"learning_rate": 3.936800841926601e-05,
"loss": 0.8002,
"step": 418
},
{
"epoch": 0.0896952182173343,
"grad_norm": 0.28070614377948816,
"learning_rate": 3.936461504424018e-05,
"loss": 0.7636,
"step": 419
},
{
"epoch": 0.0899092879505499,
"grad_norm": 0.323197171382002,
"learning_rate": 3.936121273054349e-05,
"loss": 0.7975,
"step": 420
},
{
"epoch": 0.09012335768376549,
"grad_norm": 0.3275896009129179,
"learning_rate": 3.935780147974646e-05,
"loss": 0.7978,
"step": 421
},
{
"epoch": 0.09033742741698109,
"grad_norm": 0.31620123998333727,
"learning_rate": 3.9354381293423684e-05,
"loss": 0.8278,
"step": 422
},
{
"epoch": 0.09055149715019668,
"grad_norm": 0.33135603565790595,
"learning_rate": 3.935095217315394e-05,
"loss": 0.8121,
"step": 423
},
{
"epoch": 0.09076556688341227,
"grad_norm": 0.2709838730320346,
"learning_rate": 3.9347514120520104e-05,
"loss": 0.7872,
"step": 424
},
{
"epoch": 0.09097963661662786,
"grad_norm": 0.3081730667884663,
"learning_rate": 3.934406713710915e-05,
"loss": 0.7798,
"step": 425
},
{
"epoch": 0.09119370634984346,
"grad_norm": 0.2936912909537816,
"learning_rate": 3.934061122451223e-05,
"loss": 0.7912,
"step": 426
},
{
"epoch": 0.09140777608305906,
"grad_norm": 0.29386606384320585,
"learning_rate": 3.933714638432458e-05,
"loss": 0.7724,
"step": 427
},
{
"epoch": 0.09162184581627465,
"grad_norm": 0.35283616412668806,
"learning_rate": 3.9333672618145545e-05,
"loss": 0.8262,
"step": 428
},
{
"epoch": 0.09183591554949025,
"grad_norm": 0.33350158381331013,
"learning_rate": 3.933018992757862e-05,
"loss": 0.8252,
"step": 429
},
{
"epoch": 0.09204998528270585,
"grad_norm": 0.3679928914569241,
"learning_rate": 3.9326698314231414e-05,
"loss": 0.7915,
"step": 430
},
{
"epoch": 0.09226405501592144,
"grad_norm": 0.3278069512216059,
"learning_rate": 3.932319777971564e-05,
"loss": 0.782,
"step": 431
},
{
"epoch": 0.09247812474913703,
"grad_norm": 0.30042433795201073,
"learning_rate": 3.931968832564716e-05,
"loss": 0.7707,
"step": 432
},
{
"epoch": 0.09269219448235262,
"grad_norm": 0.32775465415944327,
"learning_rate": 3.931616995364589e-05,
"loss": 0.8191,
"step": 433
},
{
"epoch": 0.09290626421556822,
"grad_norm": 0.3317510455541998,
"learning_rate": 3.9312642665335946e-05,
"loss": 0.774,
"step": 434
},
{
"epoch": 0.09312033394878381,
"grad_norm": 0.3815492582776924,
"learning_rate": 3.9309106462345496e-05,
"loss": 0.7965,
"step": 435
},
{
"epoch": 0.09333440368199941,
"grad_norm": 0.3181651283506857,
"learning_rate": 3.930556134630685e-05,
"loss": 0.8283,
"step": 436
},
{
"epoch": 0.09354847341521501,
"grad_norm": 0.30445611297616393,
"learning_rate": 3.930200731885643e-05,
"loss": 0.7769,
"step": 437
},
{
"epoch": 0.0937625431484306,
"grad_norm": 0.4112724368184057,
"learning_rate": 3.9298444381634764e-05,
"loss": 0.8069,
"step": 438
},
{
"epoch": 0.0939766128816462,
"grad_norm": 0.26329988235448815,
"learning_rate": 3.9294872536286495e-05,
"loss": 0.7896,
"step": 439
},
{
"epoch": 0.0941906826148618,
"grad_norm": 0.3415432326279578,
"learning_rate": 3.9291291784460384e-05,
"loss": 0.7944,
"step": 440
},
{
"epoch": 0.09440475234807738,
"grad_norm": 0.36580755245009844,
"learning_rate": 3.92877021278093e-05,
"loss": 0.7967,
"step": 441
},
{
"epoch": 0.09461882208129298,
"grad_norm": 0.3063955546059932,
"learning_rate": 3.928410356799022e-05,
"loss": 0.7832,
"step": 442
},
{
"epoch": 0.09483289181450857,
"grad_norm": 0.2846537103112679,
"learning_rate": 3.9280496106664244e-05,
"loss": 0.8257,
"step": 443
},
{
"epoch": 0.09504696154772417,
"grad_norm": 0.3025145656909554,
"learning_rate": 3.9276879745496546e-05,
"loss": 0.7949,
"step": 444
},
{
"epoch": 0.09526103128093977,
"grad_norm": 0.36490858653861336,
"learning_rate": 3.9273254486156454e-05,
"loss": 0.7866,
"step": 445
},
{
"epoch": 0.09547510101415536,
"grad_norm": 0.3284436117692305,
"learning_rate": 3.9269620330317366e-05,
"loss": 0.801,
"step": 446
},
{
"epoch": 0.09568917074737096,
"grad_norm": 0.28334521928060885,
"learning_rate": 3.9265977279656815e-05,
"loss": 0.7989,
"step": 447
},
{
"epoch": 0.09590324048058656,
"grad_norm": 0.2810576285736024,
"learning_rate": 3.926232533585642e-05,
"loss": 0.7866,
"step": 448
},
{
"epoch": 0.09611731021380214,
"grad_norm": 0.3622475624519783,
"learning_rate": 3.9258664500601905e-05,
"loss": 0.7673,
"step": 449
},
{
"epoch": 0.09633137994701774,
"grad_norm": 0.3731888046724523,
"learning_rate": 3.925499477558311e-05,
"loss": 0.8017,
"step": 450
},
{
"epoch": 0.09654544968023333,
"grad_norm": 0.4283601076692715,
"learning_rate": 3.925131616249398e-05,
"loss": 0.8008,
"step": 451
},
{
"epoch": 0.09675951941344893,
"grad_norm": 0.38686603375095646,
"learning_rate": 3.9247628663032546e-05,
"loss": 0.7818,
"step": 452
},
{
"epoch": 0.09697358914666453,
"grad_norm": 0.335515955694342,
"learning_rate": 3.924393227890096e-05,
"loss": 0.7737,
"step": 453
},
{
"epoch": 0.09718765887988012,
"grad_norm": 0.41277152616792745,
"learning_rate": 3.9240227011805455e-05,
"loss": 0.803,
"step": 454
},
{
"epoch": 0.09740172861309572,
"grad_norm": 0.3704392992755366,
"learning_rate": 3.923651286345638e-05,
"loss": 0.7652,
"step": 455
},
{
"epoch": 0.09761579834631132,
"grad_norm": 0.348136323262541,
"learning_rate": 3.923278983556819e-05,
"loss": 0.8068,
"step": 456
},
{
"epoch": 0.0978298680795269,
"grad_norm": 0.2743148395487614,
"learning_rate": 3.9229057929859416e-05,
"loss": 0.8237,
"step": 457
},
{
"epoch": 0.0980439378127425,
"grad_norm": 0.3201814348066745,
"learning_rate": 3.9225317148052704e-05,
"loss": 0.7556,
"step": 458
},
{
"epoch": 0.09825800754595809,
"grad_norm": 0.3080337682779766,
"learning_rate": 3.9221567491874784e-05,
"loss": 0.7774,
"step": 459
},
{
"epoch": 0.09847207727917369,
"grad_norm": 0.33474051456448056,
"learning_rate": 3.9217808963056496e-05,
"loss": 0.7763,
"step": 460
},
{
"epoch": 0.09868614701238929,
"grad_norm": 0.3233316417468542,
"learning_rate": 3.921404156333277e-05,
"loss": 0.7565,
"step": 461
},
{
"epoch": 0.09890021674560488,
"grad_norm": 0.35917382285440097,
"learning_rate": 3.921026529444264e-05,
"loss": 0.8205,
"step": 462
},
{
"epoch": 0.09911428647882048,
"grad_norm": 0.3720804896494926,
"learning_rate": 3.920648015812921e-05,
"loss": 0.7671,
"step": 463
},
{
"epoch": 0.09932835621203608,
"grad_norm": 0.3605724635094279,
"learning_rate": 3.92026861561397e-05,
"loss": 0.7583,
"step": 464
},
{
"epoch": 0.09954242594525167,
"grad_norm": 0.33828132604779365,
"learning_rate": 3.9198883290225406e-05,
"loss": 0.7565,
"step": 465
},
{
"epoch": 0.09975649567846726,
"grad_norm": 0.2869927458519343,
"learning_rate": 3.919507156214174e-05,
"loss": 0.8346,
"step": 466
},
{
"epoch": 0.09997056541168285,
"grad_norm": 0.29979997671769304,
"learning_rate": 3.919125097364817e-05,
"loss": 0.7978,
"step": 467
},
{
"epoch": 0.10018463514489845,
"grad_norm": 0.3189819781882113,
"learning_rate": 3.918742152650829e-05,
"loss": 0.7558,
"step": 468
},
{
"epoch": 0.10039870487811405,
"grad_norm": 0.33446164378667337,
"learning_rate": 3.918358322248975e-05,
"loss": 0.7986,
"step": 469
},
{
"epoch": 0.10061277461132964,
"grad_norm": 0.3875277277756248,
"learning_rate": 3.917973606336431e-05,
"loss": 0.7844,
"step": 470
},
{
"epoch": 0.10082684434454524,
"grad_norm": 0.3811898107366712,
"learning_rate": 3.9175880050907816e-05,
"loss": 0.7778,
"step": 471
},
{
"epoch": 0.10104091407776083,
"grad_norm": 0.3249475455448723,
"learning_rate": 3.9172015186900196e-05,
"loss": 0.7966,
"step": 472
},
{
"epoch": 0.10125498381097643,
"grad_norm": 0.2814721827325935,
"learning_rate": 3.916814147312546e-05,
"loss": 0.8198,
"step": 473
},
{
"epoch": 0.10146905354419201,
"grad_norm": 0.29531901355457474,
"learning_rate": 3.9164258911371705e-05,
"loss": 0.7657,
"step": 474
},
{
"epoch": 0.10168312327740761,
"grad_norm": 0.2911491443407701,
"learning_rate": 3.916036750343113e-05,
"loss": 0.7798,
"step": 475
},
{
"epoch": 0.10189719301062321,
"grad_norm": 0.3041760892145798,
"learning_rate": 3.9156467251099976e-05,
"loss": 0.7501,
"step": 476
},
{
"epoch": 0.1021112627438388,
"grad_norm": 0.31156761935336486,
"learning_rate": 3.915255815617861e-05,
"loss": 0.7758,
"step": 477
},
{
"epoch": 0.1023253324770544,
"grad_norm": 0.2790026848999721,
"learning_rate": 3.9148640220471464e-05,
"loss": 0.7929,
"step": 478
},
{
"epoch": 0.10253940221027,
"grad_norm": 0.3435996309758551,
"learning_rate": 3.914471344578704e-05,
"loss": 0.8117,
"step": 479
},
{
"epoch": 0.1027534719434856,
"grad_norm": 0.2956753281942712,
"learning_rate": 3.914077783393793e-05,
"loss": 0.8041,
"step": 480
},
{
"epoch": 0.10296754167670119,
"grad_norm": 0.3204234448008445,
"learning_rate": 3.913683338674083e-05,
"loss": 0.8116,
"step": 481
},
{
"epoch": 0.10318161140991677,
"grad_norm": 0.30766385532600793,
"learning_rate": 3.913288010601645e-05,
"loss": 0.7494,
"step": 482
},
{
"epoch": 0.10339568114313237,
"grad_norm": 0.2922689705524498,
"learning_rate": 3.912891799358964e-05,
"loss": 0.7799,
"step": 483
},
{
"epoch": 0.10360975087634797,
"grad_norm": 0.3731373712577605,
"learning_rate": 3.912494705128931e-05,
"loss": 0.7722,
"step": 484
},
{
"epoch": 0.10382382060956356,
"grad_norm": 0.32253916018762024,
"learning_rate": 3.912096728094843e-05,
"loss": 0.7778,
"step": 485
},
{
"epoch": 0.10403789034277916,
"grad_norm": 0.33121589091006576,
"learning_rate": 3.911697868440405e-05,
"loss": 0.7791,
"step": 486
},
{
"epoch": 0.10425196007599476,
"grad_norm": 0.36523733636259026,
"learning_rate": 3.9112981263497304e-05,
"loss": 0.7893,
"step": 487
},
{
"epoch": 0.10446602980921035,
"grad_norm": 0.3451596974843037,
"learning_rate": 3.91089750200734e-05,
"loss": 0.7679,
"step": 488
},
{
"epoch": 0.10468009954242595,
"grad_norm": 0.3386287377778956,
"learning_rate": 3.9104959955981605e-05,
"loss": 0.7524,
"step": 489
},
{
"epoch": 0.10489416927564155,
"grad_norm": 0.2688495289135925,
"learning_rate": 3.910093607307526e-05,
"loss": 0.771,
"step": 490
},
{
"epoch": 0.10510823900885713,
"grad_norm": 0.3143137665297875,
"learning_rate": 3.90969033732118e-05,
"loss": 0.7944,
"step": 491
},
{
"epoch": 0.10532230874207273,
"grad_norm": 0.3189166100146866,
"learning_rate": 3.90928618582527e-05,
"loss": 0.7977,
"step": 492
},
{
"epoch": 0.10553637847528832,
"grad_norm": 0.2977536874353114,
"learning_rate": 3.908881153006351e-05,
"loss": 0.7924,
"step": 493
},
{
"epoch": 0.10575044820850392,
"grad_norm": 0.3359457616162425,
"learning_rate": 3.9084752390513865e-05,
"loss": 0.7522,
"step": 494
},
{
"epoch": 0.10596451794171952,
"grad_norm": 0.3281465702772829,
"learning_rate": 3.908068444147745e-05,
"loss": 0.8004,
"step": 495
},
{
"epoch": 0.10617858767493511,
"grad_norm": 0.28487561432075975,
"learning_rate": 3.907660768483203e-05,
"loss": 0.7744,
"step": 496
},
{
"epoch": 0.10639265740815071,
"grad_norm": 0.31522568771373916,
"learning_rate": 3.9072522122459425e-05,
"loss": 0.785,
"step": 497
},
{
"epoch": 0.1066067271413663,
"grad_norm": 0.3223603592445368,
"learning_rate": 3.906842775624552e-05,
"loss": 0.7704,
"step": 498
},
{
"epoch": 0.10682079687458189,
"grad_norm": 0.2932291793889263,
"learning_rate": 3.906432458808026e-05,
"loss": 0.8022,
"step": 499
},
{
"epoch": 0.10703486660779749,
"grad_norm": 0.3066773167676235,
"learning_rate": 3.9060212619857676e-05,
"loss": 0.7992,
"step": 500
},
{
"epoch": 0.10724893634101308,
"grad_norm": 0.33935414574001344,
"learning_rate": 3.905609185347584e-05,
"loss": 0.7982,
"step": 501
},
{
"epoch": 0.10746300607422868,
"grad_norm": 0.3532180778210077,
"learning_rate": 3.905196229083688e-05,
"loss": 0.7967,
"step": 502
},
{
"epoch": 0.10767707580744428,
"grad_norm": 0.3537033113035032,
"learning_rate": 3.904782393384701e-05,
"loss": 0.8083,
"step": 503
},
{
"epoch": 0.10789114554065987,
"grad_norm": 0.334513713879041,
"learning_rate": 3.9043676784416485e-05,
"loss": 0.7814,
"step": 504
},
{
"epoch": 0.10810521527387547,
"grad_norm": 0.3505105241291179,
"learning_rate": 3.903952084445961e-05,
"loss": 0.7858,
"step": 505
},
{
"epoch": 0.10831928500709107,
"grad_norm": 0.33701530321242656,
"learning_rate": 3.903535611589477e-05,
"loss": 0.8028,
"step": 506
},
{
"epoch": 0.10853335474030666,
"grad_norm": 0.3526054369532639,
"learning_rate": 3.903118260064439e-05,
"loss": 0.7879,
"step": 507
},
{
"epoch": 0.10874742447352224,
"grad_norm": 0.33714070211530794,
"learning_rate": 3.9027000300634955e-05,
"loss": 0.7776,
"step": 508
},
{
"epoch": 0.10896149420673784,
"grad_norm": 0.2949543568958641,
"learning_rate": 3.902280921779702e-05,
"loss": 0.7644,
"step": 509
},
{
"epoch": 0.10917556393995344,
"grad_norm": 0.32674430466929394,
"learning_rate": 3.901860935406517e-05,
"loss": 0.8075,
"step": 510
},
{
"epoch": 0.10938963367316903,
"grad_norm": 0.31174220707856237,
"learning_rate": 3.9014400711378056e-05,
"loss": 0.7646,
"step": 511
},
{
"epoch": 0.10960370340638463,
"grad_norm": 0.3198392027013188,
"learning_rate": 3.901018329167838e-05,
"loss": 0.7711,
"step": 512
},
{
"epoch": 0.10981777313960023,
"grad_norm": 0.31092469844898774,
"learning_rate": 3.9005957096912896e-05,
"loss": 0.7827,
"step": 513
},
{
"epoch": 0.11003184287281582,
"grad_norm": 0.3444589142435659,
"learning_rate": 3.900172212903241e-05,
"loss": 0.7863,
"step": 514
},
{
"epoch": 0.11024591260603142,
"grad_norm": 0.3693579186196287,
"learning_rate": 3.899747838999177e-05,
"loss": 0.7873,
"step": 515
},
{
"epoch": 0.110459982339247,
"grad_norm": 0.29783179523689074,
"learning_rate": 3.8993225881749887e-05,
"loss": 0.7613,
"step": 516
},
{
"epoch": 0.1106740520724626,
"grad_norm": 0.3039123293818144,
"learning_rate": 3.89889646062697e-05,
"loss": 0.7836,
"step": 517
},
{
"epoch": 0.1108881218056782,
"grad_norm": 0.30480384331152066,
"learning_rate": 3.898469456551821e-05,
"loss": 0.7664,
"step": 518
},
{
"epoch": 0.1111021915388938,
"grad_norm": 0.3121062544761374,
"learning_rate": 3.898041576146647e-05,
"loss": 0.7764,
"step": 519
},
{
"epoch": 0.11131626127210939,
"grad_norm": 0.3060990055629138,
"learning_rate": 3.897612819608955e-05,
"loss": 0.7597,
"step": 520
},
{
"epoch": 0.11153033100532499,
"grad_norm": 0.276573895942155,
"learning_rate": 3.8971831871366594e-05,
"loss": 0.7822,
"step": 521
},
{
"epoch": 0.11174440073854058,
"grad_norm": 0.25312672886850446,
"learning_rate": 3.896752678928078e-05,
"loss": 0.7776,
"step": 522
},
{
"epoch": 0.11195847047175618,
"grad_norm": 0.28272209005982685,
"learning_rate": 3.896321295181932e-05,
"loss": 0.7552,
"step": 523
},
{
"epoch": 0.11217254020497176,
"grad_norm": 0.2712780694142558,
"learning_rate": 3.895889036097347e-05,
"loss": 0.7588,
"step": 524
},
{
"epoch": 0.11238660993818736,
"grad_norm": 0.25201874224866017,
"learning_rate": 3.895455901873854e-05,
"loss": 0.7869,
"step": 525
},
{
"epoch": 0.11260067967140296,
"grad_norm": 0.30452270733697234,
"learning_rate": 3.895021892711387e-05,
"loss": 0.7842,
"step": 526
},
{
"epoch": 0.11281474940461855,
"grad_norm": 0.28683093116647973,
"learning_rate": 3.8945870088102825e-05,
"loss": 0.7906,
"step": 527
},
{
"epoch": 0.11302881913783415,
"grad_norm": 0.25289372547757544,
"learning_rate": 3.894151250371283e-05,
"loss": 0.7592,
"step": 528
},
{
"epoch": 0.11324288887104975,
"grad_norm": 0.26538901709259677,
"learning_rate": 3.8937146175955336e-05,
"loss": 0.7851,
"step": 529
},
{
"epoch": 0.11345695860426534,
"grad_norm": 0.29445448102282995,
"learning_rate": 3.893277110684584e-05,
"loss": 0.7793,
"step": 530
},
{
"epoch": 0.11367102833748094,
"grad_norm": 0.32573007017731953,
"learning_rate": 3.892838729840385e-05,
"loss": 0.7473,
"step": 531
},
{
"epoch": 0.11388509807069654,
"grad_norm": 0.3089739158432779,
"learning_rate": 3.892399475265294e-05,
"loss": 0.7649,
"step": 532
},
{
"epoch": 0.11409916780391212,
"grad_norm": 0.2784745964411335,
"learning_rate": 3.8919593471620694e-05,
"loss": 0.786,
"step": 533
},
{
"epoch": 0.11431323753712772,
"grad_norm": 0.2858572786891086,
"learning_rate": 3.8915183457338726e-05,
"loss": 0.7361,
"step": 534
},
{
"epoch": 0.11452730727034331,
"grad_norm": 0.2954335937171979,
"learning_rate": 3.89107647118427e-05,
"loss": 0.7796,
"step": 535
},
{
"epoch": 0.11474137700355891,
"grad_norm": 0.33879091012521695,
"learning_rate": 3.8906337237172314e-05,
"loss": 0.7837,
"step": 536
},
{
"epoch": 0.1149554467367745,
"grad_norm": 0.31576256981083695,
"learning_rate": 3.890190103537126e-05,
"loss": 0.7721,
"step": 537
},
{
"epoch": 0.1151695164699901,
"grad_norm": 0.27847471034641175,
"learning_rate": 3.8897456108487286e-05,
"loss": 0.7754,
"step": 538
},
{
"epoch": 0.1153835862032057,
"grad_norm": 0.29099685803387215,
"learning_rate": 3.889300245857217e-05,
"loss": 0.7616,
"step": 539
},
{
"epoch": 0.1155976559364213,
"grad_norm": 0.32514233887056426,
"learning_rate": 3.888854008768171e-05,
"loss": 0.8162,
"step": 540
},
{
"epoch": 0.11581172566963688,
"grad_norm": 0.27227832221242765,
"learning_rate": 3.8884068997875714e-05,
"loss": 0.7667,
"step": 541
},
{
"epoch": 0.11602579540285247,
"grad_norm": 0.28965360783910543,
"learning_rate": 3.887958919121804e-05,
"loss": 0.8139,
"step": 542
},
{
"epoch": 0.11623986513606807,
"grad_norm": 0.27492062128513606,
"learning_rate": 3.8875100669776554e-05,
"loss": 0.7859,
"step": 543
},
{
"epoch": 0.11645393486928367,
"grad_norm": 0.30950320325187036,
"learning_rate": 3.887060343562315e-05,
"loss": 0.7632,
"step": 544
},
{
"epoch": 0.11666800460249926,
"grad_norm": 0.34115044122556104,
"learning_rate": 3.886609749083375e-05,
"loss": 0.801,
"step": 545
},
{
"epoch": 0.11688207433571486,
"grad_norm": 0.28438131757602064,
"learning_rate": 3.886158283748828e-05,
"loss": 0.8003,
"step": 546
},
{
"epoch": 0.11709614406893046,
"grad_norm": 0.2974028330364232,
"learning_rate": 3.88570594776707e-05,
"loss": 0.7559,
"step": 547
},
{
"epoch": 0.11731021380214605,
"grad_norm": 0.3398375178841318,
"learning_rate": 3.8852527413468984e-05,
"loss": 0.7841,
"step": 548
},
{
"epoch": 0.11752428353536164,
"grad_norm": 0.35188531441607085,
"learning_rate": 3.884798664697512e-05,
"loss": 0.7945,
"step": 549
},
{
"epoch": 0.11773835326857723,
"grad_norm": 0.31114165104756186,
"learning_rate": 3.884343718028513e-05,
"loss": 0.7922,
"step": 550
},
{
"epoch": 0.11795242300179283,
"grad_norm": 0.28450168959567035,
"learning_rate": 3.883887901549903e-05,
"loss": 0.8044,
"step": 551
},
{
"epoch": 0.11816649273500843,
"grad_norm": 0.2740499206171291,
"learning_rate": 3.883431215472086e-05,
"loss": 0.7721,
"step": 552
},
{
"epoch": 0.11838056246822402,
"grad_norm": 0.3228937059359105,
"learning_rate": 3.882973660005868e-05,
"loss": 0.7691,
"step": 553
},
{
"epoch": 0.11859463220143962,
"grad_norm": 0.2534849908445428,
"learning_rate": 3.882515235362456e-05,
"loss": 0.7707,
"step": 554
},
{
"epoch": 0.11880870193465522,
"grad_norm": 0.29382509647086186,
"learning_rate": 3.8820559417534564e-05,
"loss": 0.7436,
"step": 555
},
{
"epoch": 0.11902277166787081,
"grad_norm": 0.36869485323895296,
"learning_rate": 3.8815957793908794e-05,
"loss": 0.7651,
"step": 556
},
{
"epoch": 0.11923684140108641,
"grad_norm": 0.2686317456497916,
"learning_rate": 3.8811347484871353e-05,
"loss": 0.7963,
"step": 557
},
{
"epoch": 0.11945091113430199,
"grad_norm": 0.28997605967274664,
"learning_rate": 3.880672849255035e-05,
"loss": 0.726,
"step": 558
},
{
"epoch": 0.11966498086751759,
"grad_norm": 0.25217042243669957,
"learning_rate": 3.8802100819077905e-05,
"loss": 0.7435,
"step": 559
},
{
"epoch": 0.11987905060073319,
"grad_norm": 0.3093551373710632,
"learning_rate": 3.879746446659013e-05,
"loss": 0.8133,
"step": 560
},
{
"epoch": 0.12009312033394878,
"grad_norm": 0.32250094376072785,
"learning_rate": 3.879281943722718e-05,
"loss": 0.814,
"step": 561
},
{
"epoch": 0.12030719006716438,
"grad_norm": 0.3009295548092589,
"learning_rate": 3.878816573313317e-05,
"loss": 0.7727,
"step": 562
},
{
"epoch": 0.12052125980037998,
"grad_norm": 0.2725509094693102,
"learning_rate": 3.878350335645626e-05,
"loss": 0.7591,
"step": 563
},
{
"epoch": 0.12073532953359557,
"grad_norm": 0.255873272518967,
"learning_rate": 3.877883230934858e-05,
"loss": 0.7694,
"step": 564
},
{
"epoch": 0.12094939926681117,
"grad_norm": 0.29434175814592056,
"learning_rate": 3.8774152593966277e-05,
"loss": 0.7658,
"step": 565
},
{
"epoch": 0.12116346900002675,
"grad_norm": 0.35007367150480406,
"learning_rate": 3.8769464212469504e-05,
"loss": 0.7668,
"step": 566
},
{
"epoch": 0.12137753873324235,
"grad_norm": 0.3665899152199687,
"learning_rate": 3.876476716702242e-05,
"loss": 0.7646,
"step": 567
},
{
"epoch": 0.12159160846645795,
"grad_norm": 0.31901208253235064,
"learning_rate": 3.8760061459793155e-05,
"loss": 0.7801,
"step": 568
},
{
"epoch": 0.12180567819967354,
"grad_norm": 0.2682094020590244,
"learning_rate": 3.8755347092953856e-05,
"loss": 0.7663,
"step": 569
},
{
"epoch": 0.12201974793288914,
"grad_norm": 0.24301063250734176,
"learning_rate": 3.8750624068680684e-05,
"loss": 0.7821,
"step": 570
},
{
"epoch": 0.12223381766610474,
"grad_norm": 0.2927237881643003,
"learning_rate": 3.874589238915376e-05,
"loss": 0.7836,
"step": 571
},
{
"epoch": 0.12244788739932033,
"grad_norm": 0.30694527478392386,
"learning_rate": 3.874115205655722e-05,
"loss": 0.7646,
"step": 572
},
{
"epoch": 0.12266195713253593,
"grad_norm": 0.2541192562387383,
"learning_rate": 3.873640307307921e-05,
"loss": 0.8211,
"step": 573
},
{
"epoch": 0.12287602686575153,
"grad_norm": 0.2810086501685054,
"learning_rate": 3.873164544091183e-05,
"loss": 0.7733,
"step": 574
},
{
"epoch": 0.12309009659896711,
"grad_norm": 0.39706157769586736,
"learning_rate": 3.872687916225121e-05,
"loss": 0.8116,
"step": 575
},
{
"epoch": 0.1233041663321827,
"grad_norm": 0.3540089083723581,
"learning_rate": 3.872210423929744e-05,
"loss": 0.7714,
"step": 576
},
{
"epoch": 0.1235182360653983,
"grad_norm": 0.4016793229760035,
"learning_rate": 3.8717320674254636e-05,
"loss": 0.804,
"step": 577
},
{
"epoch": 0.1237323057986139,
"grad_norm": 0.30846391176315663,
"learning_rate": 3.871252846933087e-05,
"loss": 0.7642,
"step": 578
},
{
"epoch": 0.1239463755318295,
"grad_norm": 0.29856262927914995,
"learning_rate": 3.870772762673821e-05,
"loss": 0.7869,
"step": 579
},
{
"epoch": 0.12416044526504509,
"grad_norm": 0.33404088281171423,
"learning_rate": 3.8702918148692725e-05,
"loss": 0.7793,
"step": 580
},
{
"epoch": 0.12437451499826069,
"grad_norm": 0.3555240010502193,
"learning_rate": 3.869810003741447e-05,
"loss": 0.7814,
"step": 581
},
{
"epoch": 0.12458858473147628,
"grad_norm": 0.36175837427588975,
"learning_rate": 3.869327329512746e-05,
"loss": 0.7654,
"step": 582
},
{
"epoch": 0.12480265446469187,
"grad_norm": 0.31043731011159237,
"learning_rate": 3.868843792405971e-05,
"loss": 0.7798,
"step": 583
},
{
"epoch": 0.12501672419790746,
"grad_norm": 0.26537385873434644,
"learning_rate": 3.868359392644323e-05,
"loss": 0.8154,
"step": 584
},
{
"epoch": 0.12523079393112307,
"grad_norm": 0.3260081381676428,
"learning_rate": 3.8678741304514e-05,
"loss": 0.7503,
"step": 585
},
{
"epoch": 0.12544486366433866,
"grad_norm": 0.3328831596964062,
"learning_rate": 3.8673880060511974e-05,
"loss": 0.7944,
"step": 586
},
{
"epoch": 0.12565893339755424,
"grad_norm": 0.302523997477463,
"learning_rate": 3.86690101966811e-05,
"loss": 0.7289,
"step": 587
},
{
"epoch": 0.12587300313076985,
"grad_norm": 0.31239351228194273,
"learning_rate": 3.866413171526928e-05,
"loss": 0.7573,
"step": 588
},
{
"epoch": 0.12608707286398543,
"grad_norm": 0.28106346332874094,
"learning_rate": 3.865924461852843e-05,
"loss": 0.7687,
"step": 589
},
{
"epoch": 0.12630114259720104,
"grad_norm": 0.25788660211306436,
"learning_rate": 3.8654348908714434e-05,
"loss": 0.8018,
"step": 590
},
{
"epoch": 0.12651521233041663,
"grad_norm": 0.25831164826273156,
"learning_rate": 3.864944458808712e-05,
"loss": 0.7682,
"step": 591
},
{
"epoch": 0.12672928206363224,
"grad_norm": 0.2509999919005387,
"learning_rate": 3.864453165891032e-05,
"loss": 0.7984,
"step": 592
},
{
"epoch": 0.12694335179684782,
"grad_norm": 0.2951328187917488,
"learning_rate": 3.863961012345184e-05,
"loss": 0.785,
"step": 593
},
{
"epoch": 0.12715742153006343,
"grad_norm": 0.26397863417223993,
"learning_rate": 3.863467998398346e-05,
"loss": 0.8013,
"step": 594
},
{
"epoch": 0.127371491263279,
"grad_norm": 0.2583119588714624,
"learning_rate": 3.86297412427809e-05,
"loss": 0.7822,
"step": 595
},
{
"epoch": 0.1275855609964946,
"grad_norm": 0.27064259133436563,
"learning_rate": 3.8624793902123886e-05,
"loss": 0.7944,
"step": 596
},
{
"epoch": 0.1277996307297102,
"grad_norm": 0.29108215935884013,
"learning_rate": 3.86198379642961e-05,
"loss": 0.796,
"step": 597
},
{
"epoch": 0.1280137004629258,
"grad_norm": 0.29671321773104437,
"learning_rate": 3.8614873431585196e-05,
"loss": 0.781,
"step": 598
},
{
"epoch": 0.1282277701961414,
"grad_norm": 0.4195978479010054,
"learning_rate": 3.860990030628279e-05,
"loss": 0.7478,
"step": 599
},
{
"epoch": 0.12844183992935698,
"grad_norm": 0.26840119966490283,
"learning_rate": 3.860491859068447e-05,
"loss": 0.7836,
"step": 600
},
{
"epoch": 0.1286559096625726,
"grad_norm": 0.4129907291581528,
"learning_rate": 3.859992828708979e-05,
"loss": 0.7618,
"step": 601
},
{
"epoch": 0.12886997939578818,
"grad_norm": 0.3238934678575172,
"learning_rate": 3.859492939780226e-05,
"loss": 0.7619,
"step": 602
},
{
"epoch": 0.1290840491290038,
"grad_norm": 0.27898878787820014,
"learning_rate": 3.8589921925129357e-05,
"loss": 0.7629,
"step": 603
},
{
"epoch": 0.12929811886221937,
"grad_norm": 0.2604284751693988,
"learning_rate": 3.8584905871382526e-05,
"loss": 0.787,
"step": 604
},
{
"epoch": 0.12951218859543495,
"grad_norm": 0.2619205407332,
"learning_rate": 3.857988123887716e-05,
"loss": 0.7558,
"step": 605
},
{
"epoch": 0.12972625832865056,
"grad_norm": 0.29313059763695404,
"learning_rate": 3.857484802993263e-05,
"loss": 0.758,
"step": 606
},
{
"epoch": 0.12994032806186614,
"grad_norm": 2.1930454149061887,
"learning_rate": 3.856980624687225e-05,
"loss": 0.7746,
"step": 607
},
{
"epoch": 0.13015439779508176,
"grad_norm": 0.35963399420098147,
"learning_rate": 3.85647558920233e-05,
"loss": 0.7798,
"step": 608
},
{
"epoch": 0.13036846752829734,
"grad_norm": 0.5345692799132874,
"learning_rate": 3.855969696771702e-05,
"loss": 0.8404,
"step": 609
},
{
"epoch": 0.13058253726151295,
"grad_norm": 0.4341539959883955,
"learning_rate": 3.8554629476288596e-05,
"loss": 0.7688,
"step": 610
},
{
"epoch": 0.13079660699472853,
"grad_norm": 0.4512232747327198,
"learning_rate": 3.8549553420077167e-05,
"loss": 0.7458,
"step": 611
},
{
"epoch": 0.13101067672794411,
"grad_norm": 0.337253619260505,
"learning_rate": 3.8544468801425836e-05,
"loss": 0.7954,
"step": 612
},
{
"epoch": 0.13122474646115972,
"grad_norm": 0.32416015765255696,
"learning_rate": 3.853937562268165e-05,
"loss": 0.8084,
"step": 613
},
{
"epoch": 0.1314388161943753,
"grad_norm": 0.38068934309719155,
"learning_rate": 3.853427388619562e-05,
"loss": 0.7465,
"step": 614
},
{
"epoch": 0.13165288592759092,
"grad_norm": 0.3619894703448777,
"learning_rate": 3.852916359432269e-05,
"loss": 0.7527,
"step": 615
},
{
"epoch": 0.1318669556608065,
"grad_norm": 0.35645238503047305,
"learning_rate": 3.852404474942176e-05,
"loss": 0.7819,
"step": 616
},
{
"epoch": 0.1320810253940221,
"grad_norm": 0.35511855267868303,
"learning_rate": 3.8518917353855686e-05,
"loss": 0.7465,
"step": 617
},
{
"epoch": 0.1322950951272377,
"grad_norm": 0.37180162261348115,
"learning_rate": 3.851378140999126e-05,
"loss": 0.7459,
"step": 618
},
{
"epoch": 0.1325091648604533,
"grad_norm": 0.3396390822262118,
"learning_rate": 3.850863692019923e-05,
"loss": 0.7681,
"step": 619
},
{
"epoch": 0.1327232345936689,
"grad_norm": 0.35510271781924185,
"learning_rate": 3.850348388685428e-05,
"loss": 0.7635,
"step": 620
},
{
"epoch": 0.13293730432688447,
"grad_norm": 0.40441804257077346,
"learning_rate": 3.849832231233503e-05,
"loss": 0.7771,
"step": 621
},
{
"epoch": 0.13315137406010008,
"grad_norm": 0.34293311293913303,
"learning_rate": 3.8493152199024074e-05,
"loss": 0.7421,
"step": 622
},
{
"epoch": 0.13336544379331566,
"grad_norm": 0.2912600178622258,
"learning_rate": 3.848797354930791e-05,
"loss": 0.8171,
"step": 623
},
{
"epoch": 0.13357951352653127,
"grad_norm": 0.29306478713974593,
"learning_rate": 3.8482786365577e-05,
"loss": 0.7717,
"step": 624
},
{
"epoch": 0.13379358325974686,
"grad_norm": 0.3326609157566286,
"learning_rate": 3.8477590650225735e-05,
"loss": 0.7854,
"step": 625
},
{
"epoch": 0.13400765299296247,
"grad_norm": 0.26537216605654773,
"learning_rate": 3.847238640565246e-05,
"loss": 0.7672,
"step": 626
},
{
"epoch": 0.13422172272617805,
"grad_norm": 0.2850266903516824,
"learning_rate": 3.846717363425943e-05,
"loss": 0.7699,
"step": 627
},
{
"epoch": 0.13443579245939366,
"grad_norm": 0.32199229672531654,
"learning_rate": 3.846195233845285e-05,
"loss": 0.7673,
"step": 628
},
{
"epoch": 0.13464986219260924,
"grad_norm": 0.3262432652902493,
"learning_rate": 3.8456722520642876e-05,
"loss": 0.7767,
"step": 629
},
{
"epoch": 0.13486393192582483,
"grad_norm": 0.38010201931639875,
"learning_rate": 3.845148418324357e-05,
"loss": 0.7755,
"step": 630
},
{
"epoch": 0.13507800165904044,
"grad_norm": 0.2774173275910418,
"learning_rate": 3.844623732867294e-05,
"loss": 0.7397,
"step": 631
},
{
"epoch": 0.13529207139225602,
"grad_norm": 0.2659211241771369,
"learning_rate": 3.844098195935292e-05,
"loss": 0.7704,
"step": 632
},
{
"epoch": 0.13550614112547163,
"grad_norm": 0.25464525623727563,
"learning_rate": 3.843571807770939e-05,
"loss": 0.7834,
"step": 633
},
{
"epoch": 0.1357202108586872,
"grad_norm": 0.26193574683038773,
"learning_rate": 3.843044568617215e-05,
"loss": 0.7672,
"step": 634
},
{
"epoch": 0.13593428059190282,
"grad_norm": 0.26387639271750185,
"learning_rate": 3.842516478717492e-05,
"loss": 0.788,
"step": 635
},
{
"epoch": 0.1361483503251184,
"grad_norm": 0.29187272474834036,
"learning_rate": 3.841987538315534e-05,
"loss": 0.7655,
"step": 636
},
{
"epoch": 0.136362420058334,
"grad_norm": 0.31168977117009056,
"learning_rate": 3.8414577476555014e-05,
"loss": 0.7487,
"step": 637
},
{
"epoch": 0.1365764897915496,
"grad_norm": 0.2621584434554687,
"learning_rate": 3.840927106981943e-05,
"loss": 0.7553,
"step": 638
},
{
"epoch": 0.13679055952476518,
"grad_norm": 0.2327188756460318,
"learning_rate": 3.8403956165398016e-05,
"loss": 0.7856,
"step": 639
},
{
"epoch": 0.1370046292579808,
"grad_norm": 0.2562349108395864,
"learning_rate": 3.8398632765744127e-05,
"loss": 0.7989,
"step": 640
},
{
"epoch": 0.13721869899119638,
"grad_norm": 0.2610415362265219,
"learning_rate": 3.8393300873315035e-05,
"loss": 0.7759,
"step": 641
},
{
"epoch": 0.13743276872441199,
"grad_norm": 0.31247290427911734,
"learning_rate": 3.8387960490571935e-05,
"loss": 0.7618,
"step": 642
},
{
"epoch": 0.13764683845762757,
"grad_norm": 0.2611576449147401,
"learning_rate": 3.838261161997992e-05,
"loss": 0.7746,
"step": 643
},
{
"epoch": 0.13786090819084318,
"grad_norm": 0.2890329268622078,
"learning_rate": 3.8377254264008044e-05,
"loss": 0.7783,
"step": 644
},
{
"epoch": 0.13807497792405876,
"grad_norm": 0.27299705023008836,
"learning_rate": 3.837188842512924e-05,
"loss": 0.7665,
"step": 645
},
{
"epoch": 0.13828904765727434,
"grad_norm": 0.28975886281296026,
"learning_rate": 3.836651410582037e-05,
"loss": 0.7842,
"step": 646
},
{
"epoch": 0.13850311739048995,
"grad_norm": 0.255916883922782,
"learning_rate": 3.83611313085622e-05,
"loss": 0.7308,
"step": 647
},
{
"epoch": 0.13871718712370554,
"grad_norm": 0.2623143223523937,
"learning_rate": 3.835574003583945e-05,
"loss": 0.7612,
"step": 648
},
{
"epoch": 0.13893125685692115,
"grad_norm": 0.2908807278221207,
"learning_rate": 3.835034029014068e-05,
"loss": 0.7591,
"step": 649
},
{
"epoch": 0.13914532659013673,
"grad_norm": 0.2792529186802475,
"learning_rate": 3.834493207395843e-05,
"loss": 0.7694,
"step": 650
},
{
"epoch": 0.13935939632335234,
"grad_norm": 0.2623369924104658,
"learning_rate": 3.8339515389789115e-05,
"loss": 0.744,
"step": 651
},
{
"epoch": 0.13957346605656792,
"grad_norm": 0.2820722762027879,
"learning_rate": 3.833409024013307e-05,
"loss": 0.7556,
"step": 652
},
{
"epoch": 0.13978753578978353,
"grad_norm": 0.27711755248733916,
"learning_rate": 3.8328656627494534e-05,
"loss": 0.7709,
"step": 653
},
{
"epoch": 0.14000160552299912,
"grad_norm": 0.3811917986398551,
"learning_rate": 3.832321455438165e-05,
"loss": 0.7617,
"step": 654
},
{
"epoch": 0.1402156752562147,
"grad_norm": 0.3028791758140864,
"learning_rate": 3.8317764023306466e-05,
"loss": 0.805,
"step": 655
},
{
"epoch": 0.1404297449894303,
"grad_norm": 0.25141344995387904,
"learning_rate": 3.831230503678494e-05,
"loss": 0.7785,
"step": 656
},
{
"epoch": 0.1406438147226459,
"grad_norm": 0.26261303365336286,
"learning_rate": 3.8306837597336943e-05,
"loss": 0.7856,
"step": 657
},
{
"epoch": 0.1408578844558615,
"grad_norm": 0.2609065168861405,
"learning_rate": 3.830136170748621e-05,
"loss": 0.781,
"step": 658
},
{
"epoch": 0.1410719541890771,
"grad_norm": 0.27034827983831866,
"learning_rate": 3.8295877369760426e-05,
"loss": 0.7581,
"step": 659
},
{
"epoch": 0.1412860239222927,
"grad_norm": 0.2701528651235907,
"learning_rate": 3.829038458669113e-05,
"loss": 0.7654,
"step": 660
},
{
"epoch": 0.14150009365550828,
"grad_norm": 0.2563557451157304,
"learning_rate": 3.828488336081379e-05,
"loss": 0.7903,
"step": 661
},
{
"epoch": 0.1417141633887239,
"grad_norm": 0.2764534665072802,
"learning_rate": 3.827937369466777e-05,
"loss": 0.7526,
"step": 662
},
{
"epoch": 0.14192823312193947,
"grad_norm": 0.24673773612316036,
"learning_rate": 3.8273855590796316e-05,
"loss": 0.7556,
"step": 663
},
{
"epoch": 0.14214230285515506,
"grad_norm": 0.24728120211043053,
"learning_rate": 3.8268329051746564e-05,
"loss": 0.7916,
"step": 664
},
{
"epoch": 0.14235637258837067,
"grad_norm": 0.28711329804957375,
"learning_rate": 3.826279408006957e-05,
"loss": 0.7699,
"step": 665
},
{
"epoch": 0.14257044232158625,
"grad_norm": 0.6682267334163176,
"learning_rate": 3.8257250678320254e-05,
"loss": 0.7595,
"step": 666
},
{
"epoch": 0.14278451205480186,
"grad_norm": 0.29826111975164143,
"learning_rate": 3.825169884905745e-05,
"loss": 0.789,
"step": 667
},
{
"epoch": 0.14299858178801744,
"grad_norm": 0.26844129791093646,
"learning_rate": 3.8246138594843866e-05,
"loss": 0.7858,
"step": 668
},
{
"epoch": 0.14321265152123305,
"grad_norm": 0.25460203600355735,
"learning_rate": 3.824056991824611e-05,
"loss": 0.7768,
"step": 669
},
{
"epoch": 0.14342672125444864,
"grad_norm": 0.25700886657543587,
"learning_rate": 3.823499282183467e-05,
"loss": 0.7903,
"step": 670
},
{
"epoch": 0.14364079098766422,
"grad_norm": 0.27604014778042324,
"learning_rate": 3.822940730818392e-05,
"loss": 0.7751,
"step": 671
},
{
"epoch": 0.14385486072087983,
"grad_norm": 0.2545147079115621,
"learning_rate": 3.822381337987213e-05,
"loss": 0.7561,
"step": 672
},
{
"epoch": 0.1440689304540954,
"grad_norm": 0.23984722698315658,
"learning_rate": 3.821821103948145e-05,
"loss": 0.7623,
"step": 673
},
{
"epoch": 0.14428300018731102,
"grad_norm": 0.867169222204874,
"learning_rate": 3.821260028959789e-05,
"loss": 0.7989,
"step": 674
},
{
"epoch": 0.1444970699205266,
"grad_norm": 0.25654537168602387,
"learning_rate": 3.820698113281139e-05,
"loss": 0.78,
"step": 675
},
{
"epoch": 0.14471113965374222,
"grad_norm": 3.2058329497567497,
"learning_rate": 3.8201353571715724e-05,
"loss": 0.7841,
"step": 676
},
{
"epoch": 0.1449252093869578,
"grad_norm": 0.3697316952651418,
"learning_rate": 3.8195717608908564e-05,
"loss": 0.7375,
"step": 677
},
{
"epoch": 0.1451392791201734,
"grad_norm": 0.5280543469774536,
"learning_rate": 3.8190073246991465e-05,
"loss": 0.7873,
"step": 678
},
{
"epoch": 0.145353348853389,
"grad_norm": 0.465557351184121,
"learning_rate": 3.818442048856986e-05,
"loss": 0.7555,
"step": 679
},
{
"epoch": 0.14556741858660457,
"grad_norm": 0.4186516823478108,
"learning_rate": 3.8178759336253034e-05,
"loss": 0.7612,
"step": 680
},
{
"epoch": 0.14578148831982018,
"grad_norm": 0.3626499032628553,
"learning_rate": 3.817308979265418e-05,
"loss": 0.7677,
"step": 681
},
{
"epoch": 0.14599555805303577,
"grad_norm": 0.4557368296382944,
"learning_rate": 3.816741186039035e-05,
"loss": 0.803,
"step": 682
},
{
"epoch": 0.14620962778625138,
"grad_norm": 0.36389305704256725,
"learning_rate": 3.8161725542082464e-05,
"loss": 0.7697,
"step": 683
},
{
"epoch": 0.14642369751946696,
"grad_norm": 0.40408928941984346,
"learning_rate": 3.8156030840355306e-05,
"loss": 0.7939,
"step": 684
},
{
"epoch": 0.14663776725268257,
"grad_norm": 0.34759856848217446,
"learning_rate": 3.815032775783755e-05,
"loss": 0.7569,
"step": 685
},
{
"epoch": 0.14685183698589815,
"grad_norm": 0.3213468951285394,
"learning_rate": 3.814461629716173e-05,
"loss": 0.789,
"step": 686
},
{
"epoch": 0.14706590671911376,
"grad_norm": 0.28784011586337543,
"learning_rate": 3.813889646096424e-05,
"loss": 0.7645,
"step": 687
},
{
"epoch": 0.14727997645232935,
"grad_norm": 0.34078877386979906,
"learning_rate": 3.8133168251885354e-05,
"loss": 0.7932,
"step": 688
},
{
"epoch": 0.14749404618554493,
"grad_norm": 0.303131014259725,
"learning_rate": 3.8127431672569187e-05,
"loss": 0.7682,
"step": 689
},
{
"epoch": 0.14770811591876054,
"grad_norm": 0.2684582694020625,
"learning_rate": 3.8121686725663744e-05,
"loss": 0.7852,
"step": 690
},
{
"epoch": 0.14792218565197612,
"grad_norm": 0.2837038618704381,
"learning_rate": 3.811593341382088e-05,
"loss": 0.7673,
"step": 691
},
{
"epoch": 0.14813625538519173,
"grad_norm": 0.2844560648737273,
"learning_rate": 3.811017173969632e-05,
"loss": 0.7982,
"step": 692
},
{
"epoch": 0.14835032511840732,
"grad_norm": 0.3119909822976725,
"learning_rate": 3.810440170594964e-05,
"loss": 0.7565,
"step": 693
},
{
"epoch": 0.14856439485162293,
"grad_norm": 0.2690330292340514,
"learning_rate": 3.8098623315244275e-05,
"loss": 0.7689,
"step": 694
},
{
"epoch": 0.1487784645848385,
"grad_norm": 0.2783152561701639,
"learning_rate": 3.809283657024751e-05,
"loss": 0.7855,
"step": 695
},
{
"epoch": 0.1489925343180541,
"grad_norm": 0.2765407147198869,
"learning_rate": 3.8087041473630516e-05,
"loss": 0.7149,
"step": 696
},
{
"epoch": 0.1492066040512697,
"grad_norm": 0.262225712133104,
"learning_rate": 3.8081238028068274e-05,
"loss": 0.7828,
"step": 697
},
{
"epoch": 0.1494206737844853,
"grad_norm": 0.2705245628148132,
"learning_rate": 3.807542623623967e-05,
"loss": 0.7608,
"step": 698
},
{
"epoch": 0.1496347435177009,
"grad_norm": 0.28967389099332874,
"learning_rate": 3.8069606100827396e-05,
"loss": 0.7707,
"step": 699
},
{
"epoch": 0.14984881325091648,
"grad_norm": 0.27952000191705045,
"learning_rate": 3.8063777624518026e-05,
"loss": 0.7777,
"step": 700
},
{
"epoch": 0.1500628829841321,
"grad_norm": 0.5827073160411023,
"learning_rate": 3.805794081000197e-05,
"loss": 0.7687,
"step": 701
},
{
"epoch": 0.15027695271734767,
"grad_norm": 0.27978095776641804,
"learning_rate": 3.8052095659973494e-05,
"loss": 0.7673,
"step": 702
},
{
"epoch": 0.15049102245056328,
"grad_norm": 0.24177725367845768,
"learning_rate": 3.8046242177130707e-05,
"loss": 0.7961,
"step": 703
},
{
"epoch": 0.15070509218377887,
"grad_norm": 0.2719754688841993,
"learning_rate": 3.8040380364175556e-05,
"loss": 0.7848,
"step": 704
},
{
"epoch": 0.15091916191699445,
"grad_norm": 0.28065407377805734,
"learning_rate": 3.8034510223813864e-05,
"loss": 0.7799,
"step": 705
},
{
"epoch": 0.15113323165021006,
"grad_norm": 0.2653947429929618,
"learning_rate": 3.8028631758755264e-05,
"loss": 0.7605,
"step": 706
},
{
"epoch": 0.15134730138342564,
"grad_norm": 0.26084282961218674,
"learning_rate": 3.802274497171325e-05,
"loss": 0.7776,
"step": 707
},
{
"epoch": 0.15156137111664125,
"grad_norm": 0.25782589841291353,
"learning_rate": 3.8016849865405145e-05,
"loss": 0.7772,
"step": 708
},
{
"epoch": 0.15177544084985684,
"grad_norm": 0.28312640843468234,
"learning_rate": 3.801094644255213e-05,
"loss": 0.7663,
"step": 709
},
{
"epoch": 0.15198951058307245,
"grad_norm": 0.27255746129312775,
"learning_rate": 3.80050347058792e-05,
"loss": 0.7677,
"step": 710
},
{
"epoch": 0.15220358031628803,
"grad_norm": 0.24608802340719124,
"learning_rate": 3.799911465811521e-05,
"loss": 0.7639,
"step": 711
},
{
"epoch": 0.15241765004950364,
"grad_norm": 0.2646709801661402,
"learning_rate": 3.799318630199284e-05,
"loss": 0.7451,
"step": 712
},
{
"epoch": 0.15263171978271922,
"grad_norm": 0.26130611058084147,
"learning_rate": 3.798724964024862e-05,
"loss": 0.775,
"step": 713
},
{
"epoch": 0.1528457895159348,
"grad_norm": 0.25417406786538366,
"learning_rate": 3.798130467562288e-05,
"loss": 0.7661,
"step": 714
},
{
"epoch": 0.15305985924915042,
"grad_norm": 0.2860437453192641,
"learning_rate": 3.797535141085983e-05,
"loss": 0.7742,
"step": 715
},
{
"epoch": 0.153273928982366,
"grad_norm": 0.2915108791493047,
"learning_rate": 3.796938984870747e-05,
"loss": 0.7698,
"step": 716
},
{
"epoch": 0.1534879987155816,
"grad_norm": 0.4095923416537272,
"learning_rate": 3.796341999191765e-05,
"loss": 0.7801,
"step": 717
},
{
"epoch": 0.1537020684487972,
"grad_norm": 0.262364819657621,
"learning_rate": 3.795744184324604e-05,
"loss": 0.7525,
"step": 718
},
{
"epoch": 0.1539161381820128,
"grad_norm": 0.28695421488433925,
"learning_rate": 3.7951455405452155e-05,
"loss": 0.776,
"step": 719
},
{
"epoch": 0.15413020791522838,
"grad_norm": 0.3137504716311156,
"learning_rate": 3.794546068129931e-05,
"loss": 0.7682,
"step": 720
},
{
"epoch": 0.15434427764844397,
"grad_norm": 0.3260775023769069,
"learning_rate": 3.793945767355467e-05,
"loss": 0.76,
"step": 721
},
{
"epoch": 0.15455834738165958,
"grad_norm": 0.3143029699594306,
"learning_rate": 3.7933446384989205e-05,
"loss": 0.728,
"step": 722
},
{
"epoch": 0.15477241711487516,
"grad_norm": 0.3432677260109923,
"learning_rate": 3.792742681837772e-05,
"loss": 0.7451,
"step": 723
},
{
"epoch": 0.15498648684809077,
"grad_norm": 0.34307535233147257,
"learning_rate": 3.792139897649883e-05,
"loss": 0.7683,
"step": 724
},
{
"epoch": 0.15520055658130635,
"grad_norm": 0.26039722324676695,
"learning_rate": 3.791536286213498e-05,
"loss": 0.7588,
"step": 725
},
{
"epoch": 0.15541462631452196,
"grad_norm": 0.3180245927328958,
"learning_rate": 3.790931847807243e-05,
"loss": 0.7579,
"step": 726
},
{
"epoch": 0.15562869604773755,
"grad_norm": 0.3656361806954744,
"learning_rate": 3.790326582710125e-05,
"loss": 0.7689,
"step": 727
},
{
"epoch": 0.15584276578095316,
"grad_norm": 0.34451469974838755,
"learning_rate": 3.789720491201534e-05,
"loss": 0.7482,
"step": 728
},
{
"epoch": 0.15605683551416874,
"grad_norm": 0.29215948365756916,
"learning_rate": 3.789113573561241e-05,
"loss": 0.7763,
"step": 729
},
{
"epoch": 0.15627090524738432,
"grad_norm": 0.25004022512355967,
"learning_rate": 3.7885058300693965e-05,
"loss": 0.7807,
"step": 730
},
{
"epoch": 0.15648497498059993,
"grad_norm": 0.3220605418818703,
"learning_rate": 3.7878972610065354e-05,
"loss": 0.8252,
"step": 731
},
{
"epoch": 0.15669904471381552,
"grad_norm": 0.30932527158683015,
"learning_rate": 3.7872878666535716e-05,
"loss": 0.7371,
"step": 732
},
{
"epoch": 0.15691311444703113,
"grad_norm": 0.25967906503546573,
"learning_rate": 3.7866776472918e-05,
"loss": 0.7797,
"step": 733
},
{
"epoch": 0.1571271841802467,
"grad_norm": 0.2551191958568535,
"learning_rate": 3.7860666032028974e-05,
"loss": 0.746,
"step": 734
},
{
"epoch": 0.15734125391346232,
"grad_norm": 0.2651953004886809,
"learning_rate": 3.78545473466892e-05,
"loss": 0.7725,
"step": 735
},
{
"epoch": 0.1575553236466779,
"grad_norm": 0.2766358558914324,
"learning_rate": 3.784842041972305e-05,
"loss": 0.7683,
"step": 736
},
{
"epoch": 0.1577693933798935,
"grad_norm": 0.2735619396043064,
"learning_rate": 3.784228525395872e-05,
"loss": 0.7533,
"step": 737
},
{
"epoch": 0.1579834631131091,
"grad_norm": 0.253304508339416,
"learning_rate": 3.783614185222817e-05,
"loss": 0.7608,
"step": 738
},
{
"epoch": 0.15819753284632468,
"grad_norm": 0.247978568649734,
"learning_rate": 3.7829990217367195e-05,
"loss": 0.7703,
"step": 739
},
{
"epoch": 0.1584116025795403,
"grad_norm": 0.21606299875376397,
"learning_rate": 3.782383035221537e-05,
"loss": 0.7611,
"step": 740
},
{
"epoch": 0.15862567231275587,
"grad_norm": 0.2587787141580186,
"learning_rate": 3.7817662259616084e-05,
"loss": 0.7562,
"step": 741
},
{
"epoch": 0.15883974204597148,
"grad_norm": 0.25957712222095314,
"learning_rate": 3.7811485942416515e-05,
"loss": 0.7725,
"step": 742
},
{
"epoch": 0.15905381177918707,
"grad_norm": 0.23629133910620595,
"learning_rate": 3.780530140346764e-05,
"loss": 0.7791,
"step": 743
},
{
"epoch": 0.15926788151240268,
"grad_norm": 0.2768203978908302,
"learning_rate": 3.779910864562424e-05,
"loss": 0.747,
"step": 744
},
{
"epoch": 0.15948195124561826,
"grad_norm": 0.259458847699784,
"learning_rate": 3.779290767174486e-05,
"loss": 0.772,
"step": 745
},
{
"epoch": 0.15969602097883384,
"grad_norm": 0.26038873795570955,
"learning_rate": 3.778669848469187e-05,
"loss": 0.7919,
"step": 746
},
{
"epoch": 0.15991009071204945,
"grad_norm": 0.25802087574967486,
"learning_rate": 3.778048108733143e-05,
"loss": 0.7675,
"step": 747
},
{
"epoch": 0.16012416044526503,
"grad_norm": 0.263636350635592,
"learning_rate": 3.777425548253346e-05,
"loss": 0.7389,
"step": 748
},
{
"epoch": 0.16033823017848065,
"grad_norm": 0.27426089074668625,
"learning_rate": 3.77680216731717e-05,
"loss": 0.7807,
"step": 749
},
{
"epoch": 0.16055229991169623,
"grad_norm": 0.27602553631119603,
"learning_rate": 3.776177966212366e-05,
"loss": 0.7399,
"step": 750
},
{
"epoch": 0.16076636964491184,
"grad_norm": 0.27755950091141984,
"learning_rate": 3.775552945227064e-05,
"loss": 0.7958,
"step": 751
},
{
"epoch": 0.16098043937812742,
"grad_norm": 0.2712475858518582,
"learning_rate": 3.774927104649773e-05,
"loss": 0.7511,
"step": 752
},
{
"epoch": 0.16119450911134303,
"grad_norm": 0.27952538499117413,
"learning_rate": 3.7743004447693794e-05,
"loss": 0.7607,
"step": 753
},
{
"epoch": 0.16140857884455861,
"grad_norm": 0.309206124797287,
"learning_rate": 3.773672965875148e-05,
"loss": 0.7811,
"step": 754
},
{
"epoch": 0.1616226485777742,
"grad_norm": 0.28451036271115715,
"learning_rate": 3.7730446682567236e-05,
"loss": 0.7821,
"step": 755
},
{
"epoch": 0.1618367183109898,
"grad_norm": 0.2825866984308074,
"learning_rate": 3.7724155522041256e-05,
"loss": 0.7633,
"step": 756
},
{
"epoch": 0.1620507880442054,
"grad_norm": 0.3074401298667024,
"learning_rate": 3.771785618007753e-05,
"loss": 0.7747,
"step": 757
},
{
"epoch": 0.162264857777421,
"grad_norm": 0.4009164697772463,
"learning_rate": 3.771154865958383e-05,
"loss": 0.7744,
"step": 758
},
{
"epoch": 0.16247892751063658,
"grad_norm": 0.3245826957847806,
"learning_rate": 3.770523296347168e-05,
"loss": 0.7595,
"step": 759
},
{
"epoch": 0.1626929972438522,
"grad_norm": 0.2652441909792531,
"learning_rate": 3.769890909465642e-05,
"loss": 0.7741,
"step": 760
},
{
"epoch": 0.16290706697706778,
"grad_norm": 0.3165367792575318,
"learning_rate": 3.769257705605711e-05,
"loss": 0.784,
"step": 761
},
{
"epoch": 0.1631211367102834,
"grad_norm": 0.37822687851375253,
"learning_rate": 3.768623685059662e-05,
"loss": 0.767,
"step": 762
},
{
"epoch": 0.16333520644349897,
"grad_norm": 0.3575859732243416,
"learning_rate": 3.767988848120158e-05,
"loss": 0.7734,
"step": 763
},
{
"epoch": 0.16354927617671455,
"grad_norm": 0.24818763877214692,
"learning_rate": 3.7673531950802373e-05,
"loss": 0.8094,
"step": 764
},
{
"epoch": 0.16376334590993016,
"grad_norm": 0.28354400368932975,
"learning_rate": 3.766716726233318e-05,
"loss": 0.7576,
"step": 765
},
{
"epoch": 0.16397741564314575,
"grad_norm": 0.39658861942795515,
"learning_rate": 3.766079441873192e-05,
"loss": 0.7668,
"step": 766
},
{
"epoch": 0.16419148537636136,
"grad_norm": 0.37765885576146235,
"learning_rate": 3.765441342294028e-05,
"loss": 0.8061,
"step": 767
},
{
"epoch": 0.16440555510957694,
"grad_norm": 0.5915025678108166,
"learning_rate": 3.764802427790372e-05,
"loss": 0.759,
"step": 768
},
{
"epoch": 0.16461962484279255,
"grad_norm": 0.27850811874246983,
"learning_rate": 3.764162698657147e-05,
"loss": 0.7699,
"step": 769
},
{
"epoch": 0.16483369457600813,
"grad_norm": 0.30960138552127586,
"learning_rate": 3.763522155189648e-05,
"loss": 0.8017,
"step": 770
},
{
"epoch": 0.16504776430922374,
"grad_norm": 0.40696548208878003,
"learning_rate": 3.7628807976835516e-05,
"loss": 0.7622,
"step": 771
},
{
"epoch": 0.16526183404243933,
"grad_norm": 0.2886451062321135,
"learning_rate": 3.762238626434906e-05,
"loss": 0.7763,
"step": 772
},
{
"epoch": 0.1654759037756549,
"grad_norm": 0.2592910622465004,
"learning_rate": 3.7615956417401357e-05,
"loss": 0.7401,
"step": 773
},
{
"epoch": 0.16568997350887052,
"grad_norm": 0.2896541504226147,
"learning_rate": 3.760951843896043e-05,
"loss": 0.7524,
"step": 774
},
{
"epoch": 0.1659040432420861,
"grad_norm": 0.28207888316976604,
"learning_rate": 3.7603072331998015e-05,
"loss": 0.8057,
"step": 775
},
{
"epoch": 0.1661181129753017,
"grad_norm": 0.27093980342361823,
"learning_rate": 3.7596618099489645e-05,
"loss": 0.7722,
"step": 776
},
{
"epoch": 0.1663321827085173,
"grad_norm": 0.24124228740960998,
"learning_rate": 3.759015574441456e-05,
"loss": 0.766,
"step": 777
},
{
"epoch": 0.1665462524417329,
"grad_norm": 0.2633646596925862,
"learning_rate": 3.75836852697558e-05,
"loss": 0.7534,
"step": 778
},
{
"epoch": 0.1667603221749485,
"grad_norm": 0.2912341873523871,
"learning_rate": 3.7577206678500096e-05,
"loss": 0.7741,
"step": 779
},
{
"epoch": 0.16697439190816407,
"grad_norm": 0.2687135282669201,
"learning_rate": 3.757071997363797e-05,
"loss": 0.7641,
"step": 780
},
{
"epoch": 0.16718846164137968,
"grad_norm": 0.23712231013906176,
"learning_rate": 3.756422515816367e-05,
"loss": 0.7386,
"step": 781
},
{
"epoch": 0.16740253137459526,
"grad_norm": 0.2783664207712955,
"learning_rate": 3.7557722235075185e-05,
"loss": 0.7641,
"step": 782
},
{
"epoch": 0.16761660110781088,
"grad_norm": 0.25178668122834624,
"learning_rate": 3.7551211207374256e-05,
"loss": 0.7674,
"step": 783
},
{
"epoch": 0.16783067084102646,
"grad_norm": 0.24464602048954365,
"learning_rate": 3.754469207806636e-05,
"loss": 0.7471,
"step": 784
},
{
"epoch": 0.16804474057424207,
"grad_norm": 0.25491223478402747,
"learning_rate": 3.753816485016073e-05,
"loss": 0.782,
"step": 785
},
{
"epoch": 0.16825881030745765,
"grad_norm": 0.24290481349085874,
"learning_rate": 3.7531629526670305e-05,
"loss": 0.7449,
"step": 786
},
{
"epoch": 0.16847288004067326,
"grad_norm": 0.25907191218936754,
"learning_rate": 3.7525086110611775e-05,
"loss": 0.7425,
"step": 787
},
{
"epoch": 0.16868694977388884,
"grad_norm": 0.27513812843783825,
"learning_rate": 3.751853460500559e-05,
"loss": 0.7472,
"step": 788
},
{
"epoch": 0.16890101950710443,
"grad_norm": 0.26492744335289636,
"learning_rate": 3.751197501287589e-05,
"loss": 0.7498,
"step": 789
},
{
"epoch": 0.16911508924032004,
"grad_norm": 0.26303828189678313,
"learning_rate": 3.750540733725059e-05,
"loss": 0.7315,
"step": 790
},
{
"epoch": 0.16932915897353562,
"grad_norm": 0.2214058090704647,
"learning_rate": 3.74988315811613e-05,
"loss": 0.7383,
"step": 791
},
{
"epoch": 0.16954322870675123,
"grad_norm": 0.30065919368053895,
"learning_rate": 3.749224774764339e-05,
"loss": 0.7745,
"step": 792
},
{
"epoch": 0.16975729843996681,
"grad_norm": 0.27094054827229336,
"learning_rate": 3.748565583973594e-05,
"loss": 0.7352,
"step": 793
},
{
"epoch": 0.16997136817318242,
"grad_norm": 0.2542348275920184,
"learning_rate": 3.747905586048176e-05,
"loss": 0.7535,
"step": 794
},
{
"epoch": 0.170185437906398,
"grad_norm": 0.24558353992266338,
"learning_rate": 3.7472447812927395e-05,
"loss": 0.7327,
"step": 795
},
{
"epoch": 0.17039950763961362,
"grad_norm": 0.2351506672864148,
"learning_rate": 3.74658317001231e-05,
"loss": 0.7715,
"step": 796
},
{
"epoch": 0.1706135773728292,
"grad_norm": 0.2758704299920849,
"learning_rate": 3.745920752512287e-05,
"loss": 0.7744,
"step": 797
},
{
"epoch": 0.17082764710604478,
"grad_norm": 0.28828204667769974,
"learning_rate": 3.7452575290984406e-05,
"loss": 0.7693,
"step": 798
},
{
"epoch": 0.1710417168392604,
"grad_norm": 0.2955914891504317,
"learning_rate": 3.744593500076913e-05,
"loss": 0.7772,
"step": 799
},
{
"epoch": 0.17125578657247598,
"grad_norm": 0.2737615699555731,
"learning_rate": 3.74392866575422e-05,
"loss": 0.7657,
"step": 800
},
{
"epoch": 0.1714698563056916,
"grad_norm": 0.26983392116272886,
"learning_rate": 3.743263026437247e-05,
"loss": 0.7412,
"step": 801
},
{
"epoch": 0.17168392603890717,
"grad_norm": 0.29284983297353145,
"learning_rate": 3.742596582433252e-05,
"loss": 0.7595,
"step": 802
},
{
"epoch": 0.17189799577212278,
"grad_norm": 0.26663836921476725,
"learning_rate": 3.741929334049864e-05,
"loss": 0.7386,
"step": 803
},
{
"epoch": 0.17211206550533836,
"grad_norm": 0.2850204051932672,
"learning_rate": 3.741261281595086e-05,
"loss": 0.7635,
"step": 804
},
{
"epoch": 0.17232613523855395,
"grad_norm": 0.27818601095540874,
"learning_rate": 3.740592425377286e-05,
"loss": 0.7637,
"step": 805
},
{
"epoch": 0.17254020497176956,
"grad_norm": 0.24672279606204295,
"learning_rate": 3.73992276570521e-05,
"loss": 0.7389,
"step": 806
},
{
"epoch": 0.17275427470498514,
"grad_norm": 0.25423393775313885,
"learning_rate": 3.73925230288797e-05,
"loss": 0.7545,
"step": 807
},
{
"epoch": 0.17296834443820075,
"grad_norm": 0.3097737609761657,
"learning_rate": 3.73858103723505e-05,
"loss": 0.7623,
"step": 808
},
{
"epoch": 0.17318241417141633,
"grad_norm": 0.2914051914001749,
"learning_rate": 3.7379089690563064e-05,
"loss": 0.7292,
"step": 809
},
{
"epoch": 0.17339648390463194,
"grad_norm": 0.2569206855026681,
"learning_rate": 3.7372360986619646e-05,
"loss": 0.7872,
"step": 810
},
{
"epoch": 0.17361055363784753,
"grad_norm": 0.2469593360745501,
"learning_rate": 3.73656242636262e-05,
"loss": 0.7776,
"step": 811
},
{
"epoch": 0.17382462337106314,
"grad_norm": 0.24572005389612414,
"learning_rate": 3.735887952469237e-05,
"loss": 0.7504,
"step": 812
},
{
"epoch": 0.17403869310427872,
"grad_norm": 0.2458790506465563,
"learning_rate": 3.735212677293153e-05,
"loss": 0.7499,
"step": 813
},
{
"epoch": 0.1742527628374943,
"grad_norm": 0.23899682983295484,
"learning_rate": 3.7345366011460746e-05,
"loss": 0.7511,
"step": 814
},
{
"epoch": 0.1744668325707099,
"grad_norm": 0.3699231564097912,
"learning_rate": 3.733859724340076e-05,
"loss": 0.7564,
"step": 815
},
{
"epoch": 0.1746809023039255,
"grad_norm": 0.2875645194686169,
"learning_rate": 3.733182047187602e-05,
"loss": 0.782,
"step": 816
},
{
"epoch": 0.1748949720371411,
"grad_norm": 0.2627392316870406,
"learning_rate": 3.732503570001468e-05,
"loss": 0.7841,
"step": 817
},
{
"epoch": 0.1751090417703567,
"grad_norm": 0.23716566552296248,
"learning_rate": 3.7318242930948575e-05,
"loss": 0.755,
"step": 818
},
{
"epoch": 0.1753231115035723,
"grad_norm": 0.28114635821103856,
"learning_rate": 3.731144216781324e-05,
"loss": 0.8051,
"step": 819
},
{
"epoch": 0.17553718123678788,
"grad_norm": 0.3054026391376041,
"learning_rate": 3.7304633413747885e-05,
"loss": 0.7843,
"step": 820
},
{
"epoch": 0.1757512509700035,
"grad_norm": 0.25992981601725024,
"learning_rate": 3.7297816671895425e-05,
"loss": 0.747,
"step": 821
},
{
"epoch": 0.17596532070321907,
"grad_norm": 0.23811026243010816,
"learning_rate": 3.7290991945402456e-05,
"loss": 0.7748,
"step": 822
},
{
"epoch": 0.17617939043643466,
"grad_norm": 0.2806740728248504,
"learning_rate": 3.7284159237419255e-05,
"loss": 0.7625,
"step": 823
},
{
"epoch": 0.17639346016965027,
"grad_norm": 0.2676841422875386,
"learning_rate": 3.727731855109979e-05,
"loss": 0.7743,
"step": 824
},
{
"epoch": 0.17660752990286585,
"grad_norm": 0.23284766500209506,
"learning_rate": 3.7270469889601716e-05,
"loss": 0.7365,
"step": 825
},
{
"epoch": 0.17682159963608146,
"grad_norm": 0.2692311864758718,
"learning_rate": 3.7263613256086346e-05,
"loss": 0.753,
"step": 826
},
{
"epoch": 0.17703566936929704,
"grad_norm": 0.2735512134769299,
"learning_rate": 3.72567486537187e-05,
"loss": 0.7305,
"step": 827
},
{
"epoch": 0.17724973910251265,
"grad_norm": 0.26244627961838785,
"learning_rate": 3.7249876085667474e-05,
"loss": 0.7603,
"step": 828
},
{
"epoch": 0.17746380883572824,
"grad_norm": 0.22792906030022086,
"learning_rate": 3.7242995555105016e-05,
"loss": 0.7482,
"step": 829
},
{
"epoch": 0.17767787856894382,
"grad_norm": 0.2530232897181774,
"learning_rate": 3.723610706520738e-05,
"loss": 0.7588,
"step": 830
},
{
"epoch": 0.17789194830215943,
"grad_norm": 0.2426994810142526,
"learning_rate": 3.722921061915427e-05,
"loss": 0.7429,
"step": 831
},
{
"epoch": 0.178106018035375,
"grad_norm": 0.2891343066654247,
"learning_rate": 3.722230622012908e-05,
"loss": 0.7669,
"step": 832
},
{
"epoch": 0.17832008776859062,
"grad_norm": 0.2659092479939417,
"learning_rate": 3.721539387131886e-05,
"loss": 0.7449,
"step": 833
},
{
"epoch": 0.1785341575018062,
"grad_norm": 0.299275820855458,
"learning_rate": 3.720847357591435e-05,
"loss": 0.7485,
"step": 834
},
{
"epoch": 0.17874822723502182,
"grad_norm": 0.31678348023318653,
"learning_rate": 3.720154533710994e-05,
"loss": 0.8065,
"step": 835
},
{
"epoch": 0.1789622969682374,
"grad_norm": 0.274238580197539,
"learning_rate": 3.719460915810368e-05,
"loss": 0.7499,
"step": 836
},
{
"epoch": 0.179176366701453,
"grad_norm": 0.2645299103700658,
"learning_rate": 3.718766504209732e-05,
"loss": 0.748,
"step": 837
},
{
"epoch": 0.1793904364346686,
"grad_norm": 0.33810732073784494,
"learning_rate": 3.718071299229624e-05,
"loss": 0.749,
"step": 838
},
{
"epoch": 0.17960450616788418,
"grad_norm": 0.2707034275854758,
"learning_rate": 3.7173753011909484e-05,
"loss": 0.7428,
"step": 839
},
{
"epoch": 0.1798185759010998,
"grad_norm": 0.278007294454195,
"learning_rate": 3.716678510414978e-05,
"loss": 0.7931,
"step": 840
},
{
"epoch": 0.18003264563431537,
"grad_norm": 0.2873841867788181,
"learning_rate": 3.7159809272233503e-05,
"loss": 0.7483,
"step": 841
},
{
"epoch": 0.18024671536753098,
"grad_norm": 0.25165887340624554,
"learning_rate": 3.715282551938067e-05,
"loss": 0.7667,
"step": 842
},
{
"epoch": 0.18046078510074656,
"grad_norm": 0.23882700215230773,
"learning_rate": 3.714583384881498e-05,
"loss": 0.7666,
"step": 843
},
{
"epoch": 0.18067485483396217,
"grad_norm": 0.3095243256273488,
"learning_rate": 3.713883426376377e-05,
"loss": 0.773,
"step": 844
},
{
"epoch": 0.18088892456717776,
"grad_norm": 0.24101944011500306,
"learning_rate": 3.713182676745804e-05,
"loss": 0.7478,
"step": 845
},
{
"epoch": 0.18110299430039337,
"grad_norm": 0.2609131242953271,
"learning_rate": 3.7124811363132434e-05,
"loss": 0.7338,
"step": 846
},
{
"epoch": 0.18131706403360895,
"grad_norm": 0.23794528277671756,
"learning_rate": 3.711778805402525e-05,
"loss": 0.7341,
"step": 847
},
{
"epoch": 0.18153113376682453,
"grad_norm": 0.2563717398283204,
"learning_rate": 3.711075684337844e-05,
"loss": 0.794,
"step": 848
},
{
"epoch": 0.18174520350004014,
"grad_norm": 0.24247069564237045,
"learning_rate": 3.710371773443759e-05,
"loss": 0.746,
"step": 849
},
{
"epoch": 0.18195927323325572,
"grad_norm": 0.26129577521893677,
"learning_rate": 3.7096670730451945e-05,
"loss": 0.7789,
"step": 850
},
{
"epoch": 0.18217334296647134,
"grad_norm": 0.2316369539031604,
"learning_rate": 3.708961583467438e-05,
"loss": 0.7647,
"step": 851
},
{
"epoch": 0.18238741269968692,
"grad_norm": 0.2421032365965948,
"learning_rate": 3.708255305036144e-05,
"loss": 0.7452,
"step": 852
},
{
"epoch": 0.18260148243290253,
"grad_norm": 0.24479496782932084,
"learning_rate": 3.707548238077328e-05,
"loss": 0.7607,
"step": 853
},
{
"epoch": 0.1828155521661181,
"grad_norm": 0.26513015104086035,
"learning_rate": 3.7068403829173705e-05,
"loss": 0.7811,
"step": 854
},
{
"epoch": 0.1830296218993337,
"grad_norm": 0.22550303237471955,
"learning_rate": 3.7061317398830176e-05,
"loss": 0.7651,
"step": 855
},
{
"epoch": 0.1832436916325493,
"grad_norm": 0.24071132285626767,
"learning_rate": 3.705422309301376e-05,
"loss": 0.7447,
"step": 856
},
{
"epoch": 0.1834577613657649,
"grad_norm": 0.24485261299010438,
"learning_rate": 3.704712091499919e-05,
"loss": 0.7489,
"step": 857
},
{
"epoch": 0.1836718310989805,
"grad_norm": 0.21599402692306946,
"learning_rate": 3.7040010868064814e-05,
"loss": 0.7692,
"step": 858
},
{
"epoch": 0.18388590083219608,
"grad_norm": 0.3574968477206158,
"learning_rate": 3.703289295549261e-05,
"loss": 0.7802,
"step": 859
},
{
"epoch": 0.1840999705654117,
"grad_norm": 0.2416963853432307,
"learning_rate": 3.702576718056819e-05,
"loss": 0.751,
"step": 860
},
{
"epoch": 0.18431404029862727,
"grad_norm": 0.263650749855753,
"learning_rate": 3.7018633546580815e-05,
"loss": 0.7514,
"step": 861
},
{
"epoch": 0.18452811003184288,
"grad_norm": 0.2416012709858466,
"learning_rate": 3.701149205682335e-05,
"loss": 0.7518,
"step": 862
},
{
"epoch": 0.18474217976505847,
"grad_norm": 0.2637897868977282,
"learning_rate": 3.700434271459229e-05,
"loss": 0.7673,
"step": 863
},
{
"epoch": 0.18495624949827405,
"grad_norm": 0.2591986973480914,
"learning_rate": 3.699718552318776e-05,
"loss": 0.758,
"step": 864
},
{
"epoch": 0.18517031923148966,
"grad_norm": 0.28373412976738566,
"learning_rate": 3.69900204859135e-05,
"loss": 0.7556,
"step": 865
},
{
"epoch": 0.18538438896470524,
"grad_norm": 0.26644833684543134,
"learning_rate": 3.698284760607689e-05,
"loss": 0.733,
"step": 866
},
{
"epoch": 0.18559845869792085,
"grad_norm": 0.2557223244454845,
"learning_rate": 3.697566688698892e-05,
"loss": 0.7916,
"step": 867
},
{
"epoch": 0.18581252843113644,
"grad_norm": 0.25936331823578024,
"learning_rate": 3.696847833196419e-05,
"loss": 0.7466,
"step": 868
},
{
"epoch": 0.18602659816435205,
"grad_norm": 0.24833805790191535,
"learning_rate": 3.696128194432092e-05,
"loss": 0.7475,
"step": 869
},
{
"epoch": 0.18624066789756763,
"grad_norm": 0.24301701958171398,
"learning_rate": 3.695407772738095e-05,
"loss": 0.75,
"step": 870
},
{
"epoch": 0.18645473763078324,
"grad_norm": 0.2547341313806687,
"learning_rate": 3.6946865684469735e-05,
"loss": 0.7487,
"step": 871
},
{
"epoch": 0.18666880736399882,
"grad_norm": 0.3514402089251288,
"learning_rate": 3.693964581891635e-05,
"loss": 0.7556,
"step": 872
},
{
"epoch": 0.1868828770972144,
"grad_norm": 0.26073534877205573,
"learning_rate": 3.693241813405346e-05,
"loss": 0.7769,
"step": 873
},
{
"epoch": 0.18709694683043002,
"grad_norm": 0.2531253720240625,
"learning_rate": 3.692518263321736e-05,
"loss": 0.7515,
"step": 874
},
{
"epoch": 0.1873110165636456,
"grad_norm": 0.253007191898049,
"learning_rate": 3.691793931974793e-05,
"loss": 0.762,
"step": 875
},
{
"epoch": 0.1875250862968612,
"grad_norm": 0.23301058060597665,
"learning_rate": 3.6910688196988685e-05,
"loss": 0.7485,
"step": 876
},
{
"epoch": 0.1877391560300768,
"grad_norm": 0.2451215601454093,
"learning_rate": 3.690342926828673e-05,
"loss": 0.758,
"step": 877
},
{
"epoch": 0.1879532257632924,
"grad_norm": 0.2540061937078841,
"learning_rate": 3.689616253699276e-05,
"loss": 0.7562,
"step": 878
},
{
"epoch": 0.18816729549650799,
"grad_norm": 0.2379510276033454,
"learning_rate": 3.68888880064611e-05,
"loss": 0.7295,
"step": 879
},
{
"epoch": 0.1883813652297236,
"grad_norm": 0.2733224328561649,
"learning_rate": 3.688160568004965e-05,
"loss": 0.7238,
"step": 880
},
{
"epoch": 0.18859543496293918,
"grad_norm": 0.2766041481780387,
"learning_rate": 3.687431556111992e-05,
"loss": 0.7542,
"step": 881
},
{
"epoch": 0.18880950469615476,
"grad_norm": 0.295314325954768,
"learning_rate": 3.686701765303701e-05,
"loss": 0.8054,
"step": 882
},
{
"epoch": 0.18902357442937037,
"grad_norm": 0.29212465129928555,
"learning_rate": 3.685971195916963e-05,
"loss": 0.7635,
"step": 883
},
{
"epoch": 0.18923764416258596,
"grad_norm": 0.2773384531911246,
"learning_rate": 3.685239848289008e-05,
"loss": 0.747,
"step": 884
},
{
"epoch": 0.18945171389580157,
"grad_norm": 0.3046770818327443,
"learning_rate": 3.6845077227574234e-05,
"loss": 0.7635,
"step": 885
},
{
"epoch": 0.18966578362901715,
"grad_norm": 0.7223002240752847,
"learning_rate": 3.683774819660158e-05,
"loss": 0.7754,
"step": 886
},
{
"epoch": 0.18987985336223276,
"grad_norm": 0.28505220492776684,
"learning_rate": 3.683041139335518e-05,
"loss": 0.7566,
"step": 887
},
{
"epoch": 0.19009392309544834,
"grad_norm": 0.27583794145550233,
"learning_rate": 3.682306682122168e-05,
"loss": 0.7517,
"step": 888
},
{
"epoch": 0.19030799282866392,
"grad_norm": 0.2555012836872675,
"learning_rate": 3.681571448359135e-05,
"loss": 0.782,
"step": 889
},
{
"epoch": 0.19052206256187953,
"grad_norm": 0.30910545710081916,
"learning_rate": 3.6808354383857983e-05,
"loss": 0.7581,
"step": 890
},
{
"epoch": 0.19073613229509512,
"grad_norm": 0.2651034955880483,
"learning_rate": 3.680098652541901e-05,
"loss": 0.7493,
"step": 891
},
{
"epoch": 0.19095020202831073,
"grad_norm": 0.2513642505801275,
"learning_rate": 3.6793610911675405e-05,
"loss": 0.7579,
"step": 892
},
{
"epoch": 0.1911642717615263,
"grad_norm": 0.27887751715847525,
"learning_rate": 3.678622754603175e-05,
"loss": 0.7508,
"step": 893
},
{
"epoch": 0.19137834149474192,
"grad_norm": 0.27339848683232354,
"learning_rate": 3.6778836431896184e-05,
"loss": 0.7504,
"step": 894
},
{
"epoch": 0.1915924112279575,
"grad_norm": 0.27590508160509225,
"learning_rate": 3.677143757268043e-05,
"loss": 0.7813,
"step": 895
},
{
"epoch": 0.19180648096117311,
"grad_norm": 0.27287031621349384,
"learning_rate": 3.676403097179981e-05,
"loss": 0.7654,
"step": 896
},
{
"epoch": 0.1920205506943887,
"grad_norm": 0.2731546157033016,
"learning_rate": 3.675661663267317e-05,
"loss": 0.7602,
"step": 897
},
{
"epoch": 0.19223462042760428,
"grad_norm": 0.4328428167070174,
"learning_rate": 3.674919455872297e-05,
"loss": 0.7489,
"step": 898
},
{
"epoch": 0.1924486901608199,
"grad_norm": 0.26609179205860484,
"learning_rate": 3.6741764753375216e-05,
"loss": 0.7878,
"step": 899
},
{
"epoch": 0.19266275989403547,
"grad_norm": 0.2560585322095053,
"learning_rate": 3.673432722005951e-05,
"loss": 0.7692,
"step": 900
},
{
"epoch": 0.19287682962725108,
"grad_norm": 0.27375377498463116,
"learning_rate": 3.672688196220899e-05,
"loss": 0.7435,
"step": 901
},
{
"epoch": 0.19309089936046667,
"grad_norm": 0.24204721417261835,
"learning_rate": 3.6719428983260364e-05,
"loss": 0.7619,
"step": 902
},
{
"epoch": 0.19330496909368228,
"grad_norm": 0.27963563350801673,
"learning_rate": 3.6711968286653936e-05,
"loss": 0.7871,
"step": 903
},
{
"epoch": 0.19351903882689786,
"grad_norm": 0.2642109920435115,
"learning_rate": 3.6704499875833536e-05,
"loss": 0.7571,
"step": 904
},
{
"epoch": 0.19373310856011347,
"grad_norm": 0.3544081292478254,
"learning_rate": 3.669702375424658e-05,
"loss": 0.7406,
"step": 905
},
{
"epoch": 0.19394717829332905,
"grad_norm": 0.3042454254833041,
"learning_rate": 3.668953992534402e-05,
"loss": 0.7371,
"step": 906
},
{
"epoch": 0.19416124802654464,
"grad_norm": 0.25046822229158755,
"learning_rate": 3.668204839258038e-05,
"loss": 0.7471,
"step": 907
},
{
"epoch": 0.19437531775976025,
"grad_norm": 0.2870686541223231,
"learning_rate": 3.667454915941373e-05,
"loss": 0.7685,
"step": 908
},
{
"epoch": 0.19458938749297583,
"grad_norm": 0.2508346151097107,
"learning_rate": 3.6667042229305725e-05,
"loss": 0.7228,
"step": 909
},
{
"epoch": 0.19480345722619144,
"grad_norm": 0.5901600838533766,
"learning_rate": 3.665952760572154e-05,
"loss": 0.7617,
"step": 910
},
{
"epoch": 0.19501752695940702,
"grad_norm": 0.29949620098684454,
"learning_rate": 3.6652005292129894e-05,
"loss": 0.7458,
"step": 911
},
{
"epoch": 0.19523159669262263,
"grad_norm": 1.255403538904062,
"learning_rate": 3.66444752920031e-05,
"loss": 0.7635,
"step": 912
},
{
"epoch": 0.19544566642583822,
"grad_norm": 0.2671125999944408,
"learning_rate": 3.6636937608816975e-05,
"loss": 0.7467,
"step": 913
},
{
"epoch": 0.1956597361590538,
"grad_norm": 0.3126725141894936,
"learning_rate": 3.662939224605091e-05,
"loss": 0.7595,
"step": 914
},
{
"epoch": 0.1958738058922694,
"grad_norm": 0.8565454077615011,
"learning_rate": 3.662183920718782e-05,
"loss": 0.8323,
"step": 915
},
{
"epoch": 0.196087875625485,
"grad_norm": 0.3999468363425481,
"learning_rate": 3.661427849571418e-05,
"loss": 0.7466,
"step": 916
},
{
"epoch": 0.1963019453587006,
"grad_norm": 0.30723734570549177,
"learning_rate": 3.660671011512e-05,
"loss": 0.7205,
"step": 917
},
{
"epoch": 0.19651601509191619,
"grad_norm": 0.25734326947114644,
"learning_rate": 3.659913406889883e-05,
"loss": 0.7595,
"step": 918
},
{
"epoch": 0.1967300848251318,
"grad_norm": 0.29978089982244505,
"learning_rate": 3.659155036054777e-05,
"loss": 0.7536,
"step": 919
},
{
"epoch": 0.19694415455834738,
"grad_norm": 0.279094319424851,
"learning_rate": 3.6583958993567424e-05,
"loss": 0.7958,
"step": 920
},
{
"epoch": 0.197158224291563,
"grad_norm": 0.24255895338189498,
"learning_rate": 3.657635997146197e-05,
"loss": 0.7548,
"step": 921
},
{
"epoch": 0.19737229402477857,
"grad_norm": 0.2701795076574947,
"learning_rate": 3.6568753297739094e-05,
"loss": 0.7678,
"step": 922
},
{
"epoch": 0.19758636375799415,
"grad_norm": 0.962851927144585,
"learning_rate": 3.656113897591003e-05,
"loss": 0.7494,
"step": 923
},
{
"epoch": 0.19780043349120977,
"grad_norm": 0.44179348600186596,
"learning_rate": 3.655351700948953e-05,
"loss": 0.7625,
"step": 924
},
{
"epoch": 0.19801450322442535,
"grad_norm": 0.2589381596936857,
"learning_rate": 3.654588740199588e-05,
"loss": 0.7768,
"step": 925
},
{
"epoch": 0.19822857295764096,
"grad_norm": 0.25911085272735385,
"learning_rate": 3.653825015695089e-05,
"loss": 0.7321,
"step": 926
},
{
"epoch": 0.19844264269085654,
"grad_norm": 0.24885041775075306,
"learning_rate": 3.65306052778799e-05,
"loss": 0.7487,
"step": 927
},
{
"epoch": 0.19865671242407215,
"grad_norm": 0.2556786113182652,
"learning_rate": 3.652295276831178e-05,
"loss": 0.7801,
"step": 928
},
{
"epoch": 0.19887078215728773,
"grad_norm": 0.26634249596644843,
"learning_rate": 3.651529263177891e-05,
"loss": 0.7329,
"step": 929
},
{
"epoch": 0.19908485189050334,
"grad_norm": 0.2732515080303684,
"learning_rate": 3.6507624871817194e-05,
"loss": 0.7481,
"step": 930
},
{
"epoch": 0.19929892162371893,
"grad_norm": 0.2672716464049646,
"learning_rate": 3.6499949491966046e-05,
"loss": 0.7448,
"step": 931
},
{
"epoch": 0.1995129913569345,
"grad_norm": 0.2568881179297147,
"learning_rate": 3.649226649576843e-05,
"loss": 0.77,
"step": 932
},
{
"epoch": 0.19972706109015012,
"grad_norm": 0.23810961527881044,
"learning_rate": 3.6484575886770784e-05,
"loss": 0.749,
"step": 933
},
{
"epoch": 0.1999411308233657,
"grad_norm": 0.25038064244934716,
"learning_rate": 3.647687766852308e-05,
"loss": 0.7666,
"step": 934
},
{
"epoch": 0.20015520055658131,
"grad_norm": 0.2283222081414046,
"learning_rate": 3.6469171844578815e-05,
"loss": 0.7702,
"step": 935
},
{
"epoch": 0.2003692702897969,
"grad_norm": 0.25814551947403014,
"learning_rate": 3.6461458418494966e-05,
"loss": 0.7512,
"step": 936
},
{
"epoch": 0.2005833400230125,
"grad_norm": 0.2821156323407736,
"learning_rate": 3.645373739383205e-05,
"loss": 0.7567,
"step": 937
},
{
"epoch": 0.2007974097562281,
"grad_norm": 0.29502512710151657,
"learning_rate": 3.6446008774154075e-05,
"loss": 0.7529,
"step": 938
},
{
"epoch": 0.20101147948944367,
"grad_norm": 0.2371006541241456,
"learning_rate": 3.643827256302855e-05,
"loss": 0.7348,
"step": 939
},
{
"epoch": 0.20122554922265928,
"grad_norm": 0.28220834460938493,
"learning_rate": 3.64305287640265e-05,
"loss": 0.7491,
"step": 940
},
{
"epoch": 0.20143961895587487,
"grad_norm": 0.2743393530532695,
"learning_rate": 3.642277738072246e-05,
"loss": 0.7659,
"step": 941
},
{
"epoch": 0.20165368868909048,
"grad_norm": 0.26341136279409577,
"learning_rate": 3.6415018416694435e-05,
"loss": 0.7735,
"step": 942
},
{
"epoch": 0.20186775842230606,
"grad_norm": 0.23857262810112115,
"learning_rate": 3.640725187552396e-05,
"loss": 0.7051,
"step": 943
},
{
"epoch": 0.20208182815552167,
"grad_norm": 0.2588822104766973,
"learning_rate": 3.6399477760796055e-05,
"loss": 0.7353,
"step": 944
},
{
"epoch": 0.20229589788873725,
"grad_norm": 0.25409577229517644,
"learning_rate": 3.639169607609924e-05,
"loss": 0.7626,
"step": 945
},
{
"epoch": 0.20250996762195286,
"grad_norm": 0.22614850090788918,
"learning_rate": 3.638390682502552e-05,
"loss": 0.7693,
"step": 946
},
{
"epoch": 0.20272403735516845,
"grad_norm": 0.2219331573913887,
"learning_rate": 3.63761100111704e-05,
"loss": 0.7504,
"step": 947
},
{
"epoch": 0.20293810708838403,
"grad_norm": 0.2527122225830614,
"learning_rate": 3.636830563813287e-05,
"loss": 0.7292,
"step": 948
},
{
"epoch": 0.20315217682159964,
"grad_norm": 0.25888609447841554,
"learning_rate": 3.6360493709515427e-05,
"loss": 0.7933,
"step": 949
},
{
"epoch": 0.20336624655481522,
"grad_norm": 0.22355108902056006,
"learning_rate": 3.635267422892404e-05,
"loss": 0.7555,
"step": 950
},
{
"epoch": 0.20358031628803083,
"grad_norm": 0.2477643513389135,
"learning_rate": 3.634484719996816e-05,
"loss": 0.724,
"step": 951
},
{
"epoch": 0.20379438602124642,
"grad_norm": 0.26711413869833667,
"learning_rate": 3.6337012626260736e-05,
"loss": 0.7214,
"step": 952
},
{
"epoch": 0.20400845575446203,
"grad_norm": 0.2518356078233925,
"learning_rate": 3.632917051141818e-05,
"loss": 0.7631,
"step": 953
},
{
"epoch": 0.2042225254876776,
"grad_norm": 0.2727448789347231,
"learning_rate": 3.632132085906042e-05,
"loss": 0.736,
"step": 954
},
{
"epoch": 0.20443659522089322,
"grad_norm": 0.2564131594299474,
"learning_rate": 3.631346367281082e-05,
"loss": 0.7667,
"step": 955
},
{
"epoch": 0.2046506649541088,
"grad_norm": 0.2456704570702374,
"learning_rate": 3.6305598956296255e-05,
"loss": 0.7582,
"step": 956
},
{
"epoch": 0.20486473468732438,
"grad_norm": 0.24376627673782,
"learning_rate": 3.6297726713147065e-05,
"loss": 0.759,
"step": 957
},
{
"epoch": 0.20507880442054,
"grad_norm": 0.21743130219262322,
"learning_rate": 3.628984694699705e-05,
"loss": 0.7407,
"step": 958
},
{
"epoch": 0.20529287415375558,
"grad_norm": 0.2554600431871466,
"learning_rate": 3.6281959661483506e-05,
"loss": 0.7333,
"step": 959
},
{
"epoch": 0.2055069438869712,
"grad_norm": 0.23795030565121542,
"learning_rate": 3.627406486024719e-05,
"loss": 0.7686,
"step": 960
},
{
"epoch": 0.20572101362018677,
"grad_norm": 0.23460618633053193,
"learning_rate": 3.626616254693233e-05,
"loss": 0.7608,
"step": 961
},
{
"epoch": 0.20593508335340238,
"grad_norm": 0.36219050008528314,
"learning_rate": 3.6258252725186614e-05,
"loss": 0.7727,
"step": 962
},
{
"epoch": 0.20614915308661796,
"grad_norm": 0.28802229226357035,
"learning_rate": 3.6250335398661196e-05,
"loss": 0.754,
"step": 963
},
{
"epoch": 0.20636322281983355,
"grad_norm": 0.22913800174835353,
"learning_rate": 3.6242410571010705e-05,
"loss": 0.741,
"step": 964
},
{
"epoch": 0.20657729255304916,
"grad_norm": 0.2162083975992117,
"learning_rate": 3.623447824589323e-05,
"loss": 0.7301,
"step": 965
},
{
"epoch": 0.20679136228626474,
"grad_norm": 0.24966914276568036,
"learning_rate": 3.6226538426970315e-05,
"loss": 0.7288,
"step": 966
},
{
"epoch": 0.20700543201948035,
"grad_norm": 0.26811401307725996,
"learning_rate": 3.621859111790696e-05,
"loss": 0.7704,
"step": 967
},
{
"epoch": 0.20721950175269593,
"grad_norm": 0.8554616315407051,
"learning_rate": 3.621063632237164e-05,
"loss": 0.7557,
"step": 968
},
{
"epoch": 0.20743357148591154,
"grad_norm": 0.26537345378866023,
"learning_rate": 3.620267404403627e-05,
"loss": 0.7481,
"step": 969
},
{
"epoch": 0.20764764121912713,
"grad_norm": 0.265563365564281,
"learning_rate": 3.619470428657622e-05,
"loss": 0.7624,
"step": 970
},
{
"epoch": 0.20786171095234274,
"grad_norm": 0.25337546459776217,
"learning_rate": 3.6186727053670316e-05,
"loss": 0.7434,
"step": 971
},
{
"epoch": 0.20807578068555832,
"grad_norm": 0.28627342771657804,
"learning_rate": 3.617874234900083e-05,
"loss": 0.7776,
"step": 972
},
{
"epoch": 0.2082898504187739,
"grad_norm": 0.290219753909199,
"learning_rate": 3.61707501762535e-05,
"loss": 0.7703,
"step": 973
},
{
"epoch": 0.2085039201519895,
"grad_norm": 0.28497728133674854,
"learning_rate": 3.616275053911749e-05,
"loss": 0.7801,
"step": 974
},
{
"epoch": 0.2087179898852051,
"grad_norm": 0.2767530667311987,
"learning_rate": 3.615474344128542e-05,
"loss": 0.7442,
"step": 975
},
{
"epoch": 0.2089320596184207,
"grad_norm": 0.26315175959980436,
"learning_rate": 3.614672888645334e-05,
"loss": 0.7675,
"step": 976
},
{
"epoch": 0.2091461293516363,
"grad_norm": 0.26937821442955023,
"learning_rate": 3.6138706878320775e-05,
"loss": 0.7707,
"step": 977
},
{
"epoch": 0.2093601990848519,
"grad_norm": 0.3039189057189529,
"learning_rate": 3.613067742059065e-05,
"loss": 0.7409,
"step": 978
},
{
"epoch": 0.20957426881806748,
"grad_norm": 0.3009097585087098,
"learning_rate": 3.6122640516969356e-05,
"loss": 0.7627,
"step": 979
},
{
"epoch": 0.2097883385512831,
"grad_norm": 0.2544442419583323,
"learning_rate": 3.611459617116672e-05,
"loss": 0.7447,
"step": 980
},
{
"epoch": 0.21000240828449868,
"grad_norm": 0.23778116952129275,
"learning_rate": 3.610654438689598e-05,
"loss": 0.7272,
"step": 981
},
{
"epoch": 0.21021647801771426,
"grad_norm": 0.2613464568658064,
"learning_rate": 3.6098485167873845e-05,
"loss": 0.7364,
"step": 982
},
{
"epoch": 0.21043054775092987,
"grad_norm": 0.30803820816106064,
"learning_rate": 3.609041851782042e-05,
"loss": 0.7228,
"step": 983
},
{
"epoch": 0.21064461748414545,
"grad_norm": 0.3056767252526709,
"learning_rate": 3.608234444045927e-05,
"loss": 0.7369,
"step": 984
},
{
"epoch": 0.21085868721736106,
"grad_norm": 0.23492053334567353,
"learning_rate": 3.6074262939517355e-05,
"loss": 0.7333,
"step": 985
},
{
"epoch": 0.21107275695057665,
"grad_norm": 0.2689515273251663,
"learning_rate": 3.60661740187251e-05,
"loss": 0.7374,
"step": 986
},
{
"epoch": 0.21128682668379226,
"grad_norm": 0.2895333948928181,
"learning_rate": 3.605807768181633e-05,
"loss": 0.743,
"step": 987
},
{
"epoch": 0.21150089641700784,
"grad_norm": 0.2549851894277088,
"learning_rate": 3.604997393252829e-05,
"loss": 0.7273,
"step": 988
},
{
"epoch": 0.21171496615022342,
"grad_norm": 0.22971103325219708,
"learning_rate": 3.604186277460166e-05,
"loss": 0.743,
"step": 989
},
{
"epoch": 0.21192903588343903,
"grad_norm": 0.2710269586461271,
"learning_rate": 3.603374421178055e-05,
"loss": 0.7235,
"step": 990
},
{
"epoch": 0.21214310561665461,
"grad_norm": 0.29461367252677395,
"learning_rate": 3.602561824781246e-05,
"loss": 0.7739,
"step": 991
},
{
"epoch": 0.21235717534987023,
"grad_norm": 0.23895229336665744,
"learning_rate": 3.601748488644832e-05,
"loss": 0.7634,
"step": 992
},
{
"epoch": 0.2125712450830858,
"grad_norm": 0.2593252880595089,
"learning_rate": 3.600934413144248e-05,
"loss": 0.7561,
"step": 993
},
{
"epoch": 0.21278531481630142,
"grad_norm": 0.2849973035192068,
"learning_rate": 3.6001195986552694e-05,
"loss": 0.7429,
"step": 994
},
{
"epoch": 0.212999384549517,
"grad_norm": 0.24028559386334664,
"learning_rate": 3.5993040455540135e-05,
"loss": 0.7512,
"step": 995
},
{
"epoch": 0.2132134542827326,
"grad_norm": 0.26559823564831764,
"learning_rate": 3.5984877542169376e-05,
"loss": 0.7224,
"step": 996
},
{
"epoch": 0.2134275240159482,
"grad_norm": 0.2745250932675374,
"learning_rate": 3.59767072502084e-05,
"loss": 0.7631,
"step": 997
},
{
"epoch": 0.21364159374916378,
"grad_norm": 0.24741598704199386,
"learning_rate": 3.596852958342861e-05,
"loss": 0.7256,
"step": 998
},
{
"epoch": 0.2138556634823794,
"grad_norm": 0.24734191368592298,
"learning_rate": 3.5960344545604796e-05,
"loss": 0.7596,
"step": 999
},
{
"epoch": 0.21406973321559497,
"grad_norm": 10.092858210617589,
"learning_rate": 3.595215214051515e-05,
"loss": 0.746,
"step": 1000
},
{
"epoch": 0.21428380294881058,
"grad_norm": 0.4171261524859099,
"learning_rate": 3.594395237194128e-05,
"loss": 0.7935,
"step": 1001
},
{
"epoch": 0.21449787268202616,
"grad_norm": 0.46178238096671537,
"learning_rate": 3.593574524366819e-05,
"loss": 0.7595,
"step": 1002
},
{
"epoch": 0.21471194241524177,
"grad_norm": 0.35196861929128975,
"learning_rate": 3.592753075948426e-05,
"loss": 0.7435,
"step": 1003
},
{
"epoch": 0.21492601214845736,
"grad_norm": 0.36167115218197843,
"learning_rate": 3.5919308923181286e-05,
"loss": 0.7605,
"step": 1004
},
{
"epoch": 0.21514008188167297,
"grad_norm": 0.3696739643849057,
"learning_rate": 3.591107973855445e-05,
"loss": 0.7451,
"step": 1005
},
{
"epoch": 0.21535415161488855,
"grad_norm": 0.37838772822659933,
"learning_rate": 3.590284320940235e-05,
"loss": 0.748,
"step": 1006
},
{
"epoch": 0.21556822134810413,
"grad_norm": 0.31589052796646483,
"learning_rate": 3.589459933952692e-05,
"loss": 0.7552,
"step": 1007
},
{
"epoch": 0.21578229108131974,
"grad_norm": 0.385105987314194,
"learning_rate": 3.588634813273354e-05,
"loss": 0.741,
"step": 1008
},
{
"epoch": 0.21599636081453533,
"grad_norm": 0.38260827645461054,
"learning_rate": 3.587808959283094e-05,
"loss": 0.7506,
"step": 1009
},
{
"epoch": 0.21621043054775094,
"grad_norm": 0.33440889188332873,
"learning_rate": 3.586982372363125e-05,
"loss": 0.7327,
"step": 1010
},
{
"epoch": 0.21642450028096652,
"grad_norm": 0.29313029392688494,
"learning_rate": 3.586155052894998e-05,
"loss": 0.7469,
"step": 1011
},
{
"epoch": 0.21663857001418213,
"grad_norm": 0.340638229924971,
"learning_rate": 3.585327001260602e-05,
"loss": 0.7532,
"step": 1012
},
{
"epoch": 0.2168526397473977,
"grad_norm": 0.3544562477711959,
"learning_rate": 3.5844982178421646e-05,
"loss": 0.7754,
"step": 1013
},
{
"epoch": 0.21706670948061332,
"grad_norm": 0.3006330074376759,
"learning_rate": 3.58366870302225e-05,
"loss": 0.7742,
"step": 1014
},
{
"epoch": 0.2172807792138289,
"grad_norm": 0.28236810688192665,
"learning_rate": 3.5828384571837615e-05,
"loss": 0.7257,
"step": 1015
},
{
"epoch": 0.2174948489470445,
"grad_norm": 0.3736260433358128,
"learning_rate": 3.582007480709939e-05,
"loss": 0.7403,
"step": 1016
},
{
"epoch": 0.2177089186802601,
"grad_norm": 0.34295859341269685,
"learning_rate": 3.581175773984359e-05,
"loss": 0.7507,
"step": 1017
},
{
"epoch": 0.21792298841347568,
"grad_norm": 0.28018933171639143,
"learning_rate": 3.580343337390935e-05,
"loss": 0.7321,
"step": 1018
},
{
"epoch": 0.2181370581466913,
"grad_norm": 0.3285608573750389,
"learning_rate": 3.5795101713139205e-05,
"loss": 0.7501,
"step": 1019
},
{
"epoch": 0.21835112787990688,
"grad_norm": 0.3299448479539171,
"learning_rate": 3.578676276137903e-05,
"loss": 0.7532,
"step": 1020
},
{
"epoch": 0.21856519761312249,
"grad_norm": 0.277610694652717,
"learning_rate": 3.577841652247805e-05,
"loss": 0.7319,
"step": 1021
},
{
"epoch": 0.21877926734633807,
"grad_norm": 0.24757180785673524,
"learning_rate": 3.5770063000288896e-05,
"loss": 0.711,
"step": 1022
},
{
"epoch": 0.21899333707955365,
"grad_norm": 0.28273722392178796,
"learning_rate": 3.5761702198667525e-05,
"loss": 0.7578,
"step": 1023
},
{
"epoch": 0.21920740681276926,
"grad_norm": 0.26298230247381893,
"learning_rate": 3.5753334121473275e-05,
"loss": 0.7492,
"step": 1024
},
{
"epoch": 0.21942147654598484,
"grad_norm": 0.25583233500336755,
"learning_rate": 3.574495877256883e-05,
"loss": 0.739,
"step": 1025
},
{
"epoch": 0.21963554627920046,
"grad_norm": 0.2898634519620882,
"learning_rate": 3.5736576155820236e-05,
"loss": 0.7418,
"step": 1026
},
{
"epoch": 0.21984961601241604,
"grad_norm": 0.25997297043422357,
"learning_rate": 3.57281862750969e-05,
"loss": 0.7487,
"step": 1027
},
{
"epoch": 0.22006368574563165,
"grad_norm": 0.23053881246498512,
"learning_rate": 3.571978913427157e-05,
"loss": 0.7253,
"step": 1028
},
{
"epoch": 0.22027775547884723,
"grad_norm": 0.26850485101261434,
"learning_rate": 3.5711384737220345e-05,
"loss": 0.7384,
"step": 1029
},
{
"epoch": 0.22049182521206284,
"grad_norm": 0.24457392835460862,
"learning_rate": 3.570297308782269e-05,
"loss": 0.7264,
"step": 1030
},
{
"epoch": 0.22070589494527842,
"grad_norm": 0.23022578083012712,
"learning_rate": 3.5694554189961405e-05,
"loss": 0.738,
"step": 1031
},
{
"epoch": 0.220919964678494,
"grad_norm": 0.2525301599694607,
"learning_rate": 3.5686128047522635e-05,
"loss": 0.7138,
"step": 1032
},
{
"epoch": 0.22113403441170962,
"grad_norm": 0.2437538317226544,
"learning_rate": 3.567769466439588e-05,
"loss": 0.7111,
"step": 1033
},
{
"epoch": 0.2213481041449252,
"grad_norm": 0.24709264591753685,
"learning_rate": 3.5669254044473954e-05,
"loss": 0.7323,
"step": 1034
},
{
"epoch": 0.2215621738781408,
"grad_norm": 0.24310991049521027,
"learning_rate": 3.5660806191653055e-05,
"loss": 0.7295,
"step": 1035
},
{
"epoch": 0.2217762436113564,
"grad_norm": 0.22807514507682305,
"learning_rate": 3.565235110983268e-05,
"loss": 0.741,
"step": 1036
},
{
"epoch": 0.221990313344572,
"grad_norm": 0.26964715143263146,
"learning_rate": 3.564388880291569e-05,
"loss": 0.7484,
"step": 1037
},
{
"epoch": 0.2222043830777876,
"grad_norm": 0.2554835794594098,
"learning_rate": 3.5635419274808266e-05,
"loss": 0.7637,
"step": 1038
},
{
"epoch": 0.2224184528110032,
"grad_norm": 0.24403499735322062,
"learning_rate": 3.5626942529419916e-05,
"loss": 0.7457,
"step": 1039
},
{
"epoch": 0.22263252254421878,
"grad_norm": 0.2416178368600129,
"learning_rate": 3.5618458570663515e-05,
"loss": 0.7507,
"step": 1040
},
{
"epoch": 0.22284659227743436,
"grad_norm": 0.21805288171407658,
"learning_rate": 3.5609967402455226e-05,
"loss": 0.735,
"step": 1041
},
{
"epoch": 0.22306066201064997,
"grad_norm": 0.24543246850912478,
"learning_rate": 3.560146902871455e-05,
"loss": 0.7413,
"step": 1042
},
{
"epoch": 0.22327473174386556,
"grad_norm": 0.22441315685460572,
"learning_rate": 3.559296345336433e-05,
"loss": 0.7484,
"step": 1043
},
{
"epoch": 0.22348880147708117,
"grad_norm": 0.2338048943893594,
"learning_rate": 3.558445068033074e-05,
"loss": 0.7277,
"step": 1044
},
{
"epoch": 0.22370287121029675,
"grad_norm": 0.2605556960844784,
"learning_rate": 3.557593071354323e-05,
"loss": 0.7409,
"step": 1045
},
{
"epoch": 0.22391694094351236,
"grad_norm": 0.23718871838302671,
"learning_rate": 3.556740355693462e-05,
"loss": 0.7974,
"step": 1046
},
{
"epoch": 0.22413101067672794,
"grad_norm": 0.20936316183624143,
"learning_rate": 3.5558869214441025e-05,
"loss": 0.7436,
"step": 1047
},
{
"epoch": 0.22434508040994353,
"grad_norm": 0.25239905632152304,
"learning_rate": 3.555032769000188e-05,
"loss": 0.7661,
"step": 1048
},
{
"epoch": 0.22455915014315914,
"grad_norm": 0.5191686141846192,
"learning_rate": 3.554177898755994e-05,
"loss": 0.7506,
"step": 1049
},
{
"epoch": 0.22477321987637472,
"grad_norm": 0.24494503867317957,
"learning_rate": 3.5533223111061276e-05,
"loss": 0.7437,
"step": 1050
},
{
"epoch": 0.22498728960959033,
"grad_norm": 0.22260914180330765,
"learning_rate": 3.552466006445525e-05,
"loss": 0.705,
"step": 1051
},
{
"epoch": 0.2252013593428059,
"grad_norm": 0.2409119289875767,
"learning_rate": 3.551608985169456e-05,
"loss": 0.7392,
"step": 1052
},
{
"epoch": 0.22541542907602152,
"grad_norm": 0.23037441671075173,
"learning_rate": 3.55075124767352e-05,
"loss": 0.7556,
"step": 1053
},
{
"epoch": 0.2256294988092371,
"grad_norm": 0.2413821715606796,
"learning_rate": 3.549892794353647e-05,
"loss": 0.7594,
"step": 1054
},
{
"epoch": 0.22584356854245272,
"grad_norm": 0.23296370989555829,
"learning_rate": 3.549033625606097e-05,
"loss": 0.7523,
"step": 1055
},
{
"epoch": 0.2260576382756683,
"grad_norm": 0.23123454344750505,
"learning_rate": 3.548173741827461e-05,
"loss": 0.7588,
"step": 1056
},
{
"epoch": 0.22627170800888388,
"grad_norm": 0.3488286736625951,
"learning_rate": 3.54731314341466e-05,
"loss": 0.7225,
"step": 1057
},
{
"epoch": 0.2264857777420995,
"grad_norm": 0.22198341459277993,
"learning_rate": 3.546451830764944e-05,
"loss": 0.7514,
"step": 1058
},
{
"epoch": 0.22669984747531507,
"grad_norm": 0.25979761376278093,
"learning_rate": 3.545589804275894e-05,
"loss": 0.77,
"step": 1059
},
{
"epoch": 0.22691391720853069,
"grad_norm": 0.24466064570140464,
"learning_rate": 3.5447270643454196e-05,
"loss": 0.7741,
"step": 1060
},
{
"epoch": 0.22712798694174627,
"grad_norm": 0.28949037020711366,
"learning_rate": 3.5438636113717604e-05,
"loss": 0.7701,
"step": 1061
},
{
"epoch": 0.22734205667496188,
"grad_norm": 0.24924127205366303,
"learning_rate": 3.542999445753485e-05,
"loss": 0.7349,
"step": 1062
},
{
"epoch": 0.22755612640817746,
"grad_norm": 0.24463955301015564,
"learning_rate": 3.5421345678894883e-05,
"loss": 0.7377,
"step": 1063
},
{
"epoch": 0.22777019614139307,
"grad_norm": 0.24132365114750715,
"learning_rate": 3.5412689781789994e-05,
"loss": 0.7447,
"step": 1064
},
{
"epoch": 0.22798426587460865,
"grad_norm": 0.2393135182760011,
"learning_rate": 3.540402677021571e-05,
"loss": 0.7536,
"step": 1065
},
{
"epoch": 0.22819833560782424,
"grad_norm": 0.23470436314271398,
"learning_rate": 3.539535664817087e-05,
"loss": 0.7356,
"step": 1066
},
{
"epoch": 0.22841240534103985,
"grad_norm": 0.24991603565251896,
"learning_rate": 3.538667941965758e-05,
"loss": 0.7471,
"step": 1067
},
{
"epoch": 0.22862647507425543,
"grad_norm": 0.2510669109647726,
"learning_rate": 3.537799508868124e-05,
"loss": 0.7428,
"step": 1068
},
{
"epoch": 0.22884054480747104,
"grad_norm": 0.23343415846091617,
"learning_rate": 3.5369303659250515e-05,
"loss": 0.7624,
"step": 1069
},
{
"epoch": 0.22905461454068662,
"grad_norm": 0.276998861185143,
"learning_rate": 3.5360605135377354e-05,
"loss": 0.7527,
"step": 1070
},
{
"epoch": 0.22926868427390223,
"grad_norm": 0.2462153525809238,
"learning_rate": 3.535189952107699e-05,
"loss": 0.7373,
"step": 1071
},
{
"epoch": 0.22948275400711782,
"grad_norm": 0.2238843819915132,
"learning_rate": 3.53431868203679e-05,
"loss": 0.7281,
"step": 1072
},
{
"epoch": 0.2296968237403334,
"grad_norm": 0.23209022838278037,
"learning_rate": 3.5334467037271864e-05,
"loss": 0.7591,
"step": 1073
},
{
"epoch": 0.229910893473549,
"grad_norm": 0.24566729465175108,
"learning_rate": 3.5325740175813915e-05,
"loss": 0.7503,
"step": 1074
},
{
"epoch": 0.2301249632067646,
"grad_norm": 0.22745802495833817,
"learning_rate": 3.5317006240022355e-05,
"loss": 0.7498,
"step": 1075
},
{
"epoch": 0.2303390329399802,
"grad_norm": 0.24769220269180708,
"learning_rate": 3.5308265233928755e-05,
"loss": 0.7042,
"step": 1076
},
{
"epoch": 0.2305531026731958,
"grad_norm": 0.24851444239254405,
"learning_rate": 3.529951716156794e-05,
"loss": 0.7367,
"step": 1077
},
{
"epoch": 0.2307671724064114,
"grad_norm": 0.23598212975741154,
"learning_rate": 3.529076202697802e-05,
"loss": 0.7306,
"step": 1078
},
{
"epoch": 0.23098124213962698,
"grad_norm": 0.22106312349882618,
"learning_rate": 3.528199983420033e-05,
"loss": 0.7296,
"step": 1079
},
{
"epoch": 0.2311953118728426,
"grad_norm": 0.23293359636391545,
"learning_rate": 3.52732305872795e-05,
"loss": 0.7326,
"step": 1080
},
{
"epoch": 0.23140938160605817,
"grad_norm": 0.25325870616664814,
"learning_rate": 3.526445429026338e-05,
"loss": 0.7302,
"step": 1081
},
{
"epoch": 0.23162345133927376,
"grad_norm": 0.23651232115232096,
"learning_rate": 3.5255670947203104e-05,
"loss": 0.7575,
"step": 1082
},
{
"epoch": 0.23183752107248937,
"grad_norm": 0.24854842349869274,
"learning_rate": 3.5246880562153055e-05,
"loss": 0.7544,
"step": 1083
},
{
"epoch": 0.23205159080570495,
"grad_norm": 0.21848220913183314,
"learning_rate": 3.523808313917084e-05,
"loss": 0.7533,
"step": 1084
},
{
"epoch": 0.23226566053892056,
"grad_norm": 0.2430693946130431,
"learning_rate": 3.5229278682317346e-05,
"loss": 0.7264,
"step": 1085
},
{
"epoch": 0.23247973027213614,
"grad_norm": 0.22773842855288573,
"learning_rate": 3.522046719565669e-05,
"loss": 0.7094,
"step": 1086
},
{
"epoch": 0.23269380000535175,
"grad_norm": 0.23527288317600056,
"learning_rate": 3.521164868325624e-05,
"loss": 0.7344,
"step": 1087
},
{
"epoch": 0.23290786973856734,
"grad_norm": 0.23204699437866774,
"learning_rate": 3.52028231491866e-05,
"loss": 0.7322,
"step": 1088
},
{
"epoch": 0.23312193947178295,
"grad_norm": 0.3894003861294942,
"learning_rate": 3.519399059752163e-05,
"loss": 0.7576,
"step": 1089
},
{
"epoch": 0.23333600920499853,
"grad_norm": 0.22807525846828683,
"learning_rate": 3.5185151032338406e-05,
"loss": 0.7254,
"step": 1090
},
{
"epoch": 0.2335500789382141,
"grad_norm": 0.24709398073098707,
"learning_rate": 3.517630445771727e-05,
"loss": 0.7501,
"step": 1091
},
{
"epoch": 0.23376414867142972,
"grad_norm": 0.2883078871345372,
"learning_rate": 3.516745087774177e-05,
"loss": 0.7511,
"step": 1092
},
{
"epoch": 0.2339782184046453,
"grad_norm": 0.2400295529465924,
"learning_rate": 3.515859029649872e-05,
"loss": 0.7392,
"step": 1093
},
{
"epoch": 0.23419228813786092,
"grad_norm": 0.27051740398227936,
"learning_rate": 3.514972271807813e-05,
"loss": 0.7382,
"step": 1094
},
{
"epoch": 0.2344063578710765,
"grad_norm": 0.220551996922905,
"learning_rate": 3.514084814657327e-05,
"loss": 0.7117,
"step": 1095
},
{
"epoch": 0.2346204276042921,
"grad_norm": 0.28698525292874566,
"learning_rate": 3.513196658608062e-05,
"loss": 0.7352,
"step": 1096
},
{
"epoch": 0.2348344973375077,
"grad_norm": 0.25692057922391903,
"learning_rate": 3.5123078040699895e-05,
"loss": 0.7169,
"step": 1097
},
{
"epoch": 0.23504856707072327,
"grad_norm": 0.21995943099729548,
"learning_rate": 3.511418251453403e-05,
"loss": 0.7453,
"step": 1098
},
{
"epoch": 0.23526263680393888,
"grad_norm": 0.2812453485865409,
"learning_rate": 3.5105280011689186e-05,
"loss": 0.7586,
"step": 1099
},
{
"epoch": 0.23547670653715447,
"grad_norm": 0.26061041513055433,
"learning_rate": 3.5096370536274736e-05,
"loss": 0.7757,
"step": 1100
},
{
"epoch": 0.23569077627037008,
"grad_norm": 0.23762390163994687,
"learning_rate": 3.5087454092403285e-05,
"loss": 0.739,
"step": 1101
},
{
"epoch": 0.23590484600358566,
"grad_norm": 0.23390835824020367,
"learning_rate": 3.507853068419064e-05,
"loss": 0.7727,
"step": 1102
},
{
"epoch": 0.23611891573680127,
"grad_norm": 0.24197047861128176,
"learning_rate": 3.506960031575584e-05,
"loss": 0.7228,
"step": 1103
},
{
"epoch": 0.23633298547001685,
"grad_norm": 0.23622843382472056,
"learning_rate": 3.5060662991221113e-05,
"loss": 0.7552,
"step": 1104
},
{
"epoch": 0.23654705520323246,
"grad_norm": 0.26080172807431923,
"learning_rate": 3.505171871471192e-05,
"loss": 0.7453,
"step": 1105
},
{
"epoch": 0.23676112493644805,
"grad_norm": 0.248947733593152,
"learning_rate": 3.504276749035693e-05,
"loss": 0.7596,
"step": 1106
},
{
"epoch": 0.23697519466966363,
"grad_norm": 0.24080733011178224,
"learning_rate": 3.503380932228799e-05,
"loss": 0.7365,
"step": 1107
},
{
"epoch": 0.23718926440287924,
"grad_norm": 0.21600510072981646,
"learning_rate": 3.502484421464019e-05,
"loss": 0.7673,
"step": 1108
},
{
"epoch": 0.23740333413609482,
"grad_norm": 0.23575839153297223,
"learning_rate": 3.501587217155181e-05,
"loss": 0.7327,
"step": 1109
},
{
"epoch": 0.23761740386931043,
"grad_norm": 0.2626002933160131,
"learning_rate": 3.500689319716432e-05,
"loss": 0.7814,
"step": 1110
},
{
"epoch": 0.23783147360252602,
"grad_norm": 0.22747937420096545,
"learning_rate": 3.4997907295622405e-05,
"loss": 0.7452,
"step": 1111
},
{
"epoch": 0.23804554333574163,
"grad_norm": 0.2394714977449478,
"learning_rate": 3.4988914471073936e-05,
"loss": 0.7526,
"step": 1112
},
{
"epoch": 0.2382596130689572,
"grad_norm": 0.2796432097213277,
"learning_rate": 3.4979914727669984e-05,
"loss": 0.7398,
"step": 1113
},
{
"epoch": 0.23847368280217282,
"grad_norm": 0.25029719142892853,
"learning_rate": 3.497090806956481e-05,
"loss": 0.7305,
"step": 1114
},
{
"epoch": 0.2386877525353884,
"grad_norm": 0.2297835876791337,
"learning_rate": 3.496189450091588e-05,
"loss": 0.7539,
"step": 1115
},
{
"epoch": 0.23890182226860399,
"grad_norm": 0.2497481123456355,
"learning_rate": 3.495287402588385e-05,
"loss": 0.7583,
"step": 1116
},
{
"epoch": 0.2391158920018196,
"grad_norm": 0.27884511406424517,
"learning_rate": 3.494384664863253e-05,
"loss": 0.7186,
"step": 1117
},
{
"epoch": 0.23932996173503518,
"grad_norm": 0.27767521541428375,
"learning_rate": 3.493481237332895e-05,
"loss": 0.7189,
"step": 1118
},
{
"epoch": 0.2395440314682508,
"grad_norm": 0.24207967789590182,
"learning_rate": 3.492577120414333e-05,
"loss": 0.7324,
"step": 1119
},
{
"epoch": 0.23975810120146637,
"grad_norm": 0.22421259080900618,
"learning_rate": 3.4916723145249034e-05,
"loss": 0.7489,
"step": 1120
},
{
"epoch": 0.23997217093468198,
"grad_norm": 0.2772284666752655,
"learning_rate": 3.4907668200822645e-05,
"loss": 0.743,
"step": 1121
},
{
"epoch": 0.24018624066789757,
"grad_norm": 0.26152102978471387,
"learning_rate": 3.48986063750439e-05,
"loss": 0.7288,
"step": 1122
},
{
"epoch": 0.24040031040111318,
"grad_norm": 0.2272798845694307,
"learning_rate": 3.488953767209573e-05,
"loss": 0.7507,
"step": 1123
},
{
"epoch": 0.24061438013432876,
"grad_norm": 0.263098149056709,
"learning_rate": 3.488046209616422e-05,
"loss": 0.722,
"step": 1124
},
{
"epoch": 0.24082844986754434,
"grad_norm": 0.2500442559848116,
"learning_rate": 3.4871379651438656e-05,
"loss": 0.7235,
"step": 1125
},
{
"epoch": 0.24104251960075995,
"grad_norm": 0.21433706064419283,
"learning_rate": 3.486229034211146e-05,
"loss": 0.7543,
"step": 1126
},
{
"epoch": 0.24125658933397554,
"grad_norm": 0.2215477500866708,
"learning_rate": 3.4853194172378256e-05,
"loss": 0.7575,
"step": 1127
},
{
"epoch": 0.24147065906719115,
"grad_norm": 0.27231554106216876,
"learning_rate": 3.48440911464378e-05,
"loss": 0.7728,
"step": 1128
},
{
"epoch": 0.24168472880040673,
"grad_norm": 0.23050816642788935,
"learning_rate": 3.483498126849205e-05,
"loss": 0.7444,
"step": 1129
},
{
"epoch": 0.24189879853362234,
"grad_norm": 0.22813814355783174,
"learning_rate": 3.482586454274611e-05,
"loss": 0.7331,
"step": 1130
},
{
"epoch": 0.24211286826683792,
"grad_norm": 0.2799083074168808,
"learning_rate": 3.481674097340823e-05,
"loss": 0.7462,
"step": 1131
},
{
"epoch": 0.2423269380000535,
"grad_norm": 0.23980352747771683,
"learning_rate": 3.480761056468984e-05,
"loss": 0.7673,
"step": 1132
},
{
"epoch": 0.24254100773326911,
"grad_norm": 0.19925576192225541,
"learning_rate": 3.4798473320805525e-05,
"loss": 0.7199,
"step": 1133
},
{
"epoch": 0.2427550774664847,
"grad_norm": 0.2781882173625236,
"learning_rate": 3.478932924597301e-05,
"loss": 0.7587,
"step": 1134
},
{
"epoch": 0.2429691471997003,
"grad_norm": 0.2637722494697476,
"learning_rate": 3.478017834441319e-05,
"loss": 0.763,
"step": 1135
},
{
"epoch": 0.2431832169329159,
"grad_norm": 0.248322154684677,
"learning_rate": 3.4771020620350096e-05,
"loss": 0.7499,
"step": 1136
},
{
"epoch": 0.2433972866661315,
"grad_norm": 0.2489220850544133,
"learning_rate": 3.4761856078010924e-05,
"loss": 0.7402,
"step": 1137
},
{
"epoch": 0.24361135639934708,
"grad_norm": 0.2581634984844074,
"learning_rate": 3.475268472162601e-05,
"loss": 0.7329,
"step": 1138
},
{
"epoch": 0.2438254261325627,
"grad_norm": 0.23118231452967447,
"learning_rate": 3.4743506555428845e-05,
"loss": 0.7395,
"step": 1139
},
{
"epoch": 0.24403949586577828,
"grad_norm": 0.22158647945102775,
"learning_rate": 3.4734321583656036e-05,
"loss": 0.723,
"step": 1140
},
{
"epoch": 0.24425356559899386,
"grad_norm": 0.30313975204745625,
"learning_rate": 3.472512981054736e-05,
"loss": 0.7586,
"step": 1141
},
{
"epoch": 0.24446763533220947,
"grad_norm": 0.323824729607345,
"learning_rate": 3.471593124034571e-05,
"loss": 0.7459,
"step": 1142
},
{
"epoch": 0.24468170506542505,
"grad_norm": 0.24092939792483786,
"learning_rate": 3.470672587729714e-05,
"loss": 0.7313,
"step": 1143
},
{
"epoch": 0.24489577479864066,
"grad_norm": 0.24306584169238002,
"learning_rate": 3.469751372565083e-05,
"loss": 0.7436,
"step": 1144
},
{
"epoch": 0.24510984453185625,
"grad_norm": 0.3188364211969285,
"learning_rate": 3.468829478965909e-05,
"loss": 0.7699,
"step": 1145
},
{
"epoch": 0.24532391426507186,
"grad_norm": 0.2859441884380527,
"learning_rate": 3.467906907357736e-05,
"loss": 0.7463,
"step": 1146
},
{
"epoch": 0.24553798399828744,
"grad_norm": 0.2242704276233571,
"learning_rate": 3.466983658166422e-05,
"loss": 0.7459,
"step": 1147
},
{
"epoch": 0.24575205373150305,
"grad_norm": 0.26642590934914734,
"learning_rate": 3.4660597318181364e-05,
"loss": 0.7641,
"step": 1148
},
{
"epoch": 0.24596612346471863,
"grad_norm": 0.2966715399351912,
"learning_rate": 3.465135128739363e-05,
"loss": 0.7158,
"step": 1149
},
{
"epoch": 0.24618019319793422,
"grad_norm": 0.2445052443016698,
"learning_rate": 3.464209849356896e-05,
"loss": 0.721,
"step": 1150
},
{
"epoch": 0.24639426293114983,
"grad_norm": 0.26256000088434067,
"learning_rate": 3.463283894097842e-05,
"loss": 0.7366,
"step": 1151
},
{
"epoch": 0.2466083326643654,
"grad_norm": 0.3048545406567788,
"learning_rate": 3.4623572633896224e-05,
"loss": 0.7271,
"step": 1152
},
{
"epoch": 0.24682240239758102,
"grad_norm": 0.28828148093982753,
"learning_rate": 3.4614299576599656e-05,
"loss": 0.7195,
"step": 1153
},
{
"epoch": 0.2470364721307966,
"grad_norm": 0.23559172995254016,
"learning_rate": 3.4605019773369165e-05,
"loss": 0.7311,
"step": 1154
},
{
"epoch": 0.2472505418640122,
"grad_norm": 0.2554881762437298,
"learning_rate": 3.4595733228488284e-05,
"loss": 0.7182,
"step": 1155
},
{
"epoch": 0.2474646115972278,
"grad_norm": 0.24590570891373473,
"learning_rate": 3.458643994624366e-05,
"loss": 0.7418,
"step": 1156
},
{
"epoch": 0.24767868133044338,
"grad_norm": 0.23111157050752615,
"learning_rate": 3.4577139930925053e-05,
"loss": 0.7423,
"step": 1157
},
{
"epoch": 0.247892751063659,
"grad_norm": 0.23485886010963944,
"learning_rate": 3.456783318682534e-05,
"loss": 0.7599,
"step": 1158
},
{
"epoch": 0.24810682079687457,
"grad_norm": 0.24413155764456476,
"learning_rate": 3.455851971824051e-05,
"loss": 0.7146,
"step": 1159
},
{
"epoch": 0.24832089053009018,
"grad_norm": 0.23833202775302623,
"learning_rate": 3.454919952946961e-05,
"loss": 0.7581,
"step": 1160
},
{
"epoch": 0.24853496026330577,
"grad_norm": 0.23615597843344693,
"learning_rate": 3.453987262481485e-05,
"loss": 0.7703,
"step": 1161
},
{
"epoch": 0.24874902999652138,
"grad_norm": 0.25678600028761517,
"learning_rate": 3.4530539008581505e-05,
"loss": 0.771,
"step": 1162
},
{
"epoch": 0.24896309972973696,
"grad_norm": 0.22283876228606897,
"learning_rate": 3.452119868507794e-05,
"loss": 0.7871,
"step": 1163
},
{
"epoch": 0.24917716946295257,
"grad_norm": 0.2380059035707347,
"learning_rate": 3.451185165861566e-05,
"loss": 0.7308,
"step": 1164
},
{
"epoch": 0.24939123919616815,
"grad_norm": 0.2523623714446187,
"learning_rate": 3.450249793350921e-05,
"loss": 0.7592,
"step": 1165
},
{
"epoch": 0.24960530892938373,
"grad_norm": 0.2354571442181655,
"learning_rate": 3.449313751407626e-05,
"loss": 0.7359,
"step": 1166
},
{
"epoch": 0.24981937866259935,
"grad_norm": 0.20846214666118837,
"learning_rate": 3.4483770404637574e-05,
"loss": 0.7448,
"step": 1167
},
{
"epoch": 0.25003344839581493,
"grad_norm": 0.24788370857158232,
"learning_rate": 3.447439660951697e-05,
"loss": 0.7352,
"step": 1168
},
{
"epoch": 0.2502475181290305,
"grad_norm": 0.22760502346048284,
"learning_rate": 3.4465016133041405e-05,
"loss": 0.7554,
"step": 1169
},
{
"epoch": 0.25046158786224615,
"grad_norm": 0.22386578931970105,
"learning_rate": 3.4455628979540856e-05,
"loss": 0.7349,
"step": 1170
},
{
"epoch": 0.25067565759546173,
"grad_norm": 0.24959189418281694,
"learning_rate": 3.444623515334844e-05,
"loss": 0.7138,
"step": 1171
},
{
"epoch": 0.2508897273286773,
"grad_norm": 0.2385615516788312,
"learning_rate": 3.443683465880032e-05,
"loss": 0.7351,
"step": 1172
},
{
"epoch": 0.2511037970618929,
"grad_norm": 0.24990831138591885,
"learning_rate": 3.442742750023575e-05,
"loss": 0.7392,
"step": 1173
},
{
"epoch": 0.2513178667951085,
"grad_norm": 0.24757762975607733,
"learning_rate": 3.441801368199706e-05,
"loss": 0.7597,
"step": 1174
},
{
"epoch": 0.2515319365283241,
"grad_norm": 0.24073704959664105,
"learning_rate": 3.4408593208429637e-05,
"loss": 0.7491,
"step": 1175
},
{
"epoch": 0.2517460062615397,
"grad_norm": 0.20779625732813095,
"learning_rate": 3.439916608388197e-05,
"loss": 0.6953,
"step": 1176
},
{
"epoch": 0.2519600759947553,
"grad_norm": 0.2547420961904698,
"learning_rate": 3.43897323127056e-05,
"loss": 0.7293,
"step": 1177
},
{
"epoch": 0.25217414572797087,
"grad_norm": 0.24464680814797685,
"learning_rate": 3.438029189925513e-05,
"loss": 0.7039,
"step": 1178
},
{
"epoch": 0.2523882154611865,
"grad_norm": 0.21550033836220966,
"learning_rate": 3.437084484788825e-05,
"loss": 0.753,
"step": 1179
},
{
"epoch": 0.2526022851944021,
"grad_norm": 0.24667308792049616,
"learning_rate": 3.436139116296569e-05,
"loss": 0.7513,
"step": 1180
},
{
"epoch": 0.25281635492761767,
"grad_norm": 0.2572438301730163,
"learning_rate": 3.4351930848851264e-05,
"loss": 0.7672,
"step": 1181
},
{
"epoch": 0.25303042466083325,
"grad_norm": 0.2297997590083026,
"learning_rate": 3.4342463909911826e-05,
"loss": 0.7388,
"step": 1182
},
{
"epoch": 0.25324449439404884,
"grad_norm": 0.2407006829080367,
"learning_rate": 3.433299035051731e-05,
"loss": 0.7191,
"step": 1183
},
{
"epoch": 0.2534585641272645,
"grad_norm": 0.26075842853984643,
"learning_rate": 3.432351017504068e-05,
"loss": 0.7334,
"step": 1184
},
{
"epoch": 0.25367263386048006,
"grad_norm": 0.2901402030666382,
"learning_rate": 3.431402338785797e-05,
"loss": 0.7273,
"step": 1185
},
{
"epoch": 0.25388670359369564,
"grad_norm": 0.23686107780870275,
"learning_rate": 3.4304529993348276e-05,
"loss": 0.7407,
"step": 1186
},
{
"epoch": 0.2541007733269112,
"grad_norm": 0.24898576813796555,
"learning_rate": 3.429502999589371e-05,
"loss": 0.7523,
"step": 1187
},
{
"epoch": 0.25431484306012686,
"grad_norm": 0.2813968197460596,
"learning_rate": 3.4285523399879476e-05,
"loss": 0.7289,
"step": 1188
},
{
"epoch": 0.25452891279334244,
"grad_norm": 0.23487785867274336,
"learning_rate": 3.427601020969379e-05,
"loss": 0.755,
"step": 1189
},
{
"epoch": 0.254742982526558,
"grad_norm": 0.24512958456467976,
"learning_rate": 3.426649042972792e-05,
"loss": 0.7274,
"step": 1190
},
{
"epoch": 0.2549570522597736,
"grad_norm": 0.23657665610482212,
"learning_rate": 3.425696406437619e-05,
"loss": 0.7295,
"step": 1191
},
{
"epoch": 0.2551711219929892,
"grad_norm": 0.2324456811817946,
"learning_rate": 3.424743111803594e-05,
"loss": 0.758,
"step": 1192
},
{
"epoch": 0.25538519172620483,
"grad_norm": 0.21708333632414636,
"learning_rate": 3.423789159510757e-05,
"loss": 0.7426,
"step": 1193
},
{
"epoch": 0.2555992614594204,
"grad_norm": 0.24871125843116768,
"learning_rate": 3.4228345499994504e-05,
"loss": 0.741,
"step": 1194
},
{
"epoch": 0.255813331192636,
"grad_norm": 0.2307222244246413,
"learning_rate": 3.42187928371032e-05,
"loss": 0.7458,
"step": 1195
},
{
"epoch": 0.2560274009258516,
"grad_norm": 0.22276180460737532,
"learning_rate": 3.420923361084315e-05,
"loss": 0.7792,
"step": 1196
},
{
"epoch": 0.2562414706590672,
"grad_norm": 0.2305342650065054,
"learning_rate": 3.419966782562687e-05,
"loss": 0.7801,
"step": 1197
},
{
"epoch": 0.2564555403922828,
"grad_norm": 0.21422753082824808,
"learning_rate": 3.4190095485869926e-05,
"loss": 0.7429,
"step": 1198
},
{
"epoch": 0.2566696101254984,
"grad_norm": 0.237125565263133,
"learning_rate": 3.418051659599088e-05,
"loss": 0.7552,
"step": 1199
},
{
"epoch": 0.25688367985871396,
"grad_norm": 0.24684772760226564,
"learning_rate": 3.417093116041133e-05,
"loss": 0.7257,
"step": 1200
},
{
"epoch": 0.25709774959192955,
"grad_norm": 0.24245413705233052,
"learning_rate": 3.4161339183555896e-05,
"loss": 0.7491,
"step": 1201
},
{
"epoch": 0.2573118193251452,
"grad_norm": 0.21715045986213533,
"learning_rate": 3.415174066985222e-05,
"loss": 0.7643,
"step": 1202
},
{
"epoch": 0.25752588905836077,
"grad_norm": 0.2411298591658727,
"learning_rate": 3.4142135623730954e-05,
"loss": 0.7585,
"step": 1203
},
{
"epoch": 0.25773995879157635,
"grad_norm": 0.23090726187919966,
"learning_rate": 3.4132524049625774e-05,
"loss": 0.7471,
"step": 1204
},
{
"epoch": 0.25795402852479193,
"grad_norm": 0.21438308223040606,
"learning_rate": 3.412290595197337e-05,
"loss": 0.7267,
"step": 1205
},
{
"epoch": 0.2581680982580076,
"grad_norm": 0.25239742637018964,
"learning_rate": 3.4113281335213416e-05,
"loss": 0.738,
"step": 1206
},
{
"epoch": 0.25838216799122316,
"grad_norm": 0.20796047833447395,
"learning_rate": 3.4103650203788646e-05,
"loss": 0.7382,
"step": 1207
},
{
"epoch": 0.25859623772443874,
"grad_norm": 0.23967752000872217,
"learning_rate": 3.4094012562144754e-05,
"loss": 0.7378,
"step": 1208
},
{
"epoch": 0.2588103074576543,
"grad_norm": 0.24480785000490024,
"learning_rate": 3.408436841473046e-05,
"loss": 0.7319,
"step": 1209
},
{
"epoch": 0.2590243771908699,
"grad_norm": 0.2591456126780797,
"learning_rate": 3.40747177659975e-05,
"loss": 0.7375,
"step": 1210
},
{
"epoch": 0.25923844692408554,
"grad_norm": 0.2224563979787024,
"learning_rate": 3.406506062040057e-05,
"loss": 0.7396,
"step": 1211
},
{
"epoch": 0.2594525166573011,
"grad_norm": 0.2892949208541926,
"learning_rate": 3.405539698239742e-05,
"loss": 0.738,
"step": 1212
},
{
"epoch": 0.2596665863905167,
"grad_norm": 0.28509861922730945,
"learning_rate": 3.4045726856448745e-05,
"loss": 0.7307,
"step": 1213
},
{
"epoch": 0.2598806561237323,
"grad_norm": 0.2242946439181324,
"learning_rate": 3.403605024701826e-05,
"loss": 0.7416,
"step": 1214
},
{
"epoch": 0.26009472585694793,
"grad_norm": 0.24377794993916513,
"learning_rate": 3.402636715857268e-05,
"loss": 0.7572,
"step": 1215
},
{
"epoch": 0.2603087955901635,
"grad_norm": 0.2776969756968053,
"learning_rate": 3.4016677595581696e-05,
"loss": 0.7408,
"step": 1216
},
{
"epoch": 0.2605228653233791,
"grad_norm": 0.25539707708554316,
"learning_rate": 3.4006981562517985e-05,
"loss": 0.7374,
"step": 1217
},
{
"epoch": 0.2607369350565947,
"grad_norm": 0.2529779543226716,
"learning_rate": 3.3997279063857234e-05,
"loss": 0.7201,
"step": 1218
},
{
"epoch": 0.26095100478981026,
"grad_norm": 0.24168770679893958,
"learning_rate": 3.398757010407809e-05,
"loss": 0.738,
"step": 1219
},
{
"epoch": 0.2611650745230259,
"grad_norm": 0.22336368147850622,
"learning_rate": 3.397785468766219e-05,
"loss": 0.7246,
"step": 1220
},
{
"epoch": 0.2613791442562415,
"grad_norm": 0.26353607775290483,
"learning_rate": 3.3968132819094153e-05,
"loss": 0.7462,
"step": 1221
},
{
"epoch": 0.26159321398945706,
"grad_norm": 0.25318820280116333,
"learning_rate": 3.3958404502861574e-05,
"loss": 0.7608,
"step": 1222
},
{
"epoch": 0.26180728372267265,
"grad_norm": 0.2149599172943751,
"learning_rate": 3.394866974345504e-05,
"loss": 0.7156,
"step": 1223
},
{
"epoch": 0.26202135345588823,
"grad_norm": 0.21534152810417512,
"learning_rate": 3.393892854536807e-05,
"loss": 0.7565,
"step": 1224
},
{
"epoch": 0.26223542318910387,
"grad_norm": 0.24084926310843696,
"learning_rate": 3.3929180913097206e-05,
"loss": 0.7478,
"step": 1225
},
{
"epoch": 0.26244949292231945,
"grad_norm": 0.2148070403910902,
"learning_rate": 3.3919426851141935e-05,
"loss": 0.7192,
"step": 1226
},
{
"epoch": 0.26266356265553503,
"grad_norm": 0.22757017071777,
"learning_rate": 3.39096663640047e-05,
"loss": 0.7341,
"step": 1227
},
{
"epoch": 0.2628776323887506,
"grad_norm": 0.2438716429425449,
"learning_rate": 3.389989945619094e-05,
"loss": 0.7284,
"step": 1228
},
{
"epoch": 0.26309170212196625,
"grad_norm": 0.2139362123552242,
"learning_rate": 3.389012613220904e-05,
"loss": 0.7592,
"step": 1229
},
{
"epoch": 0.26330577185518184,
"grad_norm": 0.21884388234064667,
"learning_rate": 3.3880346396570344e-05,
"loss": 0.6918,
"step": 1230
},
{
"epoch": 0.2635198415883974,
"grad_norm": 0.20923067223168929,
"learning_rate": 3.3870560253789155e-05,
"loss": 0.724,
"step": 1231
},
{
"epoch": 0.263733911321613,
"grad_norm": 0.24306339919153871,
"learning_rate": 3.386076770838274e-05,
"loss": 0.7499,
"step": 1232
},
{
"epoch": 0.2639479810548286,
"grad_norm": 0.2323253062905506,
"learning_rate": 3.385096876487134e-05,
"loss": 0.7435,
"step": 1233
},
{
"epoch": 0.2641620507880442,
"grad_norm": 0.21947749990891102,
"learning_rate": 3.38411634277781e-05,
"loss": 0.7402,
"step": 1234
},
{
"epoch": 0.2643761205212598,
"grad_norm": 0.20947238115140063,
"learning_rate": 3.383135170162916e-05,
"loss": 0.733,
"step": 1235
},
{
"epoch": 0.2645901902544754,
"grad_norm": 0.2148703620295522,
"learning_rate": 3.38215335909536e-05,
"loss": 0.7475,
"step": 1236
},
{
"epoch": 0.26480425998769097,
"grad_norm": 0.22442933534089865,
"learning_rate": 3.3811709100283434e-05,
"loss": 0.7534,
"step": 1237
},
{
"epoch": 0.2650183297209066,
"grad_norm": 0.23414723174854493,
"learning_rate": 3.3801878234153624e-05,
"loss": 0.7487,
"step": 1238
},
{
"epoch": 0.2652323994541222,
"grad_norm": 0.31915631434876957,
"learning_rate": 3.3792040997102093e-05,
"loss": 0.7595,
"step": 1239
},
{
"epoch": 0.2654464691873378,
"grad_norm": 0.2387136592412898,
"learning_rate": 3.3782197393669684e-05,
"loss": 0.7083,
"step": 1240
},
{
"epoch": 0.26566053892055336,
"grad_norm": 0.2390840354417617,
"learning_rate": 3.3772347428400185e-05,
"loss": 0.7535,
"step": 1241
},
{
"epoch": 0.26587460865376894,
"grad_norm": 0.22769911602399937,
"learning_rate": 3.376249110584033e-05,
"loss": 0.7421,
"step": 1242
},
{
"epoch": 0.2660886783869846,
"grad_norm": 0.23289511358940743,
"learning_rate": 3.375262843053976e-05,
"loss": 0.7583,
"step": 1243
},
{
"epoch": 0.26630274812020016,
"grad_norm": 0.21364216869927816,
"learning_rate": 3.3742759407051094e-05,
"loss": 0.7285,
"step": 1244
},
{
"epoch": 0.26651681785341574,
"grad_norm": 0.23627876629788905,
"learning_rate": 3.3732884039929844e-05,
"loss": 0.7323,
"step": 1245
},
{
"epoch": 0.2667308875866313,
"grad_norm": 0.2276106304734522,
"learning_rate": 3.372300233373446e-05,
"loss": 0.7274,
"step": 1246
},
{
"epoch": 0.26694495731984696,
"grad_norm": 0.23001668093135316,
"learning_rate": 3.371311429302632e-05,
"loss": 0.7088,
"step": 1247
},
{
"epoch": 0.26715902705306255,
"grad_norm": 0.2463448454397025,
"learning_rate": 3.370321992236971e-05,
"loss": 0.7208,
"step": 1248
},
{
"epoch": 0.26737309678627813,
"grad_norm": 0.301210826139636,
"learning_rate": 3.369331922633189e-05,
"loss": 0.7203,
"step": 1249
},
{
"epoch": 0.2675871665194937,
"grad_norm": 0.2839366167069765,
"learning_rate": 3.368341220948297e-05,
"loss": 0.7398,
"step": 1250
},
{
"epoch": 0.2678012362527093,
"grad_norm": 0.21128119196927372,
"learning_rate": 3.367349887639602e-05,
"loss": 0.754,
"step": 1251
},
{
"epoch": 0.26801530598592493,
"grad_norm": 0.23116333876179326,
"learning_rate": 3.366357923164702e-05,
"loss": 0.7604,
"step": 1252
},
{
"epoch": 0.2682293757191405,
"grad_norm": 0.22747271551245782,
"learning_rate": 3.3653653279814865e-05,
"loss": 0.7394,
"step": 1253
},
{
"epoch": 0.2684434454523561,
"grad_norm": 0.23004307375556815,
"learning_rate": 3.364372102548135e-05,
"loss": 0.7287,
"step": 1254
},
{
"epoch": 0.2686575151855717,
"grad_norm": 0.2750622226294108,
"learning_rate": 3.3633782473231176e-05,
"loss": 0.7613,
"step": 1255
},
{
"epoch": 0.2688715849187873,
"grad_norm": 0.2672186064726538,
"learning_rate": 3.362383762765198e-05,
"loss": 0.7325,
"step": 1256
},
{
"epoch": 0.2690856546520029,
"grad_norm": 0.2348914065267851,
"learning_rate": 3.361388649333427e-05,
"loss": 0.7169,
"step": 1257
},
{
"epoch": 0.2692997243852185,
"grad_norm": 0.24606329973802127,
"learning_rate": 3.360392907487148e-05,
"loss": 0.7387,
"step": 1258
},
{
"epoch": 0.26951379411843407,
"grad_norm": 0.24919064555519513,
"learning_rate": 3.359396537685992e-05,
"loss": 0.711,
"step": 1259
},
{
"epoch": 0.26972786385164965,
"grad_norm": 0.25219342730910826,
"learning_rate": 3.358399540389884e-05,
"loss": 0.7379,
"step": 1260
},
{
"epoch": 0.2699419335848653,
"grad_norm": 0.2296712182666378,
"learning_rate": 3.3574019160590345e-05,
"loss": 0.7442,
"step": 1261
},
{
"epoch": 0.2701560033180809,
"grad_norm": 0.22192744289815136,
"learning_rate": 3.3564036651539455e-05,
"loss": 0.74,
"step": 1262
},
{
"epoch": 0.27037007305129646,
"grad_norm": 0.24846168601795277,
"learning_rate": 3.355404788135407e-05,
"loss": 0.725,
"step": 1263
},
{
"epoch": 0.27058414278451204,
"grad_norm": 0.25442473984225245,
"learning_rate": 3.3544052854645e-05,
"loss": 0.7159,
"step": 1264
},
{
"epoch": 0.2707982125177277,
"grad_norm": 0.2263136064538683,
"learning_rate": 3.353405157602592e-05,
"loss": 0.7222,
"step": 1265
},
{
"epoch": 0.27101228225094326,
"grad_norm": 0.25067143420904886,
"learning_rate": 3.352404405011342e-05,
"loss": 0.7424,
"step": 1266
},
{
"epoch": 0.27122635198415884,
"grad_norm": 0.2569024734131973,
"learning_rate": 3.351403028152693e-05,
"loss": 0.7412,
"step": 1267
},
{
"epoch": 0.2714404217173744,
"grad_norm": 0.2304888122878882,
"learning_rate": 3.3504010274888806e-05,
"loss": 0.7235,
"step": 1268
},
{
"epoch": 0.27165449145059,
"grad_norm": 0.23808739897672176,
"learning_rate": 3.349398403482426e-05,
"loss": 0.7167,
"step": 1269
},
{
"epoch": 0.27186856118380565,
"grad_norm": 0.2238370810629572,
"learning_rate": 3.348395156596138e-05,
"loss": 0.692,
"step": 1270
},
{
"epoch": 0.27208263091702123,
"grad_norm": 0.26716897637223047,
"learning_rate": 3.347391287293115e-05,
"loss": 0.7471,
"step": 1271
},
{
"epoch": 0.2722967006502368,
"grad_norm": 0.22016470462040894,
"learning_rate": 3.34638679603674e-05,
"loss": 0.742,
"step": 1272
},
{
"epoch": 0.2725107703834524,
"grad_norm": 0.2356505881221704,
"learning_rate": 3.3453816832906835e-05,
"loss": 0.7644,
"step": 1273
},
{
"epoch": 0.272724840116668,
"grad_norm": 0.2543211750149203,
"learning_rate": 3.344375949518906e-05,
"loss": 0.7239,
"step": 1274
},
{
"epoch": 0.2729389098498836,
"grad_norm": 0.24755004119231183,
"learning_rate": 3.343369595185651e-05,
"loss": 0.7264,
"step": 1275
},
{
"epoch": 0.2731529795830992,
"grad_norm": 0.212753245018397,
"learning_rate": 3.3423626207554494e-05,
"loss": 0.7172,
"step": 1276
},
{
"epoch": 0.2733670493163148,
"grad_norm": 0.23528019489141624,
"learning_rate": 3.34135502669312e-05,
"loss": 0.717,
"step": 1277
},
{
"epoch": 0.27358111904953036,
"grad_norm": 0.25147108889505876,
"learning_rate": 3.3403468134637654e-05,
"loss": 0.7155,
"step": 1278
},
{
"epoch": 0.273795188782746,
"grad_norm": 0.2261653568767125,
"learning_rate": 3.339337981532776e-05,
"loss": 0.7383,
"step": 1279
},
{
"epoch": 0.2740092585159616,
"grad_norm": 0.24961114565552953,
"learning_rate": 3.3383285313658254e-05,
"loss": 0.7201,
"step": 1280
},
{
"epoch": 0.27422332824917717,
"grad_norm": 0.2761825909484211,
"learning_rate": 3.337318463428874e-05,
"loss": 0.7258,
"step": 1281
},
{
"epoch": 0.27443739798239275,
"grad_norm": 0.2439268449071247,
"learning_rate": 3.336307778188169e-05,
"loss": 0.7377,
"step": 1282
},
{
"epoch": 0.27465146771560833,
"grad_norm": 0.2529478674719712,
"learning_rate": 3.3352964761102395e-05,
"loss": 0.7486,
"step": 1283
},
{
"epoch": 0.27486553744882397,
"grad_norm": 0.22273915873906183,
"learning_rate": 3.334284557661901e-05,
"loss": 0.7373,
"step": 1284
},
{
"epoch": 0.27507960718203955,
"grad_norm": 0.24813716145623047,
"learning_rate": 3.333272023310253e-05,
"loss": 0.766,
"step": 1285
},
{
"epoch": 0.27529367691525514,
"grad_norm": 0.21863374492148302,
"learning_rate": 3.33225887352268e-05,
"loss": 0.7578,
"step": 1286
},
{
"epoch": 0.2755077466484707,
"grad_norm": 0.24762223940774178,
"learning_rate": 3.331245108766849e-05,
"loss": 0.748,
"step": 1287
},
{
"epoch": 0.27572181638168636,
"grad_norm": 0.2413065434679842,
"learning_rate": 3.330230729510714e-05,
"loss": 0.7267,
"step": 1288
},
{
"epoch": 0.27593588611490194,
"grad_norm": 0.22838099631168504,
"learning_rate": 3.329215736222508e-05,
"loss": 0.6969,
"step": 1289
},
{
"epoch": 0.2761499558481175,
"grad_norm": 0.21462260948933617,
"learning_rate": 3.328200129370752e-05,
"loss": 0.7252,
"step": 1290
},
{
"epoch": 0.2763640255813331,
"grad_norm": 0.23463784112616665,
"learning_rate": 3.327183909424248e-05,
"loss": 0.7257,
"step": 1291
},
{
"epoch": 0.2765780953145487,
"grad_norm": 0.24506200071432127,
"learning_rate": 3.326167076852081e-05,
"loss": 0.7455,
"step": 1292
},
{
"epoch": 0.2767921650477643,
"grad_norm": 0.25487913911280596,
"learning_rate": 3.325149632123618e-05,
"loss": 0.753,
"step": 1293
},
{
"epoch": 0.2770062347809799,
"grad_norm": 0.22380539321134613,
"learning_rate": 3.324131575708512e-05,
"loss": 0.6957,
"step": 1294
},
{
"epoch": 0.2772203045141955,
"grad_norm": 0.22729766641670007,
"learning_rate": 3.323112908076693e-05,
"loss": 0.7592,
"step": 1295
},
{
"epoch": 0.2774343742474111,
"grad_norm": 0.2310693350497247,
"learning_rate": 3.322093629698379e-05,
"loss": 0.7193,
"step": 1296
},
{
"epoch": 0.2776484439806267,
"grad_norm": 0.20681259768160018,
"learning_rate": 3.321073741044065e-05,
"loss": 0.7381,
"step": 1297
},
{
"epoch": 0.2778625137138423,
"grad_norm": 0.2151928003070936,
"learning_rate": 3.32005324258453e-05,
"loss": 0.7313,
"step": 1298
},
{
"epoch": 0.2780765834470579,
"grad_norm": 0.21427855874770377,
"learning_rate": 3.319032134790836e-05,
"loss": 0.7516,
"step": 1299
},
{
"epoch": 0.27829065318027346,
"grad_norm": 0.20595401236647193,
"learning_rate": 3.3180104181343224e-05,
"loss": 0.7176,
"step": 1300
},
{
"epoch": 0.27850472291348904,
"grad_norm": 0.3106695088656347,
"learning_rate": 3.316988093086612e-05,
"loss": 0.7493,
"step": 1301
},
{
"epoch": 0.2787187926467047,
"grad_norm": 0.2340688588742366,
"learning_rate": 3.3159651601196094e-05,
"loss": 0.7354,
"step": 1302
},
{
"epoch": 0.27893286237992027,
"grad_norm": 0.22076851472351364,
"learning_rate": 3.314941619705498e-05,
"loss": 0.7334,
"step": 1303
},
{
"epoch": 0.27914693211313585,
"grad_norm": 0.19874871129521252,
"learning_rate": 3.3139174723167415e-05,
"loss": 0.7589,
"step": 1304
},
{
"epoch": 0.27936100184635143,
"grad_norm": 0.2212115497004667,
"learning_rate": 3.312892718426086e-05,
"loss": 0.7542,
"step": 1305
},
{
"epoch": 0.27957507157956707,
"grad_norm": 0.21415836397243754,
"learning_rate": 3.3118673585065536e-05,
"loss": 0.7369,
"step": 1306
},
{
"epoch": 0.27978914131278265,
"grad_norm": 0.2179798245266278,
"learning_rate": 3.3108413930314506e-05,
"loss": 0.7638,
"step": 1307
},
{
"epoch": 0.28000321104599823,
"grad_norm": 0.2540638570289035,
"learning_rate": 3.30981482247436e-05,
"loss": 0.7414,
"step": 1308
},
{
"epoch": 0.2802172807792138,
"grad_norm": 0.2061278171225783,
"learning_rate": 3.3087876473091455e-05,
"loss": 0.7356,
"step": 1309
},
{
"epoch": 0.2804313505124294,
"grad_norm": 0.20998675886616364,
"learning_rate": 3.307759868009949e-05,
"loss": 0.7475,
"step": 1310
},
{
"epoch": 0.28064542024564504,
"grad_norm": 0.2231058947198689,
"learning_rate": 3.306731485051191e-05,
"loss": 0.7131,
"step": 1311
},
{
"epoch": 0.2808594899788606,
"grad_norm": 0.3566037413688859,
"learning_rate": 3.3057024989075715e-05,
"loss": 0.7525,
"step": 1312
},
{
"epoch": 0.2810735597120762,
"grad_norm": 0.22277082867800663,
"learning_rate": 3.3046729100540686e-05,
"loss": 0.7493,
"step": 1313
},
{
"epoch": 0.2812876294452918,
"grad_norm": 0.1861671724954601,
"learning_rate": 3.3036427189659386e-05,
"loss": 0.7061,
"step": 1314
},
{
"epoch": 0.2815016991785074,
"grad_norm": 0.21021655214095677,
"learning_rate": 3.302611926118716e-05,
"loss": 0.7353,
"step": 1315
},
{
"epoch": 0.281715768911723,
"grad_norm": 0.20877391839607665,
"learning_rate": 3.301580531988213e-05,
"loss": 0.7621,
"step": 1316
},
{
"epoch": 0.2819298386449386,
"grad_norm": 0.20533436000378583,
"learning_rate": 3.300548537050519e-05,
"loss": 0.721,
"step": 1317
},
{
"epoch": 0.2821439083781542,
"grad_norm": 0.20169237903063889,
"learning_rate": 3.2995159417820014e-05,
"loss": 0.7542,
"step": 1318
},
{
"epoch": 0.28235797811136976,
"grad_norm": 0.21409057852008287,
"learning_rate": 3.2984827466593036e-05,
"loss": 0.7658,
"step": 1319
},
{
"epoch": 0.2825720478445854,
"grad_norm": 0.20799828550855554,
"learning_rate": 3.2974489521593474e-05,
"loss": 0.7318,
"step": 1320
},
{
"epoch": 0.282786117577801,
"grad_norm": 0.21440521985054223,
"learning_rate": 3.296414558759329e-05,
"loss": 0.7446,
"step": 1321
},
{
"epoch": 0.28300018731101656,
"grad_norm": 0.20109109765449448,
"learning_rate": 3.295379566936724e-05,
"loss": 0.7237,
"step": 1322
},
{
"epoch": 0.28321425704423214,
"grad_norm": 0.22008644947199202,
"learning_rate": 3.294343977169282e-05,
"loss": 0.7242,
"step": 1323
},
{
"epoch": 0.2834283267774478,
"grad_norm": 0.21810873547560058,
"learning_rate": 3.29330778993503e-05,
"loss": 0.7269,
"step": 1324
},
{
"epoch": 0.28364239651066336,
"grad_norm": 0.2109574149141801,
"learning_rate": 3.292271005712269e-05,
"loss": 0.7139,
"step": 1325
},
{
"epoch": 0.28385646624387895,
"grad_norm": 0.2226470165117003,
"learning_rate": 3.291233624979578e-05,
"loss": 0.7364,
"step": 1326
},
{
"epoch": 0.28407053597709453,
"grad_norm": 0.22180153572255398,
"learning_rate": 3.290195648215809e-05,
"loss": 0.7035,
"step": 1327
},
{
"epoch": 0.2842846057103101,
"grad_norm": 0.21212274759872496,
"learning_rate": 3.289157075900091e-05,
"loss": 0.752,
"step": 1328
},
{
"epoch": 0.28449867544352575,
"grad_norm": 0.22392232963013,
"learning_rate": 3.288117908511826e-05,
"loss": 0.7124,
"step": 1329
},
{
"epoch": 0.28471274517674133,
"grad_norm": 0.2217384973022529,
"learning_rate": 3.287078146530693e-05,
"loss": 0.7119,
"step": 1330
},
{
"epoch": 0.2849268149099569,
"grad_norm": 0.2239696839241841,
"learning_rate": 3.286037790436644e-05,
"loss": 0.709,
"step": 1331
},
{
"epoch": 0.2851408846431725,
"grad_norm": 0.21247932684313287,
"learning_rate": 3.284996840709904e-05,
"loss": 0.7655,
"step": 1332
},
{
"epoch": 0.2853549543763881,
"grad_norm": 0.24100995837849887,
"learning_rate": 3.283955297830975e-05,
"loss": 0.7191,
"step": 1333
},
{
"epoch": 0.2855690241096037,
"grad_norm": 0.21021584901521734,
"learning_rate": 3.2829131622806316e-05,
"loss": 0.7369,
"step": 1334
},
{
"epoch": 0.2857830938428193,
"grad_norm": 0.20031814317637867,
"learning_rate": 3.28187043453992e-05,
"loss": 0.7201,
"step": 1335
},
{
"epoch": 0.2859971635760349,
"grad_norm": 0.2407290690269822,
"learning_rate": 3.2808271150901626e-05,
"loss": 0.7367,
"step": 1336
},
{
"epoch": 0.28621123330925047,
"grad_norm": 0.20307590665925798,
"learning_rate": 3.279783204412954e-05,
"loss": 0.6986,
"step": 1337
},
{
"epoch": 0.2864253030424661,
"grad_norm": 0.24047420783975218,
"learning_rate": 3.2787387029901606e-05,
"loss": 0.7292,
"step": 1338
},
{
"epoch": 0.2866393727756817,
"grad_norm": 0.24157870880732082,
"learning_rate": 3.277693611303922e-05,
"loss": 0.7134,
"step": 1339
},
{
"epoch": 0.28685344250889727,
"grad_norm": 0.22682727467384456,
"learning_rate": 3.276647929836653e-05,
"loss": 0.7023,
"step": 1340
},
{
"epoch": 0.28706751224211285,
"grad_norm": 0.2188005785823767,
"learning_rate": 3.2756016590710355e-05,
"loss": 0.7707,
"step": 1341
},
{
"epoch": 0.28728158197532844,
"grad_norm": 0.28172732336907075,
"learning_rate": 3.274554799490028e-05,
"loss": 0.7272,
"step": 1342
},
{
"epoch": 0.2874956517085441,
"grad_norm": 0.24192696282082157,
"learning_rate": 3.273507351576857e-05,
"loss": 0.7132,
"step": 1343
},
{
"epoch": 0.28770972144175966,
"grad_norm": 0.2315519440674189,
"learning_rate": 3.272459315815025e-05,
"loss": 0.7394,
"step": 1344
},
{
"epoch": 0.28792379117497524,
"grad_norm": 0.26217337426162685,
"learning_rate": 3.2714106926883016e-05,
"loss": 0.7225,
"step": 1345
},
{
"epoch": 0.2881378609081908,
"grad_norm": 0.26990586593344973,
"learning_rate": 3.27036148268073e-05,
"loss": 0.7441,
"step": 1346
},
{
"epoch": 0.28835193064140646,
"grad_norm": 0.21589713648963416,
"learning_rate": 3.2693116862766236e-05,
"loss": 0.7161,
"step": 1347
},
{
"epoch": 0.28856600037462204,
"grad_norm": 0.24421754890717157,
"learning_rate": 3.2682613039605655e-05,
"loss": 0.7207,
"step": 1348
},
{
"epoch": 0.2887800701078376,
"grad_norm": 0.24741110918426046,
"learning_rate": 3.267210336217412e-05,
"loss": 0.7422,
"step": 1349
},
{
"epoch": 0.2889941398410532,
"grad_norm": 0.21218214160427318,
"learning_rate": 3.266158783532287e-05,
"loss": 0.7416,
"step": 1350
},
{
"epoch": 0.2892082095742688,
"grad_norm": 0.23033686820949453,
"learning_rate": 3.2651066463905854e-05,
"loss": 0.724,
"step": 1351
},
{
"epoch": 0.28942227930748443,
"grad_norm": 0.24030135503458686,
"learning_rate": 3.264053925277972e-05,
"loss": 0.7262,
"step": 1352
},
{
"epoch": 0.2896363490407,
"grad_norm": 0.23475361277373719,
"learning_rate": 3.263000620680379e-05,
"loss": 0.7475,
"step": 1353
},
{
"epoch": 0.2898504187739156,
"grad_norm": 0.2060328426111773,
"learning_rate": 3.2619467330840124e-05,
"loss": 0.7456,
"step": 1354
},
{
"epoch": 0.2900644885071312,
"grad_norm": 0.2396608594606869,
"learning_rate": 3.2608922629753444e-05,
"loss": 0.7411,
"step": 1355
},
{
"epoch": 0.2902785582403468,
"grad_norm": 0.21255554908811655,
"learning_rate": 3.259837210841116e-05,
"loss": 0.7543,
"step": 1356
},
{
"epoch": 0.2904926279735624,
"grad_norm": 0.2035296928731616,
"learning_rate": 3.2587815771683364e-05,
"loss": 0.7343,
"step": 1357
},
{
"epoch": 0.290706697706778,
"grad_norm": 0.21053857087589242,
"learning_rate": 3.2577253624442855e-05,
"loss": 0.6848,
"step": 1358
},
{
"epoch": 0.29092076743999357,
"grad_norm": 0.22660109624261895,
"learning_rate": 3.25666856715651e-05,
"loss": 0.7321,
"step": 1359
},
{
"epoch": 0.29113483717320915,
"grad_norm": 0.1899904190919483,
"learning_rate": 3.255611191792824e-05,
"loss": 0.7437,
"step": 1360
},
{
"epoch": 0.2913489069064248,
"grad_norm": 0.22172872661906323,
"learning_rate": 3.254553236841311e-05,
"loss": 0.7482,
"step": 1361
},
{
"epoch": 0.29156297663964037,
"grad_norm": 0.20740244056190774,
"learning_rate": 3.25349470279032e-05,
"loss": 0.7255,
"step": 1362
},
{
"epoch": 0.29177704637285595,
"grad_norm": 0.3158156452257583,
"learning_rate": 3.2524355901284676e-05,
"loss": 0.7662,
"step": 1363
},
{
"epoch": 0.29199111610607154,
"grad_norm": 0.22748707107737778,
"learning_rate": 3.2513758993446406e-05,
"loss": 0.7428,
"step": 1364
},
{
"epoch": 0.2922051858392872,
"grad_norm": 0.21535543945914187,
"learning_rate": 3.2503156309279895e-05,
"loss": 0.7383,
"step": 1365
},
{
"epoch": 0.29241925557250276,
"grad_norm": 0.2113972483014738,
"learning_rate": 3.249254785367931e-05,
"loss": 0.7492,
"step": 1366
},
{
"epoch": 0.29263332530571834,
"grad_norm": 0.218096472040482,
"learning_rate": 3.248193363154151e-05,
"loss": 0.7312,
"step": 1367
},
{
"epoch": 0.2928473950389339,
"grad_norm": 0.22987206607929475,
"learning_rate": 3.2471313647766e-05,
"loss": 0.7477,
"step": 1368
},
{
"epoch": 0.2930614647721495,
"grad_norm": 0.2198837848575135,
"learning_rate": 3.2460687907254933e-05,
"loss": 0.728,
"step": 1369
},
{
"epoch": 0.29327553450536514,
"grad_norm": 0.23854273009753085,
"learning_rate": 3.245005641491314e-05,
"loss": 0.742,
"step": 1370
},
{
"epoch": 0.2934896042385807,
"grad_norm": 0.33658059803919166,
"learning_rate": 3.2439419175648096e-05,
"loss": 0.7506,
"step": 1371
},
{
"epoch": 0.2937036739717963,
"grad_norm": 0.3201983548062593,
"learning_rate": 3.2428776194369936e-05,
"loss": 0.7548,
"step": 1372
},
{
"epoch": 0.2939177437050119,
"grad_norm": 0.22445902352448582,
"learning_rate": 3.241812747599143e-05,
"loss": 0.7137,
"step": 1373
},
{
"epoch": 0.29413181343822753,
"grad_norm": 0.24163029792772803,
"learning_rate": 3.2407473025428014e-05,
"loss": 0.717,
"step": 1374
},
{
"epoch": 0.2943458831714431,
"grad_norm": 0.2328587034862239,
"learning_rate": 3.239681284759776e-05,
"loss": 0.7272,
"step": 1375
},
{
"epoch": 0.2945599529046587,
"grad_norm": 0.20081575381786798,
"learning_rate": 3.23861469474214e-05,
"loss": 0.7434,
"step": 1376
},
{
"epoch": 0.2947740226378743,
"grad_norm": 0.23837871139042788,
"learning_rate": 3.237547532982228e-05,
"loss": 0.7267,
"step": 1377
},
{
"epoch": 0.29498809237108986,
"grad_norm": 0.21823564640646656,
"learning_rate": 3.2364797999726395e-05,
"loss": 0.7141,
"step": 1378
},
{
"epoch": 0.2952021621043055,
"grad_norm": 0.22545105132075569,
"learning_rate": 3.2354114962062394e-05,
"loss": 0.7179,
"step": 1379
},
{
"epoch": 0.2954162318375211,
"grad_norm": 0.23484867044352178,
"learning_rate": 3.234342622176153e-05,
"loss": 0.7148,
"step": 1380
},
{
"epoch": 0.29563030157073666,
"grad_norm": 0.2195872391322405,
"learning_rate": 3.2332731783757724e-05,
"loss": 0.7679,
"step": 1381
},
{
"epoch": 0.29584437130395225,
"grad_norm": 0.21156762066060109,
"learning_rate": 3.232203165298751e-05,
"loss": 0.7815,
"step": 1382
},
{
"epoch": 0.29605844103716783,
"grad_norm": 0.22245947323364718,
"learning_rate": 3.231132583439004e-05,
"loss": 0.7411,
"step": 1383
},
{
"epoch": 0.29627251077038347,
"grad_norm": 0.22536809649023096,
"learning_rate": 3.2300614332907095e-05,
"loss": 0.719,
"step": 1384
},
{
"epoch": 0.29648658050359905,
"grad_norm": 0.19906274829822754,
"learning_rate": 3.228989715348309e-05,
"loss": 0.7461,
"step": 1385
},
{
"epoch": 0.29670065023681463,
"grad_norm": 0.19402806520790786,
"learning_rate": 3.227917430106506e-05,
"loss": 0.7315,
"step": 1386
},
{
"epoch": 0.2969147199700302,
"grad_norm": 0.2309421631043973,
"learning_rate": 3.2268445780602654e-05,
"loss": 0.7407,
"step": 1387
},
{
"epoch": 0.29712878970324585,
"grad_norm": 0.20857649271903783,
"learning_rate": 3.225771159704813e-05,
"loss": 0.7368,
"step": 1388
},
{
"epoch": 0.29734285943646144,
"grad_norm": 0.2013707317699051,
"learning_rate": 3.2246971755356375e-05,
"loss": 0.7009,
"step": 1389
},
{
"epoch": 0.297556929169677,
"grad_norm": 0.20239912216816988,
"learning_rate": 3.223622626048487e-05,
"loss": 0.7168,
"step": 1390
},
{
"epoch": 0.2977709989028926,
"grad_norm": 0.2153847798659455,
"learning_rate": 3.222547511739373e-05,
"loss": 0.7464,
"step": 1391
},
{
"epoch": 0.2979850686361082,
"grad_norm": 0.19938444912860112,
"learning_rate": 3.221471833104565e-05,
"loss": 0.7068,
"step": 1392
},
{
"epoch": 0.2981991383693238,
"grad_norm": 0.21451018840175334,
"learning_rate": 3.220395590640595e-05,
"loss": 0.7129,
"step": 1393
},
{
"epoch": 0.2984132081025394,
"grad_norm": 0.21898499800150237,
"learning_rate": 3.219318784844254e-05,
"loss": 0.7278,
"step": 1394
},
{
"epoch": 0.298627277835755,
"grad_norm": 0.22131841856705786,
"learning_rate": 3.2182414162125945e-05,
"loss": 0.7399,
"step": 1395
},
{
"epoch": 0.2988413475689706,
"grad_norm": 0.2024849180619506,
"learning_rate": 3.2171634852429274e-05,
"loss": 0.7082,
"step": 1396
},
{
"epoch": 0.2990554173021862,
"grad_norm": 0.2145618143912526,
"learning_rate": 3.2160849924328234e-05,
"loss": 0.7286,
"step": 1397
},
{
"epoch": 0.2992694870354018,
"grad_norm": 0.20660470699343808,
"learning_rate": 3.215005938280113e-05,
"loss": 0.7246,
"step": 1398
},
{
"epoch": 0.2994835567686174,
"grad_norm": 0.21058362047175624,
"learning_rate": 3.213926323282886e-05,
"loss": 0.6958,
"step": 1399
},
{
"epoch": 0.29969762650183296,
"grad_norm": 0.20084108245517038,
"learning_rate": 3.2128461479394894e-05,
"loss": 0.7445,
"step": 1400
},
{
"epoch": 0.29991169623504854,
"grad_norm": 0.2263541092646725,
"learning_rate": 3.211765412748532e-05,
"loss": 0.7437,
"step": 1401
},
{
"epoch": 0.3001257659682642,
"grad_norm": 0.23875085738584625,
"learning_rate": 3.210684118208878e-05,
"loss": 0.7201,
"step": 1402
},
{
"epoch": 0.30033983570147976,
"grad_norm": 0.21789335664347195,
"learning_rate": 3.209602264819651e-05,
"loss": 0.7102,
"step": 1403
},
{
"epoch": 0.30055390543469535,
"grad_norm": 0.2046072580137681,
"learning_rate": 3.2085198530802334e-05,
"loss": 0.707,
"step": 1404
},
{
"epoch": 0.30076797516791093,
"grad_norm": 0.2067969290194415,
"learning_rate": 3.207436883490264e-05,
"loss": 0.7162,
"step": 1405
},
{
"epoch": 0.30098204490112657,
"grad_norm": 0.2223918694230222,
"learning_rate": 3.206353356549639e-05,
"loss": 0.696,
"step": 1406
},
{
"epoch": 0.30119611463434215,
"grad_norm": 0.20285869745100288,
"learning_rate": 3.205269272758513e-05,
"loss": 0.7228,
"step": 1407
},
{
"epoch": 0.30141018436755773,
"grad_norm": 0.22057104093564195,
"learning_rate": 3.204184632617297e-05,
"loss": 0.7402,
"step": 1408
},
{
"epoch": 0.3016242541007733,
"grad_norm": 0.21719095695822196,
"learning_rate": 3.2030994366266597e-05,
"loss": 0.7178,
"step": 1409
},
{
"epoch": 0.3018383238339889,
"grad_norm": 0.27094642527673257,
"learning_rate": 3.202013685287524e-05,
"loss": 0.7317,
"step": 1410
},
{
"epoch": 0.30205239356720454,
"grad_norm": 0.2062133401939227,
"learning_rate": 3.2009273791010715e-05,
"loss": 0.7319,
"step": 1411
},
{
"epoch": 0.3022664633004201,
"grad_norm": 0.2090566384014724,
"learning_rate": 3.199840518568739e-05,
"loss": 0.7122,
"step": 1412
},
{
"epoch": 0.3024805330336357,
"grad_norm": 0.2214651222116033,
"learning_rate": 3.1987531041922205e-05,
"loss": 0.7534,
"step": 1413
},
{
"epoch": 0.3026946027668513,
"grad_norm": 0.2072693638819392,
"learning_rate": 3.197665136473463e-05,
"loss": 0.7248,
"step": 1414
},
{
"epoch": 0.3029086725000669,
"grad_norm": 0.21247821692980245,
"learning_rate": 3.196576615914671e-05,
"loss": 0.7134,
"step": 1415
},
{
"epoch": 0.3031227422332825,
"grad_norm": 0.2061763925201024,
"learning_rate": 3.195487543018302e-05,
"loss": 0.7583,
"step": 1416
},
{
"epoch": 0.3033368119664981,
"grad_norm": 0.20360764829818256,
"learning_rate": 3.1943979182870734e-05,
"loss": 0.7353,
"step": 1417
},
{
"epoch": 0.30355088169971367,
"grad_norm": 0.20246447907897855,
"learning_rate": 3.193307742223952e-05,
"loss": 0.6982,
"step": 1418
},
{
"epoch": 0.30376495143292925,
"grad_norm": 0.19760088589285252,
"learning_rate": 3.192217015332161e-05,
"loss": 0.722,
"step": 1419
},
{
"epoch": 0.3039790211661449,
"grad_norm": 0.22099084344142367,
"learning_rate": 3.191125738115178e-05,
"loss": 0.7389,
"step": 1420
},
{
"epoch": 0.3041930908993605,
"grad_norm": 0.205468337901139,
"learning_rate": 3.190033911076735e-05,
"loss": 0.7299,
"step": 1421
},
{
"epoch": 0.30440716063257606,
"grad_norm": 0.19366753031949716,
"learning_rate": 3.1889415347208164e-05,
"loss": 0.7193,
"step": 1422
},
{
"epoch": 0.30462123036579164,
"grad_norm": 0.2279557283397567,
"learning_rate": 3.1878486095516624e-05,
"loss": 0.7141,
"step": 1423
},
{
"epoch": 0.3048353000990073,
"grad_norm": 0.22028831102493454,
"learning_rate": 3.186755136073765e-05,
"loss": 0.7274,
"step": 1424
},
{
"epoch": 0.30504936983222286,
"grad_norm": 0.2020719104817722,
"learning_rate": 3.1856611147918684e-05,
"loss": 0.7481,
"step": 1425
},
{
"epoch": 0.30526343956543844,
"grad_norm": 0.21945753089833947,
"learning_rate": 3.184566546210972e-05,
"loss": 0.7186,
"step": 1426
},
{
"epoch": 0.305477509298654,
"grad_norm": 0.21162314149336317,
"learning_rate": 3.1834714308363266e-05,
"loss": 0.7159,
"step": 1427
},
{
"epoch": 0.3056915790318696,
"grad_norm": 0.21626714707264866,
"learning_rate": 3.182375769173435e-05,
"loss": 0.7268,
"step": 1428
},
{
"epoch": 0.30590564876508525,
"grad_norm": 0.38973557982804796,
"learning_rate": 3.1812795617280527e-05,
"loss": 0.7147,
"step": 1429
},
{
"epoch": 0.30611971849830083,
"grad_norm": 0.21473123297218608,
"learning_rate": 3.180182809006187e-05,
"loss": 0.6822,
"step": 1430
},
{
"epoch": 0.3063337882315164,
"grad_norm": 0.22805434357482182,
"learning_rate": 3.1790855115140974e-05,
"loss": 0.7192,
"step": 1431
},
{
"epoch": 0.306547857964732,
"grad_norm": 0.23933935403155096,
"learning_rate": 3.177987669758293e-05,
"loss": 0.7408,
"step": 1432
},
{
"epoch": 0.30676192769794763,
"grad_norm": 0.2483779401696868,
"learning_rate": 3.176889284245538e-05,
"loss": 0.7529,
"step": 1433
},
{
"epoch": 0.3069759974311632,
"grad_norm": 0.24123188356015066,
"learning_rate": 3.175790355482844e-05,
"loss": 0.7475,
"step": 1434
},
{
"epoch": 0.3071900671643788,
"grad_norm": 0.218235722605162,
"learning_rate": 3.174690883977473e-05,
"loss": 0.7322,
"step": 1435
},
{
"epoch": 0.3074041368975944,
"grad_norm": 0.21883476938584442,
"learning_rate": 3.1735908702369414e-05,
"loss": 0.728,
"step": 1436
},
{
"epoch": 0.30761820663080996,
"grad_norm": 0.25188122569352345,
"learning_rate": 3.1724903147690115e-05,
"loss": 0.7173,
"step": 1437
},
{
"epoch": 0.3078322763640256,
"grad_norm": 0.22963360914059594,
"learning_rate": 3.171389218081699e-05,
"loss": 0.722,
"step": 1438
},
{
"epoch": 0.3080463460972412,
"grad_norm": 0.24033502472741244,
"learning_rate": 3.170287580683268e-05,
"loss": 0.7242,
"step": 1439
},
{
"epoch": 0.30826041583045677,
"grad_norm": 0.457217600022294,
"learning_rate": 3.169185403082232e-05,
"loss": 0.7212,
"step": 1440
},
{
"epoch": 0.30847448556367235,
"grad_norm": 0.1963853889286967,
"learning_rate": 3.1680826857873534e-05,
"loss": 0.725,
"step": 1441
},
{
"epoch": 0.30868855529688793,
"grad_norm": 0.2202312016687697,
"learning_rate": 3.166979429307646e-05,
"loss": 0.7314,
"step": 1442
},
{
"epoch": 0.3089026250301036,
"grad_norm": 0.22396438931842594,
"learning_rate": 3.165875634152371e-05,
"loss": 0.7699,
"step": 1443
},
{
"epoch": 0.30911669476331916,
"grad_norm": 0.20023765292935752,
"learning_rate": 3.1647713008310356e-05,
"loss": 0.7187,
"step": 1444
},
{
"epoch": 0.30933076449653474,
"grad_norm": 0.21390753427895953,
"learning_rate": 3.1636664298534014e-05,
"loss": 0.7523,
"step": 1445
},
{
"epoch": 0.3095448342297503,
"grad_norm": 0.22016320878320222,
"learning_rate": 3.1625610217294734e-05,
"loss": 0.7384,
"step": 1446
},
{
"epoch": 0.30975890396296596,
"grad_norm": 0.2044759879767915,
"learning_rate": 3.1614550769695055e-05,
"loss": 0.7513,
"step": 1447
},
{
"epoch": 0.30997297369618154,
"grad_norm": 0.20787685688090424,
"learning_rate": 3.160348596084e-05,
"loss": 0.7074,
"step": 1448
},
{
"epoch": 0.3101870434293971,
"grad_norm": 0.23711949530366647,
"learning_rate": 3.159241579583707e-05,
"loss": 0.7476,
"step": 1449
},
{
"epoch": 0.3104011131626127,
"grad_norm": 0.20747050361190908,
"learning_rate": 3.158134027979623e-05,
"loss": 0.7212,
"step": 1450
},
{
"epoch": 0.3106151828958283,
"grad_norm": 0.20995264854875686,
"learning_rate": 3.1570259417829914e-05,
"loss": 0.7285,
"step": 1451
},
{
"epoch": 0.31082925262904393,
"grad_norm": 0.21245624472761282,
"learning_rate": 3.155917321505303e-05,
"loss": 0.6909,
"step": 1452
},
{
"epoch": 0.3110433223622595,
"grad_norm": 0.21659310287205993,
"learning_rate": 3.1548081676582954e-05,
"loss": 0.6987,
"step": 1453
},
{
"epoch": 0.3112573920954751,
"grad_norm": 0.21758114627850686,
"learning_rate": 3.153698480753952e-05,
"loss": 0.7438,
"step": 1454
},
{
"epoch": 0.3114714618286907,
"grad_norm": 0.19922319084931434,
"learning_rate": 3.152588261304501e-05,
"loss": 0.7385,
"step": 1455
},
{
"epoch": 0.3116855315619063,
"grad_norm": 0.2016783810836013,
"learning_rate": 3.151477509822418e-05,
"loss": 0.7229,
"step": 1456
},
{
"epoch": 0.3118996012951219,
"grad_norm": 0.22794981419350388,
"learning_rate": 3.150366226820426e-05,
"loss": 0.7301,
"step": 1457
},
{
"epoch": 0.3121136710283375,
"grad_norm": 0.21499412039554525,
"learning_rate": 3.1492544128114876e-05,
"loss": 0.6997,
"step": 1458
},
{
"epoch": 0.31232774076155306,
"grad_norm": 0.21843695096587704,
"learning_rate": 3.1481420683088177e-05,
"loss": 0.7284,
"step": 1459
},
{
"epoch": 0.31254181049476865,
"grad_norm": 0.1971221300159341,
"learning_rate": 3.14702919382587e-05,
"loss": 0.7377,
"step": 1460
},
{
"epoch": 0.3127558802279843,
"grad_norm": 0.240978737107255,
"learning_rate": 3.145915789876346e-05,
"loss": 0.7056,
"step": 1461
},
{
"epoch": 0.31296994996119987,
"grad_norm": 0.21248182584074676,
"learning_rate": 3.1448018569741916e-05,
"loss": 0.7327,
"step": 1462
},
{
"epoch": 0.31318401969441545,
"grad_norm": 0.22093395579421116,
"learning_rate": 3.143687395633595e-05,
"loss": 0.7275,
"step": 1463
},
{
"epoch": 0.31339808942763103,
"grad_norm": 0.20231584780727468,
"learning_rate": 3.1425724063689903e-05,
"loss": 0.6969,
"step": 1464
},
{
"epoch": 0.31361215916084667,
"grad_norm": 0.22308283661802453,
"learning_rate": 3.141456889695055e-05,
"loss": 0.711,
"step": 1465
},
{
"epoch": 0.31382622889406225,
"grad_norm": 0.20868971832779562,
"learning_rate": 3.1403408461267086e-05,
"loss": 0.7506,
"step": 1466
},
{
"epoch": 0.31404029862727784,
"grad_norm": 0.22523386937803858,
"learning_rate": 3.139224276179115e-05,
"loss": 0.7446,
"step": 1467
},
{
"epoch": 0.3142543683604934,
"grad_norm": 0.19954737089394523,
"learning_rate": 3.138107180367682e-05,
"loss": 0.7112,
"step": 1468
},
{
"epoch": 0.314468438093709,
"grad_norm": 0.2542268071847943,
"learning_rate": 3.136989559208056e-05,
"loss": 0.7365,
"step": 1469
},
{
"epoch": 0.31468250782692464,
"grad_norm": 0.20895933764763175,
"learning_rate": 3.135871413216132e-05,
"loss": 0.7755,
"step": 1470
},
{
"epoch": 0.3148965775601402,
"grad_norm": 0.228118964844556,
"learning_rate": 3.134752742908043e-05,
"loss": 0.7356,
"step": 1471
},
{
"epoch": 0.3151106472933558,
"grad_norm": 0.22647240274599476,
"learning_rate": 3.133633548800165e-05,
"loss": 0.7199,
"step": 1472
},
{
"epoch": 0.3153247170265714,
"grad_norm": 0.2475068220750008,
"learning_rate": 3.132513831409116e-05,
"loss": 0.7512,
"step": 1473
},
{
"epoch": 0.315538786759787,
"grad_norm": 0.20471878775694313,
"learning_rate": 3.131393591251755e-05,
"loss": 0.7499,
"step": 1474
},
{
"epoch": 0.3157528564930026,
"grad_norm": 0.2570118035800016,
"learning_rate": 3.130272828845184e-05,
"loss": 0.7217,
"step": 1475
},
{
"epoch": 0.3159669262262182,
"grad_norm": 0.25695595430743884,
"learning_rate": 3.129151544706744e-05,
"loss": 0.715,
"step": 1476
},
{
"epoch": 0.3161809959594338,
"grad_norm": 0.20673248374703146,
"learning_rate": 3.1280297393540185e-05,
"loss": 0.7495,
"step": 1477
},
{
"epoch": 0.31639506569264936,
"grad_norm": 0.23730695452889797,
"learning_rate": 3.12690741330483e-05,
"loss": 0.7295,
"step": 1478
},
{
"epoch": 0.316609135425865,
"grad_norm": 0.20127704834085067,
"learning_rate": 3.125784567077242e-05,
"loss": 0.7148,
"step": 1479
},
{
"epoch": 0.3168232051590806,
"grad_norm": 0.22574231162441197,
"learning_rate": 3.1246612011895595e-05,
"loss": 0.7301,
"step": 1480
},
{
"epoch": 0.31703727489229616,
"grad_norm": 0.204367347369339,
"learning_rate": 3.123537316160324e-05,
"loss": 0.7357,
"step": 1481
},
{
"epoch": 0.31725134462551174,
"grad_norm": 0.2250067446805083,
"learning_rate": 3.122412912508321e-05,
"loss": 0.7463,
"step": 1482
},
{
"epoch": 0.3174654143587274,
"grad_norm": 0.20892073965372557,
"learning_rate": 3.121287990752572e-05,
"loss": 0.7279,
"step": 1483
},
{
"epoch": 0.31767948409194297,
"grad_norm": 0.2037437423808741,
"learning_rate": 3.120162551412339e-05,
"loss": 0.7483,
"step": 1484
},
{
"epoch": 0.31789355382515855,
"grad_norm": 0.20411669175473196,
"learning_rate": 3.119036595007123e-05,
"loss": 0.7178,
"step": 1485
},
{
"epoch": 0.31810762355837413,
"grad_norm": 0.21084258377580037,
"learning_rate": 3.117910122056663e-05,
"loss": 0.7431,
"step": 1486
},
{
"epoch": 0.3183216932915897,
"grad_norm": 0.21409215492256983,
"learning_rate": 3.1167831330809376e-05,
"loss": 0.7326,
"step": 1487
},
{
"epoch": 0.31853576302480535,
"grad_norm": 0.23379332085474894,
"learning_rate": 3.1156556286001615e-05,
"loss": 0.7116,
"step": 1488
},
{
"epoch": 0.31874983275802093,
"grad_norm": 0.24796122337162388,
"learning_rate": 3.1145276091347905e-05,
"loss": 0.765,
"step": 1489
},
{
"epoch": 0.3189639024912365,
"grad_norm": 0.2145407362714362,
"learning_rate": 3.1133990752055146e-05,
"loss": 0.7162,
"step": 1490
},
{
"epoch": 0.3191779722244521,
"grad_norm": 0.23883686076081942,
"learning_rate": 3.112270027333263e-05,
"loss": 0.735,
"step": 1491
},
{
"epoch": 0.3193920419576677,
"grad_norm": 0.22184388987701545,
"learning_rate": 3.111140466039205e-05,
"loss": 0.7159,
"step": 1492
},
{
"epoch": 0.3196061116908833,
"grad_norm": 0.2412817705565827,
"learning_rate": 3.1100103918447405e-05,
"loss": 0.717,
"step": 1493
},
{
"epoch": 0.3198201814240989,
"grad_norm": 0.21485345792419652,
"learning_rate": 3.1088798052715117e-05,
"loss": 0.7485,
"step": 1494
},
{
"epoch": 0.3200342511573145,
"grad_norm": 0.24883652890398836,
"learning_rate": 3.1077487068413936e-05,
"loss": 0.6953,
"step": 1495
},
{
"epoch": 0.32024832089053007,
"grad_norm": 0.24956105496228143,
"learning_rate": 3.1066170970765015e-05,
"loss": 0.7063,
"step": 1496
},
{
"epoch": 0.3204623906237457,
"grad_norm": 0.21042698842894736,
"learning_rate": 3.105484976499182e-05,
"loss": 0.7073,
"step": 1497
},
{
"epoch": 0.3206764603569613,
"grad_norm": 0.23898746630695497,
"learning_rate": 3.104352345632022e-05,
"loss": 0.7297,
"step": 1498
},
{
"epoch": 0.3208905300901769,
"grad_norm": 0.23001509225970712,
"learning_rate": 3.10321920499784e-05,
"loss": 0.7494,
"step": 1499
},
{
"epoch": 0.32110459982339246,
"grad_norm": 0.21966970587762638,
"learning_rate": 3.1020855551196936e-05,
"loss": 0.7466,
"step": 1500
},
{
"epoch": 0.32131866955660804,
"grad_norm": 0.23563745853359952,
"learning_rate": 3.100951396520871e-05,
"loss": 0.7387,
"step": 1501
},
{
"epoch": 0.3215327392898237,
"grad_norm": 0.24266907369743057,
"learning_rate": 3.0998167297249e-05,
"loss": 0.7537,
"step": 1502
},
{
"epoch": 0.32174680902303926,
"grad_norm": 0.1944517361151009,
"learning_rate": 3.09868155525554e-05,
"loss": 0.7026,
"step": 1503
},
{
"epoch": 0.32196087875625484,
"grad_norm": 0.22434107310309512,
"learning_rate": 3.097545873636785e-05,
"loss": 0.7089,
"step": 1504
},
{
"epoch": 0.3221749484894704,
"grad_norm": 0.20815115398118045,
"learning_rate": 3.096409685392864e-05,
"loss": 0.715,
"step": 1505
},
{
"epoch": 0.32238901822268606,
"grad_norm": 0.22420600533564186,
"learning_rate": 3.095272991048239e-05,
"loss": 0.7134,
"step": 1506
},
{
"epoch": 0.32260308795590165,
"grad_norm": 0.2536604211257573,
"learning_rate": 3.0941357911276064e-05,
"loss": 0.7251,
"step": 1507
},
{
"epoch": 0.32281715768911723,
"grad_norm": 0.23107524665004686,
"learning_rate": 3.0929980861558955e-05,
"loss": 0.7004,
"step": 1508
},
{
"epoch": 0.3230312274223328,
"grad_norm": 0.19131080314353968,
"learning_rate": 3.091859876658269e-05,
"loss": 0.7288,
"step": 1509
},
{
"epoch": 0.3232452971555484,
"grad_norm": 0.21450084832077976,
"learning_rate": 3.090721163160122e-05,
"loss": 0.7124,
"step": 1510
},
{
"epoch": 0.32345936688876403,
"grad_norm": 0.2141686093064143,
"learning_rate": 3.0895819461870825e-05,
"loss": 0.7397,
"step": 1511
},
{
"epoch": 0.3236734366219796,
"grad_norm": 0.22980922312876642,
"learning_rate": 3.088442226265012e-05,
"loss": 0.7166,
"step": 1512
},
{
"epoch": 0.3238875063551952,
"grad_norm": 0.21599365474216994,
"learning_rate": 3.0873020039200016e-05,
"loss": 0.6909,
"step": 1513
},
{
"epoch": 0.3241015760884108,
"grad_norm": 0.2261707207767992,
"learning_rate": 3.086161279678377e-05,
"loss": 0.7466,
"step": 1514
},
{
"epoch": 0.3243156458216264,
"grad_norm": 0.20137945351047606,
"learning_rate": 3.085020054066694e-05,
"loss": 0.7189,
"step": 1515
},
{
"epoch": 0.324529715554842,
"grad_norm": 0.23206221114248038,
"learning_rate": 3.08387832761174e-05,
"loss": 0.7089,
"step": 1516
},
{
"epoch": 0.3247437852880576,
"grad_norm": 0.2141593987780817,
"learning_rate": 3.082736100840534e-05,
"loss": 0.7198,
"step": 1517
},
{
"epoch": 0.32495785502127317,
"grad_norm": 0.20897205366311827,
"learning_rate": 3.081593374280326e-05,
"loss": 0.7159,
"step": 1518
},
{
"epoch": 0.32517192475448875,
"grad_norm": 0.2127990393603389,
"learning_rate": 3.0804501484585966e-05,
"loss": 0.7026,
"step": 1519
},
{
"epoch": 0.3253859944877044,
"grad_norm": 0.23938984632207602,
"learning_rate": 3.0793064239030566e-05,
"loss": 0.7144,
"step": 1520
},
{
"epoch": 0.32560006422091997,
"grad_norm": 0.244064604982906,
"learning_rate": 3.078162201141646e-05,
"loss": 0.7148,
"step": 1521
},
{
"epoch": 0.32581413395413555,
"grad_norm": 0.22934179594192933,
"learning_rate": 3.077017480702538e-05,
"loss": 0.742,
"step": 1522
},
{
"epoch": 0.32602820368735114,
"grad_norm": 0.23353887242018262,
"learning_rate": 3.0758722631141326e-05,
"loss": 0.7534,
"step": 1523
},
{
"epoch": 0.3262422734205668,
"grad_norm": 0.21107060960629914,
"learning_rate": 3.07472654890506e-05,
"loss": 0.7264,
"step": 1524
},
{
"epoch": 0.32645634315378236,
"grad_norm": 0.22750805242706576,
"learning_rate": 3.073580338604179e-05,
"loss": 0.7269,
"step": 1525
},
{
"epoch": 0.32667041288699794,
"grad_norm": 0.20500705537461428,
"learning_rate": 3.07243363274058e-05,
"loss": 0.7135,
"step": 1526
},
{
"epoch": 0.3268844826202135,
"grad_norm": 0.2033217865487313,
"learning_rate": 3.0712864318435786e-05,
"loss": 0.7039,
"step": 1527
},
{
"epoch": 0.3270985523534291,
"grad_norm": 0.2281544332583003,
"learning_rate": 3.070138736442721e-05,
"loss": 0.7254,
"step": 1528
},
{
"epoch": 0.32731262208664474,
"grad_norm": 0.23078568100556765,
"learning_rate": 3.068990547067783e-05,
"loss": 0.7495,
"step": 1529
},
{
"epoch": 0.3275266918198603,
"grad_norm": 0.21760372451945423,
"learning_rate": 3.067841864248764e-05,
"loss": 0.7177,
"step": 1530
},
{
"epoch": 0.3277407615530759,
"grad_norm": 0.2162578028839833,
"learning_rate": 3.066692688515896e-05,
"loss": 0.7241,
"step": 1531
},
{
"epoch": 0.3279548312862915,
"grad_norm": 0.23443380671489752,
"learning_rate": 3.065543020399635e-05,
"loss": 0.7417,
"step": 1532
},
{
"epoch": 0.32816890101950713,
"grad_norm": 0.32337526862754307,
"learning_rate": 3.064392860430666e-05,
"loss": 0.7274,
"step": 1533
},
{
"epoch": 0.3283829707527227,
"grad_norm": 0.21848406390806313,
"learning_rate": 3.0632422091399024e-05,
"loss": 0.7641,
"step": 1534
},
{
"epoch": 0.3285970404859383,
"grad_norm": 0.22945002660888902,
"learning_rate": 3.062091067058481e-05,
"loss": 0.7479,
"step": 1535
},
{
"epoch": 0.3288111102191539,
"grad_norm": 0.24754497404511555,
"learning_rate": 3.0609394347177665e-05,
"loss": 0.7162,
"step": 1536
},
{
"epoch": 0.32902517995236946,
"grad_norm": 0.22515973375379672,
"learning_rate": 3.0597873126493515e-05,
"loss": 0.706,
"step": 1537
},
{
"epoch": 0.3292392496855851,
"grad_norm": 0.19490951044382684,
"learning_rate": 3.058634701385053e-05,
"loss": 0.7108,
"step": 1538
},
{
"epoch": 0.3294533194188007,
"grad_norm": 0.2701935725606944,
"learning_rate": 3.057481601456915e-05,
"loss": 0.7377,
"step": 1539
},
{
"epoch": 0.32966738915201627,
"grad_norm": 0.2708519020796206,
"learning_rate": 3.056328013397205e-05,
"loss": 0.7319,
"step": 1540
},
{
"epoch": 0.32988145888523185,
"grad_norm": 0.24208851137501558,
"learning_rate": 3.0551739377384174e-05,
"loss": 0.716,
"step": 1541
},
{
"epoch": 0.3300955286184475,
"grad_norm": 0.26409441978526554,
"learning_rate": 3.0540193750132714e-05,
"loss": 0.732,
"step": 1542
},
{
"epoch": 0.33030959835166307,
"grad_norm": 0.2671257890040448,
"learning_rate": 3.052864325754712e-05,
"loss": 0.7395,
"step": 1543
},
{
"epoch": 0.33052366808487865,
"grad_norm": 0.21147023656331912,
"learning_rate": 3.0517087904959068e-05,
"loss": 0.7486,
"step": 1544
},
{
"epoch": 0.33073773781809424,
"grad_norm": 0.23720722371685085,
"learning_rate": 3.0505527697702497e-05,
"loss": 0.7379,
"step": 1545
},
{
"epoch": 0.3309518075513098,
"grad_norm": 0.24633335731519487,
"learning_rate": 3.049396264111357e-05,
"loss": 0.7073,
"step": 1546
},
{
"epoch": 0.33116587728452546,
"grad_norm": 0.23511021355438164,
"learning_rate": 3.0482392740530697e-05,
"loss": 0.7123,
"step": 1547
},
{
"epoch": 0.33137994701774104,
"grad_norm": 0.25273896051040157,
"learning_rate": 3.0470818001294516e-05,
"loss": 0.7489,
"step": 1548
},
{
"epoch": 0.3315940167509566,
"grad_norm": 0.2844954501216773,
"learning_rate": 3.0459238428747927e-05,
"loss": 0.7388,
"step": 1549
},
{
"epoch": 0.3318080864841722,
"grad_norm": 0.22700294434235596,
"learning_rate": 3.0447654028236013e-05,
"loss": 0.7464,
"step": 1550
},
{
"epoch": 0.3320221562173878,
"grad_norm": 0.24139343046404554,
"learning_rate": 3.0436064805106134e-05,
"loss": 0.6965,
"step": 1551
},
{
"epoch": 0.3322362259506034,
"grad_norm": 0.28762797583938526,
"learning_rate": 3.0424470764707838e-05,
"loss": 0.7248,
"step": 1552
},
{
"epoch": 0.332450295683819,
"grad_norm": 0.24719411986259368,
"learning_rate": 3.041287191239293e-05,
"loss": 0.7212,
"step": 1553
},
{
"epoch": 0.3326643654170346,
"grad_norm": 0.25059962292528026,
"learning_rate": 3.0401268253515398e-05,
"loss": 0.7422,
"step": 1554
},
{
"epoch": 0.3328784351502502,
"grad_norm": 0.23228865151268058,
"learning_rate": 3.0389659793431482e-05,
"loss": 0.7295,
"step": 1555
},
{
"epoch": 0.3330925048834658,
"grad_norm": 0.20578321531990376,
"learning_rate": 3.0378046537499622e-05,
"loss": 0.6944,
"step": 1556
},
{
"epoch": 0.3333065746166814,
"grad_norm": 0.22639186641619344,
"learning_rate": 3.0366428491080485e-05,
"loss": 0.7351,
"step": 1557
},
{
"epoch": 0.333520644349897,
"grad_norm": 0.23255840572300687,
"learning_rate": 3.035480565953693e-05,
"loss": 0.7526,
"step": 1558
},
{
"epoch": 0.33373471408311256,
"grad_norm": 0.1949740285978128,
"learning_rate": 3.0343178048234045e-05,
"loss": 0.7295,
"step": 1559
},
{
"epoch": 0.33394878381632814,
"grad_norm": 0.21275472679390295,
"learning_rate": 3.0331545662539094e-05,
"loss": 0.7225,
"step": 1560
},
{
"epoch": 0.3341628535495438,
"grad_norm": 0.22510941658463754,
"learning_rate": 3.0319908507821588e-05,
"loss": 0.7407,
"step": 1561
},
{
"epoch": 0.33437692328275936,
"grad_norm": 0.18169833239885208,
"learning_rate": 3.0308266589453202e-05,
"loss": 0.73,
"step": 1562
},
{
"epoch": 0.33459099301597495,
"grad_norm": 0.2094184274202531,
"learning_rate": 3.029661991280783e-05,
"loss": 0.7226,
"step": 1563
},
{
"epoch": 0.33480506274919053,
"grad_norm": 0.2242852324279592,
"learning_rate": 3.028496848326155e-05,
"loss": 0.7106,
"step": 1564
},
{
"epoch": 0.33501913248240617,
"grad_norm": 0.2275744277057065,
"learning_rate": 3.0273312306192656e-05,
"loss": 0.7214,
"step": 1565
},
{
"epoch": 0.33523320221562175,
"grad_norm": 0.21437923131620099,
"learning_rate": 3.0261651386981596e-05,
"loss": 0.7013,
"step": 1566
},
{
"epoch": 0.33544727194883733,
"grad_norm": 0.2301184013271844,
"learning_rate": 3.0249985731011045e-05,
"loss": 0.7553,
"step": 1567
},
{
"epoch": 0.3356613416820529,
"grad_norm": 0.20484283659264574,
"learning_rate": 3.0238315343665843e-05,
"loss": 0.7375,
"step": 1568
},
{
"epoch": 0.3358754114152685,
"grad_norm": 0.19523627877554567,
"learning_rate": 3.0226640230333025e-05,
"loss": 0.7475,
"step": 1569
},
{
"epoch": 0.33608948114848414,
"grad_norm": 0.20356523622286868,
"learning_rate": 3.0214960396401792e-05,
"loss": 0.7179,
"step": 1570
},
{
"epoch": 0.3363035508816997,
"grad_norm": 0.21115453230974598,
"learning_rate": 3.020327584726354e-05,
"loss": 0.7487,
"step": 1571
},
{
"epoch": 0.3365176206149153,
"grad_norm": 0.20746541444589975,
"learning_rate": 3.0191586588311835e-05,
"loss": 0.7315,
"step": 1572
},
{
"epoch": 0.3367316903481309,
"grad_norm": 0.20301453422715285,
"learning_rate": 3.0179892624942427e-05,
"loss": 0.7308,
"step": 1573
},
{
"epoch": 0.3369457600813465,
"grad_norm": 0.20402259252972538,
"learning_rate": 3.0168193962553202e-05,
"loss": 0.7228,
"step": 1574
},
{
"epoch": 0.3371598298145621,
"grad_norm": 0.20037789819760432,
"learning_rate": 3.0156490606544265e-05,
"loss": 0.7349,
"step": 1575
},
{
"epoch": 0.3373738995477777,
"grad_norm": 0.18538529041338164,
"learning_rate": 3.014478256231786e-05,
"loss": 0.6992,
"step": 1576
},
{
"epoch": 0.33758796928099327,
"grad_norm": 0.21455149238360505,
"learning_rate": 3.013306983527839e-05,
"loss": 0.7546,
"step": 1577
},
{
"epoch": 0.33780203901420885,
"grad_norm": 0.21237178866595172,
"learning_rate": 3.0121352430832434e-05,
"loss": 0.7366,
"step": 1578
},
{
"epoch": 0.3380161087474245,
"grad_norm": 0.19002221689288828,
"learning_rate": 3.0109630354388725e-05,
"loss": 0.7053,
"step": 1579
},
{
"epoch": 0.3382301784806401,
"grad_norm": 0.20220659300897512,
"learning_rate": 3.0097903611358146e-05,
"loss": 0.7148,
"step": 1580
},
{
"epoch": 0.33844424821385566,
"grad_norm": 0.19654587916224117,
"learning_rate": 3.0086172207153752e-05,
"loss": 0.7082,
"step": 1581
},
{
"epoch": 0.33865831794707124,
"grad_norm": 0.21760388876692274,
"learning_rate": 3.0074436147190728e-05,
"loss": 0.7171,
"step": 1582
},
{
"epoch": 0.3388723876802869,
"grad_norm": 0.18985867130739387,
"learning_rate": 3.0062695436886424e-05,
"loss": 0.7246,
"step": 1583
},
{
"epoch": 0.33908645741350246,
"grad_norm": 0.1983141633307748,
"learning_rate": 3.0050950081660316e-05,
"loss": 0.6926,
"step": 1584
},
{
"epoch": 0.33930052714671805,
"grad_norm": 0.20311032549877156,
"learning_rate": 3.0039200086934063e-05,
"loss": 0.7479,
"step": 1585
},
{
"epoch": 0.33951459687993363,
"grad_norm": 0.20611254761842612,
"learning_rate": 3.0027445458131413e-05,
"loss": 0.7433,
"step": 1586
},
{
"epoch": 0.3397286666131492,
"grad_norm": 0.19158516904924683,
"learning_rate": 3.001568620067831e-05,
"loss": 0.7378,
"step": 1587
},
{
"epoch": 0.33994273634636485,
"grad_norm": 0.22177020322419674,
"learning_rate": 3.0003922320002786e-05,
"loss": 0.7222,
"step": 1588
},
{
"epoch": 0.34015680607958043,
"grad_norm": 0.2250376877792279,
"learning_rate": 2.9992153821535028e-05,
"loss": 0.7738,
"step": 1589
},
{
"epoch": 0.340370875812796,
"grad_norm": 0.19034483792478848,
"learning_rate": 2.9980380710707355e-05,
"loss": 0.7353,
"step": 1590
},
{
"epoch": 0.3405849455460116,
"grad_norm": 0.2705908427950601,
"learning_rate": 2.9968602992954222e-05,
"loss": 0.7323,
"step": 1591
},
{
"epoch": 0.34079901527922724,
"grad_norm": 0.190363268341525,
"learning_rate": 2.9956820673712194e-05,
"loss": 0.7298,
"step": 1592
},
{
"epoch": 0.3410130850124428,
"grad_norm": 0.2257903050391308,
"learning_rate": 2.994503375841997e-05,
"loss": 0.766,
"step": 1593
},
{
"epoch": 0.3412271547456584,
"grad_norm": 0.22424147399106226,
"learning_rate": 2.993324225251837e-05,
"loss": 0.7222,
"step": 1594
},
{
"epoch": 0.341441224478874,
"grad_norm": 0.21605957952365637,
"learning_rate": 2.9921446161450328e-05,
"loss": 0.7341,
"step": 1595
},
{
"epoch": 0.34165529421208957,
"grad_norm": 0.20885516199045895,
"learning_rate": 2.9909645490660896e-05,
"loss": 0.7375,
"step": 1596
},
{
"epoch": 0.3418693639453052,
"grad_norm": 0.20602752803738508,
"learning_rate": 2.989784024559725e-05,
"loss": 0.7619,
"step": 1597
},
{
"epoch": 0.3420834336785208,
"grad_norm": 0.211005366778882,
"learning_rate": 2.9886030431708665e-05,
"loss": 0.7055,
"step": 1598
},
{
"epoch": 0.34229750341173637,
"grad_norm": 0.2079197836533538,
"learning_rate": 2.9874216054446532e-05,
"loss": 0.7363,
"step": 1599
},
{
"epoch": 0.34251157314495195,
"grad_norm": 0.22557406416231265,
"learning_rate": 2.986239711926434e-05,
"loss": 0.7307,
"step": 1600
},
{
"epoch": 0.34272564287816754,
"grad_norm": 0.209959893905052,
"learning_rate": 2.985057363161769e-05,
"loss": 0.7325,
"step": 1601
},
{
"epoch": 0.3429397126113832,
"grad_norm": 0.20610022811042758,
"learning_rate": 2.9838745596964287e-05,
"loss": 0.7328,
"step": 1602
},
{
"epoch": 0.34315378234459876,
"grad_norm": 0.21951783081430787,
"learning_rate": 2.982691302076393e-05,
"loss": 0.7489,
"step": 1603
},
{
"epoch": 0.34336785207781434,
"grad_norm": 0.21431566804952354,
"learning_rate": 2.9815075908478506e-05,
"loss": 0.7282,
"step": 1604
},
{
"epoch": 0.3435819218110299,
"grad_norm": 0.20514990108713682,
"learning_rate": 2.980323426557201e-05,
"loss": 0.745,
"step": 1605
},
{
"epoch": 0.34379599154424556,
"grad_norm": 0.19848649229970577,
"learning_rate": 2.9791388097510526e-05,
"loss": 0.7113,
"step": 1606
},
{
"epoch": 0.34401006127746114,
"grad_norm": 0.1990950695705066,
"learning_rate": 2.9779537409762223e-05,
"loss": 0.7141,
"step": 1607
},
{
"epoch": 0.3442241310106767,
"grad_norm": 0.20268547835912892,
"learning_rate": 2.9767682207797345e-05,
"loss": 0.7089,
"step": 1608
},
{
"epoch": 0.3444382007438923,
"grad_norm": 0.24099431488848325,
"learning_rate": 2.975582249708825e-05,
"loss": 0.7484,
"step": 1609
},
{
"epoch": 0.3446522704771079,
"grad_norm": 0.19793629203445218,
"learning_rate": 2.974395828310934e-05,
"loss": 0.7225,
"step": 1610
},
{
"epoch": 0.34486634021032353,
"grad_norm": 0.20548925189030023,
"learning_rate": 2.9732089571337126e-05,
"loss": 0.6875,
"step": 1611
},
{
"epoch": 0.3450804099435391,
"grad_norm": 0.23042418777640006,
"learning_rate": 2.9720216367250187e-05,
"loss": 0.7027,
"step": 1612
},
{
"epoch": 0.3452944796767547,
"grad_norm": 0.21771843664066826,
"learning_rate": 2.970833867632916e-05,
"loss": 0.7416,
"step": 1613
},
{
"epoch": 0.3455085494099703,
"grad_norm": 0.20521482357688642,
"learning_rate": 2.9696456504056773e-05,
"loss": 0.6956,
"step": 1614
},
{
"epoch": 0.3457226191431859,
"grad_norm": 0.233271321921815,
"learning_rate": 2.9684569855917817e-05,
"loss": 0.7205,
"step": 1615
},
{
"epoch": 0.3459366888764015,
"grad_norm": 0.21040558142057814,
"learning_rate": 2.967267873739914e-05,
"loss": 0.7415,
"step": 1616
},
{
"epoch": 0.3461507586096171,
"grad_norm": 0.21114989102574602,
"learning_rate": 2.9660783153989664e-05,
"loss": 0.7196,
"step": 1617
},
{
"epoch": 0.34636482834283266,
"grad_norm": 0.24747773642343213,
"learning_rate": 2.9648883111180376e-05,
"loss": 0.7414,
"step": 1618
},
{
"epoch": 0.34657889807604825,
"grad_norm": 0.21550307981218678,
"learning_rate": 2.9636978614464298e-05,
"loss": 0.6899,
"step": 1619
},
{
"epoch": 0.3467929678092639,
"grad_norm": 0.19876871136787375,
"learning_rate": 2.962506966933654e-05,
"loss": 0.704,
"step": 1620
},
{
"epoch": 0.34700703754247947,
"grad_norm": 0.24248305331461964,
"learning_rate": 2.9613156281294234e-05,
"loss": 0.7251,
"step": 1621
},
{
"epoch": 0.34722110727569505,
"grad_norm": 0.23234292968903417,
"learning_rate": 2.9601238455836592e-05,
"loss": 0.7362,
"step": 1622
},
{
"epoch": 0.34743517700891063,
"grad_norm": 0.19450745856981094,
"learning_rate": 2.9589316198464853e-05,
"loss": 0.7002,
"step": 1623
},
{
"epoch": 0.34764924674212627,
"grad_norm": 0.23514572854965687,
"learning_rate": 2.957738951468231e-05,
"loss": 0.7314,
"step": 1624
},
{
"epoch": 0.34786331647534185,
"grad_norm": 0.21443200178723576,
"learning_rate": 2.95654584099943e-05,
"loss": 0.7081,
"step": 1625
},
{
"epoch": 0.34807738620855744,
"grad_norm": 0.2000731915774815,
"learning_rate": 2.9553522889908194e-05,
"loss": 0.6902,
"step": 1626
},
{
"epoch": 0.348291455941773,
"grad_norm": 0.22442441262696242,
"learning_rate": 2.9541582959933416e-05,
"loss": 0.7183,
"step": 1627
},
{
"epoch": 0.3485055256749886,
"grad_norm": 0.20769996794973836,
"learning_rate": 2.952963862558141e-05,
"loss": 0.7025,
"step": 1628
},
{
"epoch": 0.34871959540820424,
"grad_norm": 0.20551158106453798,
"learning_rate": 2.9517689892365663e-05,
"loss": 0.7293,
"step": 1629
},
{
"epoch": 0.3489336651414198,
"grad_norm": 0.19655578191389472,
"learning_rate": 2.9505736765801677e-05,
"loss": 0.7518,
"step": 1630
},
{
"epoch": 0.3491477348746354,
"grad_norm": 0.2170350384603782,
"learning_rate": 2.9493779251407003e-05,
"loss": 0.7515,
"step": 1631
},
{
"epoch": 0.349361804607851,
"grad_norm": 0.1949058103607534,
"learning_rate": 2.9481817354701206e-05,
"loss": 0.7222,
"step": 1632
},
{
"epoch": 0.34957587434106663,
"grad_norm": 0.309134693217062,
"learning_rate": 2.9469851081205875e-05,
"loss": 0.7385,
"step": 1633
},
{
"epoch": 0.3497899440742822,
"grad_norm": 0.20833630459963806,
"learning_rate": 2.945788043644462e-05,
"loss": 0.6965,
"step": 1634
},
{
"epoch": 0.3500040138074978,
"grad_norm": 0.21075423947370012,
"learning_rate": 2.944590542594307e-05,
"loss": 0.7187,
"step": 1635
},
{
"epoch": 0.3502180835407134,
"grad_norm": 0.2167490432101183,
"learning_rate": 2.9433926055228866e-05,
"loss": 0.7482,
"step": 1636
},
{
"epoch": 0.35043215327392896,
"grad_norm": 0.20541229872761796,
"learning_rate": 2.942194232983166e-05,
"loss": 0.749,
"step": 1637
},
{
"epoch": 0.3506462230071446,
"grad_norm": 0.297135409790418,
"learning_rate": 2.9409954255283132e-05,
"loss": 0.7295,
"step": 1638
},
{
"epoch": 0.3508602927403602,
"grad_norm": 0.2387083440534443,
"learning_rate": 2.9397961837116935e-05,
"loss": 0.7411,
"step": 1639
},
{
"epoch": 0.35107436247357576,
"grad_norm": 0.20506889266125755,
"learning_rate": 2.9385965080868763e-05,
"loss": 0.7268,
"step": 1640
},
{
"epoch": 0.35128843220679135,
"grad_norm": 0.20896069342101464,
"learning_rate": 2.937396399207629e-05,
"loss": 0.7115,
"step": 1641
},
{
"epoch": 0.351502501940007,
"grad_norm": 0.22158110604347978,
"learning_rate": 2.9361958576279197e-05,
"loss": 0.7267,
"step": 1642
},
{
"epoch": 0.35171657167322257,
"grad_norm": 0.21709748064823825,
"learning_rate": 2.9349948839019165e-05,
"loss": 0.7357,
"step": 1643
},
{
"epoch": 0.35193064140643815,
"grad_norm": 0.21889904612208433,
"learning_rate": 2.9337934785839864e-05,
"loss": 0.7262,
"step": 1644
},
{
"epoch": 0.35214471113965373,
"grad_norm": 0.1983177751255727,
"learning_rate": 2.932591642228696e-05,
"loss": 0.7054,
"step": 1645
},
{
"epoch": 0.3523587808728693,
"grad_norm": 0.3684852676030669,
"learning_rate": 2.9313893753908114e-05,
"loss": 0.6861,
"step": 1646
},
{
"epoch": 0.35257285060608495,
"grad_norm": 0.21530192211391289,
"learning_rate": 2.930186678625295e-05,
"loss": 0.7755,
"step": 1647
},
{
"epoch": 0.35278692033930054,
"grad_norm": 0.19819752885760447,
"learning_rate": 2.9289835524873108e-05,
"loss": 0.6966,
"step": 1648
},
{
"epoch": 0.3530009900725161,
"grad_norm": 0.2284344604462054,
"learning_rate": 2.92777999753222e-05,
"loss": 0.712,
"step": 1649
},
{
"epoch": 0.3532150598057317,
"grad_norm": 0.22478725630612018,
"learning_rate": 2.92657601431558e-05,
"loss": 0.7364,
"step": 1650
},
{
"epoch": 0.35342912953894734,
"grad_norm": 0.22149960765616844,
"learning_rate": 2.9253716033931484e-05,
"loss": 0.7221,
"step": 1651
},
{
"epoch": 0.3536431992721629,
"grad_norm": 0.22881058246798758,
"learning_rate": 2.924166765320878e-05,
"loss": 0.7249,
"step": 1652
},
{
"epoch": 0.3538572690053785,
"grad_norm": 0.2193895470686908,
"learning_rate": 2.9229615006549208e-05,
"loss": 0.719,
"step": 1653
},
{
"epoch": 0.3540713387385941,
"grad_norm": 0.22011323073956504,
"learning_rate": 2.9217558099516242e-05,
"loss": 0.7155,
"step": 1654
},
{
"epoch": 0.35428540847180967,
"grad_norm": 0.22693360239190205,
"learning_rate": 2.9205496937675338e-05,
"loss": 0.7307,
"step": 1655
},
{
"epoch": 0.3544994782050253,
"grad_norm": 0.19702516441339143,
"learning_rate": 2.9193431526593894e-05,
"loss": 0.7205,
"step": 1656
},
{
"epoch": 0.3547135479382409,
"grad_norm": 0.2013038172795989,
"learning_rate": 2.918136187184129e-05,
"loss": 0.7213,
"step": 1657
},
{
"epoch": 0.3549276176714565,
"grad_norm": 0.1996526512490232,
"learning_rate": 2.9169287978988846e-05,
"loss": 0.7269,
"step": 1658
},
{
"epoch": 0.35514168740467206,
"grad_norm": 0.2165464999925143,
"learning_rate": 2.9157209853609864e-05,
"loss": 0.7432,
"step": 1659
},
{
"epoch": 0.35535575713788764,
"grad_norm": 0.21293323524833144,
"learning_rate": 2.914512750127957e-05,
"loss": 0.765,
"step": 1660
},
{
"epoch": 0.3555698268711033,
"grad_norm": 0.21354026133827708,
"learning_rate": 2.9133040927575165e-05,
"loss": 0.7256,
"step": 1661
},
{
"epoch": 0.35578389660431886,
"grad_norm": 0.1964840334247862,
"learning_rate": 2.912095013807579e-05,
"loss": 0.7121,
"step": 1662
},
{
"epoch": 0.35599796633753444,
"grad_norm": 0.19786554348614302,
"learning_rate": 2.910885513836252e-05,
"loss": 0.7447,
"step": 1663
},
{
"epoch": 0.35621203607075,
"grad_norm": 0.20852131506720853,
"learning_rate": 2.90967559340184e-05,
"loss": 0.7022,
"step": 1664
},
{
"epoch": 0.35642610580396566,
"grad_norm": 0.19562913488732922,
"learning_rate": 2.908465253062839e-05,
"loss": 0.7346,
"step": 1665
},
{
"epoch": 0.35664017553718125,
"grad_norm": 0.2032286542306735,
"learning_rate": 2.90725449337794e-05,
"loss": 0.7325,
"step": 1666
},
{
"epoch": 0.35685424527039683,
"grad_norm": 0.2053471624657201,
"learning_rate": 2.906043314906028e-05,
"loss": 0.7423,
"step": 1667
},
{
"epoch": 0.3570683150036124,
"grad_norm": 0.19943603861771186,
"learning_rate": 2.9048317182061808e-05,
"loss": 0.7584,
"step": 1668
},
{
"epoch": 0.357282384736828,
"grad_norm": 0.20361802831648107,
"learning_rate": 2.9036197038376674e-05,
"loss": 0.7357,
"step": 1669
},
{
"epoch": 0.35749645447004363,
"grad_norm": 0.18808887274718247,
"learning_rate": 2.902407272359954e-05,
"loss": 0.734,
"step": 1670
},
{
"epoch": 0.3577105242032592,
"grad_norm": 0.2010547899926018,
"learning_rate": 2.9011944243326958e-05,
"loss": 0.7265,
"step": 1671
},
{
"epoch": 0.3579245939364748,
"grad_norm": 0.1917256441174958,
"learning_rate": 2.8999811603157403e-05,
"loss": 0.716,
"step": 1672
},
{
"epoch": 0.3581386636696904,
"grad_norm": 0.19431126645573318,
"learning_rate": 2.8987674808691292e-05,
"loss": 0.6921,
"step": 1673
},
{
"epoch": 0.358352733402906,
"grad_norm": 0.19781977259116396,
"learning_rate": 2.8975533865530935e-05,
"loss": 0.7569,
"step": 1674
},
{
"epoch": 0.3585668031361216,
"grad_norm": 0.18966150573569404,
"learning_rate": 2.8963388779280583e-05,
"loss": 0.6993,
"step": 1675
},
{
"epoch": 0.3587808728693372,
"grad_norm": 0.2441647082523994,
"learning_rate": 2.8951239555546377e-05,
"loss": 0.7253,
"step": 1676
},
{
"epoch": 0.35899494260255277,
"grad_norm": 0.18954802346774788,
"learning_rate": 2.893908619993637e-05,
"loss": 0.7335,
"step": 1677
},
{
"epoch": 0.35920901233576835,
"grad_norm": 0.20300394226628365,
"learning_rate": 2.892692871806055e-05,
"loss": 0.7149,
"step": 1678
},
{
"epoch": 0.359423082068984,
"grad_norm": 0.18951727794135695,
"learning_rate": 2.891476711553077e-05,
"loss": 0.717,
"step": 1679
},
{
"epoch": 0.3596371518021996,
"grad_norm": 0.24248952830112777,
"learning_rate": 2.8902601397960805e-05,
"loss": 0.7269,
"step": 1680
},
{
"epoch": 0.35985122153541516,
"grad_norm": 0.2191733197768651,
"learning_rate": 2.8890431570966335e-05,
"loss": 0.6912,
"step": 1681
},
{
"epoch": 0.36006529126863074,
"grad_norm": 0.23506541996096175,
"learning_rate": 2.8878257640164923e-05,
"loss": 0.7096,
"step": 1682
},
{
"epoch": 0.3602793610018464,
"grad_norm": 0.1998321006566307,
"learning_rate": 2.886607961117604e-05,
"loss": 0.7373,
"step": 1683
},
{
"epoch": 0.36049343073506196,
"grad_norm": 0.2265943150454131,
"learning_rate": 2.8853897489621036e-05,
"loss": 0.6807,
"step": 1684
},
{
"epoch": 0.36070750046827754,
"grad_norm": 0.22358646815462654,
"learning_rate": 2.8841711281123163e-05,
"loss": 0.6968,
"step": 1685
},
{
"epoch": 0.3609215702014931,
"grad_norm": 0.20582772794801782,
"learning_rate": 2.8829520991307544e-05,
"loss": 0.731,
"step": 1686
},
{
"epoch": 0.3611356399347087,
"grad_norm": 0.2887696199346498,
"learning_rate": 2.8817326625801203e-05,
"loss": 0.7482,
"step": 1687
},
{
"epoch": 0.36134970966792435,
"grad_norm": 0.23325142534643833,
"learning_rate": 2.8805128190233032e-05,
"loss": 0.7334,
"step": 1688
},
{
"epoch": 0.36156377940113993,
"grad_norm": 0.1770642884634449,
"learning_rate": 2.87929256902338e-05,
"loss": 0.7361,
"step": 1689
},
{
"epoch": 0.3617778491343555,
"grad_norm": 0.2368596549079858,
"learning_rate": 2.8780719131436168e-05,
"loss": 0.751,
"step": 1690
},
{
"epoch": 0.3619919188675711,
"grad_norm": 0.1739362619897583,
"learning_rate": 2.8768508519474664e-05,
"loss": 0.7077,
"step": 1691
},
{
"epoch": 0.36220598860078673,
"grad_norm": 0.2147919831310469,
"learning_rate": 2.8756293859985675e-05,
"loss": 0.7318,
"step": 1692
},
{
"epoch": 0.3624200583340023,
"grad_norm": 0.19748325715869405,
"learning_rate": 2.8744075158607468e-05,
"loss": 0.7446,
"step": 1693
},
{
"epoch": 0.3626341280672179,
"grad_norm": 0.2016693979744296,
"learning_rate": 2.8731852420980176e-05,
"loss": 0.7346,
"step": 1694
},
{
"epoch": 0.3628481978004335,
"grad_norm": 0.22805234274826633,
"learning_rate": 2.871962565274579e-05,
"loss": 0.7401,
"step": 1695
},
{
"epoch": 0.36306226753364906,
"grad_norm": 0.21347228887531122,
"learning_rate": 2.8707394859548167e-05,
"loss": 0.7319,
"step": 1696
},
{
"epoch": 0.3632763372668647,
"grad_norm": 0.20867067071041048,
"learning_rate": 2.8695160047033012e-05,
"loss": 0.7381,
"step": 1697
},
{
"epoch": 0.3634904070000803,
"grad_norm": 0.19631814144259052,
"learning_rate": 2.86829212208479e-05,
"loss": 0.7204,
"step": 1698
},
{
"epoch": 0.36370447673329587,
"grad_norm": 0.19467251504422736,
"learning_rate": 2.8670678386642246e-05,
"loss": 0.7196,
"step": 1699
},
{
"epoch": 0.36391854646651145,
"grad_norm": 0.21482620723338536,
"learning_rate": 2.8658431550067317e-05,
"loss": 0.7474,
"step": 1700
},
{
"epoch": 0.3641326161997271,
"grad_norm": 0.4260820557629357,
"learning_rate": 2.8646180716776243e-05,
"loss": 0.6992,
"step": 1701
},
{
"epoch": 0.36434668593294267,
"grad_norm": 0.1942291886013534,
"learning_rate": 2.863392589242397e-05,
"loss": 0.7021,
"step": 1702
},
{
"epoch": 0.36456075566615825,
"grad_norm": 0.20489161062821937,
"learning_rate": 2.8621667082667316e-05,
"loss": 0.7172,
"step": 1703
},
{
"epoch": 0.36477482539937384,
"grad_norm": 0.19535321269181533,
"learning_rate": 2.860940429316491e-05,
"loss": 0.7432,
"step": 1704
},
{
"epoch": 0.3649888951325894,
"grad_norm": 0.23216416966189832,
"learning_rate": 2.859713752957725e-05,
"loss": 0.7035,
"step": 1705
},
{
"epoch": 0.36520296486580506,
"grad_norm": 0.20817838367972874,
"learning_rate": 2.8584866797566645e-05,
"loss": 0.7075,
"step": 1706
},
{
"epoch": 0.36541703459902064,
"grad_norm": 0.20055889426234758,
"learning_rate": 2.857259210279724e-05,
"loss": 0.6914,
"step": 1707
},
{
"epoch": 0.3656311043322362,
"grad_norm": 0.2286761813381872,
"learning_rate": 2.8560313450935012e-05,
"loss": 0.7321,
"step": 1708
},
{
"epoch": 0.3658451740654518,
"grad_norm": 0.2041313674388944,
"learning_rate": 2.854803084764777e-05,
"loss": 0.7244,
"step": 1709
},
{
"epoch": 0.3660592437986674,
"grad_norm": 0.1984995984196707,
"learning_rate": 2.8535744298605127e-05,
"loss": 0.7113,
"step": 1710
},
{
"epoch": 0.366273313531883,
"grad_norm": 0.20423034432358758,
"learning_rate": 2.8523453809478546e-05,
"loss": 0.7375,
"step": 1711
},
{
"epoch": 0.3664873832650986,
"grad_norm": 0.20856433063544874,
"learning_rate": 2.851115938594129e-05,
"loss": 0.7415,
"step": 1712
},
{
"epoch": 0.3667014529983142,
"grad_norm": 0.18630271956272798,
"learning_rate": 2.8498861033668444e-05,
"loss": 0.7234,
"step": 1713
},
{
"epoch": 0.3669155227315298,
"grad_norm": 0.21652364829720752,
"learning_rate": 2.8486558758336896e-05,
"loss": 0.6767,
"step": 1714
},
{
"epoch": 0.3671295924647454,
"grad_norm": 0.18257187565594565,
"learning_rate": 2.8474252565625368e-05,
"loss": 0.7028,
"step": 1715
},
{
"epoch": 0.367343662197961,
"grad_norm": 0.22788704890813255,
"learning_rate": 2.846194246121436e-05,
"loss": 0.7308,
"step": 1716
},
{
"epoch": 0.3675577319311766,
"grad_norm": 0.1904359809868788,
"learning_rate": 2.8449628450786207e-05,
"loss": 0.7392,
"step": 1717
},
{
"epoch": 0.36777180166439216,
"grad_norm": 0.20934717695243124,
"learning_rate": 2.8437310540025033e-05,
"loss": 0.7342,
"step": 1718
},
{
"epoch": 0.36798587139760774,
"grad_norm": 0.18721907363554224,
"learning_rate": 2.8424988734616747e-05,
"loss": 0.7051,
"step": 1719
},
{
"epoch": 0.3681999411308234,
"grad_norm": 0.20690251888753228,
"learning_rate": 2.8412663040249097e-05,
"loss": 0.6912,
"step": 1720
},
{
"epoch": 0.36841401086403897,
"grad_norm": 0.19376329881339552,
"learning_rate": 2.8400333462611578e-05,
"loss": 0.7309,
"step": 1721
},
{
"epoch": 0.36862808059725455,
"grad_norm": 0.20675741407692566,
"learning_rate": 2.8388000007395512e-05,
"loss": 0.7345,
"step": 1722
},
{
"epoch": 0.36884215033047013,
"grad_norm": 0.20526333726628604,
"learning_rate": 2.8375662680294e-05,
"loss": 0.7425,
"step": 1723
},
{
"epoch": 0.36905622006368577,
"grad_norm": 0.18939405189693614,
"learning_rate": 2.836332148700193e-05,
"loss": 0.714,
"step": 1724
},
{
"epoch": 0.36927028979690135,
"grad_norm": 0.2257731688992446,
"learning_rate": 2.8350976433215964e-05,
"loss": 0.7305,
"step": 1725
},
{
"epoch": 0.36948435953011693,
"grad_norm": 0.19399977270023452,
"learning_rate": 2.8338627524634566e-05,
"loss": 0.7,
"step": 1726
},
{
"epoch": 0.3696984292633325,
"grad_norm": 0.22325392542928493,
"learning_rate": 2.832627476695797e-05,
"loss": 0.716,
"step": 1727
},
{
"epoch": 0.3699124989965481,
"grad_norm": 0.2087277878166251,
"learning_rate": 2.831391816588818e-05,
"loss": 0.7319,
"step": 1728
},
{
"epoch": 0.37012656872976374,
"grad_norm": 0.2263158186396704,
"learning_rate": 2.830155772712899e-05,
"loss": 0.7027,
"step": 1729
},
{
"epoch": 0.3703406384629793,
"grad_norm": 0.19873845730486467,
"learning_rate": 2.8289193456385944e-05,
"loss": 0.7292,
"step": 1730
},
{
"epoch": 0.3705547081961949,
"grad_norm": 0.20724591722087535,
"learning_rate": 2.8276825359366374e-05,
"loss": 0.7352,
"step": 1731
},
{
"epoch": 0.3707687779294105,
"grad_norm": 0.2060158701752789,
"learning_rate": 2.8264453441779366e-05,
"loss": 0.7155,
"step": 1732
},
{
"epoch": 0.3709828476626261,
"grad_norm": 0.2115945468043021,
"learning_rate": 2.8252077709335782e-05,
"loss": 0.7258,
"step": 1733
},
{
"epoch": 0.3711969173958417,
"grad_norm": 0.22622279916089783,
"learning_rate": 2.8239698167748232e-05,
"loss": 0.7567,
"step": 1734
},
{
"epoch": 0.3714109871290573,
"grad_norm": 0.22605142673964448,
"learning_rate": 2.8227314822731092e-05,
"loss": 0.7154,
"step": 1735
},
{
"epoch": 0.3716250568622729,
"grad_norm": 0.19441628980835582,
"learning_rate": 2.8214927680000493e-05,
"loss": 0.6928,
"step": 1736
},
{
"epoch": 0.37183912659548846,
"grad_norm": 0.23110417463174424,
"learning_rate": 2.8202536745274307e-05,
"loss": 0.7018,
"step": 1737
},
{
"epoch": 0.3720531963287041,
"grad_norm": 0.21228303132353327,
"learning_rate": 2.819014202427218e-05,
"loss": 0.7367,
"step": 1738
},
{
"epoch": 0.3722672660619197,
"grad_norm": 0.2191197843320111,
"learning_rate": 2.817774352271549e-05,
"loss": 0.721,
"step": 1739
},
{
"epoch": 0.37248133579513526,
"grad_norm": 0.20734714732781498,
"learning_rate": 2.8165341246327357e-05,
"loss": 0.7023,
"step": 1740
},
{
"epoch": 0.37269540552835084,
"grad_norm": 0.21337594074927818,
"learning_rate": 2.8152935200832652e-05,
"loss": 0.6865,
"step": 1741
},
{
"epoch": 0.3729094752615665,
"grad_norm": 0.20082516749197782,
"learning_rate": 2.814052539195798e-05,
"loss": 0.7101,
"step": 1742
},
{
"epoch": 0.37312354499478206,
"grad_norm": 0.22526597451442446,
"learning_rate": 2.8128111825431692e-05,
"loss": 0.7362,
"step": 1743
},
{
"epoch": 0.37333761472799765,
"grad_norm": 0.19957108929175169,
"learning_rate": 2.811569450698387e-05,
"loss": 0.7067,
"step": 1744
},
{
"epoch": 0.37355168446121323,
"grad_norm": 0.19593577994852204,
"learning_rate": 2.8103273442346313e-05,
"loss": 0.7073,
"step": 1745
},
{
"epoch": 0.3737657541944288,
"grad_norm": 0.20281897215685657,
"learning_rate": 2.8090848637252566e-05,
"loss": 0.7202,
"step": 1746
},
{
"epoch": 0.37397982392764445,
"grad_norm": 0.2060240234356956,
"learning_rate": 2.80784200974379e-05,
"loss": 0.7285,
"step": 1747
},
{
"epoch": 0.37419389366086003,
"grad_norm": 0.2080764958143944,
"learning_rate": 2.8065987828639308e-05,
"loss": 0.7067,
"step": 1748
},
{
"epoch": 0.3744079633940756,
"grad_norm": 0.20306795500453428,
"learning_rate": 2.80535518365955e-05,
"loss": 0.7256,
"step": 1749
},
{
"epoch": 0.3746220331272912,
"grad_norm": 0.22562894892026664,
"learning_rate": 2.8041112127046907e-05,
"loss": 0.721,
"step": 1750
},
{
"epoch": 0.37483610286050684,
"grad_norm": 0.19540902006214536,
"learning_rate": 2.802866870573568e-05,
"loss": 0.7279,
"step": 1751
},
{
"epoch": 0.3750501725937224,
"grad_norm": 0.21536280890969858,
"learning_rate": 2.8016221578405666e-05,
"loss": 0.7482,
"step": 1752
},
{
"epoch": 0.375264242326938,
"grad_norm": 0.2185950724307106,
"learning_rate": 2.800377075080245e-05,
"loss": 0.7703,
"step": 1753
},
{
"epoch": 0.3754783120601536,
"grad_norm": 0.5483645678017114,
"learning_rate": 2.799131622867331e-05,
"loss": 0.7735,
"step": 1754
},
{
"epoch": 0.37569238179336917,
"grad_norm": 0.23077590439820195,
"learning_rate": 2.7978858017767227e-05,
"loss": 0.7096,
"step": 1755
},
{
"epoch": 0.3759064515265848,
"grad_norm": 0.21960201264980742,
"learning_rate": 2.7966396123834885e-05,
"loss": 0.7505,
"step": 1756
},
{
"epoch": 0.3761205212598004,
"grad_norm": 0.21333600647268042,
"learning_rate": 2.795393055262867e-05,
"loss": 0.7367,
"step": 1757
},
{
"epoch": 0.37633459099301597,
"grad_norm": 0.22219898638156713,
"learning_rate": 2.794146130990268e-05,
"loss": 0.7608,
"step": 1758
},
{
"epoch": 0.37654866072623155,
"grad_norm": 0.19978754112008906,
"learning_rate": 2.792898840141269e-05,
"loss": 0.7265,
"step": 1759
},
{
"epoch": 0.3767627304594472,
"grad_norm": 0.2505029699203242,
"learning_rate": 2.7916511832916167e-05,
"loss": 0.7155,
"step": 1760
},
{
"epoch": 0.3769768001926628,
"grad_norm": 0.2106146592638751,
"learning_rate": 2.790403161017227e-05,
"loss": 0.7496,
"step": 1761
},
{
"epoch": 0.37719086992587836,
"grad_norm": 0.22070540942901945,
"learning_rate": 2.7891547738941847e-05,
"loss": 0.7108,
"step": 1762
},
{
"epoch": 0.37740493965909394,
"grad_norm": 0.22309957067127134,
"learning_rate": 2.787906022498744e-05,
"loss": 0.7095,
"step": 1763
},
{
"epoch": 0.3776190093923095,
"grad_norm": 0.20847463681053063,
"learning_rate": 2.7866569074073252e-05,
"loss": 0.7152,
"step": 1764
},
{
"epoch": 0.37783307912552516,
"grad_norm": 0.23237175509921956,
"learning_rate": 2.7854074291965183e-05,
"loss": 0.7183,
"step": 1765
},
{
"epoch": 0.37804714885874074,
"grad_norm": 0.23090753195439292,
"learning_rate": 2.78415758844308e-05,
"loss": 0.7201,
"step": 1766
},
{
"epoch": 0.3782612185919563,
"grad_norm": 0.19822757111781839,
"learning_rate": 2.7829073857239342e-05,
"loss": 0.7269,
"step": 1767
},
{
"epoch": 0.3784752883251719,
"grad_norm": 0.21266903325817152,
"learning_rate": 2.7816568216161717e-05,
"loss": 0.7237,
"step": 1768
},
{
"epoch": 0.3786893580583875,
"grad_norm": 0.21239506175053335,
"learning_rate": 2.780405896697052e-05,
"loss": 0.7382,
"step": 1769
},
{
"epoch": 0.37890342779160313,
"grad_norm": 0.1990996390759198,
"learning_rate": 2.7791546115439988e-05,
"loss": 0.6949,
"step": 1770
},
{
"epoch": 0.3791174975248187,
"grad_norm": 0.22595916033927044,
"learning_rate": 2.7779029667346033e-05,
"loss": 0.7287,
"step": 1771
},
{
"epoch": 0.3793315672580343,
"grad_norm": 0.19218409645659082,
"learning_rate": 2.7766509628466223e-05,
"loss": 0.7207,
"step": 1772
},
{
"epoch": 0.3795456369912499,
"grad_norm": 0.21488969989074852,
"learning_rate": 2.7753986004579786e-05,
"loss": 0.6924,
"step": 1773
},
{
"epoch": 0.3797597067244655,
"grad_norm": 0.1999844006155712,
"learning_rate": 2.77414588014676e-05,
"loss": 0.735,
"step": 1774
},
{
"epoch": 0.3799737764576811,
"grad_norm": 0.19940210277813755,
"learning_rate": 2.7728928024912206e-05,
"loss": 0.7231,
"step": 1775
},
{
"epoch": 0.3801878461908967,
"grad_norm": 0.23614070028648362,
"learning_rate": 2.771639368069778e-05,
"loss": 0.7253,
"step": 1776
},
{
"epoch": 0.38040191592411227,
"grad_norm": 0.2177230403996155,
"learning_rate": 2.770385577461016e-05,
"loss": 0.6919,
"step": 1777
},
{
"epoch": 0.38061598565732785,
"grad_norm": 0.21134464251846,
"learning_rate": 2.7691314312436815e-05,
"loss": 0.7054,
"step": 1778
},
{
"epoch": 0.3808300553905435,
"grad_norm": 0.23083480554672203,
"learning_rate": 2.7678769299966864e-05,
"loss": 0.7146,
"step": 1779
},
{
"epoch": 0.38104412512375907,
"grad_norm": 0.2216722698933766,
"learning_rate": 2.766622074299106e-05,
"loss": 0.7199,
"step": 1780
},
{
"epoch": 0.38125819485697465,
"grad_norm": 0.2239407026522653,
"learning_rate": 2.7653668647301797e-05,
"loss": 0.7164,
"step": 1781
},
{
"epoch": 0.38147226459019024,
"grad_norm": 0.2359209677936568,
"learning_rate": 2.76411130186931e-05,
"loss": 0.737,
"step": 1782
},
{
"epoch": 0.3816863343234059,
"grad_norm": 0.20231105289973436,
"learning_rate": 2.7628553862960616e-05,
"loss": 0.7395,
"step": 1783
},
{
"epoch": 0.38190040405662146,
"grad_norm": 0.21307073531594614,
"learning_rate": 2.761599118590163e-05,
"loss": 0.7417,
"step": 1784
},
{
"epoch": 0.38211447378983704,
"grad_norm": 0.2293381408754453,
"learning_rate": 2.760342499331506e-05,
"loss": 0.7273,
"step": 1785
},
{
"epoch": 0.3823285435230526,
"grad_norm": 0.19091102829907344,
"learning_rate": 2.759085529100143e-05,
"loss": 0.7396,
"step": 1786
},
{
"epoch": 0.3825426132562682,
"grad_norm": 0.20829281955244247,
"learning_rate": 2.7578282084762893e-05,
"loss": 0.7144,
"step": 1787
},
{
"epoch": 0.38275668298948384,
"grad_norm": 0.18575514934808443,
"learning_rate": 2.7565705380403218e-05,
"loss": 0.723,
"step": 1788
},
{
"epoch": 0.3829707527226994,
"grad_norm": 0.2018641164754752,
"learning_rate": 2.7553125183727786e-05,
"loss": 0.7005,
"step": 1789
},
{
"epoch": 0.383184822455915,
"grad_norm": 0.19634296099432155,
"learning_rate": 2.7540541500543604e-05,
"loss": 0.7173,
"step": 1790
},
{
"epoch": 0.3833988921891306,
"grad_norm": 0.18750371160676454,
"learning_rate": 2.7527954336659264e-05,
"loss": 0.7109,
"step": 1791
},
{
"epoch": 0.38361296192234623,
"grad_norm": 0.20062052499594968,
"learning_rate": 2.7515363697884983e-05,
"loss": 0.7237,
"step": 1792
},
{
"epoch": 0.3838270316555618,
"grad_norm": 0.18753502787433712,
"learning_rate": 2.750276959003258e-05,
"loss": 0.6676,
"step": 1793
},
{
"epoch": 0.3840411013887774,
"grad_norm": 0.22289732741582527,
"learning_rate": 2.7490172018915462e-05,
"loss": 0.7171,
"step": 1794
},
{
"epoch": 0.384255171121993,
"grad_norm": 0.19399632465467423,
"learning_rate": 2.747757099034865e-05,
"loss": 0.74,
"step": 1795
},
{
"epoch": 0.38446924085520856,
"grad_norm": 0.2332560239406799,
"learning_rate": 2.7464966510148766e-05,
"loss": 0.7242,
"step": 1796
},
{
"epoch": 0.3846833105884242,
"grad_norm": 0.2036769492361591,
"learning_rate": 2.7452358584134e-05,
"loss": 0.6991,
"step": 1797
},
{
"epoch": 0.3848973803216398,
"grad_norm": 0.20329991688548135,
"learning_rate": 2.7439747218124156e-05,
"loss": 0.7407,
"step": 1798
},
{
"epoch": 0.38511145005485536,
"grad_norm": 0.24852491612215835,
"learning_rate": 2.7427132417940606e-05,
"loss": 0.7247,
"step": 1799
},
{
"epoch": 0.38532551978807095,
"grad_norm": 0.20990508651913883,
"learning_rate": 2.741451418940634e-05,
"loss": 0.695,
"step": 1800
},
{
"epoch": 0.3855395895212866,
"grad_norm": 0.2165520777558981,
"learning_rate": 2.7401892538345895e-05,
"loss": 0.7115,
"step": 1801
},
{
"epoch": 0.38575365925450217,
"grad_norm": 0.3442702245739841,
"learning_rate": 2.73892674705854e-05,
"loss": 0.7041,
"step": 1802
},
{
"epoch": 0.38596772898771775,
"grad_norm": 0.20240417308796424,
"learning_rate": 2.7376638991952565e-05,
"loss": 0.6835,
"step": 1803
},
{
"epoch": 0.38618179872093333,
"grad_norm": 0.23248374727170049,
"learning_rate": 2.7364007108276682e-05,
"loss": 0.7169,
"step": 1804
},
{
"epoch": 0.3863958684541489,
"grad_norm": 0.2273561424825165,
"learning_rate": 2.7351371825388597e-05,
"loss": 0.7272,
"step": 1805
},
{
"epoch": 0.38660993818736455,
"grad_norm": 0.1968252023241286,
"learning_rate": 2.7338733149120726e-05,
"loss": 0.74,
"step": 1806
},
{
"epoch": 0.38682400792058014,
"grad_norm": 0.21363516693293966,
"learning_rate": 2.7326091085307078e-05,
"loss": 0.7105,
"step": 1807
},
{
"epoch": 0.3870380776537957,
"grad_norm": 0.20097880326939904,
"learning_rate": 2.7313445639783194e-05,
"loss": 0.7179,
"step": 1808
},
{
"epoch": 0.3872521473870113,
"grad_norm": 0.2989022241395946,
"learning_rate": 2.7300796818386185e-05,
"loss": 0.7153,
"step": 1809
},
{
"epoch": 0.38746621712022694,
"grad_norm": 0.24756322170147138,
"learning_rate": 2.728814462695473e-05,
"loss": 0.7492,
"step": 1810
},
{
"epoch": 0.3876802868534425,
"grad_norm": 0.18832672298797135,
"learning_rate": 2.7275489071329065e-05,
"loss": 0.7232,
"step": 1811
},
{
"epoch": 0.3878943565866581,
"grad_norm": 0.22166867819908395,
"learning_rate": 2.7262830157350957e-05,
"loss": 0.7398,
"step": 1812
},
{
"epoch": 0.3881084263198737,
"grad_norm": 0.19895813698935308,
"learning_rate": 2.7250167890863743e-05,
"loss": 0.7091,
"step": 1813
},
{
"epoch": 0.38832249605308927,
"grad_norm": 0.2029446516468001,
"learning_rate": 2.7237502277712305e-05,
"loss": 0.7358,
"step": 1814
},
{
"epoch": 0.3885365657863049,
"grad_norm": 0.2181862871392029,
"learning_rate": 2.7224833323743064e-05,
"loss": 0.7227,
"step": 1815
},
{
"epoch": 0.3887506355195205,
"grad_norm": 0.2008808770570698,
"learning_rate": 2.7212161034803977e-05,
"loss": 0.706,
"step": 1816
},
{
"epoch": 0.3889647052527361,
"grad_norm": 0.22061140212032707,
"learning_rate": 2.7199485416744572e-05,
"loss": 0.7062,
"step": 1817
},
{
"epoch": 0.38917877498595166,
"grad_norm": 0.2038919546462918,
"learning_rate": 2.718680647541587e-05,
"loss": 0.7384,
"step": 1818
},
{
"epoch": 0.38939284471916724,
"grad_norm": 0.19984869469755115,
"learning_rate": 2.7174124216670462e-05,
"loss": 0.7055,
"step": 1819
},
{
"epoch": 0.3896069144523829,
"grad_norm": 0.19106707428113026,
"learning_rate": 2.7161438646362444e-05,
"loss": 0.6978,
"step": 1820
},
{
"epoch": 0.38982098418559846,
"grad_norm": 0.20034773939138448,
"learning_rate": 2.7148749770347453e-05,
"loss": 0.7443,
"step": 1821
},
{
"epoch": 0.39003505391881405,
"grad_norm": 0.20258415362901117,
"learning_rate": 2.7136057594482656e-05,
"loss": 0.7231,
"step": 1822
},
{
"epoch": 0.39024912365202963,
"grad_norm": 0.2002972710807168,
"learning_rate": 2.712336212462674e-05,
"loss": 0.7508,
"step": 1823
},
{
"epoch": 0.39046319338524527,
"grad_norm": 0.1887666338775884,
"learning_rate": 2.711066336663991e-05,
"loss": 0.711,
"step": 1824
},
{
"epoch": 0.39067726311846085,
"grad_norm": 0.20610052214749,
"learning_rate": 2.709796132638388e-05,
"loss": 0.716,
"step": 1825
},
{
"epoch": 0.39089133285167643,
"grad_norm": 0.18823667581409867,
"learning_rate": 2.7085256009721895e-05,
"loss": 0.7443,
"step": 1826
},
{
"epoch": 0.391105402584892,
"grad_norm": 0.19858919507760137,
"learning_rate": 2.7072547422518707e-05,
"loss": 0.7378,
"step": 1827
},
{
"epoch": 0.3913194723181076,
"grad_norm": 0.19339527041080595,
"learning_rate": 2.705983557064058e-05,
"loss": 0.7071,
"step": 1828
},
{
"epoch": 0.39153354205132324,
"grad_norm": 0.19422095290511043,
"learning_rate": 2.7047120459955274e-05,
"loss": 0.7014,
"step": 1829
},
{
"epoch": 0.3917476117845388,
"grad_norm": 0.20380937267452157,
"learning_rate": 2.7034402096332063e-05,
"loss": 0.7242,
"step": 1830
},
{
"epoch": 0.3919616815177544,
"grad_norm": 0.18695351718613296,
"learning_rate": 2.702168048564172e-05,
"loss": 0.7121,
"step": 1831
},
{
"epoch": 0.39217575125097,
"grad_norm": 0.19470267005279956,
"learning_rate": 2.700895563375652e-05,
"loss": 0.7166,
"step": 1832
},
{
"epoch": 0.3923898209841856,
"grad_norm": 0.19864949644991833,
"learning_rate": 2.699622754655023e-05,
"loss": 0.7385,
"step": 1833
},
{
"epoch": 0.3926038907174012,
"grad_norm": 0.3509291084831578,
"learning_rate": 2.6983496229898114e-05,
"loss": 0.7207,
"step": 1834
},
{
"epoch": 0.3928179604506168,
"grad_norm": 0.2052961511773908,
"learning_rate": 2.6970761689676922e-05,
"loss": 0.7172,
"step": 1835
},
{
"epoch": 0.39303203018383237,
"grad_norm": 0.20756454923945808,
"learning_rate": 2.695802393176489e-05,
"loss": 0.7318,
"step": 1836
},
{
"epoch": 0.39324609991704795,
"grad_norm": 0.19990765561099857,
"learning_rate": 2.6945282962041748e-05,
"loss": 0.7331,
"step": 1837
},
{
"epoch": 0.3934601696502636,
"grad_norm": 0.20308071953283982,
"learning_rate": 2.6932538786388706e-05,
"loss": 0.7546,
"step": 1838
},
{
"epoch": 0.3936742393834792,
"grad_norm": 0.19257776403184904,
"learning_rate": 2.6919791410688456e-05,
"loss": 0.7424,
"step": 1839
},
{
"epoch": 0.39388830911669476,
"grad_norm": 0.3725315128511502,
"learning_rate": 2.6907040840825156e-05,
"loss": 0.7312,
"step": 1840
},
{
"epoch": 0.39410237884991034,
"grad_norm": 0.21448512794040087,
"learning_rate": 2.689428708268444e-05,
"loss": 0.7174,
"step": 1841
},
{
"epoch": 0.394316448583126,
"grad_norm": 0.18667893597869978,
"learning_rate": 2.6881530142153435e-05,
"loss": 0.7325,
"step": 1842
},
{
"epoch": 0.39453051831634156,
"grad_norm": 0.20516658000882992,
"learning_rate": 2.686877002512071e-05,
"loss": 0.7073,
"step": 1843
},
{
"epoch": 0.39474458804955714,
"grad_norm": 0.1885414212304079,
"learning_rate": 2.685600673747631e-05,
"loss": 0.7217,
"step": 1844
},
{
"epoch": 0.3949586577827727,
"grad_norm": 0.20023230636099884,
"learning_rate": 2.684324028511176e-05,
"loss": 0.707,
"step": 1845
},
{
"epoch": 0.3951727275159883,
"grad_norm": 0.2059285593374697,
"learning_rate": 2.683047067392002e-05,
"loss": 0.75,
"step": 1846
},
{
"epoch": 0.39538679724920395,
"grad_norm": 0.18334200201512382,
"learning_rate": 2.6817697909795515e-05,
"loss": 0.6988,
"step": 1847
},
{
"epoch": 0.39560086698241953,
"grad_norm": 0.2156911167146091,
"learning_rate": 2.680492199863414e-05,
"loss": 0.7085,
"step": 1848
},
{
"epoch": 0.3958149367156351,
"grad_norm": 0.19017938237739312,
"learning_rate": 2.6792142946333227e-05,
"loss": 0.707,
"step": 1849
},
{
"epoch": 0.3960290064488507,
"grad_norm": 0.19714275913947663,
"learning_rate": 2.6779360758791562e-05,
"loss": 0.7341,
"step": 1850
},
{
"epoch": 0.39624307618206633,
"grad_norm": 0.20087478676998408,
"learning_rate": 2.6766575441909385e-05,
"loss": 0.7097,
"step": 1851
},
{
"epoch": 0.3964571459152819,
"grad_norm": 0.20793008326447115,
"learning_rate": 2.6753787001588362e-05,
"loss": 0.7248,
"step": 1852
},
{
"epoch": 0.3966712156484975,
"grad_norm": 0.6508073669110191,
"learning_rate": 2.6740995443731633e-05,
"loss": 0.7027,
"step": 1853
},
{
"epoch": 0.3968852853817131,
"grad_norm": 0.19346323481400934,
"learning_rate": 2.6728200774243743e-05,
"loss": 0.7196,
"step": 1854
},
{
"epoch": 0.39709935511492866,
"grad_norm": 0.2159766043372786,
"learning_rate": 2.671540299903069e-05,
"loss": 0.7408,
"step": 1855
},
{
"epoch": 0.3973134248481443,
"grad_norm": 0.19117659085865799,
"learning_rate": 2.670260212399991e-05,
"loss": 0.7003,
"step": 1856
},
{
"epoch": 0.3975274945813599,
"grad_norm": 0.21387736853704956,
"learning_rate": 2.6689798155060255e-05,
"loss": 0.7206,
"step": 1857
},
{
"epoch": 0.39774156431457547,
"grad_norm": 0.21098189486429983,
"learning_rate": 2.6676991098122015e-05,
"loss": 0.6961,
"step": 1858
},
{
"epoch": 0.39795563404779105,
"grad_norm": 0.22704794833238748,
"learning_rate": 2.6664180959096914e-05,
"loss": 0.6859,
"step": 1859
},
{
"epoch": 0.3981697037810067,
"grad_norm": 0.2141209364350246,
"learning_rate": 2.6651367743898077e-05,
"loss": 0.7247,
"step": 1860
},
{
"epoch": 0.3983837735142223,
"grad_norm": 0.21345183388124112,
"learning_rate": 2.6638551458440068e-05,
"loss": 0.7122,
"step": 1861
},
{
"epoch": 0.39859784324743786,
"grad_norm": 0.21988866363729603,
"learning_rate": 2.662573210863886e-05,
"loss": 0.7171,
"step": 1862
},
{
"epoch": 0.39881191298065344,
"grad_norm": 0.22154389839515065,
"learning_rate": 2.6612909700411827e-05,
"loss": 0.7009,
"step": 1863
},
{
"epoch": 0.399025982713869,
"grad_norm": 0.21850811127996428,
"learning_rate": 2.6600084239677794e-05,
"loss": 0.7225,
"step": 1864
},
{
"epoch": 0.39924005244708466,
"grad_norm": 0.2076529383923364,
"learning_rate": 2.658725573235695e-05,
"loss": 0.7059,
"step": 1865
},
{
"epoch": 0.39945412218030024,
"grad_norm": 0.20867367265947082,
"learning_rate": 2.6574424184370927e-05,
"loss": 0.7071,
"step": 1866
},
{
"epoch": 0.3996681919135158,
"grad_norm": 0.20616840346644114,
"learning_rate": 2.6561589601642732e-05,
"loss": 0.7272,
"step": 1867
},
{
"epoch": 0.3998822616467314,
"grad_norm": 0.21582995657450363,
"learning_rate": 2.6548751990096783e-05,
"loss": 0.7313,
"step": 1868
},
{
"epoch": 0.400096331379947,
"grad_norm": 0.21784726207671676,
"learning_rate": 2.6535911355658907e-05,
"loss": 0.7514,
"step": 1869
},
{
"epoch": 0.40031040111316263,
"grad_norm": 0.21100986770680158,
"learning_rate": 2.6523067704256318e-05,
"loss": 0.7352,
"step": 1870
},
{
"epoch": 0.4005244708463782,
"grad_norm": 0.2309081220101376,
"learning_rate": 2.6510221041817613e-05,
"loss": 0.7178,
"step": 1871
},
{
"epoch": 0.4007385405795938,
"grad_norm": 0.20078354730635983,
"learning_rate": 2.6497371374272796e-05,
"loss": 0.7211,
"step": 1872
},
{
"epoch": 0.4009526103128094,
"grad_norm": 0.2299332800045435,
"learning_rate": 2.648451870755324e-05,
"loss": 0.7263,
"step": 1873
},
{
"epoch": 0.401166680046025,
"grad_norm": 0.2047233556524905,
"learning_rate": 2.6471663047591727e-05,
"loss": 0.7087,
"step": 1874
},
{
"epoch": 0.4013807497792406,
"grad_norm": 0.2213926142266927,
"learning_rate": 2.6458804400322393e-05,
"loss": 0.7556,
"step": 1875
},
{
"epoch": 0.4015948195124562,
"grad_norm": 0.2349537978017295,
"learning_rate": 2.6445942771680776e-05,
"loss": 0.726,
"step": 1876
},
{
"epoch": 0.40180888924567176,
"grad_norm": 0.1995934053519248,
"learning_rate": 2.643307816760377e-05,
"loss": 0.6919,
"step": 1877
},
{
"epoch": 0.40202295897888735,
"grad_norm": 0.23635768573095461,
"learning_rate": 2.642021059402966e-05,
"loss": 0.7178,
"step": 1878
},
{
"epoch": 0.402237028712103,
"grad_norm": 0.22746434828024978,
"learning_rate": 2.640734005689809e-05,
"loss": 0.7207,
"step": 1879
},
{
"epoch": 0.40245109844531857,
"grad_norm": 0.2034955598877208,
"learning_rate": 2.639446656215008e-05,
"loss": 0.725,
"step": 1880
},
{
"epoch": 0.40266516817853415,
"grad_norm": 0.24228383143082338,
"learning_rate": 2.6381590115728015e-05,
"loss": 0.7222,
"step": 1881
},
{
"epoch": 0.40287923791174973,
"grad_norm": 0.22084391334038264,
"learning_rate": 2.6368710723575633e-05,
"loss": 0.7226,
"step": 1882
},
{
"epoch": 0.40309330764496537,
"grad_norm": 0.21413001055885492,
"learning_rate": 2.6355828391638036e-05,
"loss": 0.7162,
"step": 1883
},
{
"epoch": 0.40330737737818095,
"grad_norm": 0.25649482700300674,
"learning_rate": 2.634294312586169e-05,
"loss": 0.7188,
"step": 1884
},
{
"epoch": 0.40352144711139654,
"grad_norm": 0.5704672527223288,
"learning_rate": 2.633005493219441e-05,
"loss": 0.7268,
"step": 1885
},
{
"epoch": 0.4037355168446121,
"grad_norm": 0.8083517625708049,
"learning_rate": 2.6317163816585357e-05,
"loss": 0.7172,
"step": 1886
},
{
"epoch": 0.4039495865778277,
"grad_norm": 0.23867837452281146,
"learning_rate": 2.630426978498505e-05,
"loss": 0.7368,
"step": 1887
},
{
"epoch": 0.40416365631104334,
"grad_norm": 0.2609963405401781,
"learning_rate": 2.6291372843345356e-05,
"loss": 0.7167,
"step": 1888
},
{
"epoch": 0.4043777260442589,
"grad_norm": 0.24308875778444147,
"learning_rate": 2.6278472997619467e-05,
"loss": 0.7447,
"step": 1889
},
{
"epoch": 0.4045917957774745,
"grad_norm": 0.24630170170545196,
"learning_rate": 2.626557025376194e-05,
"loss": 0.7288,
"step": 1890
},
{
"epoch": 0.4048058655106901,
"grad_norm": 0.22783814529196547,
"learning_rate": 2.6252664617728655e-05,
"loss": 0.7282,
"step": 1891
},
{
"epoch": 0.4050199352439057,
"grad_norm": 0.20511205874940108,
"learning_rate": 2.6239756095476824e-05,
"loss": 0.6931,
"step": 1892
},
{
"epoch": 0.4052340049771213,
"grad_norm": 0.2168482153786405,
"learning_rate": 2.622684469296501e-05,
"loss": 0.7347,
"step": 1893
},
{
"epoch": 0.4054480747103369,
"grad_norm": 0.20211723967047945,
"learning_rate": 2.6213930416153072e-05,
"loss": 0.7445,
"step": 1894
},
{
"epoch": 0.4056621444435525,
"grad_norm": 0.2225638010297805,
"learning_rate": 2.620101327100224e-05,
"loss": 0.7724,
"step": 1895
},
{
"epoch": 0.40587621417676806,
"grad_norm": 0.22032432763512383,
"learning_rate": 2.6188093263475028e-05,
"loss": 0.7028,
"step": 1896
},
{
"epoch": 0.4060902839099837,
"grad_norm": 0.2024287932420587,
"learning_rate": 2.6175170399535298e-05,
"loss": 0.6996,
"step": 1897
},
{
"epoch": 0.4063043536431993,
"grad_norm": 0.24610753264990934,
"learning_rate": 2.6162244685148212e-05,
"loss": 0.7157,
"step": 1898
},
{
"epoch": 0.40651842337641486,
"grad_norm": 0.23955346304070563,
"learning_rate": 2.614931612628026e-05,
"loss": 0.6938,
"step": 1899
},
{
"epoch": 0.40673249310963044,
"grad_norm": 0.21850323457991389,
"learning_rate": 2.6136384728899236e-05,
"loss": 0.7198,
"step": 1900
},
{
"epoch": 0.4069465628428461,
"grad_norm": 0.273305980195453,
"learning_rate": 2.6123450498974263e-05,
"loss": 0.7383,
"step": 1901
},
{
"epoch": 0.40716063257606167,
"grad_norm": 0.24466052256934773,
"learning_rate": 2.6110513442475743e-05,
"loss": 0.7088,
"step": 1902
},
{
"epoch": 0.40737470230927725,
"grad_norm": 0.24202780793832876,
"learning_rate": 2.6097573565375412e-05,
"loss": 0.7186,
"step": 1903
},
{
"epoch": 0.40758877204249283,
"grad_norm": 0.2549373895446313,
"learning_rate": 2.6084630873646278e-05,
"loss": 0.7164,
"step": 1904
},
{
"epoch": 0.4078028417757084,
"grad_norm": 0.21691531059719876,
"learning_rate": 2.6071685373262668e-05,
"loss": 0.7145,
"step": 1905
},
{
"epoch": 0.40801691150892405,
"grad_norm": 0.25176802444622576,
"learning_rate": 2.605873707020021e-05,
"loss": 0.6862,
"step": 1906
},
{
"epoch": 0.40823098124213963,
"grad_norm": 0.22918949951025416,
"learning_rate": 2.604578597043581e-05,
"loss": 0.7233,
"step": 1907
},
{
"epoch": 0.4084450509753552,
"grad_norm": 0.19971965526751126,
"learning_rate": 2.6032832079947676e-05,
"loss": 0.7391,
"step": 1908
},
{
"epoch": 0.4086591207085708,
"grad_norm": 0.24064664225175378,
"learning_rate": 2.6019875404715293e-05,
"loss": 0.711,
"step": 1909
},
{
"epoch": 0.40887319044178644,
"grad_norm": 0.20561373418668893,
"learning_rate": 2.6006915950719444e-05,
"loss": 0.7371,
"step": 1910
},
{
"epoch": 0.409087260175002,
"grad_norm": 0.22462580804939875,
"learning_rate": 2.599395372394219e-05,
"loss": 0.7016,
"step": 1911
},
{
"epoch": 0.4093013299082176,
"grad_norm": 0.19432680958986376,
"learning_rate": 2.598098873036687e-05,
"loss": 0.7179,
"step": 1912
},
{
"epoch": 0.4095153996414332,
"grad_norm": 0.20345685887146073,
"learning_rate": 2.59680209759781e-05,
"loss": 0.7404,
"step": 1913
},
{
"epoch": 0.40972946937464877,
"grad_norm": 0.204595466035068,
"learning_rate": 2.595505046676177e-05,
"loss": 0.7383,
"step": 1914
},
{
"epoch": 0.4099435391078644,
"grad_norm": 0.22033488011141625,
"learning_rate": 2.5942077208705043e-05,
"loss": 0.7286,
"step": 1915
},
{
"epoch": 0.41015760884108,
"grad_norm": 0.1911724005421792,
"learning_rate": 2.592910120779636e-05,
"loss": 0.7083,
"step": 1916
},
{
"epoch": 0.4103716785742956,
"grad_norm": 0.2325893700714623,
"learning_rate": 2.5916122470025414e-05,
"loss": 0.702,
"step": 1917
},
{
"epoch": 0.41058574830751116,
"grad_norm": 0.20345234686251643,
"learning_rate": 2.5903141001383162e-05,
"loss": 0.7079,
"step": 1918
},
{
"epoch": 0.4107998180407268,
"grad_norm": 0.2094767112427494,
"learning_rate": 2.5890156807861832e-05,
"loss": 0.7248,
"step": 1919
},
{
"epoch": 0.4110138877739424,
"grad_norm": 0.20783405251155448,
"learning_rate": 2.5877169895454902e-05,
"loss": 0.6962,
"step": 1920
},
{
"epoch": 0.41122795750715796,
"grad_norm": 0.1989495634669376,
"learning_rate": 2.58641802701571e-05,
"loss": 0.7393,
"step": 1921
},
{
"epoch": 0.41144202724037354,
"grad_norm": 0.2066335761778752,
"learning_rate": 2.5851187937964426e-05,
"loss": 0.7257,
"step": 1922
},
{
"epoch": 0.4116560969735891,
"grad_norm": 0.2094826026211216,
"learning_rate": 2.5838192904874114e-05,
"loss": 0.6955,
"step": 1923
},
{
"epoch": 0.41187016670680476,
"grad_norm": 0.1910822520168472,
"learning_rate": 2.5825195176884634e-05,
"loss": 0.7483,
"step": 1924
},
{
"epoch": 0.41208423644002035,
"grad_norm": 0.2150962594241971,
"learning_rate": 2.581219475999573e-05,
"loss": 0.7212,
"step": 1925
},
{
"epoch": 0.41229830617323593,
"grad_norm": 0.2150972987064017,
"learning_rate": 2.5799191660208366e-05,
"loss": 0.6952,
"step": 1926
},
{
"epoch": 0.4125123759064515,
"grad_norm": 0.18914642558669026,
"learning_rate": 2.578618588352475e-05,
"loss": 0.7445,
"step": 1927
},
{
"epoch": 0.4127264456396671,
"grad_norm": 0.2255497767475264,
"learning_rate": 2.5773177435948315e-05,
"loss": 0.719,
"step": 1928
},
{
"epoch": 0.41294051537288273,
"grad_norm": 0.2072085937871065,
"learning_rate": 2.5760166323483747e-05,
"loss": 0.6834,
"step": 1929
},
{
"epoch": 0.4131545851060983,
"grad_norm": 0.20275291633445486,
"learning_rate": 2.574715255213695e-05,
"loss": 0.7173,
"step": 1930
},
{
"epoch": 0.4133686548393139,
"grad_norm": 0.21153638621965765,
"learning_rate": 2.5734136127915053e-05,
"loss": 0.7049,
"step": 1931
},
{
"epoch": 0.4135827245725295,
"grad_norm": 0.20017657516515155,
"learning_rate": 2.572111705682642e-05,
"loss": 0.7027,
"step": 1932
},
{
"epoch": 0.4137967943057451,
"grad_norm": 0.1967481674114891,
"learning_rate": 2.5708095344880627e-05,
"loss": 0.6984,
"step": 1933
},
{
"epoch": 0.4140108640389607,
"grad_norm": 0.2046817295814358,
"learning_rate": 2.5695070998088465e-05,
"loss": 0.7212,
"step": 1934
},
{
"epoch": 0.4142249337721763,
"grad_norm": 0.19165665027618903,
"learning_rate": 2.568204402246196e-05,
"loss": 0.7316,
"step": 1935
},
{
"epoch": 0.41443900350539187,
"grad_norm": 0.19922866996038252,
"learning_rate": 2.5669014424014335e-05,
"loss": 0.701,
"step": 1936
},
{
"epoch": 0.41465307323860745,
"grad_norm": 0.2017645241746894,
"learning_rate": 2.5655982208760032e-05,
"loss": 0.7472,
"step": 1937
},
{
"epoch": 0.4148671429718231,
"grad_norm": 0.18544441818376398,
"learning_rate": 2.5642947382714693e-05,
"loss": 0.7339,
"step": 1938
},
{
"epoch": 0.41508121270503867,
"grad_norm": 0.20047856917863255,
"learning_rate": 2.562990995189517e-05,
"loss": 0.7296,
"step": 1939
},
{
"epoch": 0.41529528243825425,
"grad_norm": 0.19468005756206275,
"learning_rate": 2.5616869922319523e-05,
"loss": 0.6956,
"step": 1940
},
{
"epoch": 0.41550935217146984,
"grad_norm": 0.21796914272342383,
"learning_rate": 2.5603827300007e-05,
"loss": 0.7219,
"step": 1941
},
{
"epoch": 0.4157234219046855,
"grad_norm": 0.20537795000094566,
"learning_rate": 2.559078209097805e-05,
"loss": 0.7526,
"step": 1942
},
{
"epoch": 0.41593749163790106,
"grad_norm": 0.20912238933307317,
"learning_rate": 2.5577734301254326e-05,
"loss": 0.7083,
"step": 1943
},
{
"epoch": 0.41615156137111664,
"grad_norm": 0.2097315785188199,
"learning_rate": 2.5564683936858656e-05,
"loss": 0.7165,
"step": 1944
},
{
"epoch": 0.4163656311043322,
"grad_norm": 0.1964083983221885,
"learning_rate": 2.5551631003815073e-05,
"loss": 0.7257,
"step": 1945
},
{
"epoch": 0.4165797008375478,
"grad_norm": 0.21864109289469968,
"learning_rate": 2.553857550814877e-05,
"loss": 0.7024,
"step": 1946
},
{
"epoch": 0.41679377057076344,
"grad_norm": 0.2129954779508628,
"learning_rate": 2.552551745588616e-05,
"loss": 0.7068,
"step": 1947
},
{
"epoch": 0.417007840303979,
"grad_norm": 0.20250983850460993,
"learning_rate": 2.551245685305481e-05,
"loss": 0.7009,
"step": 1948
},
{
"epoch": 0.4172219100371946,
"grad_norm": 0.2321421577627822,
"learning_rate": 2.5499393705683463e-05,
"loss": 0.7214,
"step": 1949
},
{
"epoch": 0.4174359797704102,
"grad_norm": 0.22581848164609403,
"learning_rate": 2.5486328019802048e-05,
"loss": 0.7387,
"step": 1950
},
{
"epoch": 0.41765004950362583,
"grad_norm": 0.20473702821534412,
"learning_rate": 2.5473259801441663e-05,
"loss": 0.7036,
"step": 1951
},
{
"epoch": 0.4178641192368414,
"grad_norm": 0.2272726521511251,
"learning_rate": 2.546018905663457e-05,
"loss": 0.7,
"step": 1952
},
{
"epoch": 0.418078188970057,
"grad_norm": 0.19126378115809958,
"learning_rate": 2.5447115791414206e-05,
"loss": 0.7024,
"step": 1953
},
{
"epoch": 0.4182922587032726,
"grad_norm": 0.21806172821592826,
"learning_rate": 2.543404001181516e-05,
"loss": 0.7166,
"step": 1954
},
{
"epoch": 0.41850632843648816,
"grad_norm": 0.19743518950519123,
"learning_rate": 2.54209617238732e-05,
"loss": 0.7284,
"step": 1955
},
{
"epoch": 0.4187203981697038,
"grad_norm": 0.21011429539358614,
"learning_rate": 2.5407880933625234e-05,
"loss": 0.6994,
"step": 1956
},
{
"epoch": 0.4189344679029194,
"grad_norm": 0.18938765974344965,
"learning_rate": 2.539479764710932e-05,
"loss": 0.7151,
"step": 1957
},
{
"epoch": 0.41914853763613497,
"grad_norm": 0.19169633785753745,
"learning_rate": 2.5381711870364685e-05,
"loss": 0.7416,
"step": 1958
},
{
"epoch": 0.41936260736935055,
"grad_norm": 0.20286117142219692,
"learning_rate": 2.5368623609431707e-05,
"loss": 0.7366,
"step": 1959
},
{
"epoch": 0.4195766771025662,
"grad_norm": 0.18160568259129048,
"learning_rate": 2.5355532870351902e-05,
"loss": 0.7102,
"step": 1960
},
{
"epoch": 0.41979074683578177,
"grad_norm": 0.21587986305313922,
"learning_rate": 2.5342439659167924e-05,
"loss": 0.6974,
"step": 1961
},
{
"epoch": 0.42000481656899735,
"grad_norm": 0.19979989518781668,
"learning_rate": 2.5329343981923584e-05,
"loss": 0.7029,
"step": 1962
},
{
"epoch": 0.42021888630221294,
"grad_norm": 0.2024292381380751,
"learning_rate": 2.5316245844663813e-05,
"loss": 0.7171,
"step": 1963
},
{
"epoch": 0.4204329560354285,
"grad_norm": 0.21342938170999215,
"learning_rate": 2.5303145253434692e-05,
"loss": 0.6812,
"step": 1964
},
{
"epoch": 0.42064702576864416,
"grad_norm": 0.204676288891744,
"learning_rate": 2.529004221428343e-05,
"loss": 0.7186,
"step": 1965
},
{
"epoch": 0.42086109550185974,
"grad_norm": 0.18963774223060337,
"learning_rate": 2.527693673325836e-05,
"loss": 0.7496,
"step": 1966
},
{
"epoch": 0.4210751652350753,
"grad_norm": 0.2454498865550515,
"learning_rate": 2.5263828816408963e-05,
"loss": 0.6841,
"step": 1967
},
{
"epoch": 0.4212892349682909,
"grad_norm": 0.19192947876362684,
"learning_rate": 2.5250718469785812e-05,
"loss": 0.7093,
"step": 1968
},
{
"epoch": 0.42150330470150654,
"grad_norm": 0.20654780614157478,
"learning_rate": 2.523760569944063e-05,
"loss": 0.7041,
"step": 1969
},
{
"epoch": 0.4217173744347221,
"grad_norm": 0.19539311908289664,
"learning_rate": 2.522449051142625e-05,
"loss": 0.7101,
"step": 1970
},
{
"epoch": 0.4219314441679377,
"grad_norm": 0.20980351531615526,
"learning_rate": 2.5211372911796613e-05,
"loss": 0.7144,
"step": 1971
},
{
"epoch": 0.4221455139011533,
"grad_norm": 0.20143555994115453,
"learning_rate": 2.5198252906606778e-05,
"loss": 0.7062,
"step": 1972
},
{
"epoch": 0.4223595836343689,
"grad_norm": 0.23482288302307963,
"learning_rate": 2.5185130501912913e-05,
"loss": 0.7194,
"step": 1973
},
{
"epoch": 0.4225736533675845,
"grad_norm": 0.2629980468265691,
"learning_rate": 2.5172005703772306e-05,
"loss": 0.7517,
"step": 1974
},
{
"epoch": 0.4227877231008001,
"grad_norm": 0.28126509097849256,
"learning_rate": 2.515887851824333e-05,
"loss": 0.7169,
"step": 1975
},
{
"epoch": 0.4230017928340157,
"grad_norm": 0.22778076892904672,
"learning_rate": 2.5145748951385475e-05,
"loss": 0.718,
"step": 1976
},
{
"epoch": 0.42321586256723126,
"grad_norm": 0.23880179955790307,
"learning_rate": 2.5132617009259324e-05,
"loss": 0.7389,
"step": 1977
},
{
"epoch": 0.42342993230044684,
"grad_norm": 0.26830862385188575,
"learning_rate": 2.511948269792656e-05,
"loss": 0.7002,
"step": 1978
},
{
"epoch": 0.4236440020336625,
"grad_norm": 0.19470907074977814,
"learning_rate": 2.5106346023449944e-05,
"loss": 0.7097,
"step": 1979
},
{
"epoch": 0.42385807176687806,
"grad_norm": 0.1917374137557521,
"learning_rate": 2.509320699189336e-05,
"loss": 0.7205,
"step": 1980
},
{
"epoch": 0.42407214150009365,
"grad_norm": 0.22858071347959596,
"learning_rate": 2.5080065609321755e-05,
"loss": 0.7334,
"step": 1981
},
{
"epoch": 0.42428621123330923,
"grad_norm": 0.19521587171761723,
"learning_rate": 2.506692188180116e-05,
"loss": 0.6953,
"step": 1982
},
{
"epoch": 0.42450028096652487,
"grad_norm": 0.2044269464457987,
"learning_rate": 2.5053775815398698e-05,
"loss": 0.6957,
"step": 1983
},
{
"epoch": 0.42471435069974045,
"grad_norm": 0.1949724665009927,
"learning_rate": 2.504062741618257e-05,
"loss": 0.7169,
"step": 1984
},
{
"epoch": 0.42492842043295603,
"grad_norm": 0.18414223116499662,
"learning_rate": 2.5027476690222058e-05,
"loss": 0.735,
"step": 1985
},
{
"epoch": 0.4251424901661716,
"grad_norm": 0.20613056383046116,
"learning_rate": 2.5014323643587504e-05,
"loss": 0.6837,
"step": 1986
},
{
"epoch": 0.4253565598993872,
"grad_norm": 0.18405903615562325,
"learning_rate": 2.5001168282350338e-05,
"loss": 0.7247,
"step": 1987
},
{
"epoch": 0.42557062963260284,
"grad_norm": 0.20336121514951713,
"learning_rate": 2.4988010612583053e-05,
"loss": 0.712,
"step": 1988
},
{
"epoch": 0.4257846993658184,
"grad_norm": 0.19351664028649984,
"learning_rate": 2.4974850640359192e-05,
"loss": 0.7462,
"step": 1989
},
{
"epoch": 0.425998769099034,
"grad_norm": 0.2014274630453742,
"learning_rate": 2.4961688371753385e-05,
"loss": 0.7053,
"step": 1990
},
{
"epoch": 0.4262128388322496,
"grad_norm": 0.22483785467393871,
"learning_rate": 2.494852381284131e-05,
"loss": 0.7255,
"step": 1991
},
{
"epoch": 0.4264269085654652,
"grad_norm": 0.19252394472523177,
"learning_rate": 2.49353569696997e-05,
"loss": 0.7224,
"step": 1992
},
{
"epoch": 0.4266409782986808,
"grad_norm": 0.23309321637811792,
"learning_rate": 2.4922187848406348e-05,
"loss": 0.7487,
"step": 1993
},
{
"epoch": 0.4268550480318964,
"grad_norm": 0.1979293761717918,
"learning_rate": 2.490901645504009e-05,
"loss": 0.7257,
"step": 1994
},
{
"epoch": 0.42706911776511197,
"grad_norm": 0.20447486580244634,
"learning_rate": 2.4895842795680834e-05,
"loss": 0.6863,
"step": 1995
},
{
"epoch": 0.42728318749832755,
"grad_norm": 0.22180729390738765,
"learning_rate": 2.4882666876409495e-05,
"loss": 0.7529,
"step": 1996
},
{
"epoch": 0.4274972572315432,
"grad_norm": 0.20562832861605798,
"learning_rate": 2.486948870330807e-05,
"loss": 0.7051,
"step": 1997
},
{
"epoch": 0.4277113269647588,
"grad_norm": 0.2835800373436375,
"learning_rate": 2.4856308282459575e-05,
"loss": 0.7083,
"step": 1998
},
{
"epoch": 0.42792539669797436,
"grad_norm": 0.2125601755785729,
"learning_rate": 2.4843125619948064e-05,
"loss": 0.706,
"step": 1999
},
{
"epoch": 0.42813946643118994,
"grad_norm": 0.19951958840163336,
"learning_rate": 2.482994072185863e-05,
"loss": 0.7333,
"step": 2000
},
{
"epoch": 0.4283535361644056,
"grad_norm": 0.23352727958314665,
"learning_rate": 2.4816753594277402e-05,
"loss": 0.7056,
"step": 2001
},
{
"epoch": 0.42856760589762116,
"grad_norm": 0.17971296646507665,
"learning_rate": 2.4803564243291534e-05,
"loss": 0.7399,
"step": 2002
},
{
"epoch": 0.42878167563083674,
"grad_norm": 0.21442277455899422,
"learning_rate": 2.4790372674989205e-05,
"loss": 0.6932,
"step": 2003
},
{
"epoch": 0.4289957453640523,
"grad_norm": 0.21751520674543545,
"learning_rate": 2.4777178895459617e-05,
"loss": 0.7191,
"step": 2004
},
{
"epoch": 0.4292098150972679,
"grad_norm": 0.18357623494158104,
"learning_rate": 2.4763982910792993e-05,
"loss": 0.7189,
"step": 2005
},
{
"epoch": 0.42942388483048355,
"grad_norm": 0.19412430991067642,
"learning_rate": 2.475078472708058e-05,
"loss": 0.7033,
"step": 2006
},
{
"epoch": 0.42963795456369913,
"grad_norm": 0.20851442089275044,
"learning_rate": 2.4737584350414635e-05,
"loss": 0.6986,
"step": 2007
},
{
"epoch": 0.4298520242969147,
"grad_norm": 0.2012724231766711,
"learning_rate": 2.4724381786888426e-05,
"loss": 0.6853,
"step": 2008
},
{
"epoch": 0.4300660940301303,
"grad_norm": 0.19014397968561714,
"learning_rate": 2.4711177042596232e-05,
"loss": 0.7229,
"step": 2009
},
{
"epoch": 0.43028016376334594,
"grad_norm": 0.2031082273686637,
"learning_rate": 2.469797012363334e-05,
"loss": 0.7419,
"step": 2010
},
{
"epoch": 0.4304942334965615,
"grad_norm": 0.18665928937625678,
"learning_rate": 2.4684761036096036e-05,
"loss": 0.7099,
"step": 2011
},
{
"epoch": 0.4307083032297771,
"grad_norm": 0.19617117047536298,
"learning_rate": 2.4671549786081615e-05,
"loss": 0.7343,
"step": 2012
},
{
"epoch": 0.4309223729629927,
"grad_norm": 0.20369053805829532,
"learning_rate": 2.4658336379688366e-05,
"loss": 0.7151,
"step": 2013
},
{
"epoch": 0.43113644269620827,
"grad_norm": 0.18934183530174764,
"learning_rate": 2.4645120823015572e-05,
"loss": 0.7272,
"step": 2014
},
{
"epoch": 0.4313505124294239,
"grad_norm": 0.2001952717059697,
"learning_rate": 2.463190312216351e-05,
"loss": 0.6889,
"step": 2015
},
{
"epoch": 0.4315645821626395,
"grad_norm": 0.1989442744871271,
"learning_rate": 2.461868328323344e-05,
"loss": 0.7228,
"step": 2016
},
{
"epoch": 0.43177865189585507,
"grad_norm": 0.21450949569335329,
"learning_rate": 2.4605461312327624e-05,
"loss": 0.7149,
"step": 2017
},
{
"epoch": 0.43199272162907065,
"grad_norm": 0.20680933027804005,
"learning_rate": 2.4592237215549305e-05,
"loss": 0.7267,
"step": 2018
},
{
"epoch": 0.4322067913622863,
"grad_norm": 0.20812795097276574,
"learning_rate": 2.4579010999002683e-05,
"loss": 0.7183,
"step": 2019
},
{
"epoch": 0.4324208610955019,
"grad_norm": 0.24126825216101877,
"learning_rate": 2.4565782668792975e-05,
"loss": 0.7179,
"step": 2020
},
{
"epoch": 0.43263493082871746,
"grad_norm": 0.1933731221445781,
"learning_rate": 2.4552552231026337e-05,
"loss": 0.7175,
"step": 2021
},
{
"epoch": 0.43284900056193304,
"grad_norm": 0.23747618529883308,
"learning_rate": 2.4539319691809924e-05,
"loss": 0.7302,
"step": 2022
},
{
"epoch": 0.4330630702951486,
"grad_norm": 0.21002887072082252,
"learning_rate": 2.4526085057251856e-05,
"loss": 0.7075,
"step": 2023
},
{
"epoch": 0.43327714002836426,
"grad_norm": 0.2096821368675051,
"learning_rate": 2.4512848333461206e-05,
"loss": 0.7227,
"step": 2024
},
{
"epoch": 0.43349120976157984,
"grad_norm": 0.24851552961252657,
"learning_rate": 2.4499609526548033e-05,
"loss": 0.694,
"step": 2025
},
{
"epoch": 0.4337052794947954,
"grad_norm": 0.20719164220240477,
"learning_rate": 2.4486368642623327e-05,
"loss": 0.7098,
"step": 2026
},
{
"epoch": 0.433919349228011,
"grad_norm": 0.20652034437446656,
"learning_rate": 2.447312568779908e-05,
"loss": 0.7256,
"step": 2027
},
{
"epoch": 0.43413341896122665,
"grad_norm": 0.22951985548119255,
"learning_rate": 2.44598806681882e-05,
"loss": 0.7082,
"step": 2028
},
{
"epoch": 0.43434748869444223,
"grad_norm": 0.18357655157498282,
"learning_rate": 2.4446633589904564e-05,
"loss": 0.6882,
"step": 2029
},
{
"epoch": 0.4345615584276578,
"grad_norm": 0.2221418129967864,
"learning_rate": 2.443338445906301e-05,
"loss": 0.71,
"step": 2030
},
{
"epoch": 0.4347756281608734,
"grad_norm": 0.19384174925665745,
"learning_rate": 2.4420133281779297e-05,
"loss": 0.6931,
"step": 2031
},
{
"epoch": 0.434989697894089,
"grad_norm": 0.19453526708516902,
"learning_rate": 2.4406880064170156e-05,
"loss": 0.7394,
"step": 2032
},
{
"epoch": 0.4352037676273046,
"grad_norm": 0.20168817080497953,
"learning_rate": 2.439362481235325e-05,
"loss": 0.7099,
"step": 2033
},
{
"epoch": 0.4354178373605202,
"grad_norm": 0.1923342230318402,
"learning_rate": 2.4380367532447168e-05,
"loss": 0.7287,
"step": 2034
},
{
"epoch": 0.4356319070937358,
"grad_norm": 0.20388412643436957,
"learning_rate": 2.4367108230571453e-05,
"loss": 0.6853,
"step": 2035
},
{
"epoch": 0.43584597682695136,
"grad_norm": 0.2078287711057543,
"learning_rate": 2.4353846912846567e-05,
"loss": 0.7216,
"step": 2036
},
{
"epoch": 0.43606004656016695,
"grad_norm": 0.17972524824407618,
"learning_rate": 2.4340583585393925e-05,
"loss": 0.6891,
"step": 2037
},
{
"epoch": 0.4362741162933826,
"grad_norm": 0.1871383780412847,
"learning_rate": 2.4327318254335845e-05,
"loss": 0.711,
"step": 2038
},
{
"epoch": 0.43648818602659817,
"grad_norm": 0.1876217730924942,
"learning_rate": 2.4314050925795578e-05,
"loss": 0.7329,
"step": 2039
},
{
"epoch": 0.43670225575981375,
"grad_norm": 0.18719098254903513,
"learning_rate": 2.43007816058973e-05,
"loss": 0.7131,
"step": 2040
},
{
"epoch": 0.43691632549302933,
"grad_norm": 0.18356727762805758,
"learning_rate": 2.4287510300766107e-05,
"loss": 0.6964,
"step": 2041
},
{
"epoch": 0.43713039522624497,
"grad_norm": 0.17536655782284724,
"learning_rate": 2.4274237016528e-05,
"loss": 0.6883,
"step": 2042
},
{
"epoch": 0.43734446495946055,
"grad_norm": 0.19540611769329763,
"learning_rate": 2.426096175930992e-05,
"loss": 0.7179,
"step": 2043
},
{
"epoch": 0.43755853469267614,
"grad_norm": 0.1856451253028403,
"learning_rate": 2.424768453523969e-05,
"loss": 0.7021,
"step": 2044
},
{
"epoch": 0.4377726044258917,
"grad_norm": 0.19665118427903588,
"learning_rate": 2.4234405350446055e-05,
"loss": 0.7191,
"step": 2045
},
{
"epoch": 0.4379866741591073,
"grad_norm": 0.27210370905867626,
"learning_rate": 2.422112421105866e-05,
"loss": 0.7391,
"step": 2046
},
{
"epoch": 0.43820074389232294,
"grad_norm": 0.2049765553860846,
"learning_rate": 2.4207841123208055e-05,
"loss": 0.7298,
"step": 2047
},
{
"epoch": 0.4384148136255385,
"grad_norm": 0.18648693299756902,
"learning_rate": 2.419455609302569e-05,
"loss": 0.7176,
"step": 2048
},
{
"epoch": 0.4386288833587541,
"grad_norm": 0.1973831653039735,
"learning_rate": 2.4181269126643918e-05,
"loss": 0.686,
"step": 2049
},
{
"epoch": 0.4388429530919697,
"grad_norm": 0.18632585455591297,
"learning_rate": 2.416798023019596e-05,
"loss": 0.6905,
"step": 2050
},
{
"epoch": 0.43905702282518533,
"grad_norm": 0.18149024732686886,
"learning_rate": 2.4154689409815967e-05,
"loss": 0.6879,
"step": 2051
},
{
"epoch": 0.4392710925584009,
"grad_norm": 0.21353439002266103,
"learning_rate": 2.414139667163894e-05,
"loss": 0.733,
"step": 2052
},
{
"epoch": 0.4394851622916165,
"grad_norm": 0.18791274780190753,
"learning_rate": 2.4128102021800794e-05,
"loss": 0.7366,
"step": 2053
},
{
"epoch": 0.4396992320248321,
"grad_norm": 0.1896264638812108,
"learning_rate": 2.4114805466438315e-05,
"loss": 0.7141,
"step": 2054
},
{
"epoch": 0.43991330175804766,
"grad_norm": 0.18831087940307026,
"learning_rate": 2.4101507011689162e-05,
"loss": 0.711,
"step": 2055
},
{
"epoch": 0.4401273714912633,
"grad_norm": 0.18427173728735963,
"learning_rate": 2.408820666369188e-05,
"loss": 0.7197,
"step": 2056
},
{
"epoch": 0.4403414412244789,
"grad_norm": 0.19079804512387546,
"learning_rate": 2.4074904428585884e-05,
"loss": 0.6952,
"step": 2057
},
{
"epoch": 0.44055551095769446,
"grad_norm": 0.1897212969120429,
"learning_rate": 2.4061600312511468e-05,
"loss": 0.7489,
"step": 2058
},
{
"epoch": 0.44076958069091005,
"grad_norm": 0.19581239342259346,
"learning_rate": 2.4048294321609782e-05,
"loss": 0.7612,
"step": 2059
},
{
"epoch": 0.4409836504241257,
"grad_norm": 0.19963804135525962,
"learning_rate": 2.4034986462022847e-05,
"loss": 0.7355,
"step": 2060
},
{
"epoch": 0.44119772015734127,
"grad_norm": 0.18445213879820282,
"learning_rate": 2.4021676739893547e-05,
"loss": 0.6854,
"step": 2061
},
{
"epoch": 0.44141178989055685,
"grad_norm": 0.19668533263438023,
"learning_rate": 2.4008365161365624e-05,
"loss": 0.7418,
"step": 2062
},
{
"epoch": 0.44162585962377243,
"grad_norm": 0.20749067960177486,
"learning_rate": 2.3995051732583684e-05,
"loss": 0.7091,
"step": 2063
},
{
"epoch": 0.441839929356988,
"grad_norm": 0.2081349996777893,
"learning_rate": 2.3981736459693172e-05,
"loss": 0.7311,
"step": 2064
},
{
"epoch": 0.44205399909020365,
"grad_norm": 0.20498156314794147,
"learning_rate": 2.3968419348840403e-05,
"loss": 0.7133,
"step": 2065
},
{
"epoch": 0.44226806882341924,
"grad_norm": 0.20214150959688085,
"learning_rate": 2.3955100406172533e-05,
"loss": 0.7189,
"step": 2066
},
{
"epoch": 0.4424821385566348,
"grad_norm": 0.18175552919557034,
"learning_rate": 2.394177963783755e-05,
"loss": 0.7188,
"step": 2067
},
{
"epoch": 0.4426962082898504,
"grad_norm": 0.21315627286015912,
"learning_rate": 2.3928457049984294e-05,
"loss": 0.7003,
"step": 2068
},
{
"epoch": 0.44291027802306604,
"grad_norm": 0.17134304533018946,
"learning_rate": 2.391513264876246e-05,
"loss": 0.717,
"step": 2069
},
{
"epoch": 0.4431243477562816,
"grad_norm": 0.20499242709853013,
"learning_rate": 2.390180644032257e-05,
"loss": 0.6917,
"step": 2070
},
{
"epoch": 0.4433384174894972,
"grad_norm": 0.18718900299139493,
"learning_rate": 2.3888478430815963e-05,
"loss": 0.6969,
"step": 2071
},
{
"epoch": 0.4435524872227128,
"grad_norm": 0.18602463829878715,
"learning_rate": 2.387514862639483e-05,
"loss": 0.7029,
"step": 2072
},
{
"epoch": 0.44376655695592837,
"grad_norm": 0.22042874220191952,
"learning_rate": 2.3861817033212185e-05,
"loss": 0.6947,
"step": 2073
},
{
"epoch": 0.443980626689144,
"grad_norm": 0.18009310184104058,
"learning_rate": 2.3848483657421868e-05,
"loss": 0.7088,
"step": 2074
},
{
"epoch": 0.4441946964223596,
"grad_norm": 0.2006868546139837,
"learning_rate": 2.383514850517854e-05,
"loss": 0.6993,
"step": 2075
},
{
"epoch": 0.4444087661555752,
"grad_norm": 0.2062353966689945,
"learning_rate": 2.3821811582637687e-05,
"loss": 0.7176,
"step": 2076
},
{
"epoch": 0.44462283588879076,
"grad_norm": 0.18428277020755152,
"learning_rate": 2.38084728959556e-05,
"loss": 0.7136,
"step": 2077
},
{
"epoch": 0.4448369056220064,
"grad_norm": 0.19040739557886138,
"learning_rate": 2.379513245128939e-05,
"loss": 0.7177,
"step": 2078
},
{
"epoch": 0.445050975355222,
"grad_norm": 0.181095972545011,
"learning_rate": 2.3781790254796993e-05,
"loss": 0.7115,
"step": 2079
},
{
"epoch": 0.44526504508843756,
"grad_norm": 0.213847801473037,
"learning_rate": 2.3768446312637137e-05,
"loss": 0.7004,
"step": 2080
},
{
"epoch": 0.44547911482165314,
"grad_norm": 0.2064309884570408,
"learning_rate": 2.375510063096936e-05,
"loss": 0.7197,
"step": 2081
},
{
"epoch": 0.4456931845548687,
"grad_norm": 0.1890483606201992,
"learning_rate": 2.374175321595401e-05,
"loss": 0.6993,
"step": 2082
},
{
"epoch": 0.44590725428808436,
"grad_norm": 0.21580698521259575,
"learning_rate": 2.372840407375222e-05,
"loss": 0.7168,
"step": 2083
},
{
"epoch": 0.44612132402129995,
"grad_norm": 0.18883476811571928,
"learning_rate": 2.3715053210525937e-05,
"loss": 0.7019,
"step": 2084
},
{
"epoch": 0.44633539375451553,
"grad_norm": 0.18565956158490704,
"learning_rate": 2.3701700632437892e-05,
"loss": 0.725,
"step": 2085
},
{
"epoch": 0.4465494634877311,
"grad_norm": 0.2081209436161066,
"learning_rate": 2.3688346345651612e-05,
"loss": 0.7163,
"step": 2086
},
{
"epoch": 0.4467635332209467,
"grad_norm": 0.18502706911103697,
"learning_rate": 2.367499035633141e-05,
"loss": 0.7079,
"step": 2087
},
{
"epoch": 0.44697760295416233,
"grad_norm": 0.19961209619379117,
"learning_rate": 2.3661632670642386e-05,
"loss": 0.7405,
"step": 2088
},
{
"epoch": 0.4471916726873779,
"grad_norm": 0.18601078708322014,
"learning_rate": 2.3648273294750425e-05,
"loss": 0.6957,
"step": 2089
},
{
"epoch": 0.4474057424205935,
"grad_norm": 0.20980173919175385,
"learning_rate": 2.3634912234822194e-05,
"loss": 0.7033,
"step": 2090
},
{
"epoch": 0.4476198121538091,
"grad_norm": 0.19028429850671252,
"learning_rate": 2.3621549497025118e-05,
"loss": 0.706,
"step": 2091
},
{
"epoch": 0.4478338818870247,
"grad_norm": 0.186215889007293,
"learning_rate": 2.3608185087527432e-05,
"loss": 0.7038,
"step": 2092
},
{
"epoch": 0.4480479516202403,
"grad_norm": 0.18355418658930112,
"learning_rate": 2.3594819012498115e-05,
"loss": 0.6964,
"step": 2093
},
{
"epoch": 0.4482620213534559,
"grad_norm": 0.18758532546585963,
"learning_rate": 2.3581451278106924e-05,
"loss": 0.7057,
"step": 2094
},
{
"epoch": 0.44847609108667147,
"grad_norm": 0.19956411209155378,
"learning_rate": 2.356808189052437e-05,
"loss": 0.7236,
"step": 2095
},
{
"epoch": 0.44869016081988705,
"grad_norm": 0.1778401001903628,
"learning_rate": 2.3554710855921756e-05,
"loss": 0.7422,
"step": 2096
},
{
"epoch": 0.4489042305531027,
"grad_norm": 0.21045480952717957,
"learning_rate": 2.3541338180471115e-05,
"loss": 0.6927,
"step": 2097
},
{
"epoch": 0.4491183002863183,
"grad_norm": 0.19311618494876245,
"learning_rate": 2.352796387034525e-05,
"loss": 0.7094,
"step": 2098
},
{
"epoch": 0.44933237001953386,
"grad_norm": 0.19975375908131546,
"learning_rate": 2.3514587931717724e-05,
"loss": 0.723,
"step": 2099
},
{
"epoch": 0.44954643975274944,
"grad_norm": 0.20640555154849616,
"learning_rate": 2.350121037076284e-05,
"loss": 0.7163,
"step": 2100
},
{
"epoch": 0.4497605094859651,
"grad_norm": 0.1948182699005542,
"learning_rate": 2.3487831193655666e-05,
"loss": 0.719,
"step": 2101
},
{
"epoch": 0.44997457921918066,
"grad_norm": 0.19614745708909373,
"learning_rate": 2.347445040657199e-05,
"loss": 0.7032,
"step": 2102
},
{
"epoch": 0.45018864895239624,
"grad_norm": 0.21701631897555446,
"learning_rate": 2.3461068015688372e-05,
"loss": 0.6824,
"step": 2103
},
{
"epoch": 0.4504027186856118,
"grad_norm": 0.19471397707591673,
"learning_rate": 2.344768402718209e-05,
"loss": 0.7108,
"step": 2104
},
{
"epoch": 0.4506167884188274,
"grad_norm": 0.3836571163793701,
"learning_rate": 2.3434298447231165e-05,
"loss": 0.7346,
"step": 2105
},
{
"epoch": 0.45083085815204305,
"grad_norm": 0.18322266761973127,
"learning_rate": 2.3420911282014373e-05,
"loss": 0.705,
"step": 2106
},
{
"epoch": 0.45104492788525863,
"grad_norm": 0.21850351001536295,
"learning_rate": 2.340752253771119e-05,
"loss": 0.7049,
"step": 2107
},
{
"epoch": 0.4512589976184742,
"grad_norm": 0.18326116754485203,
"learning_rate": 2.339413222050185e-05,
"loss": 0.758,
"step": 2108
},
{
"epoch": 0.4514730673516898,
"grad_norm": 0.22517947759114104,
"learning_rate": 2.3380740336567285e-05,
"loss": 0.7028,
"step": 2109
},
{
"epoch": 0.45168713708490543,
"grad_norm": 0.22960359498581612,
"learning_rate": 2.3367346892089166e-05,
"loss": 0.7203,
"step": 2110
},
{
"epoch": 0.451901206818121,
"grad_norm": 0.21405634986151675,
"learning_rate": 2.335395189324989e-05,
"loss": 0.7425,
"step": 2111
},
{
"epoch": 0.4521152765513366,
"grad_norm": 0.20123821408705836,
"learning_rate": 2.334055534623256e-05,
"loss": 0.7208,
"step": 2112
},
{
"epoch": 0.4523293462845522,
"grad_norm": 0.20140231544872989,
"learning_rate": 2.3327157257220994e-05,
"loss": 0.7107,
"step": 2113
},
{
"epoch": 0.45254341601776776,
"grad_norm": 0.1804525636336557,
"learning_rate": 2.331375763239973e-05,
"loss": 0.7075,
"step": 2114
},
{
"epoch": 0.4527574857509834,
"grad_norm": 0.26302564080958823,
"learning_rate": 2.3300356477954008e-05,
"loss": 0.7043,
"step": 2115
},
{
"epoch": 0.452971555484199,
"grad_norm": 0.19783153830963043,
"learning_rate": 2.328695380006978e-05,
"loss": 0.7424,
"step": 2116
},
{
"epoch": 0.45318562521741457,
"grad_norm": 0.19999876206622547,
"learning_rate": 2.3273549604933693e-05,
"loss": 0.7164,
"step": 2117
},
{
"epoch": 0.45339969495063015,
"grad_norm": 0.19216671514409614,
"learning_rate": 2.3260143898733106e-05,
"loss": 0.7093,
"step": 2118
},
{
"epoch": 0.4536137646838458,
"grad_norm": 0.20994204910084707,
"learning_rate": 2.3246736687656055e-05,
"loss": 0.7162,
"step": 2119
},
{
"epoch": 0.45382783441706137,
"grad_norm": 0.20692097620296712,
"learning_rate": 2.3233327977891295e-05,
"loss": 0.7248,
"step": 2120
},
{
"epoch": 0.45404190415027695,
"grad_norm": 0.20359848058700244,
"learning_rate": 2.321991777562826e-05,
"loss": 0.7529,
"step": 2121
},
{
"epoch": 0.45425597388349254,
"grad_norm": 0.1959436518862538,
"learning_rate": 2.3206506087057076e-05,
"loss": 0.7134,
"step": 2122
},
{
"epoch": 0.4544700436167081,
"grad_norm": 0.20624523193827454,
"learning_rate": 2.319309291836855e-05,
"loss": 0.7262,
"step": 2123
},
{
"epoch": 0.45468411334992376,
"grad_norm": 0.1960895968942528,
"learning_rate": 2.317967827575418e-05,
"loss": 0.7324,
"step": 2124
},
{
"epoch": 0.45489818308313934,
"grad_norm": 0.21014796008763786,
"learning_rate": 2.316626216540614e-05,
"loss": 0.7394,
"step": 2125
},
{
"epoch": 0.4551122528163549,
"grad_norm": 0.1899205626449186,
"learning_rate": 2.315284459351727e-05,
"loss": 0.6772,
"step": 2126
},
{
"epoch": 0.4553263225495705,
"grad_norm": 0.20052880229653133,
"learning_rate": 2.3139425566281118e-05,
"loss": 0.7412,
"step": 2127
},
{
"epoch": 0.45554039228278614,
"grad_norm": 0.19392973954038528,
"learning_rate": 2.312600508989187e-05,
"loss": 0.7218,
"step": 2128
},
{
"epoch": 0.4557544620160017,
"grad_norm": 0.1892379766232198,
"learning_rate": 2.3112583170544395e-05,
"loss": 0.7103,
"step": 2129
},
{
"epoch": 0.4559685317492173,
"grad_norm": 0.19748742318767742,
"learning_rate": 2.309915981443422e-05,
"loss": 0.7146,
"step": 2130
},
{
"epoch": 0.4561826014824329,
"grad_norm": 0.18467300783025856,
"learning_rate": 2.3085735027757548e-05,
"loss": 0.691,
"step": 2131
},
{
"epoch": 0.4563966712156485,
"grad_norm": 0.20449863566538318,
"learning_rate": 2.3072308816711243e-05,
"loss": 0.7143,
"step": 2132
},
{
"epoch": 0.4566107409488641,
"grad_norm": 0.18470223636476016,
"learning_rate": 2.3058881187492808e-05,
"loss": 0.7254,
"step": 2133
},
{
"epoch": 0.4568248106820797,
"grad_norm": 0.21115686334907097,
"learning_rate": 2.304545214630042e-05,
"loss": 0.6858,
"step": 2134
},
{
"epoch": 0.4570388804152953,
"grad_norm": 0.2070400787293092,
"learning_rate": 2.303202169933289e-05,
"loss": 0.7223,
"step": 2135
},
{
"epoch": 0.45725295014851086,
"grad_norm": 0.20071912284859644,
"learning_rate": 2.30185898527897e-05,
"loss": 0.7186,
"step": 2136
},
{
"epoch": 0.4574670198817265,
"grad_norm": 0.20029551412812613,
"learning_rate": 2.3005156612870954e-05,
"loss": 0.7055,
"step": 2137
},
{
"epoch": 0.4576810896149421,
"grad_norm": 0.19326759988473818,
"learning_rate": 2.2991721985777425e-05,
"loss": 0.7031,
"step": 2138
},
{
"epoch": 0.45789515934815767,
"grad_norm": 0.18588353186976161,
"learning_rate": 2.2978285977710496e-05,
"loss": 0.7005,
"step": 2139
},
{
"epoch": 0.45810922908137325,
"grad_norm": 0.18433258134248923,
"learning_rate": 2.2964848594872217e-05,
"loss": 0.7447,
"step": 2140
},
{
"epoch": 0.45832329881458883,
"grad_norm": 0.19340810567094072,
"learning_rate": 2.2951409843465248e-05,
"loss": 0.7423,
"step": 2141
},
{
"epoch": 0.45853736854780447,
"grad_norm": 0.18558020866153613,
"learning_rate": 2.2937969729692902e-05,
"loss": 0.7526,
"step": 2142
},
{
"epoch": 0.45875143828102005,
"grad_norm": 0.2025393114195531,
"learning_rate": 2.292452825975911e-05,
"loss": 0.7396,
"step": 2143
},
{
"epoch": 0.45896550801423563,
"grad_norm": 0.1827442394547482,
"learning_rate": 2.2911085439868425e-05,
"loss": 0.7034,
"step": 2144
},
{
"epoch": 0.4591795777474512,
"grad_norm": 0.1925697563862605,
"learning_rate": 2.2897641276226028e-05,
"loss": 0.7147,
"step": 2145
},
{
"epoch": 0.4593936474806668,
"grad_norm": 0.19874786800278074,
"learning_rate": 2.288419577503772e-05,
"loss": 0.7111,
"step": 2146
},
{
"epoch": 0.45960771721388244,
"grad_norm": 0.18229270314709667,
"learning_rate": 2.2870748942509928e-05,
"loss": 0.6978,
"step": 2147
},
{
"epoch": 0.459821786947098,
"grad_norm": 0.20061734811137624,
"learning_rate": 2.2857300784849672e-05,
"loss": 0.7063,
"step": 2148
},
{
"epoch": 0.4600358566803136,
"grad_norm": 0.19297151527985856,
"learning_rate": 2.2843851308264613e-05,
"loss": 0.7252,
"step": 2149
},
{
"epoch": 0.4602499264135292,
"grad_norm": 0.19265491238083898,
"learning_rate": 2.2830400518962986e-05,
"loss": 0.7352,
"step": 2150
},
{
"epoch": 0.4604639961467448,
"grad_norm": 0.20238884594899037,
"learning_rate": 2.281694842315367e-05,
"loss": 0.7201,
"step": 2151
},
{
"epoch": 0.4606780658799604,
"grad_norm": 0.21753805097857884,
"learning_rate": 2.2803495027046113e-05,
"loss": 0.7296,
"step": 2152
},
{
"epoch": 0.460892135613176,
"grad_norm": 0.21355056727719166,
"learning_rate": 2.2790040336850386e-05,
"loss": 0.6886,
"step": 2153
},
{
"epoch": 0.4611062053463916,
"grad_norm": 0.2220846866903324,
"learning_rate": 2.2776584358777143e-05,
"loss": 0.7268,
"step": 2154
},
{
"epoch": 0.46132027507960716,
"grad_norm": 0.20854007523915405,
"learning_rate": 2.2763127099037646e-05,
"loss": 0.7246,
"step": 2155
},
{
"epoch": 0.4615343448128228,
"grad_norm": 0.18591572673778503,
"learning_rate": 2.274966856384374e-05,
"loss": 0.6805,
"step": 2156
},
{
"epoch": 0.4617484145460384,
"grad_norm": 0.21008652628761115,
"learning_rate": 2.2736208759407853e-05,
"loss": 0.7286,
"step": 2157
},
{
"epoch": 0.46196248427925396,
"grad_norm": 0.17872799128498604,
"learning_rate": 2.2722747691943017e-05,
"loss": 0.7209,
"step": 2158
},
{
"epoch": 0.46217655401246954,
"grad_norm": 0.1995400707408316,
"learning_rate": 2.2709285367662828e-05,
"loss": 0.702,
"step": 2159
},
{
"epoch": 0.4623906237456852,
"grad_norm": 0.1963409303381132,
"learning_rate": 2.2695821792781474e-05,
"loss": 0.716,
"step": 2160
},
{
"epoch": 0.46260469347890076,
"grad_norm": 0.19212834799908093,
"learning_rate": 2.2682356973513714e-05,
"loss": 0.7265,
"step": 2161
},
{
"epoch": 0.46281876321211635,
"grad_norm": 0.18979304318996393,
"learning_rate": 2.2668890916074882e-05,
"loss": 0.7194,
"step": 2162
},
{
"epoch": 0.46303283294533193,
"grad_norm": 0.19390822371444827,
"learning_rate": 2.2655423626680893e-05,
"loss": 0.6937,
"step": 2163
},
{
"epoch": 0.4632469026785475,
"grad_norm": 0.19594228612862122,
"learning_rate": 2.2641955111548223e-05,
"loss": 0.7165,
"step": 2164
},
{
"epoch": 0.46346097241176315,
"grad_norm": 0.328611684424364,
"learning_rate": 2.26284853768939e-05,
"loss": 0.7529,
"step": 2165
},
{
"epoch": 0.46367504214497873,
"grad_norm": 0.18678760685275683,
"learning_rate": 2.2615014428935548e-05,
"loss": 0.7057,
"step": 2166
},
{
"epoch": 0.4638891118781943,
"grad_norm": 0.19678736007739475,
"learning_rate": 2.2601542273891317e-05,
"loss": 0.7522,
"step": 2167
},
{
"epoch": 0.4641031816114099,
"grad_norm": 0.19321796388205487,
"learning_rate": 2.2588068917979933e-05,
"loss": 0.7135,
"step": 2168
},
{
"epoch": 0.46431725134462554,
"grad_norm": 0.17534417061737678,
"learning_rate": 2.257459436742068e-05,
"loss": 0.7165,
"step": 2169
},
{
"epoch": 0.4645313210778411,
"grad_norm": 0.19223165724507915,
"learning_rate": 2.2561118628433377e-05,
"loss": 0.7146,
"step": 2170
},
{
"epoch": 0.4647453908110567,
"grad_norm": 0.19244694558663425,
"learning_rate": 2.2547641707238402e-05,
"loss": 0.7336,
"step": 2171
},
{
"epoch": 0.4649594605442723,
"grad_norm": 0.20335622665185274,
"learning_rate": 2.253416361005668e-05,
"loss": 0.7086,
"step": 2172
},
{
"epoch": 0.46517353027748787,
"grad_norm": 0.5434599869321535,
"learning_rate": 2.2520684343109675e-05,
"loss": 0.6898,
"step": 2173
},
{
"epoch": 0.4653876000107035,
"grad_norm": 0.1889701876371436,
"learning_rate": 2.2507203912619388e-05,
"loss": 0.7254,
"step": 2174
},
{
"epoch": 0.4656016697439191,
"grad_norm": 0.18927889582436427,
"learning_rate": 2.2493722324808368e-05,
"loss": 0.7061,
"step": 2175
},
{
"epoch": 0.46581573947713467,
"grad_norm": 0.19879273062813915,
"learning_rate": 2.2480239585899688e-05,
"loss": 0.7179,
"step": 2176
},
{
"epoch": 0.46602980921035025,
"grad_norm": 0.21245819806627703,
"learning_rate": 2.2466755702116957e-05,
"loss": 0.725,
"step": 2177
},
{
"epoch": 0.4662438789435659,
"grad_norm": 0.18089828361663096,
"learning_rate": 2.24532706796843e-05,
"loss": 0.6809,
"step": 2178
},
{
"epoch": 0.4664579486767815,
"grad_norm": 0.20796703517907206,
"learning_rate": 2.24397845248264e-05,
"loss": 0.7228,
"step": 2179
},
{
"epoch": 0.46667201840999706,
"grad_norm": 0.18658784740039108,
"learning_rate": 2.2426297243768423e-05,
"loss": 0.7125,
"step": 2180
},
{
"epoch": 0.46688608814321264,
"grad_norm": 0.22190722914833058,
"learning_rate": 2.2412808842736083e-05,
"loss": 0.7191,
"step": 2181
},
{
"epoch": 0.4671001578764282,
"grad_norm": 0.18321387426286206,
"learning_rate": 2.23993193279556e-05,
"loss": 0.7026,
"step": 2182
},
{
"epoch": 0.46731422760964386,
"grad_norm": 0.1930234572929713,
"learning_rate": 2.2385828705653697e-05,
"loss": 0.7133,
"step": 2183
},
{
"epoch": 0.46752829734285944,
"grad_norm": 0.21405394720514132,
"learning_rate": 2.2372336982057644e-05,
"loss": 0.7114,
"step": 2184
},
{
"epoch": 0.467742367076075,
"grad_norm": 0.18934323146710155,
"learning_rate": 2.2358844163395177e-05,
"loss": 0.7249,
"step": 2185
},
{
"epoch": 0.4679564368092906,
"grad_norm": 0.21659338425984373,
"learning_rate": 2.2345350255894563e-05,
"loss": 0.7295,
"step": 2186
},
{
"epoch": 0.46817050654250625,
"grad_norm": 0.1937781425082282,
"learning_rate": 2.2331855265784562e-05,
"loss": 0.7068,
"step": 2187
},
{
"epoch": 0.46838457627572183,
"grad_norm": 0.20862195676835077,
"learning_rate": 2.2318359199294443e-05,
"loss": 0.699,
"step": 2188
},
{
"epoch": 0.4685986460089374,
"grad_norm": 0.19356935963089814,
"learning_rate": 2.2304862062653956e-05,
"loss": 0.7364,
"step": 2189
},
{
"epoch": 0.468812715742153,
"grad_norm": 0.21522912935697439,
"learning_rate": 2.2291363862093363e-05,
"loss": 0.7386,
"step": 2190
},
{
"epoch": 0.4690267854753686,
"grad_norm": 0.20601797392840152,
"learning_rate": 2.2277864603843405e-05,
"loss": 0.7279,
"step": 2191
},
{
"epoch": 0.4692408552085842,
"grad_norm": 0.21339548740533315,
"learning_rate": 2.2264364294135316e-05,
"loss": 0.7087,
"step": 2192
},
{
"epoch": 0.4694549249417998,
"grad_norm": 0.21461038435187735,
"learning_rate": 2.2250862939200815e-05,
"loss": 0.7003,
"step": 2193
},
{
"epoch": 0.4696689946750154,
"grad_norm": 0.2063913413177893,
"learning_rate": 2.22373605452721e-05,
"loss": 0.7064,
"step": 2194
},
{
"epoch": 0.46988306440823097,
"grad_norm": 0.21786751809931207,
"learning_rate": 2.2223857118581856e-05,
"loss": 0.709,
"step": 2195
},
{
"epoch": 0.47009713414144655,
"grad_norm": 0.20358799992582777,
"learning_rate": 2.2210352665363234e-05,
"loss": 0.7151,
"step": 2196
},
{
"epoch": 0.4703112038746622,
"grad_norm": 0.2082103443229798,
"learning_rate": 2.219684719184987e-05,
"loss": 0.7424,
"step": 2197
},
{
"epoch": 0.47052527360787777,
"grad_norm": 0.18248657801484386,
"learning_rate": 2.2183340704275862e-05,
"loss": 0.6843,
"step": 2198
},
{
"epoch": 0.47073934334109335,
"grad_norm": 0.22323684825994702,
"learning_rate": 2.216983320887578e-05,
"loss": 0.6894,
"step": 2199
},
{
"epoch": 0.47095341307430894,
"grad_norm": 0.17675028993984335,
"learning_rate": 2.2156324711884665e-05,
"loss": 0.6725,
"step": 2200
},
{
"epoch": 0.4711674828075246,
"grad_norm": 0.22712610008087614,
"learning_rate": 2.2142815219538006e-05,
"loss": 0.7295,
"step": 2201
},
{
"epoch": 0.47138155254074016,
"grad_norm": 0.19932839798355986,
"learning_rate": 2.212930473807177e-05,
"loss": 0.7174,
"step": 2202
},
{
"epoch": 0.47159562227395574,
"grad_norm": 0.2138376063777719,
"learning_rate": 2.2115793273722363e-05,
"loss": 0.717,
"step": 2203
},
{
"epoch": 0.4718096920071713,
"grad_norm": 0.23804969504798007,
"learning_rate": 2.2102280832726644e-05,
"loss": 0.7129,
"step": 2204
},
{
"epoch": 0.4720237617403869,
"grad_norm": 0.19630526223385558,
"learning_rate": 2.208876742132195e-05,
"loss": 0.7186,
"step": 2205
},
{
"epoch": 0.47223783147360254,
"grad_norm": 0.23860936524030293,
"learning_rate": 2.207525304574604e-05,
"loss": 0.6855,
"step": 2206
},
{
"epoch": 0.4724519012068181,
"grad_norm": 0.18635130539411032,
"learning_rate": 2.206173771223712e-05,
"loss": 0.722,
"step": 2207
},
{
"epoch": 0.4726659709400337,
"grad_norm": 0.22061354178715856,
"learning_rate": 2.204822142703385e-05,
"loss": 0.7414,
"step": 2208
},
{
"epoch": 0.4728800406732493,
"grad_norm": 0.17771494882280253,
"learning_rate": 2.2034704196375314e-05,
"loss": 0.7262,
"step": 2209
},
{
"epoch": 0.47309411040646493,
"grad_norm": 0.22177052502399974,
"learning_rate": 2.2021186026501042e-05,
"loss": 0.7394,
"step": 2210
},
{
"epoch": 0.4733081801396805,
"grad_norm": 0.2836119350005059,
"learning_rate": 2.2007666923651007e-05,
"loss": 0.7142,
"step": 2211
},
{
"epoch": 0.4735222498728961,
"grad_norm": 0.20142207277353355,
"learning_rate": 2.1994146894065596e-05,
"loss": 0.7011,
"step": 2212
},
{
"epoch": 0.4737363196061117,
"grad_norm": 0.2195095836282597,
"learning_rate": 2.198062594398562e-05,
"loss": 0.6977,
"step": 2213
},
{
"epoch": 0.47395038933932726,
"grad_norm": 0.1843272108922569,
"learning_rate": 2.1967104079652342e-05,
"loss": 0.6985,
"step": 2214
},
{
"epoch": 0.4741644590725429,
"grad_norm": 0.21025139701485807,
"learning_rate": 2.195358130730742e-05,
"loss": 0.7259,
"step": 2215
},
{
"epoch": 0.4743785288057585,
"grad_norm": 0.1955009279018932,
"learning_rate": 2.1940057633192943e-05,
"loss": 0.6931,
"step": 2216
},
{
"epoch": 0.47459259853897406,
"grad_norm": 0.21209494188514003,
"learning_rate": 2.192653306355141e-05,
"loss": 0.7059,
"step": 2217
},
{
"epoch": 0.47480666827218965,
"grad_norm": 0.1913984807616019,
"learning_rate": 2.1913007604625746e-05,
"loss": 0.7035,
"step": 2218
},
{
"epoch": 0.4750207380054053,
"grad_norm": 0.20272947480292364,
"learning_rate": 2.1899481262659273e-05,
"loss": 0.7242,
"step": 2219
},
{
"epoch": 0.47523480773862087,
"grad_norm": 0.18602166579461094,
"learning_rate": 2.188595404389572e-05,
"loss": 0.7084,
"step": 2220
},
{
"epoch": 0.47544887747183645,
"grad_norm": 0.19229797224622588,
"learning_rate": 2.1872425954579243e-05,
"loss": 0.7243,
"step": 2221
},
{
"epoch": 0.47566294720505203,
"grad_norm": 0.19772756152008628,
"learning_rate": 2.185889700095437e-05,
"loss": 0.7004,
"step": 2222
},
{
"epoch": 0.4758770169382676,
"grad_norm": 0.18776691774748028,
"learning_rate": 2.184536718926604e-05,
"loss": 0.7127,
"step": 2223
},
{
"epoch": 0.47609108667148325,
"grad_norm": 0.19986289483928601,
"learning_rate": 2.1831836525759596e-05,
"loss": 0.7008,
"step": 2224
},
{
"epoch": 0.47630515640469884,
"grad_norm": 0.17887994580565705,
"learning_rate": 2.1818305016680767e-05,
"loss": 0.706,
"step": 2225
},
{
"epoch": 0.4765192261379144,
"grad_norm": 0.20062621179748724,
"learning_rate": 2.1804772668275668e-05,
"loss": 0.7085,
"step": 2226
},
{
"epoch": 0.47673329587113,
"grad_norm": 0.1816060001069782,
"learning_rate": 2.179123948679081e-05,
"loss": 0.6828,
"step": 2227
},
{
"epoch": 0.47694736560434564,
"grad_norm": 0.20607887352035112,
"learning_rate": 2.177770547847309e-05,
"loss": 0.7298,
"step": 2228
},
{
"epoch": 0.4771614353375612,
"grad_norm": 0.18395325230022824,
"learning_rate": 2.1764170649569766e-05,
"loss": 0.7487,
"step": 2229
},
{
"epoch": 0.4773755050707768,
"grad_norm": 0.32829652909794255,
"learning_rate": 2.1750635006328506e-05,
"loss": 0.7182,
"step": 2230
},
{
"epoch": 0.4775895748039924,
"grad_norm": 0.18739509457265122,
"learning_rate": 2.1737098554997322e-05,
"loss": 0.726,
"step": 2231
},
{
"epoch": 0.47780364453720797,
"grad_norm": 0.1899002120746162,
"learning_rate": 2.1723561301824632e-05,
"loss": 0.7011,
"step": 2232
},
{
"epoch": 0.4780177142704236,
"grad_norm": 0.19690319467732328,
"learning_rate": 2.17100232530592e-05,
"loss": 0.6923,
"step": 2233
},
{
"epoch": 0.4782317840036392,
"grad_norm": 0.19066691168538474,
"learning_rate": 2.1696484414950166e-05,
"loss": 0.6811,
"step": 2234
},
{
"epoch": 0.4784458537368548,
"grad_norm": 0.17528084411290765,
"learning_rate": 2.1682944793747032e-05,
"loss": 0.7091,
"step": 2235
},
{
"epoch": 0.47865992347007036,
"grad_norm": 0.18537712986273827,
"learning_rate": 2.1669404395699658e-05,
"loss": 0.7102,
"step": 2236
},
{
"epoch": 0.478873993203286,
"grad_norm": 0.17236799719458346,
"learning_rate": 2.1655863227058273e-05,
"loss": 0.7119,
"step": 2237
},
{
"epoch": 0.4790880629365016,
"grad_norm": 0.19124950022431447,
"learning_rate": 2.1642321294073456e-05,
"loss": 0.7088,
"step": 2238
},
{
"epoch": 0.47930213266971716,
"grad_norm": 0.17534095309799322,
"learning_rate": 2.1628778602996133e-05,
"loss": 0.7063,
"step": 2239
},
{
"epoch": 0.47951620240293275,
"grad_norm": 0.1760151295715762,
"learning_rate": 2.1615235160077594e-05,
"loss": 0.6914,
"step": 2240
},
{
"epoch": 0.47973027213614833,
"grad_norm": 0.2018822797808939,
"learning_rate": 2.160169097156945e-05,
"loss": 0.7299,
"step": 2241
},
{
"epoch": 0.47994434186936397,
"grad_norm": 0.17174582192430415,
"learning_rate": 2.158814604372369e-05,
"loss": 0.7198,
"step": 2242
},
{
"epoch": 0.48015841160257955,
"grad_norm": 0.20404193653752453,
"learning_rate": 2.157460038279263e-05,
"loss": 0.6987,
"step": 2243
},
{
"epoch": 0.48037248133579513,
"grad_norm": 0.2022425540689897,
"learning_rate": 2.1561053995028916e-05,
"loss": 0.7465,
"step": 2244
},
{
"epoch": 0.4805865510690107,
"grad_norm": 0.20972648040376374,
"learning_rate": 2.154750688668553e-05,
"loss": 0.7049,
"step": 2245
},
{
"epoch": 0.48080062080222635,
"grad_norm": 0.17349165190856564,
"learning_rate": 2.1533959064015798e-05,
"loss": 0.707,
"step": 2246
},
{
"epoch": 0.48101469053544194,
"grad_norm": 0.19821909126638199,
"learning_rate": 2.1520410533273372e-05,
"loss": 0.719,
"step": 2247
},
{
"epoch": 0.4812287602686575,
"grad_norm": 0.18990298742636988,
"learning_rate": 2.1506861300712223e-05,
"loss": 0.6977,
"step": 2248
},
{
"epoch": 0.4814428300018731,
"grad_norm": 0.19449175179036848,
"learning_rate": 2.149331137258666e-05,
"loss": 0.7216,
"step": 2249
},
{
"epoch": 0.4816568997350887,
"grad_norm": 0.1810279416218013,
"learning_rate": 2.1479760755151304e-05,
"loss": 0.7056,
"step": 2250
},
{
"epoch": 0.4818709694683043,
"grad_norm": 0.19002562059362754,
"learning_rate": 2.1466209454661088e-05,
"loss": 0.7318,
"step": 2251
},
{
"epoch": 0.4820850392015199,
"grad_norm": 0.1836020226690981,
"learning_rate": 2.1452657477371267e-05,
"loss": 0.7032,
"step": 2252
},
{
"epoch": 0.4822991089347355,
"grad_norm": 0.1842724046109878,
"learning_rate": 2.143910482953742e-05,
"loss": 0.7042,
"step": 2253
},
{
"epoch": 0.48251317866795107,
"grad_norm": 0.18971483550570742,
"learning_rate": 2.142555151741542e-05,
"loss": 0.7135,
"step": 2254
},
{
"epoch": 0.48272724840116665,
"grad_norm": 0.19163180476199815,
"learning_rate": 2.1411997547261444e-05,
"loss": 0.6964,
"step": 2255
},
{
"epoch": 0.4829413181343823,
"grad_norm": 0.18818854261034648,
"learning_rate": 2.139844292533199e-05,
"loss": 0.7184,
"step": 2256
},
{
"epoch": 0.4831553878675979,
"grad_norm": 0.19510093910806356,
"learning_rate": 2.1384887657883836e-05,
"loss": 0.7217,
"step": 2257
},
{
"epoch": 0.48336945760081346,
"grad_norm": 0.19945217780349028,
"learning_rate": 2.1371331751174074e-05,
"loss": 0.7079,
"step": 2258
},
{
"epoch": 0.48358352733402904,
"grad_norm": 0.20874140469478644,
"learning_rate": 2.1357775211460087e-05,
"loss": 0.6922,
"step": 2259
},
{
"epoch": 0.4837975970672447,
"grad_norm": 0.20908910227409855,
"learning_rate": 2.1344218044999554e-05,
"loss": 0.7048,
"step": 2260
},
{
"epoch": 0.48401166680046026,
"grad_norm": 0.20475202083773375,
"learning_rate": 2.1330660258050427e-05,
"loss": 0.7144,
"step": 2261
},
{
"epoch": 0.48422573653367584,
"grad_norm": 0.2515990189295899,
"learning_rate": 2.131710185687096e-05,
"loss": 0.7029,
"step": 2262
},
{
"epoch": 0.4844398062668914,
"grad_norm": 0.19377934490853652,
"learning_rate": 2.130354284771969e-05,
"loss": 0.6908,
"step": 2263
},
{
"epoch": 0.484653876000107,
"grad_norm": 0.2038222499424192,
"learning_rate": 2.1289983236855428e-05,
"loss": 0.7045,
"step": 2264
},
{
"epoch": 0.48486794573332265,
"grad_norm": 0.19998878810257695,
"learning_rate": 2.127642303053726e-05,
"loss": 0.716,
"step": 2265
},
{
"epoch": 0.48508201546653823,
"grad_norm": 0.20571719169026145,
"learning_rate": 2.1262862235024567e-05,
"loss": 0.715,
"step": 2266
},
{
"epoch": 0.4852960851997538,
"grad_norm": 0.19014981487799368,
"learning_rate": 2.1249300856576972e-05,
"loss": 0.7337,
"step": 2267
},
{
"epoch": 0.4855101549329694,
"grad_norm": 0.20788350079763343,
"learning_rate": 2.1235738901454385e-05,
"loss": 0.6961,
"step": 2268
},
{
"epoch": 0.48572422466618503,
"grad_norm": 0.21612197002966596,
"learning_rate": 2.122217637591699e-05,
"loss": 0.7449,
"step": 2269
},
{
"epoch": 0.4859382943994006,
"grad_norm": 0.19972069186062108,
"learning_rate": 2.1208613286225214e-05,
"loss": 0.7216,
"step": 2270
},
{
"epoch": 0.4861523641326162,
"grad_norm": 0.18881056958815423,
"learning_rate": 2.119504963863976e-05,
"loss": 0.7087,
"step": 2271
},
{
"epoch": 0.4863664338658318,
"grad_norm": 0.1925151607037651,
"learning_rate": 2.118148543942158e-05,
"loss": 0.7383,
"step": 2272
},
{
"epoch": 0.48658050359904736,
"grad_norm": 0.195449876371769,
"learning_rate": 2.1167920694831876e-05,
"loss": 0.7107,
"step": 2273
},
{
"epoch": 0.486794573332263,
"grad_norm": 0.20005252445025037,
"learning_rate": 2.1154355411132122e-05,
"loss": 0.6841,
"step": 2274
},
{
"epoch": 0.4870086430654786,
"grad_norm": 0.18416558455168575,
"learning_rate": 2.114078959458403e-05,
"loss": 0.7289,
"step": 2275
},
{
"epoch": 0.48722271279869417,
"grad_norm": 0.20309618756159076,
"learning_rate": 2.1127223251449543e-05,
"loss": 0.6937,
"step": 2276
},
{
"epoch": 0.48743678253190975,
"grad_norm": 0.18878437985187504,
"learning_rate": 2.111365638799087e-05,
"loss": 0.7074,
"step": 2277
},
{
"epoch": 0.4876508522651254,
"grad_norm": 0.20122807405960974,
"learning_rate": 2.110008901047044e-05,
"loss": 0.6767,
"step": 2278
},
{
"epoch": 0.487864921998341,
"grad_norm": 0.1869339865653749,
"learning_rate": 2.108652112515094e-05,
"loss": 0.7267,
"step": 2279
},
{
"epoch": 0.48807899173155656,
"grad_norm": 0.18685701646559502,
"learning_rate": 2.1072952738295284e-05,
"loss": 0.7064,
"step": 2280
},
{
"epoch": 0.48829306146477214,
"grad_norm": 0.18757687185499403,
"learning_rate": 2.1059383856166602e-05,
"loss": 0.7112,
"step": 2281
},
{
"epoch": 0.4885071311979877,
"grad_norm": 0.18045589237477888,
"learning_rate": 2.104581448502827e-05,
"loss": 0.7032,
"step": 2282
},
{
"epoch": 0.48872120093120336,
"grad_norm": 0.2243567008518612,
"learning_rate": 2.103224463114389e-05,
"loss": 0.711,
"step": 2283
},
{
"epoch": 0.48893527066441894,
"grad_norm": 0.17785672479561243,
"learning_rate": 2.1018674300777274e-05,
"loss": 0.6939,
"step": 2284
},
{
"epoch": 0.4891493403976345,
"grad_norm": 0.18966677078905494,
"learning_rate": 2.100510350019247e-05,
"loss": 0.7088,
"step": 2285
},
{
"epoch": 0.4893634101308501,
"grad_norm": 0.19997803944261977,
"learning_rate": 2.099153223565373e-05,
"loss": 0.6697,
"step": 2286
},
{
"epoch": 0.48957747986406575,
"grad_norm": 0.18593898857181565,
"learning_rate": 2.0977960513425523e-05,
"loss": 0.7045,
"step": 2287
},
{
"epoch": 0.48979154959728133,
"grad_norm": 0.18443672398784963,
"learning_rate": 2.096438833977253e-05,
"loss": 0.7163,
"step": 2288
},
{
"epoch": 0.4900056193304969,
"grad_norm": 0.18905170424942327,
"learning_rate": 2.095081572095965e-05,
"loss": 0.6901,
"step": 2289
},
{
"epoch": 0.4902196890637125,
"grad_norm": 0.18465422435741757,
"learning_rate": 2.093724266325197e-05,
"loss": 0.7215,
"step": 2290
},
{
"epoch": 0.4904337587969281,
"grad_norm": 0.24225053244823289,
"learning_rate": 2.0923669172914796e-05,
"loss": 0.7064,
"step": 2291
},
{
"epoch": 0.4906478285301437,
"grad_norm": 0.20772301045923566,
"learning_rate": 2.0910095256213624e-05,
"loss": 0.6744,
"step": 2292
},
{
"epoch": 0.4908618982633593,
"grad_norm": 0.21870061034685065,
"learning_rate": 2.0896520919414142e-05,
"loss": 0.7351,
"step": 2293
},
{
"epoch": 0.4910759679965749,
"grad_norm": 0.20715028862976584,
"learning_rate": 2.0882946168782247e-05,
"loss": 0.6835,
"step": 2294
},
{
"epoch": 0.49129003772979046,
"grad_norm": 0.20582083098621823,
"learning_rate": 2.0869371010584017e-05,
"loss": 0.7072,
"step": 2295
},
{
"epoch": 0.4915041074630061,
"grad_norm": 0.2380438839202891,
"learning_rate": 2.085579545108572e-05,
"loss": 0.7236,
"step": 2296
},
{
"epoch": 0.4917181771962217,
"grad_norm": 0.2050310270181436,
"learning_rate": 2.0842219496553808e-05,
"loss": 0.7367,
"step": 2297
},
{
"epoch": 0.49193224692943727,
"grad_norm": 0.20857720028133456,
"learning_rate": 2.0828643153254918e-05,
"loss": 0.7256,
"step": 2298
},
{
"epoch": 0.49214631666265285,
"grad_norm": 0.1983561651271981,
"learning_rate": 2.081506642745587e-05,
"loss": 0.6879,
"step": 2299
},
{
"epoch": 0.49236038639586843,
"grad_norm": 0.27889800775756773,
"learning_rate": 2.0801489325423642e-05,
"loss": 0.7288,
"step": 2300
},
{
"epoch": 0.49257445612908407,
"grad_norm": 0.21067068280653148,
"learning_rate": 2.0787911853425418e-05,
"loss": 0.7299,
"step": 2301
},
{
"epoch": 0.49278852586229965,
"grad_norm": 0.1901153136259381,
"learning_rate": 2.077433401772852e-05,
"loss": 0.7047,
"step": 2302
},
{
"epoch": 0.49300259559551524,
"grad_norm": 0.20697270328001657,
"learning_rate": 2.0760755824600462e-05,
"loss": 0.7041,
"step": 2303
},
{
"epoch": 0.4932166653287308,
"grad_norm": 0.18261106983665978,
"learning_rate": 2.0747177280308895e-05,
"loss": 0.7081,
"step": 2304
},
{
"epoch": 0.4934307350619464,
"grad_norm": 0.18367551658853998,
"learning_rate": 2.073359839112168e-05,
"loss": 0.6817,
"step": 2305
},
{
"epoch": 0.49364480479516204,
"grad_norm": 0.19832615632886225,
"learning_rate": 2.072001916330678e-05,
"loss": 0.7102,
"step": 2306
},
{
"epoch": 0.4938588745283776,
"grad_norm": 0.21393930601194572,
"learning_rate": 2.0706439603132357e-05,
"loss": 0.732,
"step": 2307
},
{
"epoch": 0.4940729442615932,
"grad_norm": 0.1784007718620903,
"learning_rate": 2.069285971686671e-05,
"loss": 0.7249,
"step": 2308
},
{
"epoch": 0.4942870139948088,
"grad_norm": 0.21697739711195274,
"learning_rate": 2.067927951077828e-05,
"loss": 0.7148,
"step": 2309
},
{
"epoch": 0.4945010837280244,
"grad_norm": 0.1983332192416552,
"learning_rate": 2.0665698991135666e-05,
"loss": 0.7147,
"step": 2310
},
{
"epoch": 0.49471515346124,
"grad_norm": 0.19218358718375153,
"learning_rate": 2.0652118164207624e-05,
"loss": 0.721,
"step": 2311
},
{
"epoch": 0.4949292231944556,
"grad_norm": 0.19910159640157518,
"learning_rate": 2.0638537036263032e-05,
"loss": 0.7113,
"step": 2312
},
{
"epoch": 0.4951432929276712,
"grad_norm": 0.19599306878955114,
"learning_rate": 2.062495561357091e-05,
"loss": 0.7084,
"step": 2313
},
{
"epoch": 0.49535736266088676,
"grad_norm": 0.20615678706193064,
"learning_rate": 2.061137390240042e-05,
"loss": 0.7087,
"step": 2314
},
{
"epoch": 0.4955714323941024,
"grad_norm": 0.18882100983035943,
"learning_rate": 2.059779190902085e-05,
"loss": 0.7557,
"step": 2315
},
{
"epoch": 0.495785502127318,
"grad_norm": 0.28029007840889947,
"learning_rate": 2.0584209639701643e-05,
"loss": 0.6984,
"step": 2316
},
{
"epoch": 0.49599957186053356,
"grad_norm": 0.20861673098181618,
"learning_rate": 2.057062710071233e-05,
"loss": 0.7229,
"step": 2317
},
{
"epoch": 0.49621364159374914,
"grad_norm": 0.19678723848133275,
"learning_rate": 2.055704429832259e-05,
"loss": 0.6991,
"step": 2318
},
{
"epoch": 0.4964277113269648,
"grad_norm": 0.18357941824676727,
"learning_rate": 2.0543461238802224e-05,
"loss": 0.7254,
"step": 2319
},
{
"epoch": 0.49664178106018037,
"grad_norm": 0.184084335224731,
"learning_rate": 2.0529877928421136e-05,
"loss": 0.685,
"step": 2320
},
{
"epoch": 0.49685585079339595,
"grad_norm": 0.1814205878044885,
"learning_rate": 2.0516294373449378e-05,
"loss": 0.7062,
"step": 2321
},
{
"epoch": 0.49706992052661153,
"grad_norm": 0.1831755680322796,
"learning_rate": 2.050271058015708e-05,
"loss": 0.7133,
"step": 2322
},
{
"epoch": 0.4972839902598271,
"grad_norm": 0.17480465497898204,
"learning_rate": 2.0489126554814493e-05,
"loss": 0.6761,
"step": 2323
},
{
"epoch": 0.49749805999304275,
"grad_norm": 0.1951662502268692,
"learning_rate": 2.047554230369199e-05,
"loss": 0.7183,
"step": 2324
},
{
"epoch": 0.49771212972625833,
"grad_norm": 0.17915017788063614,
"learning_rate": 2.0461957833060025e-05,
"loss": 0.6992,
"step": 2325
},
{
"epoch": 0.4979261994594739,
"grad_norm": 0.18937104512906497,
"learning_rate": 2.0448373149189172e-05,
"loss": 0.7005,
"step": 2326
},
{
"epoch": 0.4981402691926895,
"grad_norm": 0.19891811608452778,
"learning_rate": 2.0434788258350094e-05,
"loss": 0.7259,
"step": 2327
},
{
"epoch": 0.49835433892590514,
"grad_norm": 0.1741124121040168,
"learning_rate": 2.0421203166813552e-05,
"loss": 0.6881,
"step": 2328
},
{
"epoch": 0.4985684086591207,
"grad_norm": 0.20268039786935235,
"learning_rate": 2.0407617880850403e-05,
"loss": 0.706,
"step": 2329
},
{
"epoch": 0.4987824783923363,
"grad_norm": 0.17551348267840383,
"learning_rate": 2.039403240673158e-05,
"loss": 0.7259,
"step": 2330
},
{
"epoch": 0.4989965481255519,
"grad_norm": 0.2009671644300957,
"learning_rate": 2.038044675072812e-05,
"loss": 0.7202,
"step": 2331
},
{
"epoch": 0.49921061785876747,
"grad_norm": 0.17093429434405302,
"learning_rate": 2.036686091911114e-05,
"loss": 0.6964,
"step": 2332
},
{
"epoch": 0.4994246875919831,
"grad_norm": 0.191369264221559,
"learning_rate": 2.0353274918151832e-05,
"loss": 0.7355,
"step": 2333
},
{
"epoch": 0.4996387573251987,
"grad_norm": 0.18551508301521066,
"learning_rate": 2.0339688754121468e-05,
"loss": 0.7291,
"step": 2334
},
{
"epoch": 0.4998528270584143,
"grad_norm": 0.17065764579513973,
"learning_rate": 2.0326102433291387e-05,
"loss": 0.6915,
"step": 2335
},
{
"epoch": 0.5000668967916299,
"grad_norm": 0.1764580327298373,
"learning_rate": 2.031251596193303e-05,
"loss": 0.715,
"step": 2336
},
{
"epoch": 0.5002809665248454,
"grad_norm": 0.16821877466370244,
"learning_rate": 2.0298929346317876e-05,
"loss": 0.6839,
"step": 2337
},
{
"epoch": 0.500495036258061,
"grad_norm": 0.17016892606393189,
"learning_rate": 2.0285342592717483e-05,
"loss": 0.6956,
"step": 2338
},
{
"epoch": 0.5007091059912766,
"grad_norm": 0.5551613601013521,
"learning_rate": 2.0271755707403467e-05,
"loss": 0.7196,
"step": 2339
},
{
"epoch": 0.5009231757244923,
"grad_norm": 0.1637426430439625,
"learning_rate": 2.0258168696647517e-05,
"loss": 0.6909,
"step": 2340
},
{
"epoch": 0.5011372454577079,
"grad_norm": 0.16709579796395363,
"learning_rate": 2.0244581566721373e-05,
"loss": 0.6995,
"step": 2341
},
{
"epoch": 0.5013513151909235,
"grad_norm": 0.1731379826836654,
"learning_rate": 2.0230994323896817e-05,
"loss": 0.7312,
"step": 2342
},
{
"epoch": 0.501565384924139,
"grad_norm": 0.16601482400831105,
"learning_rate": 2.021740697444571e-05,
"loss": 0.6862,
"step": 2343
},
{
"epoch": 0.5017794546573546,
"grad_norm": 0.188188415154757,
"learning_rate": 2.020381952463994e-05,
"loss": 0.7243,
"step": 2344
},
{
"epoch": 0.5019935243905702,
"grad_norm": 0.17006152360939655,
"learning_rate": 2.019023198075145e-05,
"loss": 0.7431,
"step": 2345
},
{
"epoch": 0.5022075941237858,
"grad_norm": 0.18321705383784387,
"learning_rate": 2.0176644349052225e-05,
"loss": 0.7106,
"step": 2346
},
{
"epoch": 0.5024216638570014,
"grad_norm": 0.17119446549725556,
"learning_rate": 2.0163056635814294e-05,
"loss": 0.7076,
"step": 2347
},
{
"epoch": 0.502635733590217,
"grad_norm": 0.1782311070606052,
"learning_rate": 2.014946884730972e-05,
"loss": 0.7,
"step": 2348
},
{
"epoch": 0.5028498033234327,
"grad_norm": 0.1783066570538049,
"learning_rate": 2.01358809898106e-05,
"loss": 0.6851,
"step": 2349
},
{
"epoch": 0.5030638730566482,
"grad_norm": 0.1747335542605542,
"learning_rate": 2.0122293069589062e-05,
"loss": 0.6973,
"step": 2350
},
{
"epoch": 0.5032779427898638,
"grad_norm": 0.17782417520246982,
"learning_rate": 2.0108705092917268e-05,
"loss": 0.7129,
"step": 2351
},
{
"epoch": 0.5034920125230794,
"grad_norm": 0.17511836005573053,
"learning_rate": 2.0095117066067398e-05,
"loss": 0.7111,
"step": 2352
},
{
"epoch": 0.503706082256295,
"grad_norm": 0.18274102767963865,
"learning_rate": 2.0081528995311666e-05,
"loss": 0.6832,
"step": 2353
},
{
"epoch": 0.5039201519895106,
"grad_norm": 0.19232256991867974,
"learning_rate": 2.0067940886922305e-05,
"loss": 0.6998,
"step": 2354
},
{
"epoch": 0.5041342217227262,
"grad_norm": 0.19133307064488894,
"learning_rate": 2.005435274717155e-05,
"loss": 0.7169,
"step": 2355
},
{
"epoch": 0.5043482914559417,
"grad_norm": 0.19299429614965022,
"learning_rate": 2.0040764582331666e-05,
"loss": 0.7222,
"step": 2356
},
{
"epoch": 0.5045623611891573,
"grad_norm": 0.20788566687208576,
"learning_rate": 2.002717639867492e-05,
"loss": 0.6984,
"step": 2357
},
{
"epoch": 0.504776430922373,
"grad_norm": 0.17868743120764646,
"learning_rate": 2.0013588202473605e-05,
"loss": 0.685,
"step": 2358
},
{
"epoch": 0.5049905006555886,
"grad_norm": 0.19885115695005554,
"learning_rate": 2e-05,
"loss": 0.7283,
"step": 2359
},
{
"epoch": 0.5052045703888042,
"grad_norm": 0.18236181566858858,
"learning_rate": 1.9986411797526395e-05,
"loss": 0.6857,
"step": 2360
},
{
"epoch": 0.5054186401220198,
"grad_norm": 0.18572831251879549,
"learning_rate": 1.9972823601325084e-05,
"loss": 0.7045,
"step": 2361
},
{
"epoch": 0.5056327098552353,
"grad_norm": 0.19423726834554206,
"learning_rate": 1.9959235417668337e-05,
"loss": 0.6945,
"step": 2362
},
{
"epoch": 0.5058467795884509,
"grad_norm": 0.16967139105862453,
"learning_rate": 1.9945647252828462e-05,
"loss": 0.6808,
"step": 2363
},
{
"epoch": 0.5060608493216665,
"grad_norm": 0.18706938255117278,
"learning_rate": 1.9932059113077705e-05,
"loss": 0.7303,
"step": 2364
},
{
"epoch": 0.5062749190548821,
"grad_norm": 0.18583122159610296,
"learning_rate": 1.9918471004688334e-05,
"loss": 0.7307,
"step": 2365
},
{
"epoch": 0.5064889887880977,
"grad_norm": 0.18808429291888643,
"learning_rate": 1.990488293393261e-05,
"loss": 0.7252,
"step": 2366
},
{
"epoch": 0.5067030585213134,
"grad_norm": 0.1726240382474707,
"learning_rate": 1.989129490708274e-05,
"loss": 0.6967,
"step": 2367
},
{
"epoch": 0.506917128254529,
"grad_norm": 0.18071632718672434,
"learning_rate": 1.9877706930410948e-05,
"loss": 0.6804,
"step": 2368
},
{
"epoch": 0.5071311979877445,
"grad_norm": 0.19357683267040604,
"learning_rate": 1.9864119010189407e-05,
"loss": 0.7231,
"step": 2369
},
{
"epoch": 0.5073452677209601,
"grad_norm": 0.17114860137255122,
"learning_rate": 1.985053115269028e-05,
"loss": 0.6918,
"step": 2370
},
{
"epoch": 0.5075593374541757,
"grad_norm": 0.2572212038646177,
"learning_rate": 1.983694336418571e-05,
"loss": 0.7033,
"step": 2371
},
{
"epoch": 0.5077734071873913,
"grad_norm": 0.18477427085141998,
"learning_rate": 1.9823355650947775e-05,
"loss": 0.7125,
"step": 2372
},
{
"epoch": 0.5079874769206069,
"grad_norm": 0.2741212601974107,
"learning_rate": 1.9809768019248557e-05,
"loss": 0.7322,
"step": 2373
},
{
"epoch": 0.5082015466538224,
"grad_norm": 0.19105586062027488,
"learning_rate": 1.9796180475360064e-05,
"loss": 0.7245,
"step": 2374
},
{
"epoch": 0.508415616387038,
"grad_norm": 0.19053957467597,
"learning_rate": 1.978259302555429e-05,
"loss": 0.7181,
"step": 2375
},
{
"epoch": 0.5086296861202537,
"grad_norm": 0.18842262608680713,
"learning_rate": 1.976900567610319e-05,
"loss": 0.7147,
"step": 2376
},
{
"epoch": 0.5088437558534693,
"grad_norm": 0.17734224520190128,
"learning_rate": 1.9755418433278633e-05,
"loss": 0.7294,
"step": 2377
},
{
"epoch": 0.5090578255866849,
"grad_norm": 0.18709617782173277,
"learning_rate": 1.9741831303352486e-05,
"loss": 0.7143,
"step": 2378
},
{
"epoch": 0.5092718953199005,
"grad_norm": 0.18088516800298257,
"learning_rate": 1.972824429259654e-05,
"loss": 0.7197,
"step": 2379
},
{
"epoch": 0.509485965053116,
"grad_norm": 0.2822352214414661,
"learning_rate": 1.9714657407282527e-05,
"loss": 0.6969,
"step": 2380
},
{
"epoch": 0.5097000347863316,
"grad_norm": 0.1851516478757197,
"learning_rate": 1.970107065368213e-05,
"loss": 0.7339,
"step": 2381
},
{
"epoch": 0.5099141045195472,
"grad_norm": 0.19218223083206867,
"learning_rate": 1.9687484038066976e-05,
"loss": 0.6844,
"step": 2382
},
{
"epoch": 0.5101281742527628,
"grad_norm": 0.17603925537645054,
"learning_rate": 1.9673897566708616e-05,
"loss": 0.6866,
"step": 2383
},
{
"epoch": 0.5103422439859784,
"grad_norm": 0.19739203187639282,
"learning_rate": 1.9660311245878542e-05,
"loss": 0.6979,
"step": 2384
},
{
"epoch": 0.5105563137191941,
"grad_norm": 0.19193124684619162,
"learning_rate": 1.9646725081848178e-05,
"loss": 0.7023,
"step": 2385
},
{
"epoch": 0.5107703834524097,
"grad_norm": 0.17646203306384048,
"learning_rate": 1.9633139080888865e-05,
"loss": 0.701,
"step": 2386
},
{
"epoch": 0.5109844531856252,
"grad_norm": 0.19494243832438965,
"learning_rate": 1.9619553249271882e-05,
"loss": 0.701,
"step": 2387
},
{
"epoch": 0.5111985229188408,
"grad_norm": 0.17101539925619594,
"learning_rate": 1.9605967593268427e-05,
"loss": 0.7008,
"step": 2388
},
{
"epoch": 0.5114125926520564,
"grad_norm": 0.19285906469546862,
"learning_rate": 1.9592382119149604e-05,
"loss": 0.7182,
"step": 2389
},
{
"epoch": 0.511626662385272,
"grad_norm": 0.17375748525740428,
"learning_rate": 1.9578796833186458e-05,
"loss": 0.6884,
"step": 2390
},
{
"epoch": 0.5118407321184876,
"grad_norm": 0.18760868495860536,
"learning_rate": 1.9565211741649913e-05,
"loss": 0.7014,
"step": 2391
},
{
"epoch": 0.5120548018517032,
"grad_norm": 0.17339425461333224,
"learning_rate": 1.9551626850810828e-05,
"loss": 0.717,
"step": 2392
},
{
"epoch": 0.5122688715849187,
"grad_norm": 0.1951409833582809,
"learning_rate": 1.9538042166939982e-05,
"loss": 0.7159,
"step": 2393
},
{
"epoch": 0.5124829413181344,
"grad_norm": 0.17417982450498132,
"learning_rate": 1.9524457696308017e-05,
"loss": 0.7204,
"step": 2394
},
{
"epoch": 0.51269701105135,
"grad_norm": 0.17865834090670113,
"learning_rate": 1.9510873445185514e-05,
"loss": 0.7355,
"step": 2395
},
{
"epoch": 0.5129110807845656,
"grad_norm": 0.17343813112653667,
"learning_rate": 1.949728941984293e-05,
"loss": 0.7189,
"step": 2396
},
{
"epoch": 0.5131251505177812,
"grad_norm": 0.1731638562684162,
"learning_rate": 1.9483705626550625e-05,
"loss": 0.7224,
"step": 2397
},
{
"epoch": 0.5133392202509968,
"grad_norm": 0.17229895338371198,
"learning_rate": 1.9470122071578867e-05,
"loss": 0.7087,
"step": 2398
},
{
"epoch": 0.5135532899842123,
"grad_norm": 0.1701206650462555,
"learning_rate": 1.9456538761197782e-05,
"loss": 0.718,
"step": 2399
},
{
"epoch": 0.5137673597174279,
"grad_norm": 0.17781763681545806,
"learning_rate": 1.944295570167742e-05,
"loss": 0.6886,
"step": 2400
},
{
"epoch": 0.5139814294506435,
"grad_norm": 0.1665046907054677,
"learning_rate": 1.9429372899287678e-05,
"loss": 0.6856,
"step": 2401
},
{
"epoch": 0.5141954991838591,
"grad_norm": 0.17594804152767285,
"learning_rate": 1.941579036029836e-05,
"loss": 0.6817,
"step": 2402
},
{
"epoch": 0.5144095689170748,
"grad_norm": 0.17811135886153168,
"learning_rate": 1.9402208090979152e-05,
"loss": 0.7143,
"step": 2403
},
{
"epoch": 0.5146236386502904,
"grad_norm": 0.16743693823359704,
"learning_rate": 1.9388626097599585e-05,
"loss": 0.7129,
"step": 2404
},
{
"epoch": 0.514837708383506,
"grad_norm": 0.18181922997233096,
"learning_rate": 1.9375044386429103e-05,
"loss": 0.7009,
"step": 2405
},
{
"epoch": 0.5150517781167215,
"grad_norm": 0.18291054112383048,
"learning_rate": 1.9361462963736978e-05,
"loss": 0.705,
"step": 2406
},
{
"epoch": 0.5152658478499371,
"grad_norm": 0.18050104738981726,
"learning_rate": 1.934788183579238e-05,
"loss": 0.711,
"step": 2407
},
{
"epoch": 0.5154799175831527,
"grad_norm": 0.1665601928447741,
"learning_rate": 1.933430100886434e-05,
"loss": 0.7117,
"step": 2408
},
{
"epoch": 0.5156939873163683,
"grad_norm": 0.1899409126838694,
"learning_rate": 1.9320720489221728e-05,
"loss": 0.7109,
"step": 2409
},
{
"epoch": 0.5159080570495839,
"grad_norm": 0.17591987081234786,
"learning_rate": 1.9307140283133305e-05,
"loss": 0.688,
"step": 2410
},
{
"epoch": 0.5161221267827995,
"grad_norm": 0.19600856341742107,
"learning_rate": 1.9293560396867646e-05,
"loss": 0.7295,
"step": 2411
},
{
"epoch": 0.5163361965160151,
"grad_norm": 0.18149886765818582,
"learning_rate": 1.927998083669322e-05,
"loss": 0.7116,
"step": 2412
},
{
"epoch": 0.5165502662492307,
"grad_norm": 0.21087260529720359,
"learning_rate": 1.926640160887833e-05,
"loss": 0.7403,
"step": 2413
},
{
"epoch": 0.5167643359824463,
"grad_norm": 0.1810112048437299,
"learning_rate": 1.92528227196911e-05,
"loss": 0.7074,
"step": 2414
},
{
"epoch": 0.5169784057156619,
"grad_norm": 0.1811772106321527,
"learning_rate": 1.9239244175399548e-05,
"loss": 0.7052,
"step": 2415
},
{
"epoch": 0.5171924754488775,
"grad_norm": 0.18435248533126594,
"learning_rate": 1.9225665982271483e-05,
"loss": 0.7068,
"step": 2416
},
{
"epoch": 0.5174065451820931,
"grad_norm": 0.17567521068438643,
"learning_rate": 1.9212088146574585e-05,
"loss": 0.6824,
"step": 2417
},
{
"epoch": 0.5176206149153086,
"grad_norm": 0.19517579534362361,
"learning_rate": 1.919851067457636e-05,
"loss": 0.7073,
"step": 2418
},
{
"epoch": 0.5178346846485242,
"grad_norm": 0.173956476489934,
"learning_rate": 1.918493357254414e-05,
"loss": 0.7255,
"step": 2419
},
{
"epoch": 0.5180487543817398,
"grad_norm": 0.19263265915578986,
"learning_rate": 1.9171356846745085e-05,
"loss": 0.701,
"step": 2420
},
{
"epoch": 0.5182628241149555,
"grad_norm": 0.1771915668668789,
"learning_rate": 1.91577805034462e-05,
"loss": 0.6748,
"step": 2421
},
{
"epoch": 0.5184768938481711,
"grad_norm": 0.1762121557303122,
"learning_rate": 1.914420454891429e-05,
"loss": 0.7138,
"step": 2422
},
{
"epoch": 0.5186909635813867,
"grad_norm": 0.17280975868188017,
"learning_rate": 1.913062898941599e-05,
"loss": 0.695,
"step": 2423
},
{
"epoch": 0.5189050333146022,
"grad_norm": 0.16191149556216228,
"learning_rate": 1.911705383121776e-05,
"loss": 0.6652,
"step": 2424
},
{
"epoch": 0.5191191030478178,
"grad_norm": 0.18077192547039,
"learning_rate": 1.9103479080585868e-05,
"loss": 0.7123,
"step": 2425
},
{
"epoch": 0.5193331727810334,
"grad_norm": 0.16549281930608375,
"learning_rate": 1.9089904743786383e-05,
"loss": 0.7075,
"step": 2426
},
{
"epoch": 0.519547242514249,
"grad_norm": 0.17844717983546715,
"learning_rate": 1.9076330827085214e-05,
"loss": 0.7379,
"step": 2427
},
{
"epoch": 0.5197613122474646,
"grad_norm": 0.17288216637910517,
"learning_rate": 1.9062757336748034e-05,
"loss": 0.672,
"step": 2428
},
{
"epoch": 0.5199753819806802,
"grad_norm": 0.17782160290731314,
"learning_rate": 1.9049184279040354e-05,
"loss": 0.7098,
"step": 2429
},
{
"epoch": 0.5201894517138959,
"grad_norm": 0.17626367258519576,
"learning_rate": 1.9035611660227476e-05,
"loss": 0.6914,
"step": 2430
},
{
"epoch": 0.5204035214471114,
"grad_norm": 0.1766203569094171,
"learning_rate": 1.9022039486574484e-05,
"loss": 0.6912,
"step": 2431
},
{
"epoch": 0.520617591180327,
"grad_norm": 0.16746964356194322,
"learning_rate": 1.900846776434628e-05,
"loss": 0.684,
"step": 2432
},
{
"epoch": 0.5208316609135426,
"grad_norm": 0.17332127773247646,
"learning_rate": 1.8994896499807534e-05,
"loss": 0.6955,
"step": 2433
},
{
"epoch": 0.5210457306467582,
"grad_norm": 0.1854011416630557,
"learning_rate": 1.8981325699222726e-05,
"loss": 0.6895,
"step": 2434
},
{
"epoch": 0.5212598003799738,
"grad_norm": 0.16844781247433485,
"learning_rate": 1.8967755368856118e-05,
"loss": 0.6974,
"step": 2435
},
{
"epoch": 0.5214738701131894,
"grad_norm": 0.17433800797431942,
"learning_rate": 1.8954185514971733e-05,
"loss": 0.7107,
"step": 2436
},
{
"epoch": 0.5216879398464049,
"grad_norm": 0.16382371927723935,
"learning_rate": 1.8940616143833408e-05,
"loss": 0.6465,
"step": 2437
},
{
"epoch": 0.5219020095796205,
"grad_norm": 0.16725203105456918,
"learning_rate": 1.8927047261704723e-05,
"loss": 0.7091,
"step": 2438
},
{
"epoch": 0.5221160793128361,
"grad_norm": 0.17193985324127656,
"learning_rate": 1.891347887484906e-05,
"loss": 0.7233,
"step": 2439
},
{
"epoch": 0.5223301490460518,
"grad_norm": 0.17006006807904392,
"learning_rate": 1.8899910989529567e-05,
"loss": 0.7243,
"step": 2440
},
{
"epoch": 0.5225442187792674,
"grad_norm": 0.17500226871993238,
"learning_rate": 1.8886343612009138e-05,
"loss": 0.7263,
"step": 2441
},
{
"epoch": 0.522758288512483,
"grad_norm": 0.1693394120196878,
"learning_rate": 1.8872776748550467e-05,
"loss": 0.738,
"step": 2442
},
{
"epoch": 0.5229723582456985,
"grad_norm": 0.17241829192872127,
"learning_rate": 1.8859210405415977e-05,
"loss": 0.718,
"step": 2443
},
{
"epoch": 0.5231864279789141,
"grad_norm": 0.16618010241416503,
"learning_rate": 1.8845644588867878e-05,
"loss": 0.6773,
"step": 2444
},
{
"epoch": 0.5234004977121297,
"grad_norm": 0.18077908505361287,
"learning_rate": 1.883207930516813e-05,
"loss": 0.7226,
"step": 2445
},
{
"epoch": 0.5236145674453453,
"grad_norm": 0.17848471786808595,
"learning_rate": 1.881851456057843e-05,
"loss": 0.7342,
"step": 2446
},
{
"epoch": 0.5238286371785609,
"grad_norm": 0.180198603087818,
"learning_rate": 1.880495036136025e-05,
"loss": 0.7081,
"step": 2447
},
{
"epoch": 0.5240427069117765,
"grad_norm": 0.18003694994380037,
"learning_rate": 1.8791386713774793e-05,
"loss": 0.6921,
"step": 2448
},
{
"epoch": 0.5242567766449922,
"grad_norm": 0.18435628537362891,
"learning_rate": 1.8777823624083014e-05,
"loss": 0.6965,
"step": 2449
},
{
"epoch": 0.5244708463782077,
"grad_norm": 0.1822405965776421,
"learning_rate": 1.876426109854562e-05,
"loss": 0.7269,
"step": 2450
},
{
"epoch": 0.5246849161114233,
"grad_norm": 0.1791863797660689,
"learning_rate": 1.8750699143423034e-05,
"loss": 0.7164,
"step": 2451
},
{
"epoch": 0.5248989858446389,
"grad_norm": 0.18568060559199578,
"learning_rate": 1.8737137764975446e-05,
"loss": 0.7336,
"step": 2452
},
{
"epoch": 0.5251130555778545,
"grad_norm": 0.1740890741721583,
"learning_rate": 1.8723576969462743e-05,
"loss": 0.6711,
"step": 2453
},
{
"epoch": 0.5253271253110701,
"grad_norm": 0.18135128756071045,
"learning_rate": 1.8710016763144575e-05,
"loss": 0.7166,
"step": 2454
},
{
"epoch": 0.5255411950442856,
"grad_norm": 0.1771530160603431,
"learning_rate": 1.8696457152280317e-05,
"loss": 0.6871,
"step": 2455
},
{
"epoch": 0.5257552647775012,
"grad_norm": 0.17283987479795945,
"learning_rate": 1.8682898143129044e-05,
"loss": 0.7281,
"step": 2456
},
{
"epoch": 0.5259693345107168,
"grad_norm": 0.17865973348635084,
"learning_rate": 1.8669339741949577e-05,
"loss": 0.7419,
"step": 2457
},
{
"epoch": 0.5261834042439325,
"grad_norm": 0.1860920436859969,
"learning_rate": 1.8655781955000452e-05,
"loss": 0.6974,
"step": 2458
},
{
"epoch": 0.5263974739771481,
"grad_norm": 0.18102381730441347,
"learning_rate": 1.864222478853991e-05,
"loss": 0.6928,
"step": 2459
},
{
"epoch": 0.5266115437103637,
"grad_norm": 0.18503940273221842,
"learning_rate": 1.8628668248825933e-05,
"loss": 0.7144,
"step": 2460
},
{
"epoch": 0.5268256134435793,
"grad_norm": 0.16673122364110576,
"learning_rate": 1.861511234211617e-05,
"loss": 0.7049,
"step": 2461
},
{
"epoch": 0.5270396831767948,
"grad_norm": 0.17903155801938797,
"learning_rate": 1.8601557074668018e-05,
"loss": 0.6836,
"step": 2462
},
{
"epoch": 0.5272537529100104,
"grad_norm": 0.1696083896982828,
"learning_rate": 1.8588002452738562e-05,
"loss": 0.7278,
"step": 2463
},
{
"epoch": 0.527467822643226,
"grad_norm": 0.18973019059171162,
"learning_rate": 1.857444848258459e-05,
"loss": 0.7013,
"step": 2464
},
{
"epoch": 0.5276818923764416,
"grad_norm": 0.17079550591332204,
"learning_rate": 1.8560895170462582e-05,
"loss": 0.7046,
"step": 2465
},
{
"epoch": 0.5278959621096572,
"grad_norm": 0.3044465689473129,
"learning_rate": 1.8547342522628737e-05,
"loss": 0.7344,
"step": 2466
},
{
"epoch": 0.5281100318428729,
"grad_norm": 0.16982640971868748,
"learning_rate": 1.8533790545338922e-05,
"loss": 0.7013,
"step": 2467
},
{
"epoch": 0.5283241015760884,
"grad_norm": 0.17904394406706206,
"learning_rate": 1.8520239244848703e-05,
"loss": 0.6943,
"step": 2468
},
{
"epoch": 0.528538171309304,
"grad_norm": 0.17778436166215425,
"learning_rate": 1.8506688627413348e-05,
"loss": 0.7192,
"step": 2469
},
{
"epoch": 0.5287522410425196,
"grad_norm": 0.1785608464525092,
"learning_rate": 1.849313869928778e-05,
"loss": 0.6952,
"step": 2470
},
{
"epoch": 0.5289663107757352,
"grad_norm": 0.1905136133771529,
"learning_rate": 1.847958946672663e-05,
"loss": 0.6875,
"step": 2471
},
{
"epoch": 0.5291803805089508,
"grad_norm": 0.17887130244242824,
"learning_rate": 1.8466040935984212e-05,
"loss": 0.7104,
"step": 2472
},
{
"epoch": 0.5293944502421664,
"grad_norm": 0.19150569144067042,
"learning_rate": 1.8452493113314476e-05,
"loss": 0.7318,
"step": 2473
},
{
"epoch": 0.5296085199753819,
"grad_norm": 0.18285341693850285,
"learning_rate": 1.8438946004971097e-05,
"loss": 0.7137,
"step": 2474
},
{
"epoch": 0.5298225897085975,
"grad_norm": 0.19699083379961962,
"learning_rate": 1.8425399617207374e-05,
"loss": 0.7144,
"step": 2475
},
{
"epoch": 0.5300366594418132,
"grad_norm": 0.1800731314307582,
"learning_rate": 1.8411853956276308e-05,
"loss": 0.7416,
"step": 2476
},
{
"epoch": 0.5302507291750288,
"grad_norm": 0.19874803726187476,
"learning_rate": 1.8398309028430553e-05,
"loss": 0.6787,
"step": 2477
},
{
"epoch": 0.5304647989082444,
"grad_norm": 0.1788852593300529,
"learning_rate": 1.8384764839922416e-05,
"loss": 0.7287,
"step": 2478
},
{
"epoch": 0.53067886864146,
"grad_norm": 0.19454786239054195,
"learning_rate": 1.8371221397003877e-05,
"loss": 0.7153,
"step": 2479
},
{
"epoch": 0.5308929383746755,
"grad_norm": 0.1903782778511923,
"learning_rate": 1.835767870592655e-05,
"loss": 0.7272,
"step": 2480
},
{
"epoch": 0.5311070081078911,
"grad_norm": 0.19530298402538088,
"learning_rate": 1.8344136772941726e-05,
"loss": 0.7331,
"step": 2481
},
{
"epoch": 0.5313210778411067,
"grad_norm": 0.1852973371593222,
"learning_rate": 1.833059560430035e-05,
"loss": 0.7076,
"step": 2482
},
{
"epoch": 0.5315351475743223,
"grad_norm": 0.18426494349686592,
"learning_rate": 1.831705520625297e-05,
"loss": 0.6885,
"step": 2483
},
{
"epoch": 0.5317492173075379,
"grad_norm": 0.17152523280607498,
"learning_rate": 1.8303515585049844e-05,
"loss": 0.7142,
"step": 2484
},
{
"epoch": 0.5319632870407536,
"grad_norm": 0.19455230656334577,
"learning_rate": 1.8289976746940802e-05,
"loss": 0.7088,
"step": 2485
},
{
"epoch": 0.5321773567739692,
"grad_norm": 0.17678173349173643,
"learning_rate": 1.8276438698175368e-05,
"loss": 0.6899,
"step": 2486
},
{
"epoch": 0.5323914265071847,
"grad_norm": 0.1825525586582506,
"learning_rate": 1.826290144500268e-05,
"loss": 0.7143,
"step": 2487
},
{
"epoch": 0.5326054962404003,
"grad_norm": 0.1786219845029516,
"learning_rate": 1.82493649936715e-05,
"loss": 0.6967,
"step": 2488
},
{
"epoch": 0.5328195659736159,
"grad_norm": 0.1732376213961871,
"learning_rate": 1.8235829350430244e-05,
"loss": 0.7377,
"step": 2489
},
{
"epoch": 0.5330336357068315,
"grad_norm": 0.1730755691180236,
"learning_rate": 1.822229452152692e-05,
"loss": 0.7046,
"step": 2490
},
{
"epoch": 0.5332477054400471,
"grad_norm": 0.16994482017550197,
"learning_rate": 1.820876051320919e-05,
"loss": 0.704,
"step": 2491
},
{
"epoch": 0.5334617751732627,
"grad_norm": 0.3308067603674807,
"learning_rate": 1.8195227331724335e-05,
"loss": 0.721,
"step": 2492
},
{
"epoch": 0.5336758449064782,
"grad_norm": 0.27628204100007286,
"learning_rate": 1.8181694983319237e-05,
"loss": 0.7047,
"step": 2493
},
{
"epoch": 0.5338899146396939,
"grad_norm": 0.17224375740682724,
"learning_rate": 1.816816347424041e-05,
"loss": 0.7094,
"step": 2494
},
{
"epoch": 0.5341039843729095,
"grad_norm": 0.18466956821318758,
"learning_rate": 1.815463281073396e-05,
"loss": 0.7179,
"step": 2495
},
{
"epoch": 0.5343180541061251,
"grad_norm": 0.21228352619403165,
"learning_rate": 1.814110299904563e-05,
"loss": 0.6739,
"step": 2496
},
{
"epoch": 0.5345321238393407,
"grad_norm": 0.17250103128322827,
"learning_rate": 1.8127574045420764e-05,
"loss": 0.6992,
"step": 2497
},
{
"epoch": 0.5347461935725563,
"grad_norm": 0.2009493652652341,
"learning_rate": 1.8114045956104278e-05,
"loss": 0.7052,
"step": 2498
},
{
"epoch": 0.5349602633057718,
"grad_norm": 0.17549417277338095,
"learning_rate": 1.8100518737340734e-05,
"loss": 0.6988,
"step": 2499
},
{
"epoch": 0.5351743330389874,
"grad_norm": 0.19766238626801938,
"learning_rate": 1.8086992395374258e-05,
"loss": 0.7033,
"step": 2500
},
{
"epoch": 0.535388402772203,
"grad_norm": 0.16937998352030223,
"learning_rate": 1.807346693644859e-05,
"loss": 0.7295,
"step": 2501
},
{
"epoch": 0.5356024725054186,
"grad_norm": 0.19666489403075987,
"learning_rate": 1.805994236680706e-05,
"loss": 0.6958,
"step": 2502
},
{
"epoch": 0.5358165422386343,
"grad_norm": 0.17901559768882327,
"learning_rate": 1.8046418692692587e-05,
"loss": 0.7262,
"step": 2503
},
{
"epoch": 0.5360306119718499,
"grad_norm": 0.1871379220935042,
"learning_rate": 1.8032895920347665e-05,
"loss": 0.6701,
"step": 2504
},
{
"epoch": 0.5362446817050655,
"grad_norm": 0.16836786587567446,
"learning_rate": 1.8019374056014385e-05,
"loss": 0.6858,
"step": 2505
},
{
"epoch": 0.536458751438281,
"grad_norm": 0.20209169038728814,
"learning_rate": 1.8005853105934417e-05,
"loss": 0.7006,
"step": 2506
},
{
"epoch": 0.5366728211714966,
"grad_norm": 0.17277068183601035,
"learning_rate": 1.7992333076349e-05,
"loss": 0.7348,
"step": 2507
},
{
"epoch": 0.5368868909047122,
"grad_norm": 0.1882623040160267,
"learning_rate": 1.7978813973498965e-05,
"loss": 0.7129,
"step": 2508
},
{
"epoch": 0.5371009606379278,
"grad_norm": 0.16952614277498532,
"learning_rate": 1.7965295803624696e-05,
"loss": 0.7196,
"step": 2509
},
{
"epoch": 0.5373150303711434,
"grad_norm": 0.182894354867448,
"learning_rate": 1.795177857296616e-05,
"loss": 0.7146,
"step": 2510
},
{
"epoch": 0.537529100104359,
"grad_norm": 0.17701931790939363,
"learning_rate": 1.793826228776289e-05,
"loss": 0.7213,
"step": 2511
},
{
"epoch": 0.5377431698375746,
"grad_norm": 0.18513538550607694,
"learning_rate": 1.7924746954253966e-05,
"loss": 0.6996,
"step": 2512
},
{
"epoch": 0.5379572395707902,
"grad_norm": 0.17918502491815952,
"learning_rate": 1.791123257867805e-05,
"loss": 0.7212,
"step": 2513
},
{
"epoch": 0.5381713093040058,
"grad_norm": 0.19801345031052334,
"learning_rate": 1.789771916727336e-05,
"loss": 0.7277,
"step": 2514
},
{
"epoch": 0.5383853790372214,
"grad_norm": 0.17546767434987479,
"learning_rate": 1.7884206726277647e-05,
"loss": 0.6851,
"step": 2515
},
{
"epoch": 0.538599448770437,
"grad_norm": 0.18694259453156425,
"learning_rate": 1.787069526192824e-05,
"loss": 0.6869,
"step": 2516
},
{
"epoch": 0.5388135185036526,
"grad_norm": 0.18277557945745132,
"learning_rate": 1.7857184780461997e-05,
"loss": 0.7258,
"step": 2517
},
{
"epoch": 0.5390275882368681,
"grad_norm": 0.19117307738056796,
"learning_rate": 1.7843675288115338e-05,
"loss": 0.6986,
"step": 2518
},
{
"epoch": 0.5392416579700837,
"grad_norm": 0.17177527905827172,
"learning_rate": 1.7830166791124227e-05,
"loss": 0.6963,
"step": 2519
},
{
"epoch": 0.5394557277032993,
"grad_norm": 0.1923204926492419,
"learning_rate": 1.7816659295724145e-05,
"loss": 0.7134,
"step": 2520
},
{
"epoch": 0.539669797436515,
"grad_norm": 0.1677900634007861,
"learning_rate": 1.780315280815014e-05,
"loss": 0.6986,
"step": 2521
},
{
"epoch": 0.5398838671697306,
"grad_norm": 0.1939207425932993,
"learning_rate": 1.7789647334636773e-05,
"loss": 0.6829,
"step": 2522
},
{
"epoch": 0.5400979369029462,
"grad_norm": 0.21773054703624947,
"learning_rate": 1.7776142881418147e-05,
"loss": 0.682,
"step": 2523
},
{
"epoch": 0.5403120066361617,
"grad_norm": 0.18946022567408957,
"learning_rate": 1.7762639454727905e-05,
"loss": 0.7201,
"step": 2524
},
{
"epoch": 0.5405260763693773,
"grad_norm": 0.1780260453553282,
"learning_rate": 1.774913706079919e-05,
"loss": 0.7285,
"step": 2525
},
{
"epoch": 0.5407401461025929,
"grad_norm": 0.1831236711113459,
"learning_rate": 1.7735635705864694e-05,
"loss": 0.7015,
"step": 2526
},
{
"epoch": 0.5409542158358085,
"grad_norm": 0.1947019346245163,
"learning_rate": 1.77221353961566e-05,
"loss": 0.7054,
"step": 2527
},
{
"epoch": 0.5411682855690241,
"grad_norm": 0.17259788856167682,
"learning_rate": 1.770863613790664e-05,
"loss": 0.7167,
"step": 2528
},
{
"epoch": 0.5413823553022397,
"grad_norm": 0.18353811467179074,
"learning_rate": 1.769513793734605e-05,
"loss": 0.7043,
"step": 2529
},
{
"epoch": 0.5415964250354554,
"grad_norm": 0.17218651591518727,
"learning_rate": 1.7681640800705564e-05,
"loss": 0.7058,
"step": 2530
},
{
"epoch": 0.5418104947686709,
"grad_norm": 0.18030673359814622,
"learning_rate": 1.7668144734215448e-05,
"loss": 0.6836,
"step": 2531
},
{
"epoch": 0.5420245645018865,
"grad_norm": 0.17340359072259717,
"learning_rate": 1.7654649744105447e-05,
"loss": 0.724,
"step": 2532
},
{
"epoch": 0.5422386342351021,
"grad_norm": 0.18179653224184358,
"learning_rate": 1.7641155836604826e-05,
"loss": 0.6939,
"step": 2533
},
{
"epoch": 0.5424527039683177,
"grad_norm": 0.16520294440466055,
"learning_rate": 1.7627663017942366e-05,
"loss": 0.7026,
"step": 2534
},
{
"epoch": 0.5426667737015333,
"grad_norm": 0.1941746797113647,
"learning_rate": 1.7614171294346303e-05,
"loss": 0.6831,
"step": 2535
},
{
"epoch": 0.5428808434347488,
"grad_norm": 0.1764436024799967,
"learning_rate": 1.7600680672044412e-05,
"loss": 0.7112,
"step": 2536
},
{
"epoch": 0.5430949131679644,
"grad_norm": 0.17581816549404258,
"learning_rate": 1.758719115726392e-05,
"loss": 0.6996,
"step": 2537
},
{
"epoch": 0.54330898290118,
"grad_norm": 0.19685010722831256,
"learning_rate": 1.7573702756231577e-05,
"loss": 0.6726,
"step": 2538
},
{
"epoch": 0.5435230526343957,
"grad_norm": 0.16555053117582616,
"learning_rate": 1.7560215475173607e-05,
"loss": 0.6879,
"step": 2539
},
{
"epoch": 0.5437371223676113,
"grad_norm": 0.2000040365903853,
"learning_rate": 1.75467293203157e-05,
"loss": 0.7563,
"step": 2540
},
{
"epoch": 0.5439511921008269,
"grad_norm": 0.1777753976061916,
"learning_rate": 1.753324429788305e-05,
"loss": 0.7281,
"step": 2541
},
{
"epoch": 0.5441652618340425,
"grad_norm": 0.1746452388649787,
"learning_rate": 1.751976041410032e-05,
"loss": 0.7215,
"step": 2542
},
{
"epoch": 0.544379331567258,
"grad_norm": 0.178911481431613,
"learning_rate": 1.7506277675191635e-05,
"loss": 0.696,
"step": 2543
},
{
"epoch": 0.5445934013004736,
"grad_norm": 0.16467846087276522,
"learning_rate": 1.7492796087380615e-05,
"loss": 0.708,
"step": 2544
},
{
"epoch": 0.5448074710336892,
"grad_norm": 0.18767225162608395,
"learning_rate": 1.7479315656890332e-05,
"loss": 0.7018,
"step": 2545
},
{
"epoch": 0.5450215407669048,
"grad_norm": 0.15993321565973773,
"learning_rate": 1.7465836389943327e-05,
"loss": 0.6808,
"step": 2546
},
{
"epoch": 0.5452356105001204,
"grad_norm": 0.17624590328709866,
"learning_rate": 1.74523582927616e-05,
"loss": 0.6877,
"step": 2547
},
{
"epoch": 0.545449680233336,
"grad_norm": 0.15943500774359257,
"learning_rate": 1.7438881371566633e-05,
"loss": 0.6834,
"step": 2548
},
{
"epoch": 0.5456637499665516,
"grad_norm": 0.1740527866314105,
"learning_rate": 1.7425405632579328e-05,
"loss": 0.6873,
"step": 2549
},
{
"epoch": 0.5458778196997672,
"grad_norm": 0.17160190918333196,
"learning_rate": 1.741193108202007e-05,
"loss": 0.7255,
"step": 2550
},
{
"epoch": 0.5460918894329828,
"grad_norm": 0.172909564738029,
"learning_rate": 1.739845772610869e-05,
"loss": 0.6853,
"step": 2551
},
{
"epoch": 0.5463059591661984,
"grad_norm": 0.17533365700687084,
"learning_rate": 1.738498557106446e-05,
"loss": 0.715,
"step": 2552
},
{
"epoch": 0.546520028899414,
"grad_norm": 0.1769648455907618,
"learning_rate": 1.7371514623106106e-05,
"loss": 0.6963,
"step": 2553
},
{
"epoch": 0.5467340986326296,
"grad_norm": 0.1712616404508207,
"learning_rate": 1.7358044888451787e-05,
"loss": 0.7238,
"step": 2554
},
{
"epoch": 0.5469481683658451,
"grad_norm": 0.1744943811449071,
"learning_rate": 1.734457637331911e-05,
"loss": 0.6942,
"step": 2555
},
{
"epoch": 0.5471622380990607,
"grad_norm": 0.17051226930599167,
"learning_rate": 1.7331109083925124e-05,
"loss": 0.7011,
"step": 2556
},
{
"epoch": 0.5473763078322763,
"grad_norm": 0.17398364127373958,
"learning_rate": 1.731764302648629e-05,
"loss": 0.7145,
"step": 2557
},
{
"epoch": 0.547590377565492,
"grad_norm": 0.16602616685694127,
"learning_rate": 1.7304178207218536e-05,
"loss": 0.6965,
"step": 2558
},
{
"epoch": 0.5478044472987076,
"grad_norm": 0.17487757201427861,
"learning_rate": 1.729071463233718e-05,
"loss": 0.6809,
"step": 2559
},
{
"epoch": 0.5480185170319232,
"grad_norm": 0.17426771543368935,
"learning_rate": 1.7277252308056986e-05,
"loss": 0.7233,
"step": 2560
},
{
"epoch": 0.5482325867651388,
"grad_norm": 0.1666560575533171,
"learning_rate": 1.726379124059215e-05,
"loss": 0.7168,
"step": 2561
},
{
"epoch": 0.5484466564983543,
"grad_norm": 0.18332265522648047,
"learning_rate": 1.7250331436156263e-05,
"loss": 0.6603,
"step": 2562
},
{
"epoch": 0.5486607262315699,
"grad_norm": 0.16905805750435804,
"learning_rate": 1.7236872900962364e-05,
"loss": 0.7031,
"step": 2563
},
{
"epoch": 0.5488747959647855,
"grad_norm": 0.1743030892947747,
"learning_rate": 1.722341564122286e-05,
"loss": 0.6987,
"step": 2564
},
{
"epoch": 0.5490888656980011,
"grad_norm": 0.17335530962757986,
"learning_rate": 1.7209959663149617e-05,
"loss": 0.6898,
"step": 2565
},
{
"epoch": 0.5493029354312167,
"grad_norm": 0.17775522104816577,
"learning_rate": 1.7196504972953897e-05,
"loss": 0.7169,
"step": 2566
},
{
"epoch": 0.5495170051644324,
"grad_norm": 0.17867472568886555,
"learning_rate": 1.7183051576846335e-05,
"loss": 0.6835,
"step": 2567
},
{
"epoch": 0.5497310748976479,
"grad_norm": 0.17642125315462062,
"learning_rate": 1.716959948103702e-05,
"loss": 0.6756,
"step": 2568
},
{
"epoch": 0.5499451446308635,
"grad_norm": 0.1763120193474745,
"learning_rate": 1.7156148691735394e-05,
"loss": 0.6895,
"step": 2569
},
{
"epoch": 0.5501592143640791,
"grad_norm": 0.16548942879504808,
"learning_rate": 1.7142699215150328e-05,
"loss": 0.7236,
"step": 2570
},
{
"epoch": 0.5503732840972947,
"grad_norm": 0.19127343155067736,
"learning_rate": 1.7129251057490083e-05,
"loss": 0.7185,
"step": 2571
},
{
"epoch": 0.5505873538305103,
"grad_norm": 0.1686480442065763,
"learning_rate": 1.711580422496228e-05,
"loss": 0.6881,
"step": 2572
},
{
"epoch": 0.5508014235637259,
"grad_norm": 0.1895178577464964,
"learning_rate": 1.7102358723773983e-05,
"loss": 0.7061,
"step": 2573
},
{
"epoch": 0.5510154932969414,
"grad_norm": 0.18006601362226637,
"learning_rate": 1.7088914560131582e-05,
"loss": 0.734,
"step": 2574
},
{
"epoch": 0.551229563030157,
"grad_norm": 0.18119314894926422,
"learning_rate": 1.7075471740240893e-05,
"loss": 0.7235,
"step": 2575
},
{
"epoch": 0.5514436327633727,
"grad_norm": 0.18492238624793897,
"learning_rate": 1.70620302703071e-05,
"loss": 0.6922,
"step": 2576
},
{
"epoch": 0.5516577024965883,
"grad_norm": 0.18838903479435018,
"learning_rate": 1.7048590156534752e-05,
"loss": 0.716,
"step": 2577
},
{
"epoch": 0.5518717722298039,
"grad_norm": 0.1806492899153125,
"learning_rate": 1.7035151405127793e-05,
"loss": 0.733,
"step": 2578
},
{
"epoch": 0.5520858419630195,
"grad_norm": 0.21034240583069322,
"learning_rate": 1.7021714022289508e-05,
"loss": 0.7136,
"step": 2579
},
{
"epoch": 0.552299911696235,
"grad_norm": 0.16257297871228057,
"learning_rate": 1.700827801422258e-05,
"loss": 0.7046,
"step": 2580
},
{
"epoch": 0.5525139814294506,
"grad_norm": 0.19590110820251028,
"learning_rate": 1.699484338712905e-05,
"loss": 0.7197,
"step": 2581
},
{
"epoch": 0.5527280511626662,
"grad_norm": 0.17367892373826405,
"learning_rate": 1.6981410147210305e-05,
"loss": 0.688,
"step": 2582
},
{
"epoch": 0.5529421208958818,
"grad_norm": 0.18599098998833577,
"learning_rate": 1.6967978300667112e-05,
"loss": 0.6871,
"step": 2583
},
{
"epoch": 0.5531561906290974,
"grad_norm": 0.17739346439819054,
"learning_rate": 1.6954547853699588e-05,
"loss": 0.7012,
"step": 2584
},
{
"epoch": 0.5533702603623131,
"grad_norm": 0.18558321718696819,
"learning_rate": 1.6941118812507192e-05,
"loss": 0.7022,
"step": 2585
},
{
"epoch": 0.5535843300955287,
"grad_norm": 0.1791576529627214,
"learning_rate": 1.692769118328876e-05,
"loss": 0.6882,
"step": 2586
},
{
"epoch": 0.5537983998287442,
"grad_norm": 0.17295047952605672,
"learning_rate": 1.6914264972242455e-05,
"loss": 0.7063,
"step": 2587
},
{
"epoch": 0.5540124695619598,
"grad_norm": 0.1831145624108626,
"learning_rate": 1.6900840185565788e-05,
"loss": 0.6885,
"step": 2588
},
{
"epoch": 0.5542265392951754,
"grad_norm": 0.17348722888698428,
"learning_rate": 1.6887416829455615e-05,
"loss": 0.7033,
"step": 2589
},
{
"epoch": 0.554440609028391,
"grad_norm": 0.1698163045766598,
"learning_rate": 1.687399491010814e-05,
"loss": 0.7207,
"step": 2590
},
{
"epoch": 0.5546546787616066,
"grad_norm": 0.19242643053915104,
"learning_rate": 1.686057443371889e-05,
"loss": 0.7041,
"step": 2591
},
{
"epoch": 0.5548687484948222,
"grad_norm": 0.16853852755838822,
"learning_rate": 1.684715540648273e-05,
"loss": 0.6866,
"step": 2592
},
{
"epoch": 0.5550828182280377,
"grad_norm": 0.171034883093637,
"learning_rate": 1.6833737834593874e-05,
"loss": 0.6867,
"step": 2593
},
{
"epoch": 0.5552968879612534,
"grad_norm": 0.16969581032873562,
"learning_rate": 1.6820321724245824e-05,
"loss": 0.7015,
"step": 2594
},
{
"epoch": 0.555510957694469,
"grad_norm": 0.17473056491589362,
"learning_rate": 1.6806907081631458e-05,
"loss": 0.6934,
"step": 2595
},
{
"epoch": 0.5557250274276846,
"grad_norm": 0.165962716375049,
"learning_rate": 1.6793493912942927e-05,
"loss": 0.7182,
"step": 2596
},
{
"epoch": 0.5559390971609002,
"grad_norm": 0.1632106660159136,
"learning_rate": 1.678008222437174e-05,
"loss": 0.6909,
"step": 2597
},
{
"epoch": 0.5561531668941158,
"grad_norm": 0.17493867782339634,
"learning_rate": 1.6766672022108712e-05,
"loss": 0.6894,
"step": 2598
},
{
"epoch": 0.5563672366273313,
"grad_norm": 0.15575514788163156,
"learning_rate": 1.6753263312343948e-05,
"loss": 0.6832,
"step": 2599
},
{
"epoch": 0.5565813063605469,
"grad_norm": 0.18760416784486758,
"learning_rate": 1.6739856101266907e-05,
"loss": 0.6981,
"step": 2600
},
{
"epoch": 0.5567953760937625,
"grad_norm": 0.1562575177121708,
"learning_rate": 1.672645039506631e-05,
"loss": 0.6472,
"step": 2601
},
{
"epoch": 0.5570094458269781,
"grad_norm": 0.17204576736151733,
"learning_rate": 1.671304619993022e-05,
"loss": 0.7119,
"step": 2602
},
{
"epoch": 0.5572235155601938,
"grad_norm": 0.17453972682224395,
"learning_rate": 1.6699643522046e-05,
"loss": 0.6771,
"step": 2603
},
{
"epoch": 0.5574375852934094,
"grad_norm": 0.1746598260501258,
"learning_rate": 1.6686242367600272e-05,
"loss": 0.6948,
"step": 2604
},
{
"epoch": 0.557651655026625,
"grad_norm": 0.17631254753360737,
"learning_rate": 1.6672842742779013e-05,
"loss": 0.7102,
"step": 2605
},
{
"epoch": 0.5578657247598405,
"grad_norm": 0.16830374020009,
"learning_rate": 1.6659444653767448e-05,
"loss": 0.7043,
"step": 2606
},
{
"epoch": 0.5580797944930561,
"grad_norm": 0.1753334674085283,
"learning_rate": 1.6646048106750113e-05,
"loss": 0.7314,
"step": 2607
},
{
"epoch": 0.5582938642262717,
"grad_norm": 0.17564178531192245,
"learning_rate": 1.663265310791084e-05,
"loss": 0.6912,
"step": 2608
},
{
"epoch": 0.5585079339594873,
"grad_norm": 0.17138112941188524,
"learning_rate": 1.661925966343272e-05,
"loss": 0.7065,
"step": 2609
},
{
"epoch": 0.5587220036927029,
"grad_norm": 0.16152491010415923,
"learning_rate": 1.6605867779498163e-05,
"loss": 0.6636,
"step": 2610
},
{
"epoch": 0.5589360734259184,
"grad_norm": 0.17239711920107642,
"learning_rate": 1.6592477462288812e-05,
"loss": 0.6798,
"step": 2611
},
{
"epoch": 0.5591501431591341,
"grad_norm": 0.1779910584696381,
"learning_rate": 1.6579088717985627e-05,
"loss": 0.6925,
"step": 2612
},
{
"epoch": 0.5593642128923497,
"grad_norm": 0.17606720772471096,
"learning_rate": 1.656570155276884e-05,
"loss": 0.7108,
"step": 2613
},
{
"epoch": 0.5595782826255653,
"grad_norm": 0.1680597618729834,
"learning_rate": 1.6552315972817918e-05,
"loss": 0.6745,
"step": 2614
},
{
"epoch": 0.5597923523587809,
"grad_norm": 0.158575936679841,
"learning_rate": 1.653893198431164e-05,
"loss": 0.6854,
"step": 2615
},
{
"epoch": 0.5600064220919965,
"grad_norm": 0.18133138312663244,
"learning_rate": 1.6525549593428017e-05,
"loss": 0.6982,
"step": 2616
},
{
"epoch": 0.560220491825212,
"grad_norm": 0.16530422178113097,
"learning_rate": 1.6512168806344337e-05,
"loss": 0.702,
"step": 2617
},
{
"epoch": 0.5604345615584276,
"grad_norm": 0.17993673783077352,
"learning_rate": 1.6498789629237163e-05,
"loss": 0.7126,
"step": 2618
},
{
"epoch": 0.5606486312916432,
"grad_norm": 0.17628390841629152,
"learning_rate": 1.648541206828228e-05,
"loss": 0.6928,
"step": 2619
},
{
"epoch": 0.5608627010248588,
"grad_norm": 0.1644657133198672,
"learning_rate": 1.6472036129654757e-05,
"loss": 0.6985,
"step": 2620
},
{
"epoch": 0.5610767707580745,
"grad_norm": 0.19042556331669824,
"learning_rate": 1.645866181952889e-05,
"loss": 0.6843,
"step": 2621
},
{
"epoch": 0.5612908404912901,
"grad_norm": 0.16715886766047117,
"learning_rate": 1.6445289144078244e-05,
"loss": 0.6817,
"step": 2622
},
{
"epoch": 0.5615049102245057,
"grad_norm": 0.18078979944140733,
"learning_rate": 1.6431918109475634e-05,
"loss": 0.6983,
"step": 2623
},
{
"epoch": 0.5617189799577212,
"grad_norm": 0.16609052584351036,
"learning_rate": 1.6418548721893082e-05,
"loss": 0.7092,
"step": 2624
},
{
"epoch": 0.5619330496909368,
"grad_norm": 0.17347879809534777,
"learning_rate": 1.6405180987501888e-05,
"loss": 0.6927,
"step": 2625
},
{
"epoch": 0.5621471194241524,
"grad_norm": 0.17637115524575825,
"learning_rate": 1.639181491247257e-05,
"loss": 0.7165,
"step": 2626
},
{
"epoch": 0.562361189157368,
"grad_norm": 0.1628063507107413,
"learning_rate": 1.6378450502974882e-05,
"loss": 0.7085,
"step": 2627
},
{
"epoch": 0.5625752588905836,
"grad_norm": 0.1802493672573469,
"learning_rate": 1.6365087765177812e-05,
"loss": 0.7201,
"step": 2628
},
{
"epoch": 0.5627893286237992,
"grad_norm": 0.16543058426564483,
"learning_rate": 1.635172670524958e-05,
"loss": 0.6728,
"step": 2629
},
{
"epoch": 0.5630033983570149,
"grad_norm": 0.17102790589571118,
"learning_rate": 1.633836732935762e-05,
"loss": 0.6901,
"step": 2630
},
{
"epoch": 0.5632174680902304,
"grad_norm": 0.1730735339285674,
"learning_rate": 1.6325009643668592e-05,
"loss": 0.7195,
"step": 2631
},
{
"epoch": 0.563431537823446,
"grad_norm": 0.17142689767208405,
"learning_rate": 1.6311653654348395e-05,
"loss": 0.6971,
"step": 2632
},
{
"epoch": 0.5636456075566616,
"grad_norm": 0.16529788965033945,
"learning_rate": 1.6298299367562114e-05,
"loss": 0.6927,
"step": 2633
},
{
"epoch": 0.5638596772898772,
"grad_norm": 0.18094645166343623,
"learning_rate": 1.6284946789474066e-05,
"loss": 0.6952,
"step": 2634
},
{
"epoch": 0.5640737470230928,
"grad_norm": 0.17062995761548738,
"learning_rate": 1.627159592624779e-05,
"loss": 0.6907,
"step": 2635
},
{
"epoch": 0.5642878167563083,
"grad_norm": 0.17924096999215827,
"learning_rate": 1.6258246784045994e-05,
"loss": 0.6946,
"step": 2636
},
{
"epoch": 0.5645018864895239,
"grad_norm": 0.17725927375921324,
"learning_rate": 1.6244899369030647e-05,
"loss": 0.6766,
"step": 2637
},
{
"epoch": 0.5647159562227395,
"grad_norm": 0.1792954319475049,
"learning_rate": 1.623155368736287e-05,
"loss": 0.7019,
"step": 2638
},
{
"epoch": 0.5649300259559552,
"grad_norm": 0.17573567680536856,
"learning_rate": 1.621820974520301e-05,
"loss": 0.6866,
"step": 2639
},
{
"epoch": 0.5651440956891708,
"grad_norm": 0.17827040307814296,
"learning_rate": 1.6204867548710618e-05,
"loss": 0.7164,
"step": 2640
},
{
"epoch": 0.5653581654223864,
"grad_norm": 0.1823574922633907,
"learning_rate": 1.6191527104044407e-05,
"loss": 0.6762,
"step": 2641
},
{
"epoch": 0.565572235155602,
"grad_norm": 0.17409638027917007,
"learning_rate": 1.6178188417362326e-05,
"loss": 0.6839,
"step": 2642
},
{
"epoch": 0.5657863048888175,
"grad_norm": 0.17981781509710126,
"learning_rate": 1.6164851494821463e-05,
"loss": 0.7023,
"step": 2643
},
{
"epoch": 0.5660003746220331,
"grad_norm": 0.17012733163495317,
"learning_rate": 1.6151516342578132e-05,
"loss": 0.6899,
"step": 2644
},
{
"epoch": 0.5662144443552487,
"grad_norm": 0.16773900252249282,
"learning_rate": 1.6138182966787822e-05,
"loss": 0.6975,
"step": 2645
},
{
"epoch": 0.5664285140884643,
"grad_norm": 0.1597815913860277,
"learning_rate": 1.6124851373605174e-05,
"loss": 0.6907,
"step": 2646
},
{
"epoch": 0.5666425838216799,
"grad_norm": 0.2552136725855513,
"learning_rate": 1.6111521569184047e-05,
"loss": 0.7129,
"step": 2647
},
{
"epoch": 0.5668566535548956,
"grad_norm": 0.16858063026817874,
"learning_rate": 1.609819355967744e-05,
"loss": 0.6801,
"step": 2648
},
{
"epoch": 0.5670707232881111,
"grad_norm": 0.25403964328041073,
"learning_rate": 1.6084867351237538e-05,
"loss": 0.721,
"step": 2649
},
{
"epoch": 0.5672847930213267,
"grad_norm": 0.168614949343269,
"learning_rate": 1.6071542950015713e-05,
"loss": 0.7157,
"step": 2650
},
{
"epoch": 0.5674988627545423,
"grad_norm": 0.1697398856475484,
"learning_rate": 1.605822036216246e-05,
"loss": 0.7337,
"step": 2651
},
{
"epoch": 0.5677129324877579,
"grad_norm": 0.16600190026545894,
"learning_rate": 1.604489959382748e-05,
"loss": 0.695,
"step": 2652
},
{
"epoch": 0.5679270022209735,
"grad_norm": 0.1757455401999051,
"learning_rate": 1.60315806511596e-05,
"loss": 0.7007,
"step": 2653
},
{
"epoch": 0.5681410719541891,
"grad_norm": 0.1656177914157262,
"learning_rate": 1.6018263540306827e-05,
"loss": 0.6779,
"step": 2654
},
{
"epoch": 0.5683551416874046,
"grad_norm": 0.1704239067581863,
"learning_rate": 1.6004948267416326e-05,
"loss": 0.6823,
"step": 2655
},
{
"epoch": 0.5685692114206202,
"grad_norm": 8.912496396978549,
"learning_rate": 1.599163483863438e-05,
"loss": 0.7275,
"step": 2656
},
{
"epoch": 0.5687832811538358,
"grad_norm": 0.1855300823528869,
"learning_rate": 1.5978323260106463e-05,
"loss": 0.6995,
"step": 2657
},
{
"epoch": 0.5689973508870515,
"grad_norm": 0.16269820285223507,
"learning_rate": 1.596501353797716e-05,
"loss": 0.6987,
"step": 2658
},
{
"epoch": 0.5692114206202671,
"grad_norm": 0.23223472693764194,
"learning_rate": 1.595170567839022e-05,
"loss": 0.7094,
"step": 2659
},
{
"epoch": 0.5694254903534827,
"grad_norm": 0.17086118303863418,
"learning_rate": 1.5938399687488536e-05,
"loss": 0.7075,
"step": 2660
},
{
"epoch": 0.5696395600866982,
"grad_norm": 0.18525578051426128,
"learning_rate": 1.5925095571414116e-05,
"loss": 0.6659,
"step": 2661
},
{
"epoch": 0.5698536298199138,
"grad_norm": 0.1697856717238186,
"learning_rate": 1.5911793336308126e-05,
"loss": 0.6949,
"step": 2662
},
{
"epoch": 0.5700676995531294,
"grad_norm": 0.17264177768579375,
"learning_rate": 1.589849298831084e-05,
"loss": 0.7093,
"step": 2663
},
{
"epoch": 0.570281769286345,
"grad_norm": 0.17456447740166398,
"learning_rate": 1.5885194533561688e-05,
"loss": 0.6846,
"step": 2664
},
{
"epoch": 0.5704958390195606,
"grad_norm": 0.1660651117548356,
"learning_rate": 1.5871897978199213e-05,
"loss": 0.7231,
"step": 2665
},
{
"epoch": 0.5707099087527762,
"grad_norm": 0.16830182730304524,
"learning_rate": 1.5858603328361062e-05,
"loss": 0.6791,
"step": 2666
},
{
"epoch": 0.5709239784859919,
"grad_norm": 0.17009361623168645,
"learning_rate": 1.584531059018404e-05,
"loss": 0.6895,
"step": 2667
},
{
"epoch": 0.5711380482192074,
"grad_norm": 0.1636000034657648,
"learning_rate": 1.5832019769804046e-05,
"loss": 0.6814,
"step": 2668
},
{
"epoch": 0.571352117952423,
"grad_norm": 0.16430727377195042,
"learning_rate": 1.5818730873356096e-05,
"loss": 0.7202,
"step": 2669
},
{
"epoch": 0.5715661876856386,
"grad_norm": 0.1751136044026341,
"learning_rate": 1.580544390697431e-05,
"loss": 0.6912,
"step": 2670
},
{
"epoch": 0.5717802574188542,
"grad_norm": 0.16778141925927165,
"learning_rate": 1.579215887679195e-05,
"loss": 0.7356,
"step": 2671
},
{
"epoch": 0.5719943271520698,
"grad_norm": 0.1748980080116598,
"learning_rate": 1.5778875788941348e-05,
"loss": 0.6911,
"step": 2672
},
{
"epoch": 0.5722083968852854,
"grad_norm": 0.16807866825292672,
"learning_rate": 1.576559464955395e-05,
"loss": 0.7164,
"step": 2673
},
{
"epoch": 0.5724224666185009,
"grad_norm": 0.22042166108926137,
"learning_rate": 1.5752315464760316e-05,
"loss": 0.7545,
"step": 2674
},
{
"epoch": 0.5726365363517165,
"grad_norm": 0.16454085108082656,
"learning_rate": 1.5739038240690084e-05,
"loss": 0.6824,
"step": 2675
},
{
"epoch": 0.5728506060849322,
"grad_norm": 0.17524623135800574,
"learning_rate": 1.5725762983472e-05,
"loss": 0.7217,
"step": 2676
},
{
"epoch": 0.5730646758181478,
"grad_norm": 0.17727082057636445,
"learning_rate": 1.5712489699233903e-05,
"loss": 0.7177,
"step": 2677
},
{
"epoch": 0.5732787455513634,
"grad_norm": 0.1756032630367442,
"learning_rate": 1.5699218394102705e-05,
"loss": 0.7081,
"step": 2678
},
{
"epoch": 0.573492815284579,
"grad_norm": 0.18349669376757125,
"learning_rate": 1.5685949074204436e-05,
"loss": 0.7097,
"step": 2679
},
{
"epoch": 0.5737068850177945,
"grad_norm": 0.17347757811077608,
"learning_rate": 1.5672681745664162e-05,
"loss": 0.7062,
"step": 2680
},
{
"epoch": 0.5739209547510101,
"grad_norm": 0.17591544246657548,
"learning_rate": 1.5659416414606075e-05,
"loss": 0.6869,
"step": 2681
},
{
"epoch": 0.5741350244842257,
"grad_norm": 0.1808924848994917,
"learning_rate": 1.5646153087153437e-05,
"loss": 0.6883,
"step": 2682
},
{
"epoch": 0.5743490942174413,
"grad_norm": 0.17121195399124406,
"learning_rate": 1.5632891769428554e-05,
"loss": 0.6745,
"step": 2683
},
{
"epoch": 0.5745631639506569,
"grad_norm": 0.1769316701709479,
"learning_rate": 1.5619632467552842e-05,
"loss": 0.701,
"step": 2684
},
{
"epoch": 0.5747772336838726,
"grad_norm": 0.1722387619036292,
"learning_rate": 1.5606375187646755e-05,
"loss": 0.7072,
"step": 2685
},
{
"epoch": 0.5749913034170882,
"grad_norm": 0.2063465496069471,
"learning_rate": 1.5593119935829844e-05,
"loss": 0.7169,
"step": 2686
},
{
"epoch": 0.5752053731503037,
"grad_norm": 0.16092048876232048,
"learning_rate": 1.557986671822071e-05,
"loss": 0.6831,
"step": 2687
},
{
"epoch": 0.5754194428835193,
"grad_norm": 0.17672007983765237,
"learning_rate": 1.5566615540936994e-05,
"loss": 0.6648,
"step": 2688
},
{
"epoch": 0.5756335126167349,
"grad_norm": 0.16833732076961716,
"learning_rate": 1.5553366410095443e-05,
"loss": 0.6987,
"step": 2689
},
{
"epoch": 0.5758475823499505,
"grad_norm": 0.16961713857908878,
"learning_rate": 1.5540119331811807e-05,
"loss": 0.7092,
"step": 2690
},
{
"epoch": 0.5760616520831661,
"grad_norm": 0.1694528421671422,
"learning_rate": 1.5526874312200923e-05,
"loss": 0.7058,
"step": 2691
},
{
"epoch": 0.5762757218163816,
"grad_norm": 0.17170558961931925,
"learning_rate": 1.5513631357376677e-05,
"loss": 0.682,
"step": 2692
},
{
"epoch": 0.5764897915495972,
"grad_norm": 0.16871501912314427,
"learning_rate": 1.5500390473451977e-05,
"loss": 0.6945,
"step": 2693
},
{
"epoch": 0.5767038612828129,
"grad_norm": 0.17504256833279142,
"learning_rate": 1.54871516665388e-05,
"loss": 0.7149,
"step": 2694
},
{
"epoch": 0.5769179310160285,
"grad_norm": 0.1633022256535095,
"learning_rate": 1.547391494274815e-05,
"loss": 0.6902,
"step": 2695
},
{
"epoch": 0.5771320007492441,
"grad_norm": 0.1655482773500039,
"learning_rate": 1.5460680308190076e-05,
"loss": 0.713,
"step": 2696
},
{
"epoch": 0.5773460704824597,
"grad_norm": 0.16413844359398785,
"learning_rate": 1.544744776897367e-05,
"loss": 0.7046,
"step": 2697
},
{
"epoch": 0.5775601402156753,
"grad_norm": 0.1629869752071672,
"learning_rate": 1.543421733120703e-05,
"loss": 0.7139,
"step": 2698
},
{
"epoch": 0.5777742099488908,
"grad_norm": 0.15968617666908208,
"learning_rate": 1.5420989000997324e-05,
"loss": 0.6876,
"step": 2699
},
{
"epoch": 0.5779882796821064,
"grad_norm": 0.16300519938956157,
"learning_rate": 1.5407762784450705e-05,
"loss": 0.6885,
"step": 2700
},
{
"epoch": 0.578202349415322,
"grad_norm": 0.16071963520224197,
"learning_rate": 1.5394538687672372e-05,
"loss": 0.7394,
"step": 2701
},
{
"epoch": 0.5784164191485376,
"grad_norm": 0.17296846367692134,
"learning_rate": 1.5381316716766566e-05,
"loss": 0.6864,
"step": 2702
},
{
"epoch": 0.5786304888817533,
"grad_norm": 0.16966570424573013,
"learning_rate": 1.5368096877836495e-05,
"loss": 0.6946,
"step": 2703
},
{
"epoch": 0.5788445586149689,
"grad_norm": 0.16801847811970264,
"learning_rate": 1.5354879176984438e-05,
"loss": 0.7012,
"step": 2704
},
{
"epoch": 0.5790586283481844,
"grad_norm": 0.16345224853796655,
"learning_rate": 1.5341663620311638e-05,
"loss": 0.6963,
"step": 2705
},
{
"epoch": 0.5792726980814,
"grad_norm": 0.16318160860193356,
"learning_rate": 1.5328450213918388e-05,
"loss": 0.7224,
"step": 2706
},
{
"epoch": 0.5794867678146156,
"grad_norm": 0.16574054622915096,
"learning_rate": 1.531523896390397e-05,
"loss": 0.73,
"step": 2707
},
{
"epoch": 0.5797008375478312,
"grad_norm": 0.166160445983208,
"learning_rate": 1.5302029876366667e-05,
"loss": 0.7126,
"step": 2708
},
{
"epoch": 0.5799149072810468,
"grad_norm": 0.15654746520301857,
"learning_rate": 1.5288822957403775e-05,
"loss": 0.6844,
"step": 2709
},
{
"epoch": 0.5801289770142624,
"grad_norm": 0.17873582425754628,
"learning_rate": 1.527561821311158e-05,
"loss": 0.7014,
"step": 2710
},
{
"epoch": 0.5803430467474779,
"grad_norm": 0.1640892628615522,
"learning_rate": 1.5262415649585375e-05,
"loss": 0.7246,
"step": 2711
},
{
"epoch": 0.5805571164806936,
"grad_norm": 0.16525787461111266,
"learning_rate": 1.5249215272919424e-05,
"loss": 0.7177,
"step": 2712
},
{
"epoch": 0.5807711862139092,
"grad_norm": 0.1650028056548862,
"learning_rate": 1.5236017089207014e-05,
"loss": 0.6935,
"step": 2713
},
{
"epoch": 0.5809852559471248,
"grad_norm": 0.16254064172306787,
"learning_rate": 1.5222821104540393e-05,
"loss": 0.6909,
"step": 2714
},
{
"epoch": 0.5811993256803404,
"grad_norm": 0.15840833276929261,
"learning_rate": 1.52096273250108e-05,
"loss": 0.662,
"step": 2715
},
{
"epoch": 0.581413395413556,
"grad_norm": 0.16580998810550387,
"learning_rate": 1.5196435756708474e-05,
"loss": 0.6955,
"step": 2716
},
{
"epoch": 0.5816274651467715,
"grad_norm": 0.16274111320780607,
"learning_rate": 1.5183246405722603e-05,
"loss": 0.7137,
"step": 2717
},
{
"epoch": 0.5818415348799871,
"grad_norm": 0.16765261211098964,
"learning_rate": 1.5170059278141371e-05,
"loss": 0.6955,
"step": 2718
},
{
"epoch": 0.5820556046132027,
"grad_norm": 0.1702200327390562,
"learning_rate": 1.5156874380051945e-05,
"loss": 0.7213,
"step": 2719
},
{
"epoch": 0.5822696743464183,
"grad_norm": 0.1854837920569842,
"learning_rate": 1.514369171754043e-05,
"loss": 0.7008,
"step": 2720
},
{
"epoch": 0.582483744079634,
"grad_norm": 0.18219078464002583,
"learning_rate": 1.5130511296691937e-05,
"loss": 0.697,
"step": 2721
},
{
"epoch": 0.5826978138128496,
"grad_norm": 0.1788694770784065,
"learning_rate": 1.5117333123590508e-05,
"loss": 0.7069,
"step": 2722
},
{
"epoch": 0.5829118835460652,
"grad_norm": 0.1746042320161375,
"learning_rate": 1.5104157204319169e-05,
"loss": 0.7078,
"step": 2723
},
{
"epoch": 0.5831259532792807,
"grad_norm": 0.17680237787055583,
"learning_rate": 1.5090983544959914e-05,
"loss": 0.6968,
"step": 2724
},
{
"epoch": 0.5833400230124963,
"grad_norm": 0.17766671131807185,
"learning_rate": 1.5077812151593655e-05,
"loss": 0.7207,
"step": 2725
},
{
"epoch": 0.5835540927457119,
"grad_norm": 0.17263375304450626,
"learning_rate": 1.506464303030031e-05,
"loss": 0.6882,
"step": 2726
},
{
"epoch": 0.5837681624789275,
"grad_norm": 0.1773322650045857,
"learning_rate": 1.5051476187158698e-05,
"loss": 0.7228,
"step": 2727
},
{
"epoch": 0.5839822322121431,
"grad_norm": 0.16299003445156277,
"learning_rate": 1.5038311628246619e-05,
"loss": 0.7027,
"step": 2728
},
{
"epoch": 0.5841963019453587,
"grad_norm": 0.1865519783868536,
"learning_rate": 1.5025149359640816e-05,
"loss": 0.7296,
"step": 2729
},
{
"epoch": 0.5844103716785743,
"grad_norm": 0.1627341280597865,
"learning_rate": 1.5011989387416954e-05,
"loss": 0.6691,
"step": 2730
},
{
"epoch": 0.5846244414117899,
"grad_norm": 0.16657588961357114,
"learning_rate": 1.4998831717649668e-05,
"loss": 0.6908,
"step": 2731
},
{
"epoch": 0.5848385111450055,
"grad_norm": 0.20490407719349335,
"learning_rate": 1.49856763564125e-05,
"loss": 0.7106,
"step": 2732
},
{
"epoch": 0.5850525808782211,
"grad_norm": 0.16340349893692208,
"learning_rate": 1.4972523309777947e-05,
"loss": 0.7386,
"step": 2733
},
{
"epoch": 0.5852666506114367,
"grad_norm": 0.17609615496676675,
"learning_rate": 1.4959372583817438e-05,
"loss": 0.6765,
"step": 2734
},
{
"epoch": 0.5854807203446523,
"grad_norm": 0.16286203130168111,
"learning_rate": 1.4946224184601308e-05,
"loss": 0.6992,
"step": 2735
},
{
"epoch": 0.5856947900778678,
"grad_norm": 0.16676236941421238,
"learning_rate": 1.4933078118198851e-05,
"loss": 0.7078,
"step": 2736
},
{
"epoch": 0.5859088598110834,
"grad_norm": 0.15477849951635472,
"learning_rate": 1.4919934390678252e-05,
"loss": 0.6895,
"step": 2737
},
{
"epoch": 0.586122929544299,
"grad_norm": 0.1704130964499219,
"learning_rate": 1.490679300810664e-05,
"loss": 0.7007,
"step": 2738
},
{
"epoch": 0.5863369992775147,
"grad_norm": 0.15630732815757206,
"learning_rate": 1.4893653976550057e-05,
"loss": 0.6827,
"step": 2739
},
{
"epoch": 0.5865510690107303,
"grad_norm": 0.16091649855807497,
"learning_rate": 1.4880517302073447e-05,
"loss": 0.6951,
"step": 2740
},
{
"epoch": 0.5867651387439459,
"grad_norm": 0.15774658223729976,
"learning_rate": 1.4867382990740686e-05,
"loss": 0.6812,
"step": 2741
},
{
"epoch": 0.5869792084771615,
"grad_norm": 0.1646144254437179,
"learning_rate": 1.4854251048614531e-05,
"loss": 0.7227,
"step": 2742
},
{
"epoch": 0.587193278210377,
"grad_norm": 0.15827064039679917,
"learning_rate": 1.484112148175667e-05,
"loss": 0.7189,
"step": 2743
},
{
"epoch": 0.5874073479435926,
"grad_norm": 0.16587505835067617,
"learning_rate": 1.4827994296227704e-05,
"loss": 0.6933,
"step": 2744
},
{
"epoch": 0.5876214176768082,
"grad_norm": 0.1576660131843526,
"learning_rate": 1.481486949808709e-05,
"loss": 0.7036,
"step": 2745
},
{
"epoch": 0.5878354874100238,
"grad_norm": 0.1618836470440896,
"learning_rate": 1.4801747093393229e-05,
"loss": 0.7007,
"step": 2746
},
{
"epoch": 0.5880495571432394,
"grad_norm": 0.16452588982706243,
"learning_rate": 1.4788627088203394e-05,
"loss": 0.6932,
"step": 2747
},
{
"epoch": 0.5882636268764551,
"grad_norm": 0.16883568099615812,
"learning_rate": 1.4775509488573751e-05,
"loss": 0.7159,
"step": 2748
},
{
"epoch": 0.5884776966096706,
"grad_norm": 0.16325082793248,
"learning_rate": 1.4762394300559373e-05,
"loss": 0.6863,
"step": 2749
},
{
"epoch": 0.5886917663428862,
"grad_norm": 0.16526408761461683,
"learning_rate": 1.474928153021419e-05,
"loss": 0.7169,
"step": 2750
},
{
"epoch": 0.5889058360761018,
"grad_norm": 0.1689393345787796,
"learning_rate": 1.4736171183591044e-05,
"loss": 0.6992,
"step": 2751
},
{
"epoch": 0.5891199058093174,
"grad_norm": 0.5073757563284773,
"learning_rate": 1.4723063266741644e-05,
"loss": 0.7031,
"step": 2752
},
{
"epoch": 0.589333975542533,
"grad_norm": 0.16192856735154973,
"learning_rate": 1.4709957785716582e-05,
"loss": 0.6941,
"step": 2753
},
{
"epoch": 0.5895480452757486,
"grad_norm": 0.16701122262533197,
"learning_rate": 1.4696854746565316e-05,
"loss": 0.6918,
"step": 2754
},
{
"epoch": 0.5897621150089641,
"grad_norm": 0.1796473393516434,
"learning_rate": 1.4683754155336196e-05,
"loss": 0.7097,
"step": 2755
},
{
"epoch": 0.5899761847421797,
"grad_norm": 0.16193738971829216,
"learning_rate": 1.4670656018076428e-05,
"loss": 0.7207,
"step": 2756
},
{
"epoch": 0.5901902544753954,
"grad_norm": 0.17590962510084385,
"learning_rate": 1.4657560340832078e-05,
"loss": 0.6936,
"step": 2757
},
{
"epoch": 0.590404324208611,
"grad_norm": 0.30933642674275985,
"learning_rate": 1.4644467129648106e-05,
"loss": 0.7,
"step": 2758
},
{
"epoch": 0.5906183939418266,
"grad_norm": 0.178397385660487,
"learning_rate": 1.4631376390568294e-05,
"loss": 0.7173,
"step": 2759
},
{
"epoch": 0.5908324636750422,
"grad_norm": 0.19148816228214477,
"learning_rate": 1.4618288129635314e-05,
"loss": 0.6957,
"step": 2760
},
{
"epoch": 0.5910465334082577,
"grad_norm": 0.17337939676446942,
"learning_rate": 1.4605202352890691e-05,
"loss": 0.687,
"step": 2761
},
{
"epoch": 0.5912606031414733,
"grad_norm": 0.16812102940261198,
"learning_rate": 1.4592119066374775e-05,
"loss": 0.715,
"step": 2762
},
{
"epoch": 0.5914746728746889,
"grad_norm": 0.19026483539071762,
"learning_rate": 1.4579038276126806e-05,
"loss": 0.6858,
"step": 2763
},
{
"epoch": 0.5916887426079045,
"grad_norm": 0.1630638900718737,
"learning_rate": 1.456595998818484e-05,
"loss": 0.704,
"step": 2764
},
{
"epoch": 0.5919028123411201,
"grad_norm": 0.16555163385873223,
"learning_rate": 1.4552884208585796e-05,
"loss": 0.6952,
"step": 2765
},
{
"epoch": 0.5921168820743357,
"grad_norm": 0.1668396705760013,
"learning_rate": 1.4539810943365438e-05,
"loss": 0.6782,
"step": 2766
},
{
"epoch": 0.5923309518075514,
"grad_norm": 0.21188407067690096,
"learning_rate": 1.4526740198558345e-05,
"loss": 0.7056,
"step": 2767
},
{
"epoch": 0.5925450215407669,
"grad_norm": 0.1756583186916101,
"learning_rate": 1.4513671980197964e-05,
"loss": 0.7094,
"step": 2768
},
{
"epoch": 0.5927590912739825,
"grad_norm": 0.16932947661767148,
"learning_rate": 1.4500606294316545e-05,
"loss": 0.678,
"step": 2769
},
{
"epoch": 0.5929731610071981,
"grad_norm": 0.1680398258976508,
"learning_rate": 1.4487543146945196e-05,
"loss": 0.7007,
"step": 2770
},
{
"epoch": 0.5931872307404137,
"grad_norm": 0.17438446698121887,
"learning_rate": 1.4474482544113846e-05,
"loss": 0.7141,
"step": 2771
},
{
"epoch": 0.5934013004736293,
"grad_norm": 0.16077979710309473,
"learning_rate": 1.446142449185123e-05,
"loss": 0.6817,
"step": 2772
},
{
"epoch": 0.5936153702068449,
"grad_norm": 0.16452070105806216,
"learning_rate": 1.444836899618494e-05,
"loss": 0.7056,
"step": 2773
},
{
"epoch": 0.5938294399400604,
"grad_norm": 0.17937973900146656,
"learning_rate": 1.4435316063141347e-05,
"loss": 0.7024,
"step": 2774
},
{
"epoch": 0.594043509673276,
"grad_norm": 0.16968575232261435,
"learning_rate": 1.4422265698745676e-05,
"loss": 0.6885,
"step": 2775
},
{
"epoch": 0.5942575794064917,
"grad_norm": 0.16299774524171282,
"learning_rate": 1.4409217909021958e-05,
"loss": 0.7112,
"step": 2776
},
{
"epoch": 0.5944716491397073,
"grad_norm": 0.16631355701424153,
"learning_rate": 1.4396172699993004e-05,
"loss": 0.7231,
"step": 2777
},
{
"epoch": 0.5946857188729229,
"grad_norm": 0.23119997682126242,
"learning_rate": 1.4383130077680489e-05,
"loss": 0.7195,
"step": 2778
},
{
"epoch": 0.5948997886061385,
"grad_norm": 3.138037986720887,
"learning_rate": 1.4370090048104835e-05,
"loss": 0.7344,
"step": 2779
},
{
"epoch": 0.595113858339354,
"grad_norm": 0.20002142315450172,
"learning_rate": 1.435705261728531e-05,
"loss": 0.7108,
"step": 2780
},
{
"epoch": 0.5953279280725696,
"grad_norm": 0.16604958792986207,
"learning_rate": 1.4344017791239976e-05,
"loss": 0.6958,
"step": 2781
},
{
"epoch": 0.5955419978057852,
"grad_norm": 0.1700742915211355,
"learning_rate": 1.4330985575985668e-05,
"loss": 0.7043,
"step": 2782
},
{
"epoch": 0.5957560675390008,
"grad_norm": 0.1705322217778988,
"learning_rate": 1.4317955977538047e-05,
"loss": 0.7016,
"step": 2783
},
{
"epoch": 0.5959701372722164,
"grad_norm": 0.1695689380361238,
"learning_rate": 1.4304929001911538e-05,
"loss": 0.7111,
"step": 2784
},
{
"epoch": 0.5961842070054321,
"grad_norm": 0.16783008074734213,
"learning_rate": 1.4291904655119378e-05,
"loss": 0.7157,
"step": 2785
},
{
"epoch": 0.5963982767386476,
"grad_norm": 0.20520283635870903,
"learning_rate": 1.4278882943173586e-05,
"loss": 0.7137,
"step": 2786
},
{
"epoch": 0.5966123464718632,
"grad_norm": 0.18463803305822762,
"learning_rate": 1.4265863872084947e-05,
"loss": 0.6937,
"step": 2787
},
{
"epoch": 0.5968264162050788,
"grad_norm": 0.166916375414561,
"learning_rate": 1.4252847447863052e-05,
"loss": 0.679,
"step": 2788
},
{
"epoch": 0.5970404859382944,
"grad_norm": 0.16078833682187244,
"learning_rate": 1.4239833676516254e-05,
"loss": 0.7056,
"step": 2789
},
{
"epoch": 0.59725455567151,
"grad_norm": 0.16483393450802797,
"learning_rate": 1.4226822564051685e-05,
"loss": 0.7233,
"step": 2790
},
{
"epoch": 0.5974686254047256,
"grad_norm": 0.17327600889177555,
"learning_rate": 1.4213814116475253e-05,
"loss": 0.7395,
"step": 2791
},
{
"epoch": 0.5976826951379411,
"grad_norm": 0.169055947008346,
"learning_rate": 1.4200808339791636e-05,
"loss": 0.6527,
"step": 2792
},
{
"epoch": 0.5978967648711567,
"grad_norm": 0.19770003659598154,
"learning_rate": 1.418780524000427e-05,
"loss": 0.6789,
"step": 2793
},
{
"epoch": 0.5981108346043724,
"grad_norm": 0.16378956125142946,
"learning_rate": 1.4174804823115369e-05,
"loss": 0.683,
"step": 2794
},
{
"epoch": 0.598324904337588,
"grad_norm": 0.1749418451018798,
"learning_rate": 1.4161807095125898e-05,
"loss": 0.7052,
"step": 2795
},
{
"epoch": 0.5985389740708036,
"grad_norm": 0.1655426744522246,
"learning_rate": 1.4148812062035577e-05,
"loss": 0.7057,
"step": 2796
},
{
"epoch": 0.5987530438040192,
"grad_norm": 0.16307665483456588,
"learning_rate": 1.4135819729842903e-05,
"loss": 0.7071,
"step": 2797
},
{
"epoch": 0.5989671135372348,
"grad_norm": 0.1676887388934184,
"learning_rate": 1.412283010454511e-05,
"loss": 0.713,
"step": 2798
},
{
"epoch": 0.5991811832704503,
"grad_norm": 0.1707732594147557,
"learning_rate": 1.4109843192138173e-05,
"loss": 0.696,
"step": 2799
},
{
"epoch": 0.5993952530036659,
"grad_norm": 0.1592237316422158,
"learning_rate": 1.409685899861685e-05,
"loss": 0.6917,
"step": 2800
},
{
"epoch": 0.5996093227368815,
"grad_norm": 0.164504155732762,
"learning_rate": 1.4083877529974594e-05,
"loss": 0.6847,
"step": 2801
},
{
"epoch": 0.5998233924700971,
"grad_norm": 0.16961189605041485,
"learning_rate": 1.4070898792203643e-05,
"loss": 0.7063,
"step": 2802
},
{
"epoch": 0.6000374622033128,
"grad_norm": 0.16920821739088487,
"learning_rate": 1.405792279129496e-05,
"loss": 0.675,
"step": 2803
},
{
"epoch": 0.6002515319365284,
"grad_norm": 0.1639359750351902,
"learning_rate": 1.4044949533238237e-05,
"loss": 0.7148,
"step": 2804
},
{
"epoch": 0.6004656016697439,
"grad_norm": 0.16622171843395203,
"learning_rate": 1.4031979024021913e-05,
"loss": 0.6685,
"step": 2805
},
{
"epoch": 0.6006796714029595,
"grad_norm": 0.16717356812615172,
"learning_rate": 1.4019011269633138e-05,
"loss": 0.6911,
"step": 2806
},
{
"epoch": 0.6008937411361751,
"grad_norm": 0.16108208873293745,
"learning_rate": 1.4006046276057813e-05,
"loss": 0.6939,
"step": 2807
},
{
"epoch": 0.6011078108693907,
"grad_norm": 0.16295024827925517,
"learning_rate": 1.3993084049280563e-05,
"loss": 0.6948,
"step": 2808
},
{
"epoch": 0.6013218806026063,
"grad_norm": 0.15832520855149404,
"learning_rate": 1.398012459528471e-05,
"loss": 0.7063,
"step": 2809
},
{
"epoch": 0.6015359503358219,
"grad_norm": 0.16825850548728402,
"learning_rate": 1.3967167920052336e-05,
"loss": 0.7239,
"step": 2810
},
{
"epoch": 0.6017500200690374,
"grad_norm": 0.15987064419423355,
"learning_rate": 1.3954214029564195e-05,
"loss": 0.7036,
"step": 2811
},
{
"epoch": 0.6019640898022531,
"grad_norm": 0.1680518196512475,
"learning_rate": 1.394126292979979e-05,
"loss": 0.7191,
"step": 2812
},
{
"epoch": 0.6021781595354687,
"grad_norm": 0.16920494731449956,
"learning_rate": 1.3928314626737338e-05,
"loss": 0.7226,
"step": 2813
},
{
"epoch": 0.6023922292686843,
"grad_norm": 0.15814715156032466,
"learning_rate": 1.3915369126353728e-05,
"loss": 0.6922,
"step": 2814
},
{
"epoch": 0.6026062990018999,
"grad_norm": 0.18116438775219987,
"learning_rate": 1.3902426434624601e-05,
"loss": 0.7135,
"step": 2815
},
{
"epoch": 0.6028203687351155,
"grad_norm": 0.15802119074031223,
"learning_rate": 1.3889486557524258e-05,
"loss": 0.6722,
"step": 2816
},
{
"epoch": 0.603034438468331,
"grad_norm": 0.17503587907093565,
"learning_rate": 1.387654950102574e-05,
"loss": 0.7054,
"step": 2817
},
{
"epoch": 0.6032485082015466,
"grad_norm": 0.16905747935656973,
"learning_rate": 1.3863615271100767e-05,
"loss": 0.6916,
"step": 2818
},
{
"epoch": 0.6034625779347622,
"grad_norm": 0.16820107102584045,
"learning_rate": 1.3850683873719746e-05,
"loss": 0.6971,
"step": 2819
},
{
"epoch": 0.6036766476679778,
"grad_norm": 0.16442523150197488,
"learning_rate": 1.38377553148518e-05,
"loss": 0.6819,
"step": 2820
},
{
"epoch": 0.6038907174011935,
"grad_norm": 0.16279692335300602,
"learning_rate": 1.3824829600464709e-05,
"loss": 0.708,
"step": 2821
},
{
"epoch": 0.6041047871344091,
"grad_norm": 0.17084118113946872,
"learning_rate": 1.3811906736524972e-05,
"loss": 0.6849,
"step": 2822
},
{
"epoch": 0.6043188568676247,
"grad_norm": 0.16452064589010315,
"learning_rate": 1.3798986728997767e-05,
"loss": 0.702,
"step": 2823
},
{
"epoch": 0.6045329266008402,
"grad_norm": 0.16673368973665786,
"learning_rate": 1.3786069583846926e-05,
"loss": 0.6938,
"step": 2824
},
{
"epoch": 0.6047469963340558,
"grad_norm": 0.1693800605040336,
"learning_rate": 1.3773155307035002e-05,
"loss": 0.6875,
"step": 2825
},
{
"epoch": 0.6049610660672714,
"grad_norm": 0.16765783009207916,
"learning_rate": 1.376024390452318e-05,
"loss": 0.7236,
"step": 2826
},
{
"epoch": 0.605175135800487,
"grad_norm": 0.17137048506079616,
"learning_rate": 1.3747335382271345e-05,
"loss": 0.7096,
"step": 2827
},
{
"epoch": 0.6053892055337026,
"grad_norm": 0.17064714750208643,
"learning_rate": 1.3734429746238066e-05,
"loss": 0.7192,
"step": 2828
},
{
"epoch": 0.6056032752669182,
"grad_norm": 0.1780840211898227,
"learning_rate": 1.3721527002380535e-05,
"loss": 0.7098,
"step": 2829
},
{
"epoch": 0.6058173450001338,
"grad_norm": 0.1733188902182287,
"learning_rate": 1.370862715665465e-05,
"loss": 0.7256,
"step": 2830
},
{
"epoch": 0.6060314147333494,
"grad_norm": 0.16486273290790035,
"learning_rate": 1.3695730215014955e-05,
"loss": 0.7036,
"step": 2831
},
{
"epoch": 0.606245484466565,
"grad_norm": 0.17658189792895368,
"learning_rate": 1.3682836183414647e-05,
"loss": 0.679,
"step": 2832
},
{
"epoch": 0.6064595541997806,
"grad_norm": 0.16447354117646157,
"learning_rate": 1.3669945067805596e-05,
"loss": 0.6766,
"step": 2833
},
{
"epoch": 0.6066736239329962,
"grad_norm": 0.17992482061826953,
"learning_rate": 1.3657056874138315e-05,
"loss": 0.7033,
"step": 2834
},
{
"epoch": 0.6068876936662118,
"grad_norm": 0.16356972374040374,
"learning_rate": 1.364417160836197e-05,
"loss": 0.7154,
"step": 2835
},
{
"epoch": 0.6071017633994273,
"grad_norm": 0.17107500255646887,
"learning_rate": 1.3631289276424374e-05,
"loss": 0.7159,
"step": 2836
},
{
"epoch": 0.6073158331326429,
"grad_norm": 0.1785890302760153,
"learning_rate": 1.3618409884271993e-05,
"loss": 0.7215,
"step": 2837
},
{
"epoch": 0.6075299028658585,
"grad_norm": 0.16887974727734947,
"learning_rate": 1.360553343784992e-05,
"loss": 0.6903,
"step": 2838
},
{
"epoch": 0.6077439725990742,
"grad_norm": 0.17943646777936517,
"learning_rate": 1.3592659943101914e-05,
"loss": 0.6875,
"step": 2839
},
{
"epoch": 0.6079580423322898,
"grad_norm": 0.15998224774645078,
"learning_rate": 1.3579789405970347e-05,
"loss": 0.689,
"step": 2840
},
{
"epoch": 0.6081721120655054,
"grad_norm": 0.17218073051440003,
"learning_rate": 1.3566921832396234e-05,
"loss": 0.7032,
"step": 2841
},
{
"epoch": 0.608386181798721,
"grad_norm": 0.1651642459289121,
"learning_rate": 1.3554057228319236e-05,
"loss": 0.6949,
"step": 2842
},
{
"epoch": 0.6086002515319365,
"grad_norm": 0.1671213785682561,
"learning_rate": 1.354119559967761e-05,
"loss": 0.7063,
"step": 2843
},
{
"epoch": 0.6088143212651521,
"grad_norm": 0.1780644297694708,
"learning_rate": 1.3528336952408277e-05,
"loss": 0.7025,
"step": 2844
},
{
"epoch": 0.6090283909983677,
"grad_norm": 0.1680704473732953,
"learning_rate": 1.3515481292446762e-05,
"loss": 0.7032,
"step": 2845
},
{
"epoch": 0.6092424607315833,
"grad_norm": 0.17020266357413008,
"learning_rate": 1.3502628625727208e-05,
"loss": 0.6782,
"step": 2846
},
{
"epoch": 0.6094565304647989,
"grad_norm": 0.245010912957747,
"learning_rate": 1.3489778958182393e-05,
"loss": 0.7111,
"step": 2847
},
{
"epoch": 0.6096706001980146,
"grad_norm": 0.1641355606340915,
"learning_rate": 1.3476932295743685e-05,
"loss": 0.6618,
"step": 2848
},
{
"epoch": 0.6098846699312301,
"grad_norm": 0.17186124968937724,
"learning_rate": 1.3464088644341091e-05,
"loss": 0.6968,
"step": 2849
},
{
"epoch": 0.6100987396644457,
"grad_norm": 0.17715077313952188,
"learning_rate": 1.3451248009903222e-05,
"loss": 0.7078,
"step": 2850
},
{
"epoch": 0.6103128093976613,
"grad_norm": 0.15681559309428414,
"learning_rate": 1.3438410398357273e-05,
"loss": 0.676,
"step": 2851
},
{
"epoch": 0.6105268791308769,
"grad_norm": 0.1745691176010804,
"learning_rate": 1.3425575815629084e-05,
"loss": 0.6734,
"step": 2852
},
{
"epoch": 0.6107409488640925,
"grad_norm": 0.17127724325606236,
"learning_rate": 1.3412744267643051e-05,
"loss": 0.7271,
"step": 2853
},
{
"epoch": 0.610955018597308,
"grad_norm": 0.16361323942944692,
"learning_rate": 1.3399915760322211e-05,
"loss": 0.6776,
"step": 2854
},
{
"epoch": 0.6111690883305236,
"grad_norm": 0.17723093582319852,
"learning_rate": 1.338709029958818e-05,
"loss": 0.6939,
"step": 2855
},
{
"epoch": 0.6113831580637392,
"grad_norm": 0.16608857951563702,
"learning_rate": 1.337426789136115e-05,
"loss": 0.6909,
"step": 2856
},
{
"epoch": 0.6115972277969549,
"grad_norm": 0.18475664294283847,
"learning_rate": 1.3361448541559944e-05,
"loss": 0.7132,
"step": 2857
},
{
"epoch": 0.6118112975301705,
"grad_norm": 0.18309689093673312,
"learning_rate": 1.334863225610193e-05,
"loss": 0.7389,
"step": 2858
},
{
"epoch": 0.6120253672633861,
"grad_norm": 0.20160900986128505,
"learning_rate": 1.3335819040903091e-05,
"loss": 0.7195,
"step": 2859
},
{
"epoch": 0.6122394369966017,
"grad_norm": 0.18710150081023777,
"learning_rate": 1.3323008901877991e-05,
"loss": 0.6991,
"step": 2860
},
{
"epoch": 0.6124535067298172,
"grad_norm": 0.17717707834980048,
"learning_rate": 1.331020184493975e-05,
"loss": 0.7325,
"step": 2861
},
{
"epoch": 0.6126675764630328,
"grad_norm": 0.16784155215696425,
"learning_rate": 1.3297397876000103e-05,
"loss": 0.711,
"step": 2862
},
{
"epoch": 0.6128816461962484,
"grad_norm": 0.1725802521420512,
"learning_rate": 1.3284597000969314e-05,
"loss": 0.7224,
"step": 2863
},
{
"epoch": 0.613095715929464,
"grad_norm": 1.5390276737933177,
"learning_rate": 1.3271799225756259e-05,
"loss": 0.7148,
"step": 2864
},
{
"epoch": 0.6133097856626796,
"grad_norm": 0.17687346419640848,
"learning_rate": 1.3259004556268374e-05,
"loss": 0.6831,
"step": 2865
},
{
"epoch": 0.6135238553958953,
"grad_norm": 0.18585619933688044,
"learning_rate": 1.3246212998411636e-05,
"loss": 0.704,
"step": 2866
},
{
"epoch": 0.6137379251291109,
"grad_norm": 0.18105808829446995,
"learning_rate": 1.3233424558090624e-05,
"loss": 0.7002,
"step": 2867
},
{
"epoch": 0.6139519948623264,
"grad_norm": 0.17855654258899187,
"learning_rate": 1.322063924120844e-05,
"loss": 0.695,
"step": 2868
},
{
"epoch": 0.614166064595542,
"grad_norm": 0.18355776755397543,
"learning_rate": 1.3207857053666773e-05,
"loss": 0.6831,
"step": 2869
},
{
"epoch": 0.6143801343287576,
"grad_norm": 0.1902327712386683,
"learning_rate": 1.3195078001365864e-05,
"loss": 0.6831,
"step": 2870
},
{
"epoch": 0.6145942040619732,
"grad_norm": 0.17204234073682728,
"learning_rate": 1.3182302090204484e-05,
"loss": 0.6969,
"step": 2871
},
{
"epoch": 0.6148082737951888,
"grad_norm": 0.17094098670129862,
"learning_rate": 1.3169529326079984e-05,
"loss": 0.7152,
"step": 2872
},
{
"epoch": 0.6150223435284043,
"grad_norm": 0.2117693784640695,
"learning_rate": 1.3156759714888244e-05,
"loss": 0.7198,
"step": 2873
},
{
"epoch": 0.6152364132616199,
"grad_norm": 0.17009886451328768,
"learning_rate": 1.3143993262523687e-05,
"loss": 0.6989,
"step": 2874
},
{
"epoch": 0.6154504829948355,
"grad_norm": 0.1779596592890098,
"learning_rate": 1.3131229974879296e-05,
"loss": 0.6734,
"step": 2875
},
{
"epoch": 0.6156645527280512,
"grad_norm": 0.17921322382485147,
"learning_rate": 1.3118469857846571e-05,
"loss": 0.6743,
"step": 2876
},
{
"epoch": 0.6158786224612668,
"grad_norm": 0.17691893997151076,
"learning_rate": 1.3105712917315565e-05,
"loss": 0.703,
"step": 2877
},
{
"epoch": 0.6160926921944824,
"grad_norm": 0.16694540052827558,
"learning_rate": 1.3092959159174851e-05,
"loss": 0.6896,
"step": 2878
},
{
"epoch": 0.616306761927698,
"grad_norm": 0.1790079753898222,
"learning_rate": 1.3080208589311556e-05,
"loss": 0.6901,
"step": 2879
},
{
"epoch": 0.6165208316609135,
"grad_norm": 0.17878494361234837,
"learning_rate": 1.3067461213611297e-05,
"loss": 0.6518,
"step": 2880
},
{
"epoch": 0.6167349013941291,
"grad_norm": 0.18703174230007547,
"learning_rate": 1.3054717037958254e-05,
"loss": 0.7004,
"step": 2881
},
{
"epoch": 0.6169489711273447,
"grad_norm": 0.17219382822652507,
"learning_rate": 1.3041976068235118e-05,
"loss": 0.6819,
"step": 2882
},
{
"epoch": 0.6171630408605603,
"grad_norm": 0.19117048808355613,
"learning_rate": 1.3029238310323086e-05,
"loss": 0.6767,
"step": 2883
},
{
"epoch": 0.6173771105937759,
"grad_norm": 0.17508629085109303,
"learning_rate": 1.3016503770101898e-05,
"loss": 0.7224,
"step": 2884
},
{
"epoch": 0.6175911803269916,
"grad_norm": 0.18228936156076614,
"learning_rate": 1.3003772453449775e-05,
"loss": 0.6842,
"step": 2885
},
{
"epoch": 0.6178052500602071,
"grad_norm": 0.1767314759678005,
"learning_rate": 1.2991044366243482e-05,
"loss": 0.7206,
"step": 2886
},
{
"epoch": 0.6180193197934227,
"grad_norm": 0.1928813647012719,
"learning_rate": 1.2978319514358288e-05,
"loss": 0.7231,
"step": 2887
},
{
"epoch": 0.6182333895266383,
"grad_norm": 0.17987899265982177,
"learning_rate": 1.2965597903667942e-05,
"loss": 0.7104,
"step": 2888
},
{
"epoch": 0.6184474592598539,
"grad_norm": 0.17763217163885667,
"learning_rate": 1.2952879540044738e-05,
"loss": 0.6771,
"step": 2889
},
{
"epoch": 0.6186615289930695,
"grad_norm": 0.17860682345936435,
"learning_rate": 1.2940164429359427e-05,
"loss": 0.7033,
"step": 2890
},
{
"epoch": 0.6188755987262851,
"grad_norm": 0.1888543358700002,
"learning_rate": 1.2927452577481291e-05,
"loss": 0.6885,
"step": 2891
},
{
"epoch": 0.6190896684595006,
"grad_norm": 0.17864857931412279,
"learning_rate": 1.2914743990278112e-05,
"loss": 0.7343,
"step": 2892
},
{
"epoch": 0.6193037381927162,
"grad_norm": 0.172214273624621,
"learning_rate": 1.2902038673616124e-05,
"loss": 0.6857,
"step": 2893
},
{
"epoch": 0.6195178079259319,
"grad_norm": 0.18740147605806642,
"learning_rate": 1.2889336633360101e-05,
"loss": 0.7228,
"step": 2894
},
{
"epoch": 0.6197318776591475,
"grad_norm": 0.17032178588734434,
"learning_rate": 1.2876637875373263e-05,
"loss": 0.6954,
"step": 2895
},
{
"epoch": 0.6199459473923631,
"grad_norm": 0.2258489905632264,
"learning_rate": 1.2863942405517342e-05,
"loss": 0.6721,
"step": 2896
},
{
"epoch": 0.6201600171255787,
"grad_norm": 0.17617785733416969,
"learning_rate": 1.2851250229652552e-05,
"loss": 0.7157,
"step": 2897
},
{
"epoch": 0.6203740868587942,
"grad_norm": 0.18391050906383133,
"learning_rate": 1.283856135363756e-05,
"loss": 0.7062,
"step": 2898
},
{
"epoch": 0.6205881565920098,
"grad_norm": 0.1691362509596803,
"learning_rate": 1.282587578332955e-05,
"loss": 0.7037,
"step": 2899
},
{
"epoch": 0.6208022263252254,
"grad_norm": 0.18639845336118688,
"learning_rate": 1.281319352458413e-05,
"loss": 0.7055,
"step": 2900
},
{
"epoch": 0.621016296058441,
"grad_norm": 0.17607930458619322,
"learning_rate": 1.280051458325543e-05,
"loss": 0.6917,
"step": 2901
},
{
"epoch": 0.6212303657916566,
"grad_norm": 0.1738789578756451,
"learning_rate": 1.2787838965196024e-05,
"loss": 0.7173,
"step": 2902
},
{
"epoch": 0.6214444355248723,
"grad_norm": 0.1801430032805143,
"learning_rate": 1.2775166676256942e-05,
"loss": 0.7097,
"step": 2903
},
{
"epoch": 0.6216585052580879,
"grad_norm": 0.16793363576521397,
"learning_rate": 1.2762497722287705e-05,
"loss": 0.7335,
"step": 2904
},
{
"epoch": 0.6218725749913034,
"grad_norm": 0.18048841789581735,
"learning_rate": 1.2749832109136262e-05,
"loss": 0.6932,
"step": 2905
},
{
"epoch": 0.622086644724519,
"grad_norm": 0.16611617081444183,
"learning_rate": 1.2737169842649046e-05,
"loss": 0.7228,
"step": 2906
},
{
"epoch": 0.6223007144577346,
"grad_norm": 0.17939443935567653,
"learning_rate": 1.2724510928670944e-05,
"loss": 0.7221,
"step": 2907
},
{
"epoch": 0.6225147841909502,
"grad_norm": 0.1539017358556344,
"learning_rate": 1.271185537304527e-05,
"loss": 0.7059,
"step": 2908
},
{
"epoch": 0.6227288539241658,
"grad_norm": 0.17268144585332496,
"learning_rate": 1.2699203181613822e-05,
"loss": 0.6663,
"step": 2909
},
{
"epoch": 0.6229429236573814,
"grad_norm": 0.30648407676549616,
"learning_rate": 1.2686554360216814e-05,
"loss": 0.71,
"step": 2910
},
{
"epoch": 0.6231569933905969,
"grad_norm": 0.16047629951681336,
"learning_rate": 1.2673908914692925e-05,
"loss": 0.7051,
"step": 2911
},
{
"epoch": 0.6233710631238126,
"grad_norm": 0.17246879244750765,
"learning_rate": 1.2661266850879277e-05,
"loss": 0.6907,
"step": 2912
},
{
"epoch": 0.6235851328570282,
"grad_norm": 0.15638788280326743,
"learning_rate": 1.264862817461141e-05,
"loss": 0.7258,
"step": 2913
},
{
"epoch": 0.6237992025902438,
"grad_norm": 0.17085846723621334,
"learning_rate": 1.2635992891723322e-05,
"loss": 0.6643,
"step": 2914
},
{
"epoch": 0.6240132723234594,
"grad_norm": 0.16726585249472334,
"learning_rate": 1.2623361008047437e-05,
"loss": 0.6735,
"step": 2915
},
{
"epoch": 0.624227342056675,
"grad_norm": 0.16704252457088484,
"learning_rate": 1.2610732529414605e-05,
"loss": 0.6826,
"step": 2916
},
{
"epoch": 0.6244414117898905,
"grad_norm": 0.1763544082611917,
"learning_rate": 1.2598107461654111e-05,
"loss": 0.726,
"step": 2917
},
{
"epoch": 0.6246554815231061,
"grad_norm": 0.16341085542734637,
"learning_rate": 1.2585485810593665e-05,
"loss": 0.6876,
"step": 2918
},
{
"epoch": 0.6248695512563217,
"grad_norm": 0.17757472243715544,
"learning_rate": 1.2572867582059396e-05,
"loss": 0.7041,
"step": 2919
},
{
"epoch": 0.6250836209895373,
"grad_norm": 0.16358487378286496,
"learning_rate": 1.256025278187585e-05,
"loss": 0.6879,
"step": 2920
},
{
"epoch": 0.625297690722753,
"grad_norm": 0.17619618742073248,
"learning_rate": 1.254764141586601e-05,
"loss": 0.6778,
"step": 2921
},
{
"epoch": 0.6255117604559686,
"grad_norm": 0.16199914831698128,
"learning_rate": 1.2535033489851242e-05,
"loss": 0.6679,
"step": 2922
},
{
"epoch": 0.6257258301891842,
"grad_norm": 0.16538948362297123,
"learning_rate": 1.2522429009651349e-05,
"loss": 0.6995,
"step": 2923
},
{
"epoch": 0.6259398999223997,
"grad_norm": 0.17833038921739666,
"learning_rate": 1.2509827981084546e-05,
"loss": 0.6696,
"step": 2924
},
{
"epoch": 0.6261539696556153,
"grad_norm": 0.17068080819660977,
"learning_rate": 1.249723040996743e-05,
"loss": 0.7211,
"step": 2925
},
{
"epoch": 0.6263680393888309,
"grad_norm": 0.18804925573823228,
"learning_rate": 1.2484636302115027e-05,
"loss": 0.684,
"step": 2926
},
{
"epoch": 0.6265821091220465,
"grad_norm": 0.16950128218439603,
"learning_rate": 1.2472045663340744e-05,
"loss": 0.7075,
"step": 2927
},
{
"epoch": 0.6267961788552621,
"grad_norm": 0.17381394632953662,
"learning_rate": 1.2459458499456401e-05,
"loss": 0.6687,
"step": 2928
},
{
"epoch": 0.6270102485884776,
"grad_norm": 0.1596582427830223,
"learning_rate": 1.2446874816272216e-05,
"loss": 0.7093,
"step": 2929
},
{
"epoch": 0.6272243183216933,
"grad_norm": 0.18923414202347508,
"learning_rate": 1.2434294619596785e-05,
"loss": 0.7368,
"step": 2930
},
{
"epoch": 0.6274383880549089,
"grad_norm": 0.16394900734918538,
"learning_rate": 1.2421717915237114e-05,
"loss": 0.7145,
"step": 2931
},
{
"epoch": 0.6276524577881245,
"grad_norm": 0.1719139295670378,
"learning_rate": 1.2409144708998574e-05,
"loss": 0.6892,
"step": 2932
},
{
"epoch": 0.6278665275213401,
"grad_norm": 0.16953471095261508,
"learning_rate": 1.239657500668494e-05,
"loss": 0.6911,
"step": 2933
},
{
"epoch": 0.6280805972545557,
"grad_norm": 0.1710169225060896,
"learning_rate": 1.2384008814098376e-05,
"loss": 0.7124,
"step": 2934
},
{
"epoch": 0.6282946669877713,
"grad_norm": 0.17020357873357828,
"learning_rate": 1.2371446137039391e-05,
"loss": 0.692,
"step": 2935
},
{
"epoch": 0.6285087367209868,
"grad_norm": 0.15469496832129448,
"learning_rate": 1.2358886981306912e-05,
"loss": 0.6875,
"step": 2936
},
{
"epoch": 0.6287228064542024,
"grad_norm": 0.17390880002807188,
"learning_rate": 1.2346331352698206e-05,
"loss": 0.7002,
"step": 2937
},
{
"epoch": 0.628936876187418,
"grad_norm": 0.1642273622259631,
"learning_rate": 1.2333779257008937e-05,
"loss": 0.7126,
"step": 2938
},
{
"epoch": 0.6291509459206337,
"grad_norm": 0.16678568279818254,
"learning_rate": 1.232123070003314e-05,
"loss": 0.7264,
"step": 2939
},
{
"epoch": 0.6293650156538493,
"grad_norm": 0.16718697862885099,
"learning_rate": 1.2308685687563186e-05,
"loss": 0.7394,
"step": 2940
},
{
"epoch": 0.6295790853870649,
"grad_norm": 0.1524015290174883,
"learning_rate": 1.2296144225389847e-05,
"loss": 0.6749,
"step": 2941
},
{
"epoch": 0.6297931551202804,
"grad_norm": 0.1850826223936878,
"learning_rate": 1.2283606319302224e-05,
"loss": 0.7031,
"step": 2942
},
{
"epoch": 0.630007224853496,
"grad_norm": 0.2534360107397994,
"learning_rate": 1.2271071975087799e-05,
"loss": 0.6736,
"step": 2943
},
{
"epoch": 0.6302212945867116,
"grad_norm": 0.17441563231800425,
"learning_rate": 1.2258541198532407e-05,
"loss": 0.7029,
"step": 2944
},
{
"epoch": 0.6304353643199272,
"grad_norm": 0.23075740781320633,
"learning_rate": 1.2246013995420221e-05,
"loss": 0.6882,
"step": 2945
},
{
"epoch": 0.6306494340531428,
"grad_norm": 0.16999008530809995,
"learning_rate": 1.2233490371533786e-05,
"loss": 0.7079,
"step": 2946
},
{
"epoch": 0.6308635037863584,
"grad_norm": 0.2866628942833885,
"learning_rate": 1.2220970332653972e-05,
"loss": 0.716,
"step": 2947
},
{
"epoch": 0.631077573519574,
"grad_norm": 0.18768464303026294,
"learning_rate": 1.2208453884560012e-05,
"loss": 0.7176,
"step": 2948
},
{
"epoch": 0.6312916432527896,
"grad_norm": 0.16261756239700065,
"learning_rate": 1.2195941033029484e-05,
"loss": 0.7224,
"step": 2949
},
{
"epoch": 0.6315057129860052,
"grad_norm": 0.16951721449406248,
"learning_rate": 1.2183431783838281e-05,
"loss": 0.7019,
"step": 2950
},
{
"epoch": 0.6317197827192208,
"grad_norm": 0.18069610584991522,
"learning_rate": 1.2170926142760666e-05,
"loss": 0.7204,
"step": 2951
},
{
"epoch": 0.6319338524524364,
"grad_norm": 0.16441193668221984,
"learning_rate": 1.2158424115569205e-05,
"loss": 0.6933,
"step": 2952
},
{
"epoch": 0.632147922185652,
"grad_norm": 0.9628811658647918,
"learning_rate": 1.2145925708034815e-05,
"loss": 0.692,
"step": 2953
},
{
"epoch": 0.6323619919188675,
"grad_norm": 0.16391576680482073,
"learning_rate": 1.2133430925926753e-05,
"loss": 0.6858,
"step": 2954
},
{
"epoch": 0.6325760616520831,
"grad_norm": 0.17916529443289175,
"learning_rate": 1.2120939775012564e-05,
"loss": 0.6964,
"step": 2955
},
{
"epoch": 0.6327901313852987,
"grad_norm": 0.1605899428782057,
"learning_rate": 1.2108452261058156e-05,
"loss": 0.6797,
"step": 2956
},
{
"epoch": 0.6330042011185144,
"grad_norm": 0.16429996074127887,
"learning_rate": 1.2095968389827739e-05,
"loss": 0.7115,
"step": 2957
},
{
"epoch": 0.63321827085173,
"grad_norm": 0.1670154397729422,
"learning_rate": 1.2083488167083843e-05,
"loss": 0.6816,
"step": 2958
},
{
"epoch": 0.6334323405849456,
"grad_norm": 0.1666621364478631,
"learning_rate": 1.2071011598587315e-05,
"loss": 0.7101,
"step": 2959
},
{
"epoch": 0.6336464103181612,
"grad_norm": 0.16588852427192485,
"learning_rate": 1.2058538690097321e-05,
"loss": 0.6677,
"step": 2960
},
{
"epoch": 0.6338604800513767,
"grad_norm": 0.17186574899635917,
"learning_rate": 1.2046069447371332e-05,
"loss": 0.7184,
"step": 2961
},
{
"epoch": 0.6340745497845923,
"grad_norm": 0.1673490981243338,
"learning_rate": 1.203360387616512e-05,
"loss": 0.717,
"step": 2962
},
{
"epoch": 0.6342886195178079,
"grad_norm": 0.16743151579505755,
"learning_rate": 1.2021141982232785e-05,
"loss": 0.6991,
"step": 2963
},
{
"epoch": 0.6345026892510235,
"grad_norm": 0.16168715457808672,
"learning_rate": 1.2008683771326697e-05,
"loss": 0.69,
"step": 2964
},
{
"epoch": 0.6347167589842391,
"grad_norm": 0.18168044103232656,
"learning_rate": 1.199622924919755e-05,
"loss": 0.6986,
"step": 2965
},
{
"epoch": 0.6349308287174548,
"grad_norm": 0.16381373979373812,
"learning_rate": 1.1983778421594341e-05,
"loss": 0.7132,
"step": 2966
},
{
"epoch": 0.6351448984506703,
"grad_norm": 0.16316974024076497,
"learning_rate": 1.1971331294264328e-05,
"loss": 0.6968,
"step": 2967
},
{
"epoch": 0.6353589681838859,
"grad_norm": 0.16193383262435015,
"learning_rate": 1.19588878729531e-05,
"loss": 0.6855,
"step": 2968
},
{
"epoch": 0.6355730379171015,
"grad_norm": 0.15944271823824435,
"learning_rate": 1.1946448163404503e-05,
"loss": 0.6831,
"step": 2969
},
{
"epoch": 0.6357871076503171,
"grad_norm": 0.16073582155028612,
"learning_rate": 1.1934012171360692e-05,
"loss": 0.7037,
"step": 2970
},
{
"epoch": 0.6360011773835327,
"grad_norm": 0.17182927970614129,
"learning_rate": 1.1921579902562103e-05,
"loss": 0.7215,
"step": 2971
},
{
"epoch": 0.6362152471167483,
"grad_norm": 0.15885051786833096,
"learning_rate": 1.1909151362747437e-05,
"loss": 0.7016,
"step": 2972
},
{
"epoch": 0.6364293168499638,
"grad_norm": 0.18147169309209343,
"learning_rate": 1.1896726557653699e-05,
"loss": 0.7137,
"step": 2973
},
{
"epoch": 0.6366433865831794,
"grad_norm": 0.1671580602611548,
"learning_rate": 1.188430549301614e-05,
"loss": 0.6932,
"step": 2974
},
{
"epoch": 0.6368574563163951,
"grad_norm": 0.1704553591271745,
"learning_rate": 1.187188817456831e-05,
"loss": 0.6844,
"step": 2975
},
{
"epoch": 0.6370715260496107,
"grad_norm": 0.16752180285730586,
"learning_rate": 1.1859474608042025e-05,
"loss": 0.6948,
"step": 2976
},
{
"epoch": 0.6372855957828263,
"grad_norm": 0.1607453595825214,
"learning_rate": 1.1847064799167351e-05,
"loss": 0.7071,
"step": 2977
},
{
"epoch": 0.6374996655160419,
"grad_norm": 0.1601945790617932,
"learning_rate": 1.1834658753672653e-05,
"loss": 0.6875,
"step": 2978
},
{
"epoch": 0.6377137352492575,
"grad_norm": 0.16138230498616143,
"learning_rate": 1.1822256477284517e-05,
"loss": 0.7072,
"step": 2979
},
{
"epoch": 0.637927804982473,
"grad_norm": 0.15835635684726207,
"learning_rate": 1.1809857975727819e-05,
"loss": 0.6952,
"step": 2980
},
{
"epoch": 0.6381418747156886,
"grad_norm": 0.16026940319647917,
"learning_rate": 1.1797463254725696e-05,
"loss": 0.689,
"step": 2981
},
{
"epoch": 0.6383559444489042,
"grad_norm": 0.16442983720923277,
"learning_rate": 1.1785072319999513e-05,
"loss": 0.6809,
"step": 2982
},
{
"epoch": 0.6385700141821198,
"grad_norm": 0.16612737878971637,
"learning_rate": 1.1772685177268916e-05,
"loss": 0.6945,
"step": 2983
},
{
"epoch": 0.6387840839153354,
"grad_norm": 0.17809310058237487,
"learning_rate": 1.1760301832251773e-05,
"loss": 0.7226,
"step": 2984
},
{
"epoch": 0.6389981536485511,
"grad_norm": 0.16395607933048745,
"learning_rate": 1.174792229066422e-05,
"loss": 0.6691,
"step": 2985
},
{
"epoch": 0.6392122233817666,
"grad_norm": 0.1637606976705473,
"learning_rate": 1.173554655822064e-05,
"loss": 0.6909,
"step": 2986
},
{
"epoch": 0.6394262931149822,
"grad_norm": 0.163746544450305,
"learning_rate": 1.172317464063363e-05,
"loss": 0.695,
"step": 2987
},
{
"epoch": 0.6396403628481978,
"grad_norm": 0.1854282152064345,
"learning_rate": 1.1710806543614066e-05,
"loss": 0.705,
"step": 2988
},
{
"epoch": 0.6398544325814134,
"grad_norm": 0.17679673871559604,
"learning_rate": 1.1698442272871018e-05,
"loss": 0.7063,
"step": 2989
},
{
"epoch": 0.640068502314629,
"grad_norm": 0.19045542334904472,
"learning_rate": 1.168608183411182e-05,
"loss": 0.684,
"step": 2990
},
{
"epoch": 0.6402825720478446,
"grad_norm": 0.16123138168999393,
"learning_rate": 1.1673725233042033e-05,
"loss": 0.6965,
"step": 2991
},
{
"epoch": 0.6404966417810601,
"grad_norm": 0.1714294989618222,
"learning_rate": 1.166137247536543e-05,
"loss": 0.7443,
"step": 2992
},
{
"epoch": 0.6407107115142757,
"grad_norm": 0.18048674301266115,
"learning_rate": 1.1649023566784039e-05,
"loss": 0.7048,
"step": 2993
},
{
"epoch": 0.6409247812474914,
"grad_norm": 0.1560685331579729,
"learning_rate": 1.1636678512998074e-05,
"loss": 0.6938,
"step": 2994
},
{
"epoch": 0.641138850980707,
"grad_norm": 0.18494732591236804,
"learning_rate": 1.1624337319705995e-05,
"loss": 0.6826,
"step": 2995
},
{
"epoch": 0.6413529207139226,
"grad_norm": 0.16380366820778122,
"learning_rate": 1.1611999992604491e-05,
"loss": 0.7013,
"step": 2996
},
{
"epoch": 0.6415669904471382,
"grad_norm": 0.1651962122147673,
"learning_rate": 1.159966653738842e-05,
"loss": 0.7049,
"step": 2997
},
{
"epoch": 0.6417810601803537,
"grad_norm": 0.1662693680476804,
"learning_rate": 1.1587336959750912e-05,
"loss": 0.7223,
"step": 2998
},
{
"epoch": 0.6419951299135693,
"grad_norm": 0.23710229772752486,
"learning_rate": 1.1575011265383251e-05,
"loss": 0.7146,
"step": 2999
},
{
"epoch": 0.6422091996467849,
"grad_norm": 0.16185101214389352,
"learning_rate": 1.156268945997498e-05,
"loss": 0.7379,
"step": 3000
},
{
"epoch": 0.6424232693800005,
"grad_norm": 0.16935471526485132,
"learning_rate": 1.1550371549213797e-05,
"loss": 0.7042,
"step": 3001
},
{
"epoch": 0.6426373391132161,
"grad_norm": 0.17068356974370424,
"learning_rate": 1.1538057538785638e-05,
"loss": 0.7292,
"step": 3002
},
{
"epoch": 0.6428514088464318,
"grad_norm": 0.16082713244281638,
"learning_rate": 1.152574743437464e-05,
"loss": 0.6771,
"step": 3003
},
{
"epoch": 0.6430654785796474,
"grad_norm": 0.17464502471767457,
"learning_rate": 1.1513441241663105e-05,
"loss": 0.6896,
"step": 3004
},
{
"epoch": 0.6432795483128629,
"grad_norm": 0.16351997860068648,
"learning_rate": 1.150113896633157e-05,
"loss": 0.7032,
"step": 3005
},
{
"epoch": 0.6434936180460785,
"grad_norm": 0.1690926534684481,
"learning_rate": 1.1488840614058716e-05,
"loss": 0.6733,
"step": 3006
},
{
"epoch": 0.6437076877792941,
"grad_norm": 0.1784838317003333,
"learning_rate": 1.1476546190521456e-05,
"loss": 0.7136,
"step": 3007
},
{
"epoch": 0.6439217575125097,
"grad_norm": 0.1785785968046288,
"learning_rate": 1.146425570139488e-05,
"loss": 0.7067,
"step": 3008
},
{
"epoch": 0.6441358272457253,
"grad_norm": 0.1713124320675924,
"learning_rate": 1.145196915235224e-05,
"loss": 0.694,
"step": 3009
},
{
"epoch": 0.6443498969789409,
"grad_norm": 0.15675533806258324,
"learning_rate": 1.1439686549064996e-05,
"loss": 0.6652,
"step": 3010
},
{
"epoch": 0.6445639667121564,
"grad_norm": 0.1696309691171974,
"learning_rate": 1.1427407897202767e-05,
"loss": 0.7052,
"step": 3011
},
{
"epoch": 0.6447780364453721,
"grad_norm": 0.16317109100786786,
"learning_rate": 1.1415133202433357e-05,
"loss": 0.6714,
"step": 3012
},
{
"epoch": 0.6449921061785877,
"grad_norm": 0.16253351295127938,
"learning_rate": 1.1402862470422753e-05,
"loss": 0.6907,
"step": 3013
},
{
"epoch": 0.6452061759118033,
"grad_norm": 0.1729193360632724,
"learning_rate": 1.139059570683509e-05,
"loss": 0.7118,
"step": 3014
},
{
"epoch": 0.6454202456450189,
"grad_norm": 0.1702780987278593,
"learning_rate": 1.1378332917332696e-05,
"loss": 0.6995,
"step": 3015
},
{
"epoch": 0.6456343153782345,
"grad_norm": 0.17896418896916347,
"learning_rate": 1.1366074107576035e-05,
"loss": 0.7024,
"step": 3016
},
{
"epoch": 0.64584838511145,
"grad_norm": 0.16240683657758037,
"learning_rate": 1.1353819283223762e-05,
"loss": 0.7202,
"step": 3017
},
{
"epoch": 0.6460624548446656,
"grad_norm": 0.16927526374792506,
"learning_rate": 1.1341568449932688e-05,
"loss": 0.7099,
"step": 3018
},
{
"epoch": 0.6462765245778812,
"grad_norm": 0.1600543302235835,
"learning_rate": 1.132932161335776e-05,
"loss": 0.72,
"step": 3019
},
{
"epoch": 0.6464905943110968,
"grad_norm": 0.16347278785357336,
"learning_rate": 1.131707877915211e-05,
"loss": 0.7024,
"step": 3020
},
{
"epoch": 0.6467046640443125,
"grad_norm": 0.17352557510729236,
"learning_rate": 1.1304839952966993e-05,
"loss": 0.7082,
"step": 3021
},
{
"epoch": 0.6469187337775281,
"grad_norm": 0.20927375224275424,
"learning_rate": 1.1292605140451838e-05,
"loss": 0.6843,
"step": 3022
},
{
"epoch": 0.6471328035107436,
"grad_norm": 0.1704525197454697,
"learning_rate": 1.128037434725422e-05,
"loss": 0.6987,
"step": 3023
},
{
"epoch": 0.6473468732439592,
"grad_norm": 0.23404052924423965,
"learning_rate": 1.126814757901983e-05,
"loss": 0.703,
"step": 3024
},
{
"epoch": 0.6475609429771748,
"grad_norm": 0.16378192029244704,
"learning_rate": 1.1255924841392542e-05,
"loss": 0.6913,
"step": 3025
},
{
"epoch": 0.6477750127103904,
"grad_norm": 0.17260696387702695,
"learning_rate": 1.1243706140014333e-05,
"loss": 0.7071,
"step": 3026
},
{
"epoch": 0.647989082443606,
"grad_norm": 0.15840289277180297,
"learning_rate": 1.1231491480525341e-05,
"loss": 0.7295,
"step": 3027
},
{
"epoch": 0.6482031521768216,
"grad_norm": 0.1710246421139846,
"learning_rate": 1.1219280868563838e-05,
"loss": 0.7092,
"step": 3028
},
{
"epoch": 0.6484172219100371,
"grad_norm": 0.16019620902751636,
"learning_rate": 1.1207074309766204e-05,
"loss": 0.7031,
"step": 3029
},
{
"epoch": 0.6486312916432528,
"grad_norm": 0.15480753153701207,
"learning_rate": 1.1194871809766981e-05,
"loss": 0.6942,
"step": 3030
},
{
"epoch": 0.6488453613764684,
"grad_norm": 0.15943952059365776,
"learning_rate": 1.1182673374198805e-05,
"loss": 0.7083,
"step": 3031
},
{
"epoch": 0.649059431109684,
"grad_norm": 0.16123658316260847,
"learning_rate": 1.1170479008692457e-05,
"loss": 0.7095,
"step": 3032
},
{
"epoch": 0.6492735008428996,
"grad_norm": 0.15275844768832486,
"learning_rate": 1.1158288718876844e-05,
"loss": 0.6771,
"step": 3033
},
{
"epoch": 0.6494875705761152,
"grad_norm": 0.15629234469138292,
"learning_rate": 1.1146102510378964e-05,
"loss": 0.705,
"step": 3034
},
{
"epoch": 0.6497016403093308,
"grad_norm": 0.1527165321738287,
"learning_rate": 1.1133920388823967e-05,
"loss": 0.6864,
"step": 3035
},
{
"epoch": 0.6499157100425463,
"grad_norm": 0.15532618501401466,
"learning_rate": 1.1121742359835079e-05,
"loss": 0.6703,
"step": 3036
},
{
"epoch": 0.6501297797757619,
"grad_norm": 0.14565902138468276,
"learning_rate": 1.1109568429033669e-05,
"loss": 0.6715,
"step": 3037
},
{
"epoch": 0.6503438495089775,
"grad_norm": 0.16350793160863714,
"learning_rate": 1.1097398602039202e-05,
"loss": 0.6857,
"step": 3038
},
{
"epoch": 0.6505579192421932,
"grad_norm": 0.15971597977022928,
"learning_rate": 1.1085232884469236e-05,
"loss": 0.7233,
"step": 3039
},
{
"epoch": 0.6507719889754088,
"grad_norm": 0.16056666146955634,
"learning_rate": 1.107307128193946e-05,
"loss": 0.7156,
"step": 3040
},
{
"epoch": 0.6509860587086244,
"grad_norm": 0.15708115566962028,
"learning_rate": 1.106091380006363e-05,
"loss": 0.6877,
"step": 3041
},
{
"epoch": 0.6512001284418399,
"grad_norm": 0.16039488460755236,
"learning_rate": 1.1048760444453636e-05,
"loss": 0.7052,
"step": 3042
},
{
"epoch": 0.6514141981750555,
"grad_norm": 0.16257983769300854,
"learning_rate": 1.1036611220719426e-05,
"loss": 0.7038,
"step": 3043
},
{
"epoch": 0.6516282679082711,
"grad_norm": 0.15917878917205924,
"learning_rate": 1.102446613446907e-05,
"loss": 0.6955,
"step": 3044
},
{
"epoch": 0.6518423376414867,
"grad_norm": 0.15910433212899805,
"learning_rate": 1.1012325191308721e-05,
"loss": 0.7029,
"step": 3045
},
{
"epoch": 0.6520564073747023,
"grad_norm": 0.16049647340918968,
"learning_rate": 1.1000188396842604e-05,
"loss": 0.6945,
"step": 3046
},
{
"epoch": 0.6522704771079179,
"grad_norm": 0.1574848396997355,
"learning_rate": 1.0988055756673057e-05,
"loss": 0.7204,
"step": 3047
},
{
"epoch": 0.6524845468411336,
"grad_norm": 0.17062483486919586,
"learning_rate": 1.0975927276400466e-05,
"loss": 0.6952,
"step": 3048
},
{
"epoch": 0.6526986165743491,
"grad_norm": 0.15999655958826292,
"learning_rate": 1.0963802961623329e-05,
"loss": 0.7188,
"step": 3049
},
{
"epoch": 0.6529126863075647,
"grad_norm": 0.16372948371536275,
"learning_rate": 1.0951682817938209e-05,
"loss": 0.7047,
"step": 3050
},
{
"epoch": 0.6531267560407803,
"grad_norm": 0.16804253558519006,
"learning_rate": 1.0939566850939727e-05,
"loss": 0.7231,
"step": 3051
},
{
"epoch": 0.6533408257739959,
"grad_norm": 0.1637492977611271,
"learning_rate": 1.092745506622061e-05,
"loss": 0.6955,
"step": 3052
},
{
"epoch": 0.6535548955072115,
"grad_norm": 0.15823061897236976,
"learning_rate": 1.091534746937162e-05,
"loss": 0.7004,
"step": 3053
},
{
"epoch": 0.653768965240427,
"grad_norm": 0.16666509369899177,
"learning_rate": 1.0903244065981608e-05,
"loss": 0.6903,
"step": 3054
},
{
"epoch": 0.6539830349736426,
"grad_norm": 0.160025184887067,
"learning_rate": 1.0891144861637488e-05,
"loss": 0.6899,
"step": 3055
},
{
"epoch": 0.6541971047068582,
"grad_norm": 0.15938454296733964,
"learning_rate": 1.087904986192422e-05,
"loss": 0.7026,
"step": 3056
},
{
"epoch": 0.6544111744400739,
"grad_norm": 0.16917273847620276,
"learning_rate": 1.0866959072424838e-05,
"loss": 0.6996,
"step": 3057
},
{
"epoch": 0.6546252441732895,
"grad_norm": 0.1533162588092453,
"learning_rate": 1.0854872498720436e-05,
"loss": 0.6947,
"step": 3058
},
{
"epoch": 0.6548393139065051,
"grad_norm": 0.15399321658021684,
"learning_rate": 1.0842790146390144e-05,
"loss": 0.7034,
"step": 3059
},
{
"epoch": 0.6550533836397207,
"grad_norm": 0.16290717373727154,
"learning_rate": 1.0830712021011154e-05,
"loss": 0.6889,
"step": 3060
},
{
"epoch": 0.6552674533729362,
"grad_norm": 0.15982311858370116,
"learning_rate": 1.081863812815872e-05,
"loss": 0.6897,
"step": 3061
},
{
"epoch": 0.6554815231061518,
"grad_norm": 0.15789580043324297,
"learning_rate": 1.080656847340611e-05,
"loss": 0.6998,
"step": 3062
},
{
"epoch": 0.6556955928393674,
"grad_norm": 0.1673228222261171,
"learning_rate": 1.0794503062324664e-05,
"loss": 0.6905,
"step": 3063
},
{
"epoch": 0.655909662572583,
"grad_norm": 0.16043079916395062,
"learning_rate": 1.078244190048376e-05,
"loss": 0.7073,
"step": 3064
},
{
"epoch": 0.6561237323057986,
"grad_norm": 0.15737028797569128,
"learning_rate": 1.0770384993450796e-05,
"loss": 0.6915,
"step": 3065
},
{
"epoch": 0.6563378020390143,
"grad_norm": 0.15954350175409163,
"learning_rate": 1.0758332346791219e-05,
"loss": 0.6979,
"step": 3066
},
{
"epoch": 0.6565518717722298,
"grad_norm": 0.15346727175746847,
"learning_rate": 1.0746283966068525e-05,
"loss": 0.6764,
"step": 3067
},
{
"epoch": 0.6567659415054454,
"grad_norm": 0.15789737841353488,
"learning_rate": 1.0734239856844204e-05,
"loss": 0.685,
"step": 3068
},
{
"epoch": 0.656980011238661,
"grad_norm": 0.16314731016209819,
"learning_rate": 1.07222000246778e-05,
"loss": 0.7213,
"step": 3069
},
{
"epoch": 0.6571940809718766,
"grad_norm": 0.1560279985031777,
"learning_rate": 1.0710164475126894e-05,
"loss": 0.6879,
"step": 3070
},
{
"epoch": 0.6574081507050922,
"grad_norm": 0.16051110359534035,
"learning_rate": 1.069813321374705e-05,
"loss": 0.6985,
"step": 3071
},
{
"epoch": 0.6576222204383078,
"grad_norm": 0.16599332379590576,
"learning_rate": 1.0686106246091895e-05,
"loss": 0.7206,
"step": 3072
},
{
"epoch": 0.6578362901715233,
"grad_norm": 0.161663267904669,
"learning_rate": 1.0674083577713037e-05,
"loss": 0.666,
"step": 3073
},
{
"epoch": 0.6580503599047389,
"grad_norm": 0.16341807962206745,
"learning_rate": 1.0662065214160131e-05,
"loss": 0.6873,
"step": 3074
},
{
"epoch": 0.6582644296379546,
"grad_norm": 0.16331829109326712,
"learning_rate": 1.0650051160980835e-05,
"loss": 0.6894,
"step": 3075
},
{
"epoch": 0.6584784993711702,
"grad_norm": 0.15664775482017015,
"learning_rate": 1.06380414237208e-05,
"loss": 0.6825,
"step": 3076
},
{
"epoch": 0.6586925691043858,
"grad_norm": 0.15899760291435164,
"learning_rate": 1.0626036007923712e-05,
"loss": 0.6679,
"step": 3077
},
{
"epoch": 0.6589066388376014,
"grad_norm": 0.16818363978877052,
"learning_rate": 1.061403491913124e-05,
"loss": 0.7008,
"step": 3078
},
{
"epoch": 0.659120708570817,
"grad_norm": 0.15140120050036712,
"learning_rate": 1.0602038162883064e-05,
"loss": 0.7001,
"step": 3079
},
{
"epoch": 0.6593347783040325,
"grad_norm": 0.16143170483265978,
"learning_rate": 1.0590045744716875e-05,
"loss": 0.686,
"step": 3080
},
{
"epoch": 0.6595488480372481,
"grad_norm": 0.15927642747166015,
"learning_rate": 1.0578057670168338e-05,
"loss": 0.6738,
"step": 3081
},
{
"epoch": 0.6597629177704637,
"grad_norm": 0.15323202609648254,
"learning_rate": 1.0566073944771142e-05,
"loss": 0.6865,
"step": 3082
},
{
"epoch": 0.6599769875036793,
"grad_norm": 0.17252273612411162,
"learning_rate": 1.0554094574056935e-05,
"loss": 0.689,
"step": 3083
},
{
"epoch": 0.660191057236895,
"grad_norm": 0.16141619589301429,
"learning_rate": 1.0542119563555388e-05,
"loss": 0.6969,
"step": 3084
},
{
"epoch": 0.6604051269701106,
"grad_norm": 0.1756548012941864,
"learning_rate": 1.0530148918794131e-05,
"loss": 0.6843,
"step": 3085
},
{
"epoch": 0.6606191967033261,
"grad_norm": 0.15636751889672348,
"learning_rate": 1.0518182645298798e-05,
"loss": 0.7057,
"step": 3086
},
{
"epoch": 0.6608332664365417,
"grad_norm": 0.16599761739994906,
"learning_rate": 1.0506220748593003e-05,
"loss": 0.7073,
"step": 3087
},
{
"epoch": 0.6610473361697573,
"grad_norm": 0.16266530492319733,
"learning_rate": 1.0494263234198328e-05,
"loss": 0.7037,
"step": 3088
},
{
"epoch": 0.6612614059029729,
"grad_norm": 0.18291412199766832,
"learning_rate": 1.0482310107634349e-05,
"loss": 0.7001,
"step": 3089
},
{
"epoch": 0.6614754756361885,
"grad_norm": 0.15793773921931004,
"learning_rate": 1.0470361374418592e-05,
"loss": 0.6884,
"step": 3090
},
{
"epoch": 0.661689545369404,
"grad_norm": 0.16229554901334134,
"learning_rate": 1.0458417040066582e-05,
"loss": 0.7033,
"step": 3091
},
{
"epoch": 0.6619036151026196,
"grad_norm": 0.18626607373816426,
"learning_rate": 1.0446477110091809e-05,
"loss": 0.679,
"step": 3092
},
{
"epoch": 0.6621176848358352,
"grad_norm": 0.15838847604553272,
"learning_rate": 1.0434541590005702e-05,
"loss": 0.7191,
"step": 3093
},
{
"epoch": 0.6623317545690509,
"grad_norm": 0.16304191574001972,
"learning_rate": 1.0422610485317696e-05,
"loss": 0.6702,
"step": 3094
},
{
"epoch": 0.6625458243022665,
"grad_norm": 0.16061600809628618,
"learning_rate": 1.041068380153515e-05,
"loss": 0.6856,
"step": 3095
},
{
"epoch": 0.6627598940354821,
"grad_norm": 0.14957425317487621,
"learning_rate": 1.0398761544163411e-05,
"loss": 0.6799,
"step": 3096
},
{
"epoch": 0.6629739637686977,
"grad_norm": 0.19783111977513954,
"learning_rate": 1.038684371870577e-05,
"loss": 0.7037,
"step": 3097
},
{
"epoch": 0.6631880335019132,
"grad_norm": 0.16150087091595297,
"learning_rate": 1.0374930330663467e-05,
"loss": 0.7072,
"step": 3098
},
{
"epoch": 0.6634021032351288,
"grad_norm": 0.16981422418322475,
"learning_rate": 1.0363021385535709e-05,
"loss": 0.7223,
"step": 3099
},
{
"epoch": 0.6636161729683444,
"grad_norm": 0.28908513988228524,
"learning_rate": 1.0351116888819632e-05,
"loss": 0.6844,
"step": 3100
},
{
"epoch": 0.66383024270156,
"grad_norm": 0.1607792246483595,
"learning_rate": 1.0339216846010336e-05,
"loss": 0.6907,
"step": 3101
},
{
"epoch": 0.6640443124347756,
"grad_norm": 0.16236528757651575,
"learning_rate": 1.0327321262600867e-05,
"loss": 0.7155,
"step": 3102
},
{
"epoch": 0.6642583821679913,
"grad_norm": 0.16303089783476565,
"learning_rate": 1.0315430144082188e-05,
"loss": 0.7112,
"step": 3103
},
{
"epoch": 0.6644724519012069,
"grad_norm": 0.16116795157622021,
"learning_rate": 1.0303543495943233e-05,
"loss": 0.6892,
"step": 3104
},
{
"epoch": 0.6646865216344224,
"grad_norm": 0.1623009662404361,
"learning_rate": 1.0291661323670845e-05,
"loss": 0.6864,
"step": 3105
},
{
"epoch": 0.664900591367638,
"grad_norm": 0.1574929762562193,
"learning_rate": 1.0279783632749818e-05,
"loss": 0.6661,
"step": 3106
},
{
"epoch": 0.6651146611008536,
"grad_norm": 0.16442713608472861,
"learning_rate": 1.0267910428662878e-05,
"loss": 0.7152,
"step": 3107
},
{
"epoch": 0.6653287308340692,
"grad_norm": 0.1611659903693138,
"learning_rate": 1.0256041716890662e-05,
"loss": 0.6974,
"step": 3108
},
{
"epoch": 0.6655428005672848,
"grad_norm": 0.15415718884072935,
"learning_rate": 1.0244177502911762e-05,
"loss": 0.7233,
"step": 3109
},
{
"epoch": 0.6657568703005003,
"grad_norm": 0.15663701645788064,
"learning_rate": 1.0232317792202658e-05,
"loss": 0.7062,
"step": 3110
},
{
"epoch": 0.6659709400337159,
"grad_norm": 0.1660121352485925,
"learning_rate": 1.0220462590237781e-05,
"loss": 0.7041,
"step": 3111
},
{
"epoch": 0.6661850097669316,
"grad_norm": 0.1493054827494839,
"learning_rate": 1.0208611902489478e-05,
"loss": 0.684,
"step": 3112
},
{
"epoch": 0.6663990795001472,
"grad_norm": 0.16242277806578512,
"learning_rate": 1.0196765734427992e-05,
"loss": 0.6799,
"step": 3113
},
{
"epoch": 0.6666131492333628,
"grad_norm": 0.15561334950737316,
"learning_rate": 1.0184924091521502e-05,
"loss": 0.703,
"step": 3114
},
{
"epoch": 0.6668272189665784,
"grad_norm": 0.1579829671750343,
"learning_rate": 1.0173086979236077e-05,
"loss": 0.7197,
"step": 3115
},
{
"epoch": 0.667041288699794,
"grad_norm": 0.1533501087623317,
"learning_rate": 1.0161254403035711e-05,
"loss": 0.6914,
"step": 3116
},
{
"epoch": 0.6672553584330095,
"grad_norm": 0.15860615118073362,
"learning_rate": 1.0149426368382316e-05,
"loss": 0.7257,
"step": 3117
},
{
"epoch": 0.6674694281662251,
"grad_norm": 0.15062396088380706,
"learning_rate": 1.0137602880735665e-05,
"loss": 0.6871,
"step": 3118
},
{
"epoch": 0.6676834978994407,
"grad_norm": 0.15271449812538404,
"learning_rate": 1.0125783945553478e-05,
"loss": 0.6857,
"step": 3119
},
{
"epoch": 0.6678975676326563,
"grad_norm": 0.1550816402235058,
"learning_rate": 1.011396956829134e-05,
"loss": 0.6688,
"step": 3120
},
{
"epoch": 0.668111637365872,
"grad_norm": 0.1569722840211998,
"learning_rate": 1.0102159754402751e-05,
"loss": 0.6725,
"step": 3121
},
{
"epoch": 0.6683257070990876,
"grad_norm": 0.15539662546724703,
"learning_rate": 1.009035450933911e-05,
"loss": 0.7149,
"step": 3122
},
{
"epoch": 0.6685397768323031,
"grad_norm": 0.1607548003730955,
"learning_rate": 1.0078553838549679e-05,
"loss": 0.6999,
"step": 3123
},
{
"epoch": 0.6687538465655187,
"grad_norm": 0.15180709022372793,
"learning_rate": 1.006675774748164e-05,
"loss": 0.6639,
"step": 3124
},
{
"epoch": 0.6689679162987343,
"grad_norm": 0.15921001860779627,
"learning_rate": 1.0054966241580036e-05,
"loss": 0.6822,
"step": 3125
},
{
"epoch": 0.6691819860319499,
"grad_norm": 0.1681690814035098,
"learning_rate": 1.0043179326287818e-05,
"loss": 0.7023,
"step": 3126
},
{
"epoch": 0.6693960557651655,
"grad_norm": 0.15606853472344315,
"learning_rate": 1.0031397007045785e-05,
"loss": 0.7039,
"step": 3127
},
{
"epoch": 0.6696101254983811,
"grad_norm": 0.16079448082099368,
"learning_rate": 1.0019619289292648e-05,
"loss": 0.7082,
"step": 3128
},
{
"epoch": 0.6698241952315966,
"grad_norm": 0.15619147163508657,
"learning_rate": 1.0007846178464984e-05,
"loss": 0.6797,
"step": 3129
},
{
"epoch": 0.6700382649648123,
"grad_norm": 0.16050429543267425,
"learning_rate": 9.996077679997225e-06,
"loss": 0.6909,
"step": 3130
},
{
"epoch": 0.6702523346980279,
"grad_norm": 0.16513716623223665,
"learning_rate": 9.984313799321705e-06,
"loss": 0.7146,
"step": 3131
},
{
"epoch": 0.6704664044312435,
"grad_norm": 0.16168181479241397,
"learning_rate": 9.97255454186859e-06,
"loss": 0.673,
"step": 3132
},
{
"epoch": 0.6706804741644591,
"grad_norm": 0.1644963549644342,
"learning_rate": 9.960799913065945e-06,
"loss": 0.6998,
"step": 3133
},
{
"epoch": 0.6708945438976747,
"grad_norm": 0.1685040716482276,
"learning_rate": 9.94904991833969e-06,
"loss": 0.6878,
"step": 3134
},
{
"epoch": 0.6711086136308902,
"grad_norm": 0.1769403242524765,
"learning_rate": 9.937304563113588e-06,
"loss": 0.7104,
"step": 3135
},
{
"epoch": 0.6713226833641058,
"grad_norm": 0.15623683196959876,
"learning_rate": 9.925563852809277e-06,
"loss": 0.6956,
"step": 3136
},
{
"epoch": 0.6715367530973214,
"grad_norm": 0.17605621966527873,
"learning_rate": 9.913827792846256e-06,
"loss": 0.7108,
"step": 3137
},
{
"epoch": 0.671750822830537,
"grad_norm": 0.16534408460196132,
"learning_rate": 9.902096388641857e-06,
"loss": 0.6905,
"step": 3138
},
{
"epoch": 0.6719648925637527,
"grad_norm": 0.1522216941277003,
"learning_rate": 9.890369645611287e-06,
"loss": 0.6907,
"step": 3139
},
{
"epoch": 0.6721789622969683,
"grad_norm": 0.17269849774579715,
"learning_rate": 9.878647569167574e-06,
"loss": 0.7322,
"step": 3140
},
{
"epoch": 0.6723930320301839,
"grad_norm": 0.1663999999530521,
"learning_rate": 9.866930164721615e-06,
"loss": 0.7128,
"step": 3141
},
{
"epoch": 0.6726071017633994,
"grad_norm": 0.15884796574635146,
"learning_rate": 9.855217437682153e-06,
"loss": 0.7037,
"step": 3142
},
{
"epoch": 0.672821171496615,
"grad_norm": 0.1618128699996525,
"learning_rate": 9.84350939345574e-06,
"loss": 0.6869,
"step": 3143
},
{
"epoch": 0.6730352412298306,
"grad_norm": 0.18494989120822972,
"learning_rate": 9.831806037446799e-06,
"loss": 0.6954,
"step": 3144
},
{
"epoch": 0.6732493109630462,
"grad_norm": 0.16320847668955887,
"learning_rate": 9.820107375057587e-06,
"loss": 0.6853,
"step": 3145
},
{
"epoch": 0.6734633806962618,
"grad_norm": 0.15345569238811857,
"learning_rate": 9.808413411688166e-06,
"loss": 0.7107,
"step": 3146
},
{
"epoch": 0.6736774504294774,
"grad_norm": 0.17216020080991495,
"learning_rate": 9.796724152736459e-06,
"loss": 0.7337,
"step": 3147
},
{
"epoch": 0.673891520162693,
"grad_norm": 0.15632077929741134,
"learning_rate": 9.785039603598211e-06,
"loss": 0.739,
"step": 3148
},
{
"epoch": 0.6741055898959086,
"grad_norm": 0.15844641185229077,
"learning_rate": 9.773359769666979e-06,
"loss": 0.7148,
"step": 3149
},
{
"epoch": 0.6743196596291242,
"grad_norm": 0.1863008428946735,
"learning_rate": 9.761684656334153e-06,
"loss": 0.6896,
"step": 3150
},
{
"epoch": 0.6745337293623398,
"grad_norm": 0.15964610498381,
"learning_rate": 9.75001426898896e-06,
"loss": 0.6856,
"step": 3151
},
{
"epoch": 0.6747477990955554,
"grad_norm": 0.16358660961328506,
"learning_rate": 9.738348613018404e-06,
"loss": 0.7097,
"step": 3152
},
{
"epoch": 0.674961868828771,
"grad_norm": 0.37476799614030293,
"learning_rate": 9.726687693807346e-06,
"loss": 0.6836,
"step": 3153
},
{
"epoch": 0.6751759385619865,
"grad_norm": 0.1561915002792,
"learning_rate": 9.715031516738449e-06,
"loss": 0.7144,
"step": 3154
},
{
"epoch": 0.6753900082952021,
"grad_norm": 0.15804131303086497,
"learning_rate": 9.703380087192172e-06,
"loss": 0.664,
"step": 3155
},
{
"epoch": 0.6756040780284177,
"grad_norm": 0.1642980645109622,
"learning_rate": 9.691733410546803e-06,
"loss": 0.673,
"step": 3156
},
{
"epoch": 0.6758181477616334,
"grad_norm": 0.15965839899632348,
"learning_rate": 9.680091492178414e-06,
"loss": 0.6993,
"step": 3157
},
{
"epoch": 0.676032217494849,
"grad_norm": 0.17361081052545546,
"learning_rate": 9.668454337460903e-06,
"loss": 0.6821,
"step": 3158
},
{
"epoch": 0.6762462872280646,
"grad_norm": 0.15442536683044944,
"learning_rate": 9.65682195176596e-06,
"loss": 0.6816,
"step": 3159
},
{
"epoch": 0.6764603569612802,
"grad_norm": 0.2751371714063181,
"learning_rate": 9.645194340463066e-06,
"loss": 0.7192,
"step": 3160
},
{
"epoch": 0.6766744266944957,
"grad_norm": 0.16031316581342847,
"learning_rate": 9.633571508919517e-06,
"loss": 0.6663,
"step": 3161
},
{
"epoch": 0.6768884964277113,
"grad_norm": 0.1605092561481887,
"learning_rate": 9.621953462500376e-06,
"loss": 0.7064,
"step": 3162
},
{
"epoch": 0.6771025661609269,
"grad_norm": 0.15314851988646203,
"learning_rate": 9.610340206568516e-06,
"loss": 0.6978,
"step": 3163
},
{
"epoch": 0.6773166358941425,
"grad_norm": 0.16377340468062307,
"learning_rate": 9.598731746484609e-06,
"loss": 0.708,
"step": 3164
},
{
"epoch": 0.6775307056273581,
"grad_norm": 0.155316449530693,
"learning_rate": 9.587128087607076e-06,
"loss": 0.6815,
"step": 3165
},
{
"epoch": 0.6777447753605738,
"grad_norm": 0.16467278791151196,
"learning_rate": 9.575529235292167e-06,
"loss": 0.6884,
"step": 3166
},
{
"epoch": 0.6779588450937893,
"grad_norm": 0.1551656837635727,
"learning_rate": 9.563935194893873e-06,
"loss": 0.6763,
"step": 3167
},
{
"epoch": 0.6781729148270049,
"grad_norm": 0.1626814061488549,
"learning_rate": 9.552345971763995e-06,
"loss": 0.6747,
"step": 3168
},
{
"epoch": 0.6783869845602205,
"grad_norm": 0.1584305609053603,
"learning_rate": 9.540761571252081e-06,
"loss": 0.6853,
"step": 3169
},
{
"epoch": 0.6786010542934361,
"grad_norm": 0.1627757079442043,
"learning_rate": 9.529181998705484e-06,
"loss": 0.6885,
"step": 3170
},
{
"epoch": 0.6788151240266517,
"grad_norm": 0.15702841227085676,
"learning_rate": 9.517607259469315e-06,
"loss": 0.6986,
"step": 3171
},
{
"epoch": 0.6790291937598673,
"grad_norm": 0.15514903257969306,
"learning_rate": 9.506037358886438e-06,
"loss": 0.7051,
"step": 3172
},
{
"epoch": 0.6792432634930828,
"grad_norm": 0.1588871422175614,
"learning_rate": 9.494472302297513e-06,
"loss": 0.6797,
"step": 3173
},
{
"epoch": 0.6794573332262984,
"grad_norm": 0.15465620509642436,
"learning_rate": 9.482912095040935e-06,
"loss": 0.7042,
"step": 3174
},
{
"epoch": 0.6796714029595141,
"grad_norm": 0.15477902193079654,
"learning_rate": 9.471356742452881e-06,
"loss": 0.7312,
"step": 3175
},
{
"epoch": 0.6798854726927297,
"grad_norm": 0.154888446804333,
"learning_rate": 9.459806249867291e-06,
"loss": 0.6874,
"step": 3176
},
{
"epoch": 0.6800995424259453,
"grad_norm": 0.15403428259120724,
"learning_rate": 9.448260622615833e-06,
"loss": 0.6899,
"step": 3177
},
{
"epoch": 0.6803136121591609,
"grad_norm": 0.15486401377898562,
"learning_rate": 9.436719866027964e-06,
"loss": 0.7176,
"step": 3178
},
{
"epoch": 0.6805276818923764,
"grad_norm": 0.16917034306985979,
"learning_rate": 9.42518398543086e-06,
"loss": 0.692,
"step": 3179
},
{
"epoch": 0.680741751625592,
"grad_norm": 0.15405636111575113,
"learning_rate": 9.413652986149469e-06,
"loss": 0.7086,
"step": 3180
},
{
"epoch": 0.6809558213588076,
"grad_norm": 0.16976708114096917,
"learning_rate": 9.40212687350649e-06,
"loss": 0.6965,
"step": 3181
},
{
"epoch": 0.6811698910920232,
"grad_norm": 0.15729070246954535,
"learning_rate": 9.390605652822338e-06,
"loss": 0.6783,
"step": 3182
},
{
"epoch": 0.6813839608252388,
"grad_norm": 0.15539637254515223,
"learning_rate": 9.3790893294152e-06,
"loss": 0.7329,
"step": 3183
},
{
"epoch": 0.6815980305584545,
"grad_norm": 0.15200131940661787,
"learning_rate": 9.367577908600982e-06,
"loss": 0.7162,
"step": 3184
},
{
"epoch": 0.68181210029167,
"grad_norm": 0.15464749938535283,
"learning_rate": 9.356071395693336e-06,
"loss": 0.6939,
"step": 3185
},
{
"epoch": 0.6820261700248856,
"grad_norm": 0.15238123162748352,
"learning_rate": 9.344569796003658e-06,
"loss": 0.7004,
"step": 3186
},
{
"epoch": 0.6822402397581012,
"grad_norm": 0.14639624950887936,
"learning_rate": 9.333073114841047e-06,
"loss": 0.6769,
"step": 3187
},
{
"epoch": 0.6824543094913168,
"grad_norm": 0.15527939673346694,
"learning_rate": 9.321581357512368e-06,
"loss": 0.6919,
"step": 3188
},
{
"epoch": 0.6826683792245324,
"grad_norm": 0.15415946644278458,
"learning_rate": 9.31009452932218e-06,
"loss": 0.6889,
"step": 3189
},
{
"epoch": 0.682882448957748,
"grad_norm": 0.14968837572851434,
"learning_rate": 9.298612635572789e-06,
"loss": 0.6933,
"step": 3190
},
{
"epoch": 0.6830965186909636,
"grad_norm": 0.16013721408530462,
"learning_rate": 9.287135681564221e-06,
"loss": 0.6753,
"step": 3191
},
{
"epoch": 0.6833105884241791,
"grad_norm": 0.17109248256134058,
"learning_rate": 9.275663672594207e-06,
"loss": 0.6925,
"step": 3192
},
{
"epoch": 0.6835246581573948,
"grad_norm": 0.2166480431249797,
"learning_rate": 9.264196613958214e-06,
"loss": 0.6956,
"step": 3193
},
{
"epoch": 0.6837387278906104,
"grad_norm": 0.15860227185907372,
"learning_rate": 9.252734510949407e-06,
"loss": 0.6835,
"step": 3194
},
{
"epoch": 0.683952797623826,
"grad_norm": 0.1605212642264318,
"learning_rate": 9.241277368858674e-06,
"loss": 0.7096,
"step": 3195
},
{
"epoch": 0.6841668673570416,
"grad_norm": 0.15730764500498864,
"learning_rate": 9.229825192974622e-06,
"loss": 0.6816,
"step": 3196
},
{
"epoch": 0.6843809370902572,
"grad_norm": 0.15379061456916696,
"learning_rate": 9.218377988583537e-06,
"loss": 0.6991,
"step": 3197
},
{
"epoch": 0.6845950068234727,
"grad_norm": 0.15275554232726055,
"learning_rate": 9.206935760969444e-06,
"loss": 0.669,
"step": 3198
},
{
"epoch": 0.6848090765566883,
"grad_norm": 0.16113244165672033,
"learning_rate": 9.195498515414039e-06,
"loss": 0.6919,
"step": 3199
},
{
"epoch": 0.6850231462899039,
"grad_norm": 0.1520911816424395,
"learning_rate": 9.18406625719674e-06,
"loss": 0.689,
"step": 3200
},
{
"epoch": 0.6852372160231195,
"grad_norm": 0.16296861432524246,
"learning_rate": 9.172638991594664e-06,
"loss": 0.7292,
"step": 3201
},
{
"epoch": 0.6854512857563351,
"grad_norm": 0.1556187840342968,
"learning_rate": 9.161216723882602e-06,
"loss": 0.6927,
"step": 3202
},
{
"epoch": 0.6856653554895508,
"grad_norm": 0.14984107826594323,
"learning_rate": 9.14979945933307e-06,
"loss": 0.6939,
"step": 3203
},
{
"epoch": 0.6858794252227663,
"grad_norm": 0.15465119761763227,
"learning_rate": 9.138387203216235e-06,
"loss": 0.6731,
"step": 3204
},
{
"epoch": 0.6860934949559819,
"grad_norm": 0.1538868912566821,
"learning_rate": 9.126979960799984e-06,
"loss": 0.6888,
"step": 3205
},
{
"epoch": 0.6863075646891975,
"grad_norm": 0.15184321224933015,
"learning_rate": 9.115577737349887e-06,
"loss": 0.705,
"step": 3206
},
{
"epoch": 0.6865216344224131,
"grad_norm": 0.16052072893154717,
"learning_rate": 9.104180538129175e-06,
"loss": 0.7003,
"step": 3207
},
{
"epoch": 0.6867357041556287,
"grad_norm": 0.15060654352400643,
"learning_rate": 9.092788368398785e-06,
"loss": 0.678,
"step": 3208
},
{
"epoch": 0.6869497738888443,
"grad_norm": 0.1610353272806887,
"learning_rate": 9.081401233417315e-06,
"loss": 0.7286,
"step": 3209
},
{
"epoch": 0.6871638436220598,
"grad_norm": 0.15840630458392266,
"learning_rate": 9.070019138441054e-06,
"loss": 0.7406,
"step": 3210
},
{
"epoch": 0.6873779133552754,
"grad_norm": 0.24777232575067695,
"learning_rate": 9.058642088723943e-06,
"loss": 0.6667,
"step": 3211
},
{
"epoch": 0.6875919830884911,
"grad_norm": 0.15377440152663294,
"learning_rate": 9.047270089517615e-06,
"loss": 0.7053,
"step": 3212
},
{
"epoch": 0.6878060528217067,
"grad_norm": 0.16044122375214126,
"learning_rate": 9.035903146071371e-06,
"loss": 0.6988,
"step": 3213
},
{
"epoch": 0.6880201225549223,
"grad_norm": 0.15458704704568665,
"learning_rate": 9.024541263632156e-06,
"loss": 0.7298,
"step": 3214
},
{
"epoch": 0.6882341922881379,
"grad_norm": 0.19605947630624748,
"learning_rate": 9.013184447444612e-06,
"loss": 0.7159,
"step": 3215
},
{
"epoch": 0.6884482620213535,
"grad_norm": 0.15326392484139542,
"learning_rate": 9.001832702751005e-06,
"loss": 0.6825,
"step": 3216
},
{
"epoch": 0.688662331754569,
"grad_norm": 0.27379772412603165,
"learning_rate": 8.990486034791292e-06,
"loss": 0.7022,
"step": 3217
},
{
"epoch": 0.6888764014877846,
"grad_norm": 0.15215053768188203,
"learning_rate": 8.979144448803079e-06,
"loss": 0.7045,
"step": 3218
},
{
"epoch": 0.6890904712210002,
"grad_norm": 0.15324266580610485,
"learning_rate": 8.967807950021603e-06,
"loss": 0.6812,
"step": 3219
},
{
"epoch": 0.6893045409542158,
"grad_norm": 0.15535988865067632,
"learning_rate": 8.956476543679787e-06,
"loss": 0.6849,
"step": 3220
},
{
"epoch": 0.6895186106874315,
"grad_norm": 0.15531044354422383,
"learning_rate": 8.945150235008187e-06,
"loss": 0.6995,
"step": 3221
},
{
"epoch": 0.6897326804206471,
"grad_norm": 0.1606915624181014,
"learning_rate": 8.933829029234993e-06,
"loss": 0.684,
"step": 3222
},
{
"epoch": 0.6899467501538626,
"grad_norm": 0.16621197805211987,
"learning_rate": 8.922512931586066e-06,
"loss": 0.7035,
"step": 3223
},
{
"epoch": 0.6901608198870782,
"grad_norm": 0.15844074323764407,
"learning_rate": 8.911201947284893e-06,
"loss": 0.6878,
"step": 3224
},
{
"epoch": 0.6903748896202938,
"grad_norm": 0.15585698595264963,
"learning_rate": 8.8998960815526e-06,
"loss": 0.7059,
"step": 3225
},
{
"epoch": 0.6905889593535094,
"grad_norm": 0.15776095578882104,
"learning_rate": 8.888595339607961e-06,
"loss": 0.6982,
"step": 3226
},
{
"epoch": 0.690803029086725,
"grad_norm": 0.15176073051273042,
"learning_rate": 8.877299726667368e-06,
"loss": 0.6645,
"step": 3227
},
{
"epoch": 0.6910170988199406,
"grad_norm": 0.1641609940908201,
"learning_rate": 8.866009247944857e-06,
"loss": 0.6647,
"step": 3228
},
{
"epoch": 0.6912311685531561,
"grad_norm": 0.1504639867144409,
"learning_rate": 8.854723908652105e-06,
"loss": 0.6849,
"step": 3229
},
{
"epoch": 0.6914452382863718,
"grad_norm": 0.15357323871562542,
"learning_rate": 8.843443713998388e-06,
"loss": 0.7071,
"step": 3230
},
{
"epoch": 0.6916593080195874,
"grad_norm": 0.15690131119394382,
"learning_rate": 8.832168669190629e-06,
"loss": 0.7268,
"step": 3231
},
{
"epoch": 0.691873377752803,
"grad_norm": 0.14867224619401806,
"learning_rate": 8.820898779433374e-06,
"loss": 0.6911,
"step": 3232
},
{
"epoch": 0.6920874474860186,
"grad_norm": 0.1548704703614642,
"learning_rate": 8.809634049928773e-06,
"loss": 0.7196,
"step": 3233
},
{
"epoch": 0.6923015172192342,
"grad_norm": 0.15226528973226203,
"learning_rate": 8.798374485876609e-06,
"loss": 0.6886,
"step": 3234
},
{
"epoch": 0.6925155869524497,
"grad_norm": 0.15634992465558403,
"learning_rate": 8.787120092474286e-06,
"loss": 0.6935,
"step": 3235
},
{
"epoch": 0.6927296566856653,
"grad_norm": 0.15286390914464468,
"learning_rate": 8.775870874916792e-06,
"loss": 0.7091,
"step": 3236
},
{
"epoch": 0.6929437264188809,
"grad_norm": 0.15767169052468524,
"learning_rate": 8.764626838396757e-06,
"loss": 0.6807,
"step": 3237
},
{
"epoch": 0.6931577961520965,
"grad_norm": 0.15721142554207457,
"learning_rate": 8.753387988104415e-06,
"loss": 0.7197,
"step": 3238
},
{
"epoch": 0.6933718658853122,
"grad_norm": 0.1631163203891092,
"learning_rate": 8.742154329227581e-06,
"loss": 0.6637,
"step": 3239
},
{
"epoch": 0.6935859356185278,
"grad_norm": 0.15770942046593717,
"learning_rate": 8.73092586695171e-06,
"loss": 0.6653,
"step": 3240
},
{
"epoch": 0.6938000053517434,
"grad_norm": 0.16094273294550562,
"learning_rate": 8.71970260645982e-06,
"loss": 0.7003,
"step": 3241
},
{
"epoch": 0.6940140750849589,
"grad_norm": 0.168150831931923,
"learning_rate": 8.708484552932557e-06,
"loss": 0.7094,
"step": 3242
},
{
"epoch": 0.6942281448181745,
"grad_norm": 0.16187957424719024,
"learning_rate": 8.697271711548163e-06,
"loss": 0.6946,
"step": 3243
},
{
"epoch": 0.6944422145513901,
"grad_norm": 0.1685080945617025,
"learning_rate": 8.686064087482448e-06,
"loss": 0.6792,
"step": 3244
},
{
"epoch": 0.6946562842846057,
"grad_norm": 0.17103143099079773,
"learning_rate": 8.674861685908848e-06,
"loss": 0.7131,
"step": 3245
},
{
"epoch": 0.6948703540178213,
"grad_norm": 0.15904423375335486,
"learning_rate": 8.663664511998355e-06,
"loss": 0.7085,
"step": 3246
},
{
"epoch": 0.6950844237510369,
"grad_norm": 0.19989578516301848,
"learning_rate": 8.652472570919579e-06,
"loss": 0.7223,
"step": 3247
},
{
"epoch": 0.6952984934842525,
"grad_norm": 0.1551725663755147,
"learning_rate": 8.641285867838682e-06,
"loss": 0.6606,
"step": 3248
},
{
"epoch": 0.6955125632174681,
"grad_norm": 0.15329960859729763,
"learning_rate": 8.630104407919438e-06,
"loss": 0.6741,
"step": 3249
},
{
"epoch": 0.6957266329506837,
"grad_norm": 0.15652680347529477,
"learning_rate": 8.618928196323192e-06,
"loss": 0.6879,
"step": 3250
},
{
"epoch": 0.6959407026838993,
"grad_norm": 0.1551127764792699,
"learning_rate": 8.60775723820885e-06,
"loss": 0.7047,
"step": 3251
},
{
"epoch": 0.6961547724171149,
"grad_norm": 0.16006532064357237,
"learning_rate": 8.59659153873292e-06,
"loss": 0.6819,
"step": 3252
},
{
"epoch": 0.6963688421503305,
"grad_norm": 0.15633178720675306,
"learning_rate": 8.585431103049453e-06,
"loss": 0.7087,
"step": 3253
},
{
"epoch": 0.696582911883546,
"grad_norm": 0.15692161645710714,
"learning_rate": 8.574275936310095e-06,
"loss": 0.6891,
"step": 3254
},
{
"epoch": 0.6967969816167616,
"grad_norm": 0.1566846526405795,
"learning_rate": 8.563126043664054e-06,
"loss": 0.685,
"step": 3255
},
{
"epoch": 0.6970110513499772,
"grad_norm": 0.16482919007789454,
"learning_rate": 8.55198143025809e-06,
"loss": 0.6927,
"step": 3256
},
{
"epoch": 0.6972251210831929,
"grad_norm": 0.1598377174862076,
"learning_rate": 8.540842101236549e-06,
"loss": 0.6744,
"step": 3257
},
{
"epoch": 0.6974391908164085,
"grad_norm": 0.16261354296708044,
"learning_rate": 8.529708061741306e-06,
"loss": 0.689,
"step": 3258
},
{
"epoch": 0.6976532605496241,
"grad_norm": 0.17474731991633885,
"learning_rate": 8.518579316911828e-06,
"loss": 0.7015,
"step": 3259
},
{
"epoch": 0.6978673302828396,
"grad_norm": 0.1581379365573815,
"learning_rate": 8.507455871885126e-06,
"loss": 0.7059,
"step": 3260
},
{
"epoch": 0.6980814000160552,
"grad_norm": 0.16172568947803476,
"learning_rate": 8.49633773179575e-06,
"loss": 0.6835,
"step": 3261
},
{
"epoch": 0.6982954697492708,
"grad_norm": 0.158648978385029,
"learning_rate": 8.485224901775823e-06,
"loss": 0.7139,
"step": 3262
},
{
"epoch": 0.6985095394824864,
"grad_norm": 0.16222491003334794,
"learning_rate": 8.474117386954998e-06,
"loss": 0.6862,
"step": 3263
},
{
"epoch": 0.698723609215702,
"grad_norm": 0.16584229699293707,
"learning_rate": 8.463015192460488e-06,
"loss": 0.7049,
"step": 3264
},
{
"epoch": 0.6989376789489176,
"grad_norm": 0.15216134303928552,
"learning_rate": 8.451918323417053e-06,
"loss": 0.6973,
"step": 3265
},
{
"epoch": 0.6991517486821333,
"grad_norm": 0.15987361850664245,
"learning_rate": 8.440826784946972e-06,
"loss": 0.6871,
"step": 3266
},
{
"epoch": 0.6993658184153488,
"grad_norm": 0.1508799694115644,
"learning_rate": 8.429740582170094e-06,
"loss": 0.6829,
"step": 3267
},
{
"epoch": 0.6995798881485644,
"grad_norm": 0.1556035125570702,
"learning_rate": 8.418659720203777e-06,
"loss": 0.6947,
"step": 3268
},
{
"epoch": 0.69979395788178,
"grad_norm": 0.15935844557397552,
"learning_rate": 8.407584204162933e-06,
"loss": 0.6948,
"step": 3269
},
{
"epoch": 0.7000080276149956,
"grad_norm": 0.15164055886152145,
"learning_rate": 8.396514039160007e-06,
"loss": 0.7085,
"step": 3270
},
{
"epoch": 0.7002220973482112,
"grad_norm": 0.15745134574952785,
"learning_rate": 8.38544923030495e-06,
"loss": 0.6901,
"step": 3271
},
{
"epoch": 0.7004361670814268,
"grad_norm": 0.16747816355778503,
"learning_rate": 8.374389782705276e-06,
"loss": 0.7361,
"step": 3272
},
{
"epoch": 0.7006502368146423,
"grad_norm": 0.14676182178891728,
"learning_rate": 8.363335701465989e-06,
"loss": 0.696,
"step": 3273
},
{
"epoch": 0.7008643065478579,
"grad_norm": 0.16553099968837984,
"learning_rate": 8.352286991689642e-06,
"loss": 0.6989,
"step": 3274
},
{
"epoch": 0.7010783762810736,
"grad_norm": 0.15052109228717409,
"learning_rate": 8.341243658476303e-06,
"loss": 0.6999,
"step": 3275
},
{
"epoch": 0.7012924460142892,
"grad_norm": 0.1480200012041235,
"learning_rate": 8.330205706923543e-06,
"loss": 0.6853,
"step": 3276
},
{
"epoch": 0.7015065157475048,
"grad_norm": 0.15865158359551787,
"learning_rate": 8.319173142126473e-06,
"loss": 0.682,
"step": 3277
},
{
"epoch": 0.7017205854807204,
"grad_norm": 0.1590859019280151,
"learning_rate": 8.30814596917769e-06,
"loss": 0.7098,
"step": 3278
},
{
"epoch": 0.7019346552139359,
"grad_norm": 0.152983472158898,
"learning_rate": 8.297124193167325e-06,
"loss": 0.7197,
"step": 3279
},
{
"epoch": 0.7021487249471515,
"grad_norm": 0.1586510475568293,
"learning_rate": 8.286107819183018e-06,
"loss": 0.7014,
"step": 3280
},
{
"epoch": 0.7023627946803671,
"grad_norm": 0.15122253761799054,
"learning_rate": 8.27509685230989e-06,
"loss": 0.7142,
"step": 3281
},
{
"epoch": 0.7025768644135827,
"grad_norm": 0.1548462353061408,
"learning_rate": 8.264091297630601e-06,
"loss": 0.6958,
"step": 3282
},
{
"epoch": 0.7027909341467983,
"grad_norm": 0.15058402413909766,
"learning_rate": 8.253091160225275e-06,
"loss": 0.6909,
"step": 3283
},
{
"epoch": 0.703005003880014,
"grad_norm": 0.15312143235504205,
"learning_rate": 8.242096445171568e-06,
"loss": 0.664,
"step": 3284
},
{
"epoch": 0.7032190736132296,
"grad_norm": 0.16032369384110995,
"learning_rate": 8.231107157544627e-06,
"loss": 0.7078,
"step": 3285
},
{
"epoch": 0.7034331433464451,
"grad_norm": 0.16338575824528243,
"learning_rate": 8.220123302417068e-06,
"loss": 0.685,
"step": 3286
},
{
"epoch": 0.7036472130796607,
"grad_norm": 0.15133135918381588,
"learning_rate": 8.209144884859038e-06,
"loss": 0.6944,
"step": 3287
},
{
"epoch": 0.7038612828128763,
"grad_norm": 0.16186220984160274,
"learning_rate": 8.198171909938135e-06,
"loss": 0.6995,
"step": 3288
},
{
"epoch": 0.7040753525460919,
"grad_norm": 0.16182674921422807,
"learning_rate": 8.187204382719485e-06,
"loss": 0.701,
"step": 3289
},
{
"epoch": 0.7042894222793075,
"grad_norm": 0.15291505984125645,
"learning_rate": 8.176242308265659e-06,
"loss": 0.6945,
"step": 3290
},
{
"epoch": 0.704503492012523,
"grad_norm": 0.15712566845642592,
"learning_rate": 8.16528569163674e-06,
"loss": 0.7011,
"step": 3291
},
{
"epoch": 0.7047175617457386,
"grad_norm": 0.1533780516375367,
"learning_rate": 8.154334537890288e-06,
"loss": 0.7048,
"step": 3292
},
{
"epoch": 0.7049316314789543,
"grad_norm": 0.15025676200541188,
"learning_rate": 8.143388852081319e-06,
"loss": 0.6932,
"step": 3293
},
{
"epoch": 0.7051457012121699,
"grad_norm": 0.15997870935165437,
"learning_rate": 8.132448639262362e-06,
"loss": 0.682,
"step": 3294
},
{
"epoch": 0.7053597709453855,
"grad_norm": 0.15125068583237963,
"learning_rate": 8.121513904483383e-06,
"loss": 0.6946,
"step": 3295
},
{
"epoch": 0.7055738406786011,
"grad_norm": 0.15393005915538988,
"learning_rate": 8.110584652791837e-06,
"loss": 0.6878,
"step": 3296
},
{
"epoch": 0.7057879104118167,
"grad_norm": 0.16056953788845163,
"learning_rate": 8.099660889232661e-06,
"loss": 0.7207,
"step": 3297
},
{
"epoch": 0.7060019801450322,
"grad_norm": 0.15322709592882022,
"learning_rate": 8.088742618848227e-06,
"loss": 0.6877,
"step": 3298
},
{
"epoch": 0.7062160498782478,
"grad_norm": 0.20836632658402116,
"learning_rate": 8.077829846678401e-06,
"loss": 0.7085,
"step": 3299
},
{
"epoch": 0.7064301196114634,
"grad_norm": 0.23459825862196712,
"learning_rate": 8.066922577760488e-06,
"loss": 0.7036,
"step": 3300
},
{
"epoch": 0.706644189344679,
"grad_norm": 0.16557335826929756,
"learning_rate": 8.056020817129269e-06,
"loss": 0.7171,
"step": 3301
},
{
"epoch": 0.7068582590778947,
"grad_norm": 0.15962824680752044,
"learning_rate": 8.045124569816983e-06,
"loss": 0.6942,
"step": 3302
},
{
"epoch": 0.7070723288111103,
"grad_norm": 0.1562427851842794,
"learning_rate": 8.034233840853304e-06,
"loss": 0.6977,
"step": 3303
},
{
"epoch": 0.7072863985443258,
"grad_norm": 0.15336135090779646,
"learning_rate": 8.023348635265377e-06,
"loss": 0.6992,
"step": 3304
},
{
"epoch": 0.7075004682775414,
"grad_norm": 0.1559817613170699,
"learning_rate": 8.012468958077805e-06,
"loss": 0.6823,
"step": 3305
},
{
"epoch": 0.707714538010757,
"grad_norm": 0.151487855424781,
"learning_rate": 8.001594814312612e-06,
"loss": 0.6633,
"step": 3306
},
{
"epoch": 0.7079286077439726,
"grad_norm": 0.15145904827595,
"learning_rate": 7.990726208989289e-06,
"loss": 0.7021,
"step": 3307
},
{
"epoch": 0.7081426774771882,
"grad_norm": 0.15019125814155232,
"learning_rate": 7.979863147124771e-06,
"loss": 0.6683,
"step": 3308
},
{
"epoch": 0.7083567472104038,
"grad_norm": 0.16162331545892966,
"learning_rate": 7.969005633733412e-06,
"loss": 0.7502,
"step": 3309
},
{
"epoch": 0.7085708169436193,
"grad_norm": 0.15798457096325713,
"learning_rate": 7.95815367382703e-06,
"loss": 0.7138,
"step": 3310
},
{
"epoch": 0.7087848866768349,
"grad_norm": 0.15290488841322952,
"learning_rate": 7.947307272414874e-06,
"loss": 0.679,
"step": 3311
},
{
"epoch": 0.7089989564100506,
"grad_norm": 0.15531253190558253,
"learning_rate": 7.936466434503614e-06,
"loss": 0.681,
"step": 3312
},
{
"epoch": 0.7092130261432662,
"grad_norm": 0.15074385446016486,
"learning_rate": 7.925631165097362e-06,
"loss": 0.6814,
"step": 3313
},
{
"epoch": 0.7094270958764818,
"grad_norm": 0.16414183475105307,
"learning_rate": 7.914801469197669e-06,
"loss": 0.6879,
"step": 3314
},
{
"epoch": 0.7096411656096974,
"grad_norm": 0.15444895914184442,
"learning_rate": 7.903977351803488e-06,
"loss": 0.6813,
"step": 3315
},
{
"epoch": 0.709855235342913,
"grad_norm": 0.15600202300130273,
"learning_rate": 7.893158817911225e-06,
"loss": 0.6943,
"step": 3316
},
{
"epoch": 0.7100693050761285,
"grad_norm": 0.29906092755354347,
"learning_rate": 7.882345872514682e-06,
"loss": 0.7171,
"step": 3317
},
{
"epoch": 0.7102833748093441,
"grad_norm": 0.15706329338725833,
"learning_rate": 7.871538520605104e-06,
"loss": 0.7027,
"step": 3318
},
{
"epoch": 0.7104974445425597,
"grad_norm": 0.15247037005381606,
"learning_rate": 7.860736767171148e-06,
"loss": 0.6959,
"step": 3319
},
{
"epoch": 0.7107115142757753,
"grad_norm": 0.16360515419120714,
"learning_rate": 7.849940617198872e-06,
"loss": 0.7192,
"step": 3320
},
{
"epoch": 0.710925584008991,
"grad_norm": 0.1494733107927237,
"learning_rate": 7.839150075671766e-06,
"loss": 0.7096,
"step": 3321
},
{
"epoch": 0.7111396537422066,
"grad_norm": 0.15651951456030722,
"learning_rate": 7.828365147570731e-06,
"loss": 0.691,
"step": 3322
},
{
"epoch": 0.7113537234754221,
"grad_norm": 0.16220546679217188,
"learning_rate": 7.817585837874055e-06,
"loss": 0.6959,
"step": 3323
},
{
"epoch": 0.7115677932086377,
"grad_norm": 0.14801509523348158,
"learning_rate": 7.806812151557463e-06,
"loss": 0.6822,
"step": 3324
},
{
"epoch": 0.7117818629418533,
"grad_norm": 0.16681944991433031,
"learning_rate": 7.796044093594056e-06,
"loss": 0.7127,
"step": 3325
},
{
"epoch": 0.7119959326750689,
"grad_norm": 0.15712317398624448,
"learning_rate": 7.785281668954353e-06,
"loss": 0.691,
"step": 3326
},
{
"epoch": 0.7122100024082845,
"grad_norm": 0.15789069431373431,
"learning_rate": 7.774524882606278e-06,
"loss": 0.7135,
"step": 3327
},
{
"epoch": 0.7124240721415,
"grad_norm": 0.16442824904434017,
"learning_rate": 7.76377373951513e-06,
"loss": 0.6983,
"step": 3328
},
{
"epoch": 0.7126381418747156,
"grad_norm": 0.16359429737990552,
"learning_rate": 7.753028244643634e-06,
"loss": 0.6985,
"step": 3329
},
{
"epoch": 0.7128522116079313,
"grad_norm": 0.16502710970219736,
"learning_rate": 7.742288402951875e-06,
"loss": 0.6842,
"step": 3330
},
{
"epoch": 0.7130662813411469,
"grad_norm": 0.16350182802163685,
"learning_rate": 7.731554219397354e-06,
"loss": 0.7213,
"step": 3331
},
{
"epoch": 0.7132803510743625,
"grad_norm": 0.15803065686106776,
"learning_rate": 7.720825698934941e-06,
"loss": 0.6936,
"step": 3332
},
{
"epoch": 0.7134944208075781,
"grad_norm": 0.16427002342058883,
"learning_rate": 7.710102846516909e-06,
"loss": 0.7221,
"step": 3333
},
{
"epoch": 0.7137084905407937,
"grad_norm": 0.1507168336640026,
"learning_rate": 7.699385667092914e-06,
"loss": 0.681,
"step": 3334
},
{
"epoch": 0.7139225602740092,
"grad_norm": 0.15888700028778466,
"learning_rate": 7.688674165609968e-06,
"loss": 0.6694,
"step": 3335
},
{
"epoch": 0.7141366300072248,
"grad_norm": 0.16434463130603383,
"learning_rate": 7.6779683470125e-06,
"loss": 0.6848,
"step": 3336
},
{
"epoch": 0.7143506997404404,
"grad_norm": 0.14962235689266584,
"learning_rate": 7.667268216242276e-06,
"loss": 0.6797,
"step": 3337
},
{
"epoch": 0.714564769473656,
"grad_norm": 0.2539172984903302,
"learning_rate": 7.65657377823847e-06,
"loss": 0.6945,
"step": 3338
},
{
"epoch": 0.7147788392068717,
"grad_norm": 0.15894632150544735,
"learning_rate": 7.645885037937618e-06,
"loss": 0.7146,
"step": 3339
},
{
"epoch": 0.7149929089400873,
"grad_norm": 0.15303574716754773,
"learning_rate": 7.635202000273612e-06,
"loss": 0.6851,
"step": 3340
},
{
"epoch": 0.7152069786733029,
"grad_norm": 0.17956860287237303,
"learning_rate": 7.624524670177733e-06,
"loss": 0.6893,
"step": 3341
},
{
"epoch": 0.7154210484065184,
"grad_norm": 0.16486662999374005,
"learning_rate": 7.613853052578606e-06,
"loss": 0.6997,
"step": 3342
},
{
"epoch": 0.715635118139734,
"grad_norm": 0.15758835418243866,
"learning_rate": 7.603187152402236e-06,
"loss": 0.6888,
"step": 3343
},
{
"epoch": 0.7158491878729496,
"grad_norm": 0.15339903183700115,
"learning_rate": 7.592526974571992e-06,
"loss": 0.6829,
"step": 3344
},
{
"epoch": 0.7160632576061652,
"grad_norm": 0.7017295495675703,
"learning_rate": 7.581872524008574e-06,
"loss": 0.7461,
"step": 3345
},
{
"epoch": 0.7162773273393808,
"grad_norm": 0.14876238787415644,
"learning_rate": 7.571223805630074e-06,
"loss": 0.6823,
"step": 3346
},
{
"epoch": 0.7164913970725963,
"grad_norm": 0.1526503072162832,
"learning_rate": 7.560580824351908e-06,
"loss": 0.672,
"step": 3347
},
{
"epoch": 0.716705466805812,
"grad_norm": 0.15917182905910648,
"learning_rate": 7.549943585086863e-06,
"loss": 0.691,
"step": 3348
},
{
"epoch": 0.7169195365390276,
"grad_norm": 0.16069115875788337,
"learning_rate": 7.539312092745072e-06,
"loss": 0.6967,
"step": 3349
},
{
"epoch": 0.7171336062722432,
"grad_norm": 0.16353040198864685,
"learning_rate": 7.528686352234005e-06,
"loss": 0.6717,
"step": 3350
},
{
"epoch": 0.7173476760054588,
"grad_norm": 0.16357354033857646,
"learning_rate": 7.518066368458494e-06,
"loss": 0.6989,
"step": 3351
},
{
"epoch": 0.7175617457386744,
"grad_norm": 0.1535502885079072,
"learning_rate": 7.5074521463206904e-06,
"loss": 0.6872,
"step": 3352
},
{
"epoch": 0.71777581547189,
"grad_norm": 0.15944656493142786,
"learning_rate": 7.49684369072011e-06,
"loss": 0.6963,
"step": 3353
},
{
"epoch": 0.7179898852051055,
"grad_norm": 0.15582152848457353,
"learning_rate": 7.486241006553598e-06,
"loss": 0.7141,
"step": 3354
},
{
"epoch": 0.7182039549383211,
"grad_norm": 0.15180057158042523,
"learning_rate": 7.475644098715324e-06,
"loss": 0.7161,
"step": 3355
},
{
"epoch": 0.7184180246715367,
"grad_norm": 0.15303103099972895,
"learning_rate": 7.465052972096816e-06,
"loss": 0.6799,
"step": 3356
},
{
"epoch": 0.7186320944047524,
"grad_norm": 0.14673479453292337,
"learning_rate": 7.454467631586901e-06,
"loss": 0.7051,
"step": 3357
},
{
"epoch": 0.718846164137968,
"grad_norm": 0.1599869420272919,
"learning_rate": 7.443888082071764e-06,
"loss": 0.7064,
"step": 3358
},
{
"epoch": 0.7190602338711836,
"grad_norm": 0.15461538319716817,
"learning_rate": 7.433314328434908e-06,
"loss": 0.7072,
"step": 3359
},
{
"epoch": 0.7192743036043991,
"grad_norm": 0.15386624927594827,
"learning_rate": 7.422746375557148e-06,
"loss": 0.6646,
"step": 3360
},
{
"epoch": 0.7194883733376147,
"grad_norm": 0.1545566190984167,
"learning_rate": 7.412184228316644e-06,
"loss": 0.7063,
"step": 3361
},
{
"epoch": 0.7197024430708303,
"grad_norm": 0.15626497527671113,
"learning_rate": 7.40162789158885e-06,
"loss": 0.7081,
"step": 3362
},
{
"epoch": 0.7199165128040459,
"grad_norm": 0.15450757007304494,
"learning_rate": 7.3910773702465596e-06,
"loss": 0.7157,
"step": 3363
},
{
"epoch": 0.7201305825372615,
"grad_norm": 0.16973833457855797,
"learning_rate": 7.380532669159881e-06,
"loss": 0.6915,
"step": 3364
},
{
"epoch": 0.7203446522704771,
"grad_norm": 0.15458051922297733,
"learning_rate": 7.369993793196213e-06,
"loss": 0.731,
"step": 3365
},
{
"epoch": 0.7205587220036928,
"grad_norm": 0.14870759179833373,
"learning_rate": 7.359460747220298e-06,
"loss": 0.6992,
"step": 3366
},
{
"epoch": 0.7207727917369083,
"grad_norm": 0.1505420260072728,
"learning_rate": 7.348933536094156e-06,
"loss": 0.6831,
"step": 3367
},
{
"epoch": 0.7209868614701239,
"grad_norm": 0.15286444108432226,
"learning_rate": 7.338412164677133e-06,
"loss": 0.7078,
"step": 3368
},
{
"epoch": 0.7212009312033395,
"grad_norm": 0.14654828021530483,
"learning_rate": 7.327896637825886e-06,
"loss": 0.715,
"step": 3369
},
{
"epoch": 0.7214150009365551,
"grad_norm": 0.1519082171866383,
"learning_rate": 7.317386960394346e-06,
"loss": 0.691,
"step": 3370
},
{
"epoch": 0.7216290706697707,
"grad_norm": 0.1533500100122846,
"learning_rate": 7.306883137233776e-06,
"loss": 0.703,
"step": 3371
},
{
"epoch": 0.7218431404029862,
"grad_norm": 0.15430001585354824,
"learning_rate": 7.296385173192708e-06,
"loss": 0.6862,
"step": 3372
},
{
"epoch": 0.7220572101362018,
"grad_norm": 0.14863032108344576,
"learning_rate": 7.2858930731169945e-06,
"loss": 0.6909,
"step": 3373
},
{
"epoch": 0.7222712798694174,
"grad_norm": 0.1637095261720954,
"learning_rate": 7.275406841849757e-06,
"loss": 0.6923,
"step": 3374
},
{
"epoch": 0.7224853496026331,
"grad_norm": 0.1503527045643267,
"learning_rate": 7.264926484231429e-06,
"loss": 0.6571,
"step": 3375
},
{
"epoch": 0.7226994193358487,
"grad_norm": 0.1500767315268531,
"learning_rate": 7.2544520050997305e-06,
"loss": 0.6934,
"step": 3376
},
{
"epoch": 0.7229134890690643,
"grad_norm": 0.15289633280915021,
"learning_rate": 7.243983409289648e-06,
"loss": 0.6921,
"step": 3377
},
{
"epoch": 0.7231275588022799,
"grad_norm": 0.1563676658019505,
"learning_rate": 7.233520701633479e-06,
"loss": 0.7074,
"step": 3378
},
{
"epoch": 0.7233416285354954,
"grad_norm": 0.14779967180706038,
"learning_rate": 7.223063886960779e-06,
"loss": 0.7217,
"step": 3379
},
{
"epoch": 0.723555698268711,
"grad_norm": 0.1584082916968249,
"learning_rate": 7.2126129700983986e-06,
"loss": 0.728,
"step": 3380
},
{
"epoch": 0.7237697680019266,
"grad_norm": 0.15063823251962052,
"learning_rate": 7.20216795587047e-06,
"loss": 0.7113,
"step": 3381
},
{
"epoch": 0.7239838377351422,
"grad_norm": 0.1550126643318111,
"learning_rate": 7.191728849098379e-06,
"loss": 0.6939,
"step": 3382
},
{
"epoch": 0.7241979074683578,
"grad_norm": 0.15214817709964662,
"learning_rate": 7.1812956546008105e-06,
"loss": 0.7081,
"step": 3383
},
{
"epoch": 0.7244119772015735,
"grad_norm": 0.14938195835100643,
"learning_rate": 7.170868377193696e-06,
"loss": 0.6981,
"step": 3384
},
{
"epoch": 0.724626046934789,
"grad_norm": 0.15038555055654673,
"learning_rate": 7.160447021690253e-06,
"loss": 0.7076,
"step": 3385
},
{
"epoch": 0.7248401166680046,
"grad_norm": 0.15309372635862914,
"learning_rate": 7.150031592900968e-06,
"loss": 0.6889,
"step": 3386
},
{
"epoch": 0.7250541864012202,
"grad_norm": 0.1524824433175752,
"learning_rate": 7.139622095633572e-06,
"loss": 0.7322,
"step": 3387
},
{
"epoch": 0.7252682561344358,
"grad_norm": 0.1599372986538526,
"learning_rate": 7.1292185346930745e-06,
"loss": 0.7222,
"step": 3388
},
{
"epoch": 0.7254823258676514,
"grad_norm": 0.15133341854378823,
"learning_rate": 7.118820914881746e-06,
"loss": 0.6981,
"step": 3389
},
{
"epoch": 0.725696395600867,
"grad_norm": 0.1446864509483478,
"learning_rate": 7.108429240999097e-06,
"loss": 0.683,
"step": 3390
},
{
"epoch": 0.7259104653340825,
"grad_norm": 0.14883310883327594,
"learning_rate": 7.098043517841911e-06,
"loss": 0.6818,
"step": 3391
},
{
"epoch": 0.7261245350672981,
"grad_norm": 0.1555072200378894,
"learning_rate": 7.0876637502042255e-06,
"loss": 0.7017,
"step": 3392
},
{
"epoch": 0.7263386048005138,
"grad_norm": 0.1536401070649317,
"learning_rate": 7.07728994287731e-06,
"loss": 0.7172,
"step": 3393
},
{
"epoch": 0.7265526745337294,
"grad_norm": 0.14689587361987888,
"learning_rate": 7.066922100649702e-06,
"loss": 0.6965,
"step": 3394
},
{
"epoch": 0.726766744266945,
"grad_norm": 0.15310481914422255,
"learning_rate": 7.056560228307183e-06,
"loss": 0.7084,
"step": 3395
},
{
"epoch": 0.7269808140001606,
"grad_norm": 0.15197732025207406,
"learning_rate": 7.046204330632762e-06,
"loss": 0.6819,
"step": 3396
},
{
"epoch": 0.7271948837333762,
"grad_norm": 0.14805906907025784,
"learning_rate": 7.035854412406709e-06,
"loss": 0.6983,
"step": 3397
},
{
"epoch": 0.7274089534665917,
"grad_norm": 0.1483724804144164,
"learning_rate": 7.025510478406534e-06,
"loss": 0.695,
"step": 3398
},
{
"epoch": 0.7276230231998073,
"grad_norm": 0.15873187132890057,
"learning_rate": 7.015172533406964e-06,
"loss": 0.6991,
"step": 3399
},
{
"epoch": 0.7278370929330229,
"grad_norm": 0.14800913953356853,
"learning_rate": 7.0048405821799855e-06,
"loss": 0.724,
"step": 3400
},
{
"epoch": 0.7280511626662385,
"grad_norm": 0.15134973949863306,
"learning_rate": 6.9945146294948105e-06,
"loss": 0.6858,
"step": 3401
},
{
"epoch": 0.7282652323994542,
"grad_norm": 0.15248909677906117,
"learning_rate": 6.984194680117868e-06,
"loss": 0.7221,
"step": 3402
},
{
"epoch": 0.7284793021326698,
"grad_norm": 0.15342124727909373,
"learning_rate": 6.973880738812844e-06,
"loss": 0.7029,
"step": 3403
},
{
"epoch": 0.7286933718658853,
"grad_norm": 0.14576049701522734,
"learning_rate": 6.963572810340616e-06,
"loss": 0.7224,
"step": 3404
},
{
"epoch": 0.7289074415991009,
"grad_norm": 0.1498282622939985,
"learning_rate": 6.953270899459317e-06,
"loss": 0.6969,
"step": 3405
},
{
"epoch": 0.7291215113323165,
"grad_norm": 0.1512993692592409,
"learning_rate": 6.942975010924291e-06,
"loss": 0.7149,
"step": 3406
},
{
"epoch": 0.7293355810655321,
"grad_norm": 0.16659082919254012,
"learning_rate": 6.932685149488094e-06,
"loss": 0.6801,
"step": 3407
},
{
"epoch": 0.7295496507987477,
"grad_norm": 0.147521830530763,
"learning_rate": 6.922401319900518e-06,
"loss": 0.7229,
"step": 3408
},
{
"epoch": 0.7297637205319633,
"grad_norm": 0.24749044889111424,
"learning_rate": 6.912123526908547e-06,
"loss": 0.7052,
"step": 3409
},
{
"epoch": 0.7299777902651788,
"grad_norm": 0.15457958137989353,
"learning_rate": 6.901851775256396e-06,
"loss": 0.7045,
"step": 3410
},
{
"epoch": 0.7301918599983945,
"grad_norm": 0.1460763916119327,
"learning_rate": 6.8915860696854965e-06,
"loss": 0.7014,
"step": 3411
},
{
"epoch": 0.7304059297316101,
"grad_norm": 0.148052040371279,
"learning_rate": 6.881326414934464e-06,
"loss": 0.6878,
"step": 3412
},
{
"epoch": 0.7306199994648257,
"grad_norm": 0.14774034247572365,
"learning_rate": 6.87107281573915e-06,
"loss": 0.6603,
"step": 3413
},
{
"epoch": 0.7308340691980413,
"grad_norm": 0.14849805484254816,
"learning_rate": 6.860825276832585e-06,
"loss": 0.6801,
"step": 3414
},
{
"epoch": 0.7310481389312569,
"grad_norm": 0.17338069488266583,
"learning_rate": 6.8505838029450275e-06,
"loss": 0.688,
"step": 3415
},
{
"epoch": 0.7312622086644724,
"grad_norm": 0.1538822474317705,
"learning_rate": 6.840348398803906e-06,
"loss": 0.7164,
"step": 3416
},
{
"epoch": 0.731476278397688,
"grad_norm": 0.15209063029325054,
"learning_rate": 6.830119069133878e-06,
"loss": 0.7129,
"step": 3417
},
{
"epoch": 0.7316903481309036,
"grad_norm": 0.15285611898125917,
"learning_rate": 6.819895818656783e-06,
"loss": 0.7178,
"step": 3418
},
{
"epoch": 0.7319044178641192,
"grad_norm": 0.15493790996850904,
"learning_rate": 6.809678652091645e-06,
"loss": 0.6951,
"step": 3419
},
{
"epoch": 0.7321184875973348,
"grad_norm": 0.1433556319415999,
"learning_rate": 6.7994675741547014e-06,
"loss": 0.677,
"step": 3420
},
{
"epoch": 0.7323325573305505,
"grad_norm": 0.1477034357517413,
"learning_rate": 6.789262589559355e-06,
"loss": 0.6864,
"step": 3421
},
{
"epoch": 0.732546627063766,
"grad_norm": 0.15183324778132398,
"learning_rate": 6.779063703016216e-06,
"loss": 0.683,
"step": 3422
},
{
"epoch": 0.7327606967969816,
"grad_norm": 0.15255717181585005,
"learning_rate": 6.768870919233073e-06,
"loss": 0.6892,
"step": 3423
},
{
"epoch": 0.7329747665301972,
"grad_norm": 0.14965734972611663,
"learning_rate": 6.758684242914888e-06,
"loss": 0.6942,
"step": 3424
},
{
"epoch": 0.7331888362634128,
"grad_norm": 0.15405644325158754,
"learning_rate": 6.7485036787638245e-06,
"loss": 0.7072,
"step": 3425
},
{
"epoch": 0.7334029059966284,
"grad_norm": 0.15227504324107274,
"learning_rate": 6.738329231479197e-06,
"loss": 0.7054,
"step": 3426
},
{
"epoch": 0.733616975729844,
"grad_norm": 0.14675505260563773,
"learning_rate": 6.728160905757521e-06,
"loss": 0.6963,
"step": 3427
},
{
"epoch": 0.7338310454630596,
"grad_norm": 0.14652893843082374,
"learning_rate": 6.717998706292481e-06,
"loss": 0.7229,
"step": 3428
},
{
"epoch": 0.7340451151962751,
"grad_norm": 0.1509029876280325,
"learning_rate": 6.70784263777492e-06,
"loss": 0.703,
"step": 3429
},
{
"epoch": 0.7342591849294908,
"grad_norm": 0.15094441942247902,
"learning_rate": 6.697692704892871e-06,
"loss": 0.7041,
"step": 3430
},
{
"epoch": 0.7344732546627064,
"grad_norm": 0.1506906511359724,
"learning_rate": 6.687548912331512e-06,
"loss": 0.7032,
"step": 3431
},
{
"epoch": 0.734687324395922,
"grad_norm": 0.1505700428178517,
"learning_rate": 6.677411264773204e-06,
"loss": 0.7044,
"step": 3432
},
{
"epoch": 0.7349013941291376,
"grad_norm": 0.1562400944894484,
"learning_rate": 6.6672797668974765e-06,
"loss": 0.6775,
"step": 3433
},
{
"epoch": 0.7351154638623532,
"grad_norm": 0.15451972419948504,
"learning_rate": 6.657154423380996e-06,
"loss": 0.6834,
"step": 3434
},
{
"epoch": 0.7353295335955687,
"grad_norm": 0.14973106909517378,
"learning_rate": 6.6470352388976146e-06,
"loss": 0.6923,
"step": 3435
},
{
"epoch": 0.7355436033287843,
"grad_norm": 0.1505642357852099,
"learning_rate": 6.636922218118316e-06,
"loss": 0.691,
"step": 3436
},
{
"epoch": 0.7357576730619999,
"grad_norm": 0.1513512979605357,
"learning_rate": 6.626815365711259e-06,
"loss": 0.6969,
"step": 3437
},
{
"epoch": 0.7359717427952155,
"grad_norm": 0.15932898538625465,
"learning_rate": 6.6167146863417564e-06,
"loss": 0.6706,
"step": 3438
},
{
"epoch": 0.7361858125284312,
"grad_norm": 0.1461255327804242,
"learning_rate": 6.60662018467225e-06,
"loss": 0.6555,
"step": 3439
},
{
"epoch": 0.7363998822616468,
"grad_norm": 0.15223104873445534,
"learning_rate": 6.596531865362354e-06,
"loss": 0.7068,
"step": 3440
},
{
"epoch": 0.7366139519948623,
"grad_norm": 0.15299475056670553,
"learning_rate": 6.5864497330688045e-06,
"loss": 0.6863,
"step": 3441
},
{
"epoch": 0.7368280217280779,
"grad_norm": 0.14539464281086412,
"learning_rate": 6.576373792445507e-06,
"loss": 0.7074,
"step": 3442
},
{
"epoch": 0.7370420914612935,
"grad_norm": 0.15091681358692513,
"learning_rate": 6.566304048143499e-06,
"loss": 0.6906,
"step": 3443
},
{
"epoch": 0.7372561611945091,
"grad_norm": 0.15680469206084852,
"learning_rate": 6.556240504810945e-06,
"loss": 0.7087,
"step": 3444
},
{
"epoch": 0.7374702309277247,
"grad_norm": 0.15354007935878433,
"learning_rate": 6.54618316709317e-06,
"loss": 0.6972,
"step": 3445
},
{
"epoch": 0.7376843006609403,
"grad_norm": 0.15215215918153305,
"learning_rate": 6.53613203963261e-06,
"loss": 0.7038,
"step": 3446
},
{
"epoch": 0.7378983703941558,
"grad_norm": 0.14887076958114878,
"learning_rate": 6.526087127068857e-06,
"loss": 0.7332,
"step": 3447
},
{
"epoch": 0.7381124401273715,
"grad_norm": 0.15696229039598944,
"learning_rate": 6.516048434038624e-06,
"loss": 0.6826,
"step": 3448
},
{
"epoch": 0.7383265098605871,
"grad_norm": 0.154293062938764,
"learning_rate": 6.506015965175745e-06,
"loss": 0.6952,
"step": 3449
},
{
"epoch": 0.7385405795938027,
"grad_norm": 0.14669425585749374,
"learning_rate": 6.495989725111203e-06,
"loss": 0.6866,
"step": 3450
},
{
"epoch": 0.7387546493270183,
"grad_norm": 0.1542738165770645,
"learning_rate": 6.485969718473075e-06,
"loss": 0.7225,
"step": 3451
},
{
"epoch": 0.7389687190602339,
"grad_norm": 0.15307150042183137,
"learning_rate": 6.475955949886587e-06,
"loss": 0.6793,
"step": 3452
},
{
"epoch": 0.7391827887934495,
"grad_norm": 0.1483389967735417,
"learning_rate": 6.465948423974085e-06,
"loss": 0.7074,
"step": 3453
},
{
"epoch": 0.739396858526665,
"grad_norm": 0.14907186217462975,
"learning_rate": 6.455947145355006e-06,
"loss": 0.7193,
"step": 3454
},
{
"epoch": 0.7396109282598806,
"grad_norm": 0.18214754122585616,
"learning_rate": 6.445952118645937e-06,
"loss": 0.6676,
"step": 3455
},
{
"epoch": 0.7398249979930962,
"grad_norm": 0.1482875992962516,
"learning_rate": 6.435963348460554e-06,
"loss": 0.6898,
"step": 3456
},
{
"epoch": 0.7400390677263119,
"grad_norm": 0.1427804193861355,
"learning_rate": 6.4259808394096645e-06,
"loss": 0.6947,
"step": 3457
},
{
"epoch": 0.7402531374595275,
"grad_norm": 0.1518872142755112,
"learning_rate": 6.4160045961011664e-06,
"loss": 0.6959,
"step": 3458
},
{
"epoch": 0.7404672071927431,
"grad_norm": 0.15155101550663358,
"learning_rate": 6.406034623140078e-06,
"loss": 0.7016,
"step": 3459
},
{
"epoch": 0.7406812769259586,
"grad_norm": 0.1517575123736666,
"learning_rate": 6.396070925128532e-06,
"loss": 0.6925,
"step": 3460
},
{
"epoch": 0.7408953466591742,
"grad_norm": 0.14763084958866032,
"learning_rate": 6.386113506665737e-06,
"loss": 0.6997,
"step": 3461
},
{
"epoch": 0.7411094163923898,
"grad_norm": 0.15559156433787158,
"learning_rate": 6.376162372348032e-06,
"loss": 0.6639,
"step": 3462
},
{
"epoch": 0.7413234861256054,
"grad_norm": 0.15121447903508906,
"learning_rate": 6.36621752676883e-06,
"loss": 0.701,
"step": 3463
},
{
"epoch": 0.741537555858821,
"grad_norm": 0.15143520224027426,
"learning_rate": 6.356278974518659e-06,
"loss": 0.6859,
"step": 3464
},
{
"epoch": 0.7417516255920366,
"grad_norm": 0.15200496557556217,
"learning_rate": 6.346346720185146e-06,
"loss": 0.6891,
"step": 3465
},
{
"epoch": 0.7419656953252523,
"grad_norm": 0.15178547620092442,
"learning_rate": 6.336420768352984e-06,
"loss": 0.7108,
"step": 3466
},
{
"epoch": 0.7421797650584678,
"grad_norm": 0.1455060733596948,
"learning_rate": 6.326501123603986e-06,
"loss": 0.6763,
"step": 3467
},
{
"epoch": 0.7423938347916834,
"grad_norm": 0.15051287681302894,
"learning_rate": 6.316587790517044e-06,
"loss": 0.7349,
"step": 3468
},
{
"epoch": 0.742607904524899,
"grad_norm": 0.14343932085981198,
"learning_rate": 6.3066807736681215e-06,
"loss": 0.6908,
"step": 3469
},
{
"epoch": 0.7428219742581146,
"grad_norm": 0.1503341811816976,
"learning_rate": 6.296780077630289e-06,
"loss": 0.6822,
"step": 3470
},
{
"epoch": 0.7430360439913302,
"grad_norm": 0.14770233398877097,
"learning_rate": 6.2868857069736935e-06,
"loss": 0.6986,
"step": 3471
},
{
"epoch": 0.7432501137245457,
"grad_norm": 0.14953962426904005,
"learning_rate": 6.276997666265547e-06,
"loss": 0.6895,
"step": 3472
},
{
"epoch": 0.7434641834577613,
"grad_norm": 0.15064435272164417,
"learning_rate": 6.267115960070165e-06,
"loss": 0.7043,
"step": 3473
},
{
"epoch": 0.7436782531909769,
"grad_norm": 0.14900797917823344,
"learning_rate": 6.257240592948908e-06,
"loss": 0.7116,
"step": 3474
},
{
"epoch": 0.7438923229241926,
"grad_norm": 0.14859530069751684,
"learning_rate": 6.247371569460236e-06,
"loss": 0.6833,
"step": 3475
},
{
"epoch": 0.7441063926574082,
"grad_norm": 0.1536351887710474,
"learning_rate": 6.23750889415968e-06,
"loss": 0.6794,
"step": 3476
},
{
"epoch": 0.7443204623906238,
"grad_norm": 0.14901655132083652,
"learning_rate": 6.2276525715998184e-06,
"loss": 0.6881,
"step": 3477
},
{
"epoch": 0.7445345321238394,
"grad_norm": 0.14949396606953977,
"learning_rate": 6.217802606330319e-06,
"loss": 0.698,
"step": 3478
},
{
"epoch": 0.7447486018570549,
"grad_norm": 0.1479567503230999,
"learning_rate": 6.207959002897912e-06,
"loss": 0.6676,
"step": 3479
},
{
"epoch": 0.7449626715902705,
"grad_norm": 0.1461896823252663,
"learning_rate": 6.1981217658463766e-06,
"loss": 0.69,
"step": 3480
},
{
"epoch": 0.7451767413234861,
"grad_norm": 0.14459359084047935,
"learning_rate": 6.188290899716569e-06,
"loss": 0.6888,
"step": 3481
},
{
"epoch": 0.7453908110567017,
"grad_norm": 0.1486442535448886,
"learning_rate": 6.1784664090464045e-06,
"loss": 0.6891,
"step": 3482
},
{
"epoch": 0.7456048807899173,
"grad_norm": 0.15391385653026315,
"learning_rate": 6.168648298370839e-06,
"loss": 0.7018,
"step": 3483
},
{
"epoch": 0.745818950523133,
"grad_norm": 0.13931330424670904,
"learning_rate": 6.1588365722218975e-06,
"loss": 0.6633,
"step": 3484
},
{
"epoch": 0.7460330202563485,
"grad_norm": 0.14574270828205177,
"learning_rate": 6.149031235128667e-06,
"loss": 0.7149,
"step": 3485
},
{
"epoch": 0.7462470899895641,
"grad_norm": 0.14688827931739226,
"learning_rate": 6.139232291617254e-06,
"loss": 0.6902,
"step": 3486
},
{
"epoch": 0.7464611597227797,
"grad_norm": 0.1503196751740802,
"learning_rate": 6.129439746210848e-06,
"loss": 0.7141,
"step": 3487
},
{
"epoch": 0.7466752294559953,
"grad_norm": 0.14990597542397502,
"learning_rate": 6.119653603429659e-06,
"loss": 0.7168,
"step": 3488
},
{
"epoch": 0.7468892991892109,
"grad_norm": 0.14781381138706368,
"learning_rate": 6.109873867790957e-06,
"loss": 0.6865,
"step": 3489
},
{
"epoch": 0.7471033689224265,
"grad_norm": 0.1512508172162291,
"learning_rate": 6.100100543809057e-06,
"loss": 0.6991,
"step": 3490
},
{
"epoch": 0.747317438655642,
"grad_norm": 0.14971574620785155,
"learning_rate": 6.090333635995296e-06,
"loss": 0.7168,
"step": 3491
},
{
"epoch": 0.7475315083888576,
"grad_norm": 0.14662669096817155,
"learning_rate": 6.080573148858071e-06,
"loss": 0.6971,
"step": 3492
},
{
"epoch": 0.7477455781220733,
"grad_norm": 0.1477528433065089,
"learning_rate": 6.070819086902795e-06,
"loss": 0.6814,
"step": 3493
},
{
"epoch": 0.7479596478552889,
"grad_norm": 0.14861693160736386,
"learning_rate": 6.06107145463193e-06,
"loss": 0.6977,
"step": 3494
},
{
"epoch": 0.7481737175885045,
"grad_norm": 0.15123908792964869,
"learning_rate": 6.051330256544971e-06,
"loss": 0.6637,
"step": 3495
},
{
"epoch": 0.7483877873217201,
"grad_norm": 0.14176940130106536,
"learning_rate": 6.041595497138424e-06,
"loss": 0.704,
"step": 3496
},
{
"epoch": 0.7486018570549356,
"grad_norm": 0.15762788664350239,
"learning_rate": 6.031867180905852e-06,
"loss": 0.7146,
"step": 3497
},
{
"epoch": 0.7488159267881512,
"grad_norm": 0.14694967764663688,
"learning_rate": 6.022145312337812e-06,
"loss": 0.6589,
"step": 3498
},
{
"epoch": 0.7490299965213668,
"grad_norm": 0.14380758565341722,
"learning_rate": 6.0124298959219165e-06,
"loss": 0.6629,
"step": 3499
},
{
"epoch": 0.7492440662545824,
"grad_norm": 0.15062939581345539,
"learning_rate": 6.002720936142767e-06,
"loss": 0.6876,
"step": 3500
},
{
"epoch": 0.749458135987798,
"grad_norm": 0.15310027955477912,
"learning_rate": 5.9930184374820125e-06,
"loss": 0.7018,
"step": 3501
},
{
"epoch": 0.7496722057210137,
"grad_norm": 0.14450896111185574,
"learning_rate": 5.98332240441831e-06,
"loss": 0.6619,
"step": 3502
},
{
"epoch": 0.7498862754542293,
"grad_norm": 0.15373891111271507,
"learning_rate": 5.973632841427324e-06,
"loss": 0.7045,
"step": 3503
},
{
"epoch": 0.7501003451874448,
"grad_norm": 0.15075910232264875,
"learning_rate": 5.963949752981746e-06,
"loss": 0.6976,
"step": 3504
},
{
"epoch": 0.7503144149206604,
"grad_norm": 0.14762413292381302,
"learning_rate": 5.954273143551264e-06,
"loss": 0.676,
"step": 3505
},
{
"epoch": 0.750528484653876,
"grad_norm": 0.1509472898776307,
"learning_rate": 5.944603017602586e-06,
"loss": 0.705,
"step": 3506
},
{
"epoch": 0.7507425543870916,
"grad_norm": 0.1598886546934672,
"learning_rate": 5.934939379599431e-06,
"loss": 0.7103,
"step": 3507
},
{
"epoch": 0.7509566241203072,
"grad_norm": 0.14834173668501452,
"learning_rate": 5.925282234002505e-06,
"loss": 0.6667,
"step": 3508
},
{
"epoch": 0.7511706938535228,
"grad_norm": 0.14871353345344307,
"learning_rate": 5.915631585269543e-06,
"loss": 0.677,
"step": 3509
},
{
"epoch": 0.7513847635867383,
"grad_norm": 0.1455205068481264,
"learning_rate": 5.905987437855252e-06,
"loss": 0.694,
"step": 3510
},
{
"epoch": 0.751598833319954,
"grad_norm": 0.1433319837719959,
"learning_rate": 5.896349796211358e-06,
"loss": 0.6931,
"step": 3511
},
{
"epoch": 0.7518129030531696,
"grad_norm": 0.14993155880692113,
"learning_rate": 5.8867186647865885e-06,
"loss": 0.6669,
"step": 3512
},
{
"epoch": 0.7520269727863852,
"grad_norm": 0.15340387858874688,
"learning_rate": 5.877094048026641e-06,
"loss": 0.6857,
"step": 3513
},
{
"epoch": 0.7522410425196008,
"grad_norm": 0.14781702867549093,
"learning_rate": 5.867475950374233e-06,
"loss": 0.6903,
"step": 3514
},
{
"epoch": 0.7524551122528164,
"grad_norm": 0.15672579984067628,
"learning_rate": 5.857864376269051e-06,
"loss": 0.6975,
"step": 3515
},
{
"epoch": 0.7526691819860319,
"grad_norm": 0.15518680035157378,
"learning_rate": 5.848259330147785e-06,
"loss": 0.7203,
"step": 3516
},
{
"epoch": 0.7528832517192475,
"grad_norm": 0.14822640603276857,
"learning_rate": 5.83866081644411e-06,
"loss": 0.6938,
"step": 3517
},
{
"epoch": 0.7530973214524631,
"grad_norm": 0.15473743140564133,
"learning_rate": 5.829068839588676e-06,
"loss": 0.7144,
"step": 3518
},
{
"epoch": 0.7533113911856787,
"grad_norm": 0.15380435570663145,
"learning_rate": 5.81948340400913e-06,
"loss": 0.6952,
"step": 3519
},
{
"epoch": 0.7535254609188944,
"grad_norm": 0.149614534669644,
"learning_rate": 5.809904514130078e-06,
"loss": 0.6814,
"step": 3520
},
{
"epoch": 0.75373953065211,
"grad_norm": 0.1516527425931899,
"learning_rate": 5.800332174373129e-06,
"loss": 0.6785,
"step": 3521
},
{
"epoch": 0.7539536003853256,
"grad_norm": 0.14958597241174026,
"learning_rate": 5.790766389156859e-06,
"loss": 0.6863,
"step": 3522
},
{
"epoch": 0.7541676701185411,
"grad_norm": 0.1455880171340645,
"learning_rate": 5.781207162896807e-06,
"loss": 0.6779,
"step": 3523
},
{
"epoch": 0.7543817398517567,
"grad_norm": 0.1481194485704188,
"learning_rate": 5.7716545000055056e-06,
"loss": 0.6966,
"step": 3524
},
{
"epoch": 0.7545958095849723,
"grad_norm": 0.1453422681248648,
"learning_rate": 5.762108404892437e-06,
"loss": 0.6788,
"step": 3525
},
{
"epoch": 0.7548098793181879,
"grad_norm": 0.141303964684086,
"learning_rate": 5.752568881964065e-06,
"loss": 0.6647,
"step": 3526
},
{
"epoch": 0.7550239490514035,
"grad_norm": 0.1465381438690067,
"learning_rate": 5.74303593562382e-06,
"loss": 0.7006,
"step": 3527
},
{
"epoch": 0.755238018784619,
"grad_norm": 0.14717689421418012,
"learning_rate": 5.733509570272085e-06,
"loss": 0.706,
"step": 3528
},
{
"epoch": 0.7554520885178346,
"grad_norm": 0.145311171036877,
"learning_rate": 5.7239897903062195e-06,
"loss": 0.685,
"step": 3529
},
{
"epoch": 0.7556661582510503,
"grad_norm": 0.13975117168116696,
"learning_rate": 5.714476600120531e-06,
"loss": 0.6734,
"step": 3530
},
{
"epoch": 0.7558802279842659,
"grad_norm": 0.15204346152954393,
"learning_rate": 5.7049700041062896e-06,
"loss": 0.7228,
"step": 3531
},
{
"epoch": 0.7560942977174815,
"grad_norm": 0.14964212802659002,
"learning_rate": 5.695470006651736e-06,
"loss": 0.7265,
"step": 3532
},
{
"epoch": 0.7563083674506971,
"grad_norm": 0.14141466999538752,
"learning_rate": 5.685976612142033e-06,
"loss": 0.693,
"step": 3533
},
{
"epoch": 0.7565224371839127,
"grad_norm": 0.1428511091199509,
"learning_rate": 5.67648982495933e-06,
"loss": 0.6766,
"step": 3534
},
{
"epoch": 0.7567365069171282,
"grad_norm": 0.15061562224777444,
"learning_rate": 5.667009649482698e-06,
"loss": 0.6989,
"step": 3535
},
{
"epoch": 0.7569505766503438,
"grad_norm": 0.14920844910110417,
"learning_rate": 5.65753609008818e-06,
"loss": 0.7145,
"step": 3536
},
{
"epoch": 0.7571646463835594,
"grad_norm": 0.1862597364757916,
"learning_rate": 5.6480691511487404e-06,
"loss": 0.6871,
"step": 3537
},
{
"epoch": 0.757378716116775,
"grad_norm": 0.15082101946973378,
"learning_rate": 5.638608837034309e-06,
"loss": 0.7031,
"step": 3538
},
{
"epoch": 0.7575927858499907,
"grad_norm": 0.14885222190568342,
"learning_rate": 5.629155152111756e-06,
"loss": 0.6708,
"step": 3539
},
{
"epoch": 0.7578068555832063,
"grad_norm": 0.14946031097612494,
"learning_rate": 5.619708100744871e-06,
"loss": 0.6998,
"step": 3540
},
{
"epoch": 0.7580209253164218,
"grad_norm": 0.15150139368910043,
"learning_rate": 5.6102676872944105e-06,
"loss": 0.6862,
"step": 3541
},
{
"epoch": 0.7582349950496374,
"grad_norm": 0.14528120780814796,
"learning_rate": 5.600833916118036e-06,
"loss": 0.6926,
"step": 3542
},
{
"epoch": 0.758449064782853,
"grad_norm": 0.1478792522908929,
"learning_rate": 5.591406791570368e-06,
"loss": 0.6757,
"step": 3543
},
{
"epoch": 0.7586631345160686,
"grad_norm": 0.15479992966447959,
"learning_rate": 5.581986318002954e-06,
"loss": 0.7115,
"step": 3544
},
{
"epoch": 0.7588772042492842,
"grad_norm": 0.14778186516095093,
"learning_rate": 5.572572499764258e-06,
"loss": 0.6631,
"step": 3545
},
{
"epoch": 0.7590912739824998,
"grad_norm": 0.15514968870863632,
"learning_rate": 5.56316534119969e-06,
"loss": 0.723,
"step": 3546
},
{
"epoch": 0.7593053437157153,
"grad_norm": 0.15399317871950186,
"learning_rate": 5.553764846651568e-06,
"loss": 0.6834,
"step": 3547
},
{
"epoch": 0.759519413448931,
"grad_norm": 0.14610405195629494,
"learning_rate": 5.544371020459147e-06,
"loss": 0.6949,
"step": 3548
},
{
"epoch": 0.7597334831821466,
"grad_norm": 0.1550757297618627,
"learning_rate": 5.534983866958608e-06,
"loss": 0.7034,
"step": 3549
},
{
"epoch": 0.7599475529153622,
"grad_norm": 0.15249826727832982,
"learning_rate": 5.52560339048303e-06,
"loss": 0.6802,
"step": 3550
},
{
"epoch": 0.7601616226485778,
"grad_norm": 0.15990361646423798,
"learning_rate": 5.51622959536243e-06,
"loss": 0.6665,
"step": 3551
},
{
"epoch": 0.7603756923817934,
"grad_norm": 0.15675674451816746,
"learning_rate": 5.506862485923743e-06,
"loss": 0.7085,
"step": 3552
},
{
"epoch": 0.760589762115009,
"grad_norm": 0.15425741849393068,
"learning_rate": 5.497502066490794e-06,
"loss": 0.7043,
"step": 3553
},
{
"epoch": 0.7608038318482245,
"grad_norm": 0.14732362439572336,
"learning_rate": 5.488148341384343e-06,
"loss": 0.6942,
"step": 3554
},
{
"epoch": 0.7610179015814401,
"grad_norm": 0.15462024258481727,
"learning_rate": 5.47880131492206e-06,
"loss": 0.6877,
"step": 3555
},
{
"epoch": 0.7612319713146557,
"grad_norm": 0.1488457724029461,
"learning_rate": 5.469460991418501e-06,
"loss": 0.6778,
"step": 3556
},
{
"epoch": 0.7614460410478714,
"grad_norm": 0.1463214204438405,
"learning_rate": 5.460127375185149e-06,
"loss": 0.7052,
"step": 3557
},
{
"epoch": 0.761660110781087,
"grad_norm": 0.15787772253742774,
"learning_rate": 5.450800470530391e-06,
"loss": 0.7364,
"step": 3558
},
{
"epoch": 0.7618741805143026,
"grad_norm": 0.15108225763316432,
"learning_rate": 5.441480281759497e-06,
"loss": 0.692,
"step": 3559
},
{
"epoch": 0.7620882502475181,
"grad_norm": 0.14680756999056407,
"learning_rate": 5.43216681317466e-06,
"loss": 0.6819,
"step": 3560
},
{
"epoch": 0.7623023199807337,
"grad_norm": 0.15136912613020453,
"learning_rate": 5.422860069074949e-06,
"loss": 0.7046,
"step": 3561
},
{
"epoch": 0.7625163897139493,
"grad_norm": 0.14723621183647193,
"learning_rate": 5.413560053756344e-06,
"loss": 0.6712,
"step": 3562
},
{
"epoch": 0.7627304594471649,
"grad_norm": 0.14575649289043383,
"learning_rate": 5.404266771511724e-06,
"loss": 0.6831,
"step": 3563
},
{
"epoch": 0.7629445291803805,
"grad_norm": 0.14985340867888913,
"learning_rate": 5.394980226630837e-06,
"loss": 0.6907,
"step": 3564
},
{
"epoch": 0.763158598913596,
"grad_norm": 0.21399373667467642,
"learning_rate": 5.385700423400342e-06,
"loss": 0.6851,
"step": 3565
},
{
"epoch": 0.7633726686468117,
"grad_norm": 0.14587068917445511,
"learning_rate": 5.376427366103785e-06,
"loss": 0.6746,
"step": 3566
},
{
"epoch": 0.7635867383800273,
"grad_norm": 0.14772214126312302,
"learning_rate": 5.367161059021579e-06,
"loss": 0.6807,
"step": 3567
},
{
"epoch": 0.7638008081132429,
"grad_norm": 0.15104290541918627,
"learning_rate": 5.357901506431045e-06,
"loss": 0.6925,
"step": 3568
},
{
"epoch": 0.7640148778464585,
"grad_norm": 0.1460490001623541,
"learning_rate": 5.348648712606377e-06,
"loss": 0.6606,
"step": 3569
},
{
"epoch": 0.7642289475796741,
"grad_norm": 0.14996309101222452,
"learning_rate": 5.339402681818635e-06,
"loss": 0.6921,
"step": 3570
},
{
"epoch": 0.7644430173128897,
"grad_norm": 0.15161998327531107,
"learning_rate": 5.330163418335785e-06,
"loss": 0.6887,
"step": 3571
},
{
"epoch": 0.7646570870461052,
"grad_norm": 0.15116850976620883,
"learning_rate": 5.3209309264226405e-06,
"loss": 0.6967,
"step": 3572
},
{
"epoch": 0.7648711567793208,
"grad_norm": 0.1525044548119896,
"learning_rate": 5.311705210340909e-06,
"loss": 0.6929,
"step": 3573
},
{
"epoch": 0.7650852265125364,
"grad_norm": 0.14448107558451886,
"learning_rate": 5.302486274349172e-06,
"loss": 0.6904,
"step": 3574
},
{
"epoch": 0.7652992962457521,
"grad_norm": 0.15287788455081394,
"learning_rate": 5.293274122702858e-06,
"loss": 0.6758,
"step": 3575
},
{
"epoch": 0.7655133659789677,
"grad_norm": 0.144367658280424,
"learning_rate": 5.284068759654295e-06,
"loss": 0.7035,
"step": 3576
},
{
"epoch": 0.7657274357121833,
"grad_norm": 0.14739219572103643,
"learning_rate": 5.274870189452648e-06,
"loss": 0.7131,
"step": 3577
},
{
"epoch": 0.7659415054453989,
"grad_norm": 0.15153468382661264,
"learning_rate": 5.2656784163439715e-06,
"loss": 0.6855,
"step": 3578
},
{
"epoch": 0.7661555751786144,
"grad_norm": 0.14472458441935257,
"learning_rate": 5.25649344457116e-06,
"loss": 0.6678,
"step": 3579
},
{
"epoch": 0.76636964491183,
"grad_norm": 0.1423686618276698,
"learning_rate": 5.247315278373983e-06,
"loss": 0.6645,
"step": 3580
},
{
"epoch": 0.7665837146450456,
"grad_norm": 0.1561284936350141,
"learning_rate": 5.238143921989076e-06,
"loss": 0.7006,
"step": 3581
},
{
"epoch": 0.7667977843782612,
"grad_norm": 0.14694574730059415,
"learning_rate": 5.228979379649906e-06,
"loss": 0.6965,
"step": 3582
},
{
"epoch": 0.7670118541114768,
"grad_norm": 0.14965368879489102,
"learning_rate": 5.219821655586821e-06,
"loss": 0.6786,
"step": 3583
},
{
"epoch": 0.7672259238446925,
"grad_norm": 0.15153940337772823,
"learning_rate": 5.210670754026996e-06,
"loss": 0.69,
"step": 3584
},
{
"epoch": 0.767439993577908,
"grad_norm": 0.1476311981634055,
"learning_rate": 5.20152667919448e-06,
"loss": 0.7052,
"step": 3585
},
{
"epoch": 0.7676540633111236,
"grad_norm": 0.14386225508385378,
"learning_rate": 5.192389435310165e-06,
"loss": 0.6789,
"step": 3586
},
{
"epoch": 0.7678681330443392,
"grad_norm": 0.16161601365398967,
"learning_rate": 5.183259026591774e-06,
"loss": 0.7124,
"step": 3587
},
{
"epoch": 0.7680822027775548,
"grad_norm": 0.1429500331562117,
"learning_rate": 5.174135457253899e-06,
"loss": 0.6885,
"step": 3588
},
{
"epoch": 0.7682962725107704,
"grad_norm": 0.14901454804148337,
"learning_rate": 5.1650187315079495e-06,
"loss": 0.6823,
"step": 3589
},
{
"epoch": 0.768510342243986,
"grad_norm": 0.15060970553205538,
"learning_rate": 5.155908853562199e-06,
"loss": 0.6605,
"step": 3590
},
{
"epoch": 0.7687244119772015,
"grad_norm": 0.15125526613464874,
"learning_rate": 5.146805827621755e-06,
"loss": 0.6704,
"step": 3591
},
{
"epoch": 0.7689384817104171,
"grad_norm": 0.1452066765752053,
"learning_rate": 5.137709657888543e-06,
"loss": 0.6759,
"step": 3592
},
{
"epoch": 0.7691525514436328,
"grad_norm": 0.1515118732805631,
"learning_rate": 5.1286203485613525e-06,
"loss": 0.6783,
"step": 3593
},
{
"epoch": 0.7693666211768484,
"grad_norm": 0.1538291655720675,
"learning_rate": 5.1195379038357825e-06,
"loss": 0.6862,
"step": 3594
},
{
"epoch": 0.769580690910064,
"grad_norm": 0.14378164556618328,
"learning_rate": 5.110462327904275e-06,
"loss": 0.6944,
"step": 3595
},
{
"epoch": 0.7697947606432796,
"grad_norm": 0.15453963403424745,
"learning_rate": 5.101393624956106e-06,
"loss": 0.7054,
"step": 3596
},
{
"epoch": 0.7700088303764951,
"grad_norm": 0.15560221937755805,
"learning_rate": 5.092331799177361e-06,
"loss": 0.7042,
"step": 3597
},
{
"epoch": 0.7702229001097107,
"grad_norm": 0.14812052992953495,
"learning_rate": 5.083276854750974e-06,
"loss": 0.6854,
"step": 3598
},
{
"epoch": 0.7704369698429263,
"grad_norm": 0.14777507490410358,
"learning_rate": 5.074228795856679e-06,
"loss": 0.6728,
"step": 3599
},
{
"epoch": 0.7706510395761419,
"grad_norm": 0.15288474595976217,
"learning_rate": 5.065187626671048e-06,
"loss": 0.7063,
"step": 3600
},
{
"epoch": 0.7708651093093575,
"grad_norm": 0.1502097967002784,
"learning_rate": 5.056153351367477e-06,
"loss": 0.7021,
"step": 3601
},
{
"epoch": 0.7710791790425732,
"grad_norm": 0.17267344323599498,
"learning_rate": 5.047125974116156e-06,
"loss": 0.6868,
"step": 3602
},
{
"epoch": 0.7712932487757888,
"grad_norm": 0.1481410798670096,
"learning_rate": 5.038105499084119e-06,
"loss": 0.6715,
"step": 3603
},
{
"epoch": 0.7715073185090043,
"grad_norm": 0.14471274871786427,
"learning_rate": 5.02909193043519e-06,
"loss": 0.6961,
"step": 3604
},
{
"epoch": 0.7717213882422199,
"grad_norm": 0.14253817472206023,
"learning_rate": 5.02008527233002e-06,
"loss": 0.6856,
"step": 3605
},
{
"epoch": 0.7719354579754355,
"grad_norm": 0.1454712497078495,
"learning_rate": 5.0110855289260715e-06,
"loss": 0.6811,
"step": 3606
},
{
"epoch": 0.7721495277086511,
"grad_norm": 0.14372760835458082,
"learning_rate": 5.002092704377599e-06,
"loss": 0.6977,
"step": 3607
},
{
"epoch": 0.7723635974418667,
"grad_norm": 0.14137880497796756,
"learning_rate": 4.993106802835686e-06,
"loss": 0.6872,
"step": 3608
},
{
"epoch": 0.7725776671750822,
"grad_norm": 0.1443860306366555,
"learning_rate": 4.984127828448196e-06,
"loss": 0.6845,
"step": 3609
},
{
"epoch": 0.7727917369082978,
"grad_norm": 0.1495619821833103,
"learning_rate": 4.9751557853598105e-06,
"loss": 0.7199,
"step": 3610
},
{
"epoch": 0.7730058066415135,
"grad_norm": 0.14246740084591147,
"learning_rate": 4.966190677712019e-06,
"loss": 0.6526,
"step": 3611
},
{
"epoch": 0.7732198763747291,
"grad_norm": 0.14280730977428982,
"learning_rate": 4.957232509643082e-06,
"loss": 0.6958,
"step": 3612
},
{
"epoch": 0.7734339461079447,
"grad_norm": 0.1444098725201292,
"learning_rate": 4.94828128528809e-06,
"loss": 0.6879,
"step": 3613
},
{
"epoch": 0.7736480158411603,
"grad_norm": 0.15274940038812987,
"learning_rate": 4.939337008778895e-06,
"loss": 0.6712,
"step": 3614
},
{
"epoch": 0.7738620855743759,
"grad_norm": 0.14534139764508106,
"learning_rate": 4.9303996842441695e-06,
"loss": 0.6927,
"step": 3615
},
{
"epoch": 0.7740761553075914,
"grad_norm": 0.16815580329923768,
"learning_rate": 4.921469315809369e-06,
"loss": 0.7049,
"step": 3616
},
{
"epoch": 0.774290225040807,
"grad_norm": 0.14971565575094634,
"learning_rate": 4.912545907596722e-06,
"loss": 0.71,
"step": 3617
},
{
"epoch": 0.7745042947740226,
"grad_norm": 0.14539530061684675,
"learning_rate": 4.903629463725274e-06,
"loss": 0.6774,
"step": 3618
},
{
"epoch": 0.7747183645072382,
"grad_norm": 0.14651811235383488,
"learning_rate": 4.894719988310823e-06,
"loss": 0.7002,
"step": 3619
},
{
"epoch": 0.7749324342404539,
"grad_norm": 0.1449687047863747,
"learning_rate": 4.8858174854659804e-06,
"loss": 0.6979,
"step": 3620
},
{
"epoch": 0.7751465039736695,
"grad_norm": 0.14878444311342331,
"learning_rate": 4.8769219593001135e-06,
"loss": 0.6834,
"step": 3621
},
{
"epoch": 0.775360573706885,
"grad_norm": 0.14637069318606644,
"learning_rate": 4.868033413919386e-06,
"loss": 0.7114,
"step": 3622
},
{
"epoch": 0.7755746434401006,
"grad_norm": 0.14602603522843896,
"learning_rate": 4.85915185342674e-06,
"loss": 0.7031,
"step": 3623
},
{
"epoch": 0.7757887131733162,
"grad_norm": 0.14681099392977884,
"learning_rate": 4.850277281921876e-06,
"loss": 0.712,
"step": 3624
},
{
"epoch": 0.7760027829065318,
"grad_norm": 0.1454575177981704,
"learning_rate": 4.841409703501292e-06,
"loss": 0.6961,
"step": 3625
},
{
"epoch": 0.7762168526397474,
"grad_norm": 0.14564331273396205,
"learning_rate": 4.832549122258234e-06,
"loss": 0.6725,
"step": 3626
},
{
"epoch": 0.776430922372963,
"grad_norm": 0.15854855672454377,
"learning_rate": 4.823695542282738e-06,
"loss": 0.7169,
"step": 3627
},
{
"epoch": 0.7766449921061785,
"grad_norm": 0.14785777387749188,
"learning_rate": 4.8148489676616025e-06,
"loss": 0.679,
"step": 3628
},
{
"epoch": 0.7768590618393941,
"grad_norm": 0.1412872034267171,
"learning_rate": 4.80600940247838e-06,
"loss": 0.6825,
"step": 3629
},
{
"epoch": 0.7770731315726098,
"grad_norm": 0.16532232539661437,
"learning_rate": 4.79717685081341e-06,
"loss": 0.7056,
"step": 3630
},
{
"epoch": 0.7772872013058254,
"grad_norm": 0.14848016296659505,
"learning_rate": 4.788351316743769e-06,
"loss": 0.6657,
"step": 3631
},
{
"epoch": 0.777501271039041,
"grad_norm": 0.14375141099044916,
"learning_rate": 4.7795328043433166e-06,
"loss": 0.6826,
"step": 3632
},
{
"epoch": 0.7777153407722566,
"grad_norm": 0.15246816576983932,
"learning_rate": 4.770721317682663e-06,
"loss": 0.6778,
"step": 3633
},
{
"epoch": 0.7779294105054722,
"grad_norm": 0.14619019412293158,
"learning_rate": 4.7619168608291655e-06,
"loss": 0.7208,
"step": 3634
},
{
"epoch": 0.7781434802386877,
"grad_norm": 0.14514720848243715,
"learning_rate": 4.753119437846951e-06,
"loss": 0.683,
"step": 3635
},
{
"epoch": 0.7783575499719033,
"grad_norm": 0.13991836419741208,
"learning_rate": 4.744329052796899e-06,
"loss": 0.706,
"step": 3636
},
{
"epoch": 0.7785716197051189,
"grad_norm": 0.14645596734362765,
"learning_rate": 4.735545709736624e-06,
"loss": 0.6869,
"step": 3637
},
{
"epoch": 0.7787856894383345,
"grad_norm": 0.14443721378800692,
"learning_rate": 4.726769412720506e-06,
"loss": 0.6845,
"step": 3638
},
{
"epoch": 0.7789997591715502,
"grad_norm": 0.14559633643361072,
"learning_rate": 4.7180001657996745e-06,
"loss": 0.6921,
"step": 3639
},
{
"epoch": 0.7792138289047658,
"grad_norm": 0.14773278671770956,
"learning_rate": 4.7092379730219874e-06,
"loss": 0.6891,
"step": 3640
},
{
"epoch": 0.7794278986379813,
"grad_norm": 0.13550999742590666,
"learning_rate": 4.700482838432059e-06,
"loss": 0.68,
"step": 3641
},
{
"epoch": 0.7796419683711969,
"grad_norm": 0.14702684097174573,
"learning_rate": 4.691734766071252e-06,
"loss": 0.6797,
"step": 3642
},
{
"epoch": 0.7798560381044125,
"grad_norm": 0.14445549825886153,
"learning_rate": 4.682993759977648e-06,
"loss": 0.6889,
"step": 3643
},
{
"epoch": 0.7800701078376281,
"grad_norm": 0.14631527443199774,
"learning_rate": 4.6742598241860875e-06,
"loss": 0.7227,
"step": 3644
},
{
"epoch": 0.7802841775708437,
"grad_norm": 0.14902162597253918,
"learning_rate": 4.665532962728141e-06,
"loss": 0.6964,
"step": 3645
},
{
"epoch": 0.7804982473040593,
"grad_norm": 0.14566734344008586,
"learning_rate": 4.656813179632102e-06,
"loss": 0.6993,
"step": 3646
},
{
"epoch": 0.7807123170372748,
"grad_norm": 0.1440681304391279,
"learning_rate": 4.648100478923014e-06,
"loss": 0.7002,
"step": 3647
},
{
"epoch": 0.7809263867704905,
"grad_norm": 0.14337501249023385,
"learning_rate": 4.639394864622646e-06,
"loss": 0.6801,
"step": 3648
},
{
"epoch": 0.7811404565037061,
"grad_norm": 0.14444266543344936,
"learning_rate": 4.6306963407494855e-06,
"loss": 0.6754,
"step": 3649
},
{
"epoch": 0.7813545262369217,
"grad_norm": 0.14726334608406041,
"learning_rate": 4.6220049113187644e-06,
"loss": 0.6977,
"step": 3650
},
{
"epoch": 0.7815685959701373,
"grad_norm": 0.14200863975157607,
"learning_rate": 4.613320580342422e-06,
"loss": 0.6766,
"step": 3651
},
{
"epoch": 0.7817826657033529,
"grad_norm": 0.14223219680662605,
"learning_rate": 4.60464335182913e-06,
"loss": 0.6895,
"step": 3652
},
{
"epoch": 0.7819967354365684,
"grad_norm": 0.143881909479297,
"learning_rate": 4.595973229784291e-06,
"loss": 0.6703,
"step": 3653
},
{
"epoch": 0.782210805169784,
"grad_norm": 0.14389924843123095,
"learning_rate": 4.587310218210008e-06,
"loss": 0.6677,
"step": 3654
},
{
"epoch": 0.7824248749029996,
"grad_norm": 0.1521399356603649,
"learning_rate": 4.578654321105118e-06,
"loss": 0.6975,
"step": 3655
},
{
"epoch": 0.7826389446362152,
"grad_norm": 0.15069367842892362,
"learning_rate": 4.5700055424651594e-06,
"loss": 0.7117,
"step": 3656
},
{
"epoch": 0.7828530143694309,
"grad_norm": 0.14967374826431096,
"learning_rate": 4.561363886282393e-06,
"loss": 0.6847,
"step": 3657
},
{
"epoch": 0.7830670841026465,
"grad_norm": 0.1441688220108929,
"learning_rate": 4.552729356545804e-06,
"loss": 0.6967,
"step": 3658
},
{
"epoch": 0.783281153835862,
"grad_norm": 0.14632044198323957,
"learning_rate": 4.54410195724106e-06,
"loss": 0.6826,
"step": 3659
},
{
"epoch": 0.7834952235690776,
"grad_norm": 0.14638171754928977,
"learning_rate": 4.535481692350565e-06,
"loss": 0.6952,
"step": 3660
},
{
"epoch": 0.7837092933022932,
"grad_norm": 0.14501938688256968,
"learning_rate": 4.526868565853406e-06,
"loss": 0.7029,
"step": 3661
},
{
"epoch": 0.7839233630355088,
"grad_norm": 0.1469192786686249,
"learning_rate": 4.518262581725399e-06,
"loss": 0.7042,
"step": 3662
},
{
"epoch": 0.7841374327687244,
"grad_norm": 0.22081669649556304,
"learning_rate": 4.5096637439390365e-06,
"loss": 0.6984,
"step": 3663
},
{
"epoch": 0.78435150250194,
"grad_norm": 0.14377410961623974,
"learning_rate": 4.501072056463536e-06,
"loss": 0.6945,
"step": 3664
},
{
"epoch": 0.7845655722351556,
"grad_norm": 0.13757389187353808,
"learning_rate": 4.492487523264806e-06,
"loss": 0.6571,
"step": 3665
},
{
"epoch": 0.7847796419683712,
"grad_norm": 0.1395342436130252,
"learning_rate": 4.483910148305441e-06,
"loss": 0.6856,
"step": 3666
},
{
"epoch": 0.7849937117015868,
"grad_norm": 0.14693427866391,
"learning_rate": 4.4753399355447556e-06,
"loss": 0.6679,
"step": 3667
},
{
"epoch": 0.7852077814348024,
"grad_norm": 0.1455743255241307,
"learning_rate": 4.466776888938731e-06,
"loss": 0.694,
"step": 3668
},
{
"epoch": 0.785421851168018,
"grad_norm": 0.19106409949001513,
"learning_rate": 4.45822101244006e-06,
"loss": 0.6908,
"step": 3669
},
{
"epoch": 0.7856359209012336,
"grad_norm": 0.14056268608495565,
"learning_rate": 4.449672309998125e-06,
"loss": 0.6956,
"step": 3670
},
{
"epoch": 0.7858499906344492,
"grad_norm": 0.1398491797699803,
"learning_rate": 4.441130785558981e-06,
"loss": 0.7018,
"step": 3671
},
{
"epoch": 0.7860640603676647,
"grad_norm": 0.18205320695841806,
"learning_rate": 4.432596443065389e-06,
"loss": 0.6693,
"step": 3672
},
{
"epoch": 0.7862781301008803,
"grad_norm": 0.14921981320338992,
"learning_rate": 4.4240692864567755e-06,
"loss": 0.7079,
"step": 3673
},
{
"epoch": 0.7864921998340959,
"grad_norm": 0.14413740783108842,
"learning_rate": 4.415549319669268e-06,
"loss": 0.7093,
"step": 3674
},
{
"epoch": 0.7867062695673116,
"grad_norm": 0.1446170870345179,
"learning_rate": 4.40703654663567e-06,
"loss": 0.697,
"step": 3675
},
{
"epoch": 0.7869203393005272,
"grad_norm": 0.14357160578361775,
"learning_rate": 4.398530971285453e-06,
"loss": 0.6662,
"step": 3676
},
{
"epoch": 0.7871344090337428,
"grad_norm": 0.14400881820519595,
"learning_rate": 4.390032597544787e-06,
"loss": 0.7033,
"step": 3677
},
{
"epoch": 0.7873484787669583,
"grad_norm": 0.1451046800820446,
"learning_rate": 4.381541429336491e-06,
"loss": 0.6656,
"step": 3678
},
{
"epoch": 0.7875625485001739,
"grad_norm": 0.137583112108047,
"learning_rate": 4.373057470580082e-06,
"loss": 0.6596,
"step": 3679
},
{
"epoch": 0.7877766182333895,
"grad_norm": 0.14469552651510104,
"learning_rate": 4.364580725191743e-06,
"loss": 0.6877,
"step": 3680
},
{
"epoch": 0.7879906879666051,
"grad_norm": 0.14247038821039348,
"learning_rate": 4.356111197084317e-06,
"loss": 0.6792,
"step": 3681
},
{
"epoch": 0.7882047576998207,
"grad_norm": 0.1468007516027795,
"learning_rate": 4.347648890167326e-06,
"loss": 0.6646,
"step": 3682
},
{
"epoch": 0.7884188274330363,
"grad_norm": 0.1402996787144692,
"learning_rate": 4.339193808346951e-06,
"loss": 0.6779,
"step": 3683
},
{
"epoch": 0.788632897166252,
"grad_norm": 0.14254646459308967,
"learning_rate": 4.330745955526045e-06,
"loss": 0.6596,
"step": 3684
},
{
"epoch": 0.7888469668994675,
"grad_norm": 0.14631651078258634,
"learning_rate": 4.3223053356041315e-06,
"loss": 0.6739,
"step": 3685
},
{
"epoch": 0.7890610366326831,
"grad_norm": 0.16083493154870732,
"learning_rate": 4.313871952477367e-06,
"loss": 0.6578,
"step": 3686
},
{
"epoch": 0.7892751063658987,
"grad_norm": 0.19634635903316577,
"learning_rate": 4.3054458100385996e-06,
"loss": 0.7058,
"step": 3687
},
{
"epoch": 0.7894891760991143,
"grad_norm": 0.17472902085358755,
"learning_rate": 4.2970269121773135e-06,
"loss": 0.6827,
"step": 3688
},
{
"epoch": 0.7897032458323299,
"grad_norm": 0.1443233124034339,
"learning_rate": 4.288615262779656e-06,
"loss": 0.688,
"step": 3689
},
{
"epoch": 0.7899173155655455,
"grad_norm": 0.1458702048708569,
"learning_rate": 4.28021086572844e-06,
"loss": 0.6996,
"step": 3690
},
{
"epoch": 0.790131385298761,
"grad_norm": 0.14680820313500526,
"learning_rate": 4.271813724903106e-06,
"loss": 0.6925,
"step": 3691
},
{
"epoch": 0.7903454550319766,
"grad_norm": 0.14795281151426107,
"learning_rate": 4.26342384417977e-06,
"loss": 0.6841,
"step": 3692
},
{
"epoch": 0.7905595247651923,
"grad_norm": 0.14621841645765707,
"learning_rate": 4.255041227431178e-06,
"loss": 0.7052,
"step": 3693
},
{
"epoch": 0.7907735944984079,
"grad_norm": 0.14424606937255216,
"learning_rate": 4.2466658785267304e-06,
"loss": 0.6895,
"step": 3694
},
{
"epoch": 0.7909876642316235,
"grad_norm": 0.1442743584183758,
"learning_rate": 4.238297801332483e-06,
"loss": 0.6983,
"step": 3695
},
{
"epoch": 0.7912017339648391,
"grad_norm": 0.1516338560548041,
"learning_rate": 4.22993699971111e-06,
"loss": 0.6732,
"step": 3696
},
{
"epoch": 0.7914158036980546,
"grad_norm": 0.1525905454680027,
"learning_rate": 4.221583477521956e-06,
"loss": 0.6873,
"step": 3697
},
{
"epoch": 0.7916298734312702,
"grad_norm": 0.14743315246458855,
"learning_rate": 4.21323723862098e-06,
"loss": 0.7065,
"step": 3698
},
{
"epoch": 0.7918439431644858,
"grad_norm": 0.17158018204275188,
"learning_rate": 4.204898286860795e-06,
"loss": 0.7114,
"step": 3699
},
{
"epoch": 0.7920580128977014,
"grad_norm": 0.14403492705947102,
"learning_rate": 4.1965666260906525e-06,
"loss": 0.6848,
"step": 3700
},
{
"epoch": 0.792272082630917,
"grad_norm": 0.14858960809544725,
"learning_rate": 4.188242260156421e-06,
"loss": 0.7141,
"step": 3701
},
{
"epoch": 0.7924861523641327,
"grad_norm": 0.1475919588083457,
"learning_rate": 4.1799251929006225e-06,
"loss": 0.7067,
"step": 3702
},
{
"epoch": 0.7927002220973483,
"grad_norm": 0.14173596377722034,
"learning_rate": 4.17161542816239e-06,
"loss": 0.6942,
"step": 3703
},
{
"epoch": 0.7929142918305638,
"grad_norm": 0.14673351814957863,
"learning_rate": 4.163312969777506e-06,
"loss": 0.7086,
"step": 3704
},
{
"epoch": 0.7931283615637794,
"grad_norm": 0.14623364280415502,
"learning_rate": 4.155017821578362e-06,
"loss": 0.7105,
"step": 3705
},
{
"epoch": 0.793342431296995,
"grad_norm": 0.1471445843756772,
"learning_rate": 4.146729987393982e-06,
"loss": 0.6972,
"step": 3706
},
{
"epoch": 0.7935565010302106,
"grad_norm": 0.14515934720938398,
"learning_rate": 4.138449471050028e-06,
"loss": 0.693,
"step": 3707
},
{
"epoch": 0.7937705707634262,
"grad_norm": 0.14547294711745223,
"learning_rate": 4.1301762763687556e-06,
"loss": 0.7054,
"step": 3708
},
{
"epoch": 0.7939846404966417,
"grad_norm": 0.14358553265992055,
"learning_rate": 4.12191040716907e-06,
"loss": 0.69,
"step": 3709
},
{
"epoch": 0.7941987102298573,
"grad_norm": 0.14066686916955223,
"learning_rate": 4.113651867266468e-06,
"loss": 0.7061,
"step": 3710
},
{
"epoch": 0.794412779963073,
"grad_norm": 0.14646119589548262,
"learning_rate": 4.105400660473082e-06,
"loss": 0.7019,
"step": 3711
},
{
"epoch": 0.7946268496962886,
"grad_norm": 0.1506096934484361,
"learning_rate": 4.09715679059766e-06,
"loss": 0.6811,
"step": 3712
},
{
"epoch": 0.7948409194295042,
"grad_norm": 0.13838487767472074,
"learning_rate": 4.088920261445548e-06,
"loss": 0.6626,
"step": 3713
},
{
"epoch": 0.7950549891627198,
"grad_norm": 0.15062904687313375,
"learning_rate": 4.080691076818719e-06,
"loss": 0.7285,
"step": 3714
},
{
"epoch": 0.7952690588959354,
"grad_norm": 0.14022059738133055,
"learning_rate": 4.0724692405157505e-06,
"loss": 0.6551,
"step": 3715
},
{
"epoch": 0.7954831286291509,
"grad_norm": 0.1390790251765366,
"learning_rate": 4.064254756331818e-06,
"loss": 0.6612,
"step": 3716
},
{
"epoch": 0.7956971983623665,
"grad_norm": 0.17527178930194584,
"learning_rate": 4.056047628058726e-06,
"loss": 0.6712,
"step": 3717
},
{
"epoch": 0.7959112680955821,
"grad_norm": 0.14177816726424428,
"learning_rate": 4.047847859484855e-06,
"loss": 0.665,
"step": 3718
},
{
"epoch": 0.7961253378287977,
"grad_norm": 0.1462702274505592,
"learning_rate": 4.03965545439521e-06,
"loss": 0.6947,
"step": 3719
},
{
"epoch": 0.7963394075620134,
"grad_norm": 0.14625636632028322,
"learning_rate": 4.031470416571397e-06,
"loss": 0.6842,
"step": 3720
},
{
"epoch": 0.796553477295229,
"grad_norm": 0.1479395399533281,
"learning_rate": 4.023292749791603e-06,
"loss": 0.7117,
"step": 3721
},
{
"epoch": 0.7967675470284445,
"grad_norm": 0.14294806232725346,
"learning_rate": 4.015122457830631e-06,
"loss": 0.6782,
"step": 3722
},
{
"epoch": 0.7969816167616601,
"grad_norm": 0.14391605199157265,
"learning_rate": 4.006959544459874e-06,
"loss": 0.6805,
"step": 3723
},
{
"epoch": 0.7971956864948757,
"grad_norm": 0.1453150979786913,
"learning_rate": 3.99880401344731e-06,
"loss": 0.6634,
"step": 3724
},
{
"epoch": 0.7974097562280913,
"grad_norm": 0.1453991275400545,
"learning_rate": 3.990655868557522e-06,
"loss": 0.6869,
"step": 3725
},
{
"epoch": 0.7976238259613069,
"grad_norm": 0.1437959290913186,
"learning_rate": 3.982515113551684e-06,
"loss": 0.7075,
"step": 3726
},
{
"epoch": 0.7978378956945225,
"grad_norm": 0.1460419031747847,
"learning_rate": 3.9743817521875436e-06,
"loss": 0.6918,
"step": 3727
},
{
"epoch": 0.798051965427738,
"grad_norm": 0.14625730461489223,
"learning_rate": 3.966255788219451e-06,
"loss": 0.6822,
"step": 3728
},
{
"epoch": 0.7982660351609537,
"grad_norm": 0.14315116744583717,
"learning_rate": 3.958137225398339e-06,
"loss": 0.6788,
"step": 3729
},
{
"epoch": 0.7984801048941693,
"grad_norm": 0.14548552457502054,
"learning_rate": 3.950026067471713e-06,
"loss": 0.6888,
"step": 3730
},
{
"epoch": 0.7986941746273849,
"grad_norm": 0.14504314778964464,
"learning_rate": 3.941922318183675e-06,
"loss": 0.6891,
"step": 3731
},
{
"epoch": 0.7989082443606005,
"grad_norm": 0.14054258332500452,
"learning_rate": 3.933825981274903e-06,
"loss": 0.688,
"step": 3732
},
{
"epoch": 0.7991223140938161,
"grad_norm": 0.14416063829175654,
"learning_rate": 3.925737060482644e-06,
"loss": 0.6849,
"step": 3733
},
{
"epoch": 0.7993363838270316,
"grad_norm": 0.15232123685251384,
"learning_rate": 3.917655559540738e-06,
"loss": 0.6712,
"step": 3734
},
{
"epoch": 0.7995504535602472,
"grad_norm": 0.14910757725890156,
"learning_rate": 3.9095814821795805e-06,
"loss": 0.7175,
"step": 3735
},
{
"epoch": 0.7997645232934628,
"grad_norm": 0.14494711098002688,
"learning_rate": 3.901514832126154e-06,
"loss": 0.6852,
"step": 3736
},
{
"epoch": 0.7999785930266784,
"grad_norm": 0.1453708082480703,
"learning_rate": 3.893455613104021e-06,
"loss": 0.6791,
"step": 3737
},
{
"epoch": 0.800192662759894,
"grad_norm": 0.21346208746687895,
"learning_rate": 3.885403828833283e-06,
"loss": 0.6916,
"step": 3738
},
{
"epoch": 0.8004067324931097,
"grad_norm": 0.1446229387767777,
"learning_rate": 3.877359483030647e-06,
"loss": 0.7044,
"step": 3739
},
{
"epoch": 0.8006208022263253,
"grad_norm": 0.1474085393401029,
"learning_rate": 3.8693225794093535e-06,
"loss": 0.6994,
"step": 3740
},
{
"epoch": 0.8008348719595408,
"grad_norm": 0.14436214299259978,
"learning_rate": 3.86129312167923e-06,
"loss": 0.6881,
"step": 3741
},
{
"epoch": 0.8010489416927564,
"grad_norm": 0.14398485106569503,
"learning_rate": 3.853271113546661e-06,
"loss": 0.7168,
"step": 3742
},
{
"epoch": 0.801263011425972,
"grad_norm": 0.1408527844385051,
"learning_rate": 3.845256558714585e-06,
"loss": 0.6899,
"step": 3743
},
{
"epoch": 0.8014770811591876,
"grad_norm": 0.14478090729331433,
"learning_rate": 3.837249460882515e-06,
"loss": 0.6892,
"step": 3744
},
{
"epoch": 0.8016911508924032,
"grad_norm": 0.14571803924285012,
"learning_rate": 3.829249823746502e-06,
"loss": 0.7181,
"step": 3745
},
{
"epoch": 0.8019052206256188,
"grad_norm": 0.14419415353094814,
"learning_rate": 3.821257650999171e-06,
"loss": 0.6955,
"step": 3746
},
{
"epoch": 0.8021192903588343,
"grad_norm": 0.14691963733543656,
"learning_rate": 3.8132729463296892e-06,
"loss": 0.6879,
"step": 3747
},
{
"epoch": 0.80233336009205,
"grad_norm": 0.14784162164039757,
"learning_rate": 3.8052957134237823e-06,
"loss": 0.6946,
"step": 3748
},
{
"epoch": 0.8025474298252656,
"grad_norm": 0.14621945815172976,
"learning_rate": 3.7973259559637353e-06,
"loss": 0.6793,
"step": 3749
},
{
"epoch": 0.8027614995584812,
"grad_norm": 0.14157435080718195,
"learning_rate": 3.7893636776283616e-06,
"loss": 0.6733,
"step": 3750
},
{
"epoch": 0.8029755692916968,
"grad_norm": 0.15148044838644467,
"learning_rate": 3.781408882093045e-06,
"loss": 0.6919,
"step": 3751
},
{
"epoch": 0.8031896390249124,
"grad_norm": 0.14507699440240024,
"learning_rate": 3.773461573029693e-06,
"loss": 0.7086,
"step": 3752
},
{
"epoch": 0.8034037087581279,
"grad_norm": 0.14049868694163933,
"learning_rate": 3.765521754106776e-06,
"loss": 0.6766,
"step": 3753
},
{
"epoch": 0.8036177784913435,
"grad_norm": 0.13681091509930346,
"learning_rate": 3.757589428989303e-06,
"loss": 0.6648,
"step": 3754
},
{
"epoch": 0.8038318482245591,
"grad_norm": 0.14202927095717371,
"learning_rate": 3.7496646013388116e-06,
"loss": 0.6815,
"step": 3755
},
{
"epoch": 0.8040459179577747,
"grad_norm": 0.1423962517603676,
"learning_rate": 3.741747274813399e-06,
"loss": 0.7088,
"step": 3756
},
{
"epoch": 0.8042599876909904,
"grad_norm": 0.1431857263635574,
"learning_rate": 3.733837453067677e-06,
"loss": 0.6978,
"step": 3757
},
{
"epoch": 0.804474057424206,
"grad_norm": 0.13907985974998918,
"learning_rate": 3.7259351397528097e-06,
"loss": 0.67,
"step": 3758
},
{
"epoch": 0.8046881271574216,
"grad_norm": 0.14506125633703043,
"learning_rate": 3.7180403385164955e-06,
"loss": 0.6747,
"step": 3759
},
{
"epoch": 0.8049021968906371,
"grad_norm": 0.14047285450165206,
"learning_rate": 3.710153053002952e-06,
"loss": 0.6958,
"step": 3760
},
{
"epoch": 0.8051162666238527,
"grad_norm": 0.14635988275187017,
"learning_rate": 3.7022732868529444e-06,
"loss": 0.708,
"step": 3761
},
{
"epoch": 0.8053303363570683,
"grad_norm": 0.1389022472999722,
"learning_rate": 3.6944010437037482e-06,
"loss": 0.6785,
"step": 3762
},
{
"epoch": 0.8055444060902839,
"grad_norm": 0.1415424297330864,
"learning_rate": 3.686536327189181e-06,
"loss": 0.6762,
"step": 3763
},
{
"epoch": 0.8057584758234995,
"grad_norm": 0.1453015273944333,
"learning_rate": 3.678679140939587e-06,
"loss": 0.7102,
"step": 3764
},
{
"epoch": 0.805972545556715,
"grad_norm": 0.14185952598928692,
"learning_rate": 3.6708294885818196e-06,
"loss": 0.6924,
"step": 3765
},
{
"epoch": 0.8061866152899307,
"grad_norm": 0.14440778141542995,
"learning_rate": 3.6629873737392727e-06,
"loss": 0.6965,
"step": 3766
},
{
"epoch": 0.8064006850231463,
"grad_norm": 0.14062813659339363,
"learning_rate": 3.6551528000318447e-06,
"loss": 0.6773,
"step": 3767
},
{
"epoch": 0.8066147547563619,
"grad_norm": 0.13929165691530832,
"learning_rate": 3.6473257710759647e-06,
"loss": 0.6825,
"step": 3768
},
{
"epoch": 0.8068288244895775,
"grad_norm": 0.14496243727647162,
"learning_rate": 3.639506290484576e-06,
"loss": 0.699,
"step": 3769
},
{
"epoch": 0.8070428942227931,
"grad_norm": 0.1374901631490186,
"learning_rate": 3.6316943618671306e-06,
"loss": 0.6524,
"step": 3770
},
{
"epoch": 0.8072569639560087,
"grad_norm": 0.14000586225946124,
"learning_rate": 3.6238899888296097e-06,
"loss": 0.6628,
"step": 3771
},
{
"epoch": 0.8074710336892242,
"grad_norm": 0.13749802901242772,
"learning_rate": 3.616093174974489e-06,
"loss": 0.6741,
"step": 3772
},
{
"epoch": 0.8076851034224398,
"grad_norm": 0.13592998452882274,
"learning_rate": 3.6083039239007642e-06,
"loss": 0.6766,
"step": 3773
},
{
"epoch": 0.8078991731556554,
"grad_norm": 0.1422615683326355,
"learning_rate": 3.6005222392039473e-06,
"loss": 0.6986,
"step": 3774
},
{
"epoch": 0.8081132428888711,
"grad_norm": 0.1440892780171938,
"learning_rate": 3.5927481244760397e-06,
"loss": 0.6771,
"step": 3775
},
{
"epoch": 0.8083273126220867,
"grad_norm": 0.14308044955576857,
"learning_rate": 3.584981583305569e-06,
"loss": 0.7121,
"step": 3776
},
{
"epoch": 0.8085413823553023,
"grad_norm": 0.14190129219743178,
"learning_rate": 3.577222619277545e-06,
"loss": 0.6787,
"step": 3777
},
{
"epoch": 0.8087554520885178,
"grad_norm": 0.14460558707942517,
"learning_rate": 3.5694712359734986e-06,
"loss": 0.6994,
"step": 3778
},
{
"epoch": 0.8089695218217334,
"grad_norm": 0.1440595861392093,
"learning_rate": 3.5617274369714538e-06,
"loss": 0.6963,
"step": 3779
},
{
"epoch": 0.809183591554949,
"grad_norm": 0.14904462393643703,
"learning_rate": 3.5539912258459297e-06,
"loss": 0.7145,
"step": 3780
},
{
"epoch": 0.8093976612881646,
"grad_norm": 0.14551197868259458,
"learning_rate": 3.546262606167956e-06,
"loss": 0.6971,
"step": 3781
},
{
"epoch": 0.8096117310213802,
"grad_norm": 0.1421372831117601,
"learning_rate": 3.538541581505037e-06,
"loss": 0.6991,
"step": 3782
},
{
"epoch": 0.8098258007545958,
"grad_norm": 0.1407928226543025,
"learning_rate": 3.530828155421191e-06,
"loss": 0.6928,
"step": 3783
},
{
"epoch": 0.8100398704878115,
"grad_norm": 0.32031682034983805,
"learning_rate": 3.523122331476925e-06,
"loss": 0.7007,
"step": 3784
},
{
"epoch": 0.810253940221027,
"grad_norm": 0.14805351100878913,
"learning_rate": 3.5154241132292223e-06,
"loss": 0.6943,
"step": 3785
},
{
"epoch": 0.8104680099542426,
"grad_norm": 0.1417773605708263,
"learning_rate": 3.507733504231581e-06,
"loss": 0.6973,
"step": 3786
},
{
"epoch": 0.8106820796874582,
"grad_norm": 0.1416352908057546,
"learning_rate": 3.5000505080339565e-06,
"loss": 0.6796,
"step": 3787
},
{
"epoch": 0.8108961494206738,
"grad_norm": 0.14815806986079122,
"learning_rate": 3.4923751281828187e-06,
"loss": 0.6931,
"step": 3788
},
{
"epoch": 0.8111102191538894,
"grad_norm": 0.1526928380763037,
"learning_rate": 3.4847073682210984e-06,
"loss": 0.7021,
"step": 3789
},
{
"epoch": 0.811324288887105,
"grad_norm": 0.14607899098517793,
"learning_rate": 3.4770472316882243e-06,
"loss": 0.7053,
"step": 3790
},
{
"epoch": 0.8115383586203205,
"grad_norm": 0.1445774289626635,
"learning_rate": 3.4693947221201054e-06,
"loss": 0.6879,
"step": 3791
},
{
"epoch": 0.8117524283535361,
"grad_norm": 0.15328098506784177,
"learning_rate": 3.461749843049118e-06,
"loss": 0.695,
"step": 3792
},
{
"epoch": 0.8119664980867518,
"grad_norm": 0.14365666565391966,
"learning_rate": 3.4541125980041355e-06,
"loss": 0.6768,
"step": 3793
},
{
"epoch": 0.8121805678199674,
"grad_norm": 0.13844710847516453,
"learning_rate": 3.4464829905104825e-06,
"loss": 0.6777,
"step": 3794
},
{
"epoch": 0.812394637553183,
"grad_norm": 0.14285951761810192,
"learning_rate": 3.438861024089979e-06,
"loss": 0.6714,
"step": 3795
},
{
"epoch": 0.8126087072863986,
"grad_norm": 0.14182521629810915,
"learning_rate": 3.4312467022609154e-06,
"loss": 0.6774,
"step": 3796
},
{
"epoch": 0.8128227770196141,
"grad_norm": 0.14163504457562062,
"learning_rate": 3.423640028538038e-06,
"loss": 0.6751,
"step": 3797
},
{
"epoch": 0.8130368467528297,
"grad_norm": 0.14090485057485672,
"learning_rate": 3.41604100643258e-06,
"loss": 0.6745,
"step": 3798
},
{
"epoch": 0.8132509164860453,
"grad_norm": 0.14091518590969906,
"learning_rate": 3.4084496394522402e-06,
"loss": 0.6799,
"step": 3799
},
{
"epoch": 0.8134649862192609,
"grad_norm": 0.1458713949321835,
"learning_rate": 3.4008659311011714e-06,
"loss": 0.6755,
"step": 3800
},
{
"epoch": 0.8136790559524765,
"grad_norm": 0.14484410753284657,
"learning_rate": 3.39328988488e-06,
"loss": 0.7068,
"step": 3801
},
{
"epoch": 0.8138931256856922,
"grad_norm": 0.13994096185237495,
"learning_rate": 3.385721504285826e-06,
"loss": 0.66,
"step": 3802
},
{
"epoch": 0.8141071954189077,
"grad_norm": 0.14371049552130213,
"learning_rate": 3.378160792812184e-06,
"loss": 0.7139,
"step": 3803
},
{
"epoch": 0.8143212651521233,
"grad_norm": 0.2219343042840987,
"learning_rate": 3.3706077539490933e-06,
"loss": 0.6669,
"step": 3804
},
{
"epoch": 0.8145353348853389,
"grad_norm": 0.1470722347444197,
"learning_rate": 3.3630623911830274e-06,
"loss": 0.7227,
"step": 3805
},
{
"epoch": 0.8147494046185545,
"grad_norm": 0.14029202030288834,
"learning_rate": 3.355524707996902e-06,
"loss": 0.6925,
"step": 3806
},
{
"epoch": 0.8149634743517701,
"grad_norm": 0.1452785248967079,
"learning_rate": 3.347994707870108e-06,
"loss": 0.7249,
"step": 3807
},
{
"epoch": 0.8151775440849857,
"grad_norm": 0.14116977299861644,
"learning_rate": 3.340472394278469e-06,
"loss": 0.6759,
"step": 3808
},
{
"epoch": 0.8153916138182012,
"grad_norm": 0.14288217961441463,
"learning_rate": 3.332957770694276e-06,
"loss": 0.7011,
"step": 3809
},
{
"epoch": 0.8156056835514168,
"grad_norm": 0.14838926978584027,
"learning_rate": 3.3254508405862706e-06,
"loss": 0.6992,
"step": 3810
},
{
"epoch": 0.8158197532846325,
"grad_norm": 0.14343148160581343,
"learning_rate": 3.317951607419627e-06,
"loss": 0.7141,
"step": 3811
},
{
"epoch": 0.8160338230178481,
"grad_norm": 0.1446930044037509,
"learning_rate": 3.3104600746559856e-06,
"loss": 0.6775,
"step": 3812
},
{
"epoch": 0.8162478927510637,
"grad_norm": 0.1441271187485597,
"learning_rate": 3.3029762457534266e-06,
"loss": 0.6914,
"step": 3813
},
{
"epoch": 0.8164619624842793,
"grad_norm": 0.142953090623808,
"learning_rate": 3.295500124166462e-06,
"loss": 0.6901,
"step": 3814
},
{
"epoch": 0.8166760322174949,
"grad_norm": 0.14546662819737374,
"learning_rate": 3.2880317133460628e-06,
"loss": 0.6952,
"step": 3815
},
{
"epoch": 0.8168901019507104,
"grad_norm": 0.14942845844301347,
"learning_rate": 3.2805710167396354e-06,
"loss": 0.7023,
"step": 3816
},
{
"epoch": 0.817104171683926,
"grad_norm": 0.14434334509441404,
"learning_rate": 3.2731180377910167e-06,
"loss": 0.6676,
"step": 3817
},
{
"epoch": 0.8173182414171416,
"grad_norm": 0.1415830452275537,
"learning_rate": 3.2656727799404962e-06,
"loss": 0.6763,
"step": 3818
},
{
"epoch": 0.8175323111503572,
"grad_norm": 0.1414645007118077,
"learning_rate": 3.2582352466247835e-06,
"loss": 0.7006,
"step": 3819
},
{
"epoch": 0.8177463808835729,
"grad_norm": 0.1470960502396605,
"learning_rate": 3.250805441277032e-06,
"loss": 0.7412,
"step": 3820
},
{
"epoch": 0.8179604506167885,
"grad_norm": 0.1458956073202586,
"learning_rate": 3.2433833673268358e-06,
"loss": 0.7096,
"step": 3821
},
{
"epoch": 0.818174520350004,
"grad_norm": 0.14041562626277815,
"learning_rate": 3.2359690282001944e-06,
"loss": 0.6663,
"step": 3822
},
{
"epoch": 0.8183885900832196,
"grad_norm": 0.1418928073863154,
"learning_rate": 3.2285624273195704e-06,
"loss": 0.6799,
"step": 3823
},
{
"epoch": 0.8186026598164352,
"grad_norm": 0.14221149135132968,
"learning_rate": 3.2211635681038223e-06,
"loss": 0.6633,
"step": 3824
},
{
"epoch": 0.8188167295496508,
"grad_norm": 0.1524517490005059,
"learning_rate": 3.2137724539682603e-06,
"loss": 0.7003,
"step": 3825
},
{
"epoch": 0.8190307992828664,
"grad_norm": 0.14454017156920307,
"learning_rate": 3.2063890883245997e-06,
"loss": 0.6845,
"step": 3826
},
{
"epoch": 0.819244869016082,
"grad_norm": 0.14481563853591464,
"learning_rate": 3.1990134745809966e-06,
"loss": 0.7,
"step": 3827
},
{
"epoch": 0.8194589387492975,
"grad_norm": 0.14738696202549195,
"learning_rate": 3.1916456161420207e-06,
"loss": 0.7076,
"step": 3828
},
{
"epoch": 0.8196730084825132,
"grad_norm": 0.1485838201358138,
"learning_rate": 3.1842855164086563e-06,
"loss": 0.7175,
"step": 3829
},
{
"epoch": 0.8198870782157288,
"grad_norm": 0.13992076520294638,
"learning_rate": 3.1769331787783186e-06,
"loss": 0.696,
"step": 3830
},
{
"epoch": 0.8201011479489444,
"grad_norm": 0.1451141799886476,
"learning_rate": 3.1695886066448268e-06,
"loss": 0.7044,
"step": 3831
},
{
"epoch": 0.82031521768216,
"grad_norm": 0.1383239960460572,
"learning_rate": 3.162251803398422e-06,
"loss": 0.6727,
"step": 3832
},
{
"epoch": 0.8205292874153756,
"grad_norm": 0.14559151524010955,
"learning_rate": 3.15492277242577e-06,
"loss": 0.7084,
"step": 3833
},
{
"epoch": 0.8207433571485911,
"grad_norm": 0.14538113334588182,
"learning_rate": 3.1476015171099237e-06,
"loss": 0.6928,
"step": 3834
},
{
"epoch": 0.8209574268818067,
"grad_norm": 0.13710778957341044,
"learning_rate": 3.1402880408303727e-06,
"loss": 0.6889,
"step": 3835
},
{
"epoch": 0.8211714966150223,
"grad_norm": 0.13968201697210011,
"learning_rate": 3.132982346962994e-06,
"loss": 0.6919,
"step": 3836
},
{
"epoch": 0.8213855663482379,
"grad_norm": 0.14069485094599501,
"learning_rate": 3.1256844388800876e-06,
"loss": 0.6817,
"step": 3837
},
{
"epoch": 0.8215996360814536,
"grad_norm": 0.14162967945251165,
"learning_rate": 3.11839431995036e-06,
"loss": 0.6979,
"step": 3838
},
{
"epoch": 0.8218137058146692,
"grad_norm": 0.24692001012059667,
"learning_rate": 3.1111119935389043e-06,
"loss": 0.7072,
"step": 3839
},
{
"epoch": 0.8220277755478848,
"grad_norm": 0.13860231067556303,
"learning_rate": 3.103837463007244e-06,
"loss": 0.6822,
"step": 3840
},
{
"epoch": 0.8222418452811003,
"grad_norm": 0.13930677073500938,
"learning_rate": 3.0965707317132733e-06,
"loss": 0.7099,
"step": 3841
},
{
"epoch": 0.8224559150143159,
"grad_norm": 0.14020374175989106,
"learning_rate": 3.0893118030113125e-06,
"loss": 0.6762,
"step": 3842
},
{
"epoch": 0.8226699847475315,
"grad_norm": 0.14603073170804046,
"learning_rate": 3.0820606802520704e-06,
"loss": 0.7012,
"step": 3843
},
{
"epoch": 0.8228840544807471,
"grad_norm": 0.14466821681364184,
"learning_rate": 3.074817366782645e-06,
"loss": 0.6595,
"step": 3844
},
{
"epoch": 0.8230981242139627,
"grad_norm": 0.14387890059063177,
"learning_rate": 3.067581865946545e-06,
"loss": 0.7005,
"step": 3845
},
{
"epoch": 0.8233121939471783,
"grad_norm": 0.14111563332915836,
"learning_rate": 3.0603541810836535e-06,
"loss": 0.6766,
"step": 3846
},
{
"epoch": 0.8235262636803938,
"grad_norm": 0.1423762382892295,
"learning_rate": 3.053134315530264e-06,
"loss": 0.7019,
"step": 3847
},
{
"epoch": 0.8237403334136095,
"grad_norm": 0.14122892803872422,
"learning_rate": 3.0459222726190572e-06,
"loss": 0.6715,
"step": 3848
},
{
"epoch": 0.8239544031468251,
"grad_norm": 0.14282683189266973,
"learning_rate": 3.0387180556790885e-06,
"loss": 0.7026,
"step": 3849
},
{
"epoch": 0.8241684728800407,
"grad_norm": 0.1420137311560712,
"learning_rate": 3.0315216680358197e-06,
"loss": 0.7198,
"step": 3850
},
{
"epoch": 0.8243825426132563,
"grad_norm": 0.13842001787256472,
"learning_rate": 3.0243331130110844e-06,
"loss": 0.6911,
"step": 3851
},
{
"epoch": 0.8245966123464719,
"grad_norm": 0.14723954474387052,
"learning_rate": 3.0171523939231085e-06,
"loss": 0.7183,
"step": 3852
},
{
"epoch": 0.8248106820796874,
"grad_norm": 0.14031946831048728,
"learning_rate": 3.009979514086503e-06,
"loss": 0.6949,
"step": 3853
},
{
"epoch": 0.825024751812903,
"grad_norm": 0.13896601011476556,
"learning_rate": 3.002814476812248e-06,
"loss": 0.7005,
"step": 3854
},
{
"epoch": 0.8252388215461186,
"grad_norm": 0.140336415584225,
"learning_rate": 2.9956572854077205e-06,
"loss": 0.7058,
"step": 3855
},
{
"epoch": 0.8254528912793342,
"grad_norm": 0.14237152962164493,
"learning_rate": 2.988507943176657e-06,
"loss": 0.6981,
"step": 3856
},
{
"epoch": 0.8256669610125499,
"grad_norm": 0.14474808939258405,
"learning_rate": 2.981366453419188e-06,
"loss": 0.6757,
"step": 3857
},
{
"epoch": 0.8258810307457655,
"grad_norm": 0.14428721558822039,
"learning_rate": 2.974232819431815e-06,
"loss": 0.6803,
"step": 3858
},
{
"epoch": 0.826095100478981,
"grad_norm": 0.14696435302289199,
"learning_rate": 2.967107044507398e-06,
"loss": 0.7367,
"step": 3859
},
{
"epoch": 0.8263091702121966,
"grad_norm": 0.28037210468520757,
"learning_rate": 2.959989131935197e-06,
"loss": 0.693,
"step": 3860
},
{
"epoch": 0.8265232399454122,
"grad_norm": 0.1448533722417162,
"learning_rate": 2.9528790850008127e-06,
"loss": 0.7079,
"step": 3861
},
{
"epoch": 0.8267373096786278,
"grad_norm": 0.14752679242510425,
"learning_rate": 2.9457769069862395e-06,
"loss": 0.7179,
"step": 3862
},
{
"epoch": 0.8269513794118434,
"grad_norm": 0.13785014223553663,
"learning_rate": 2.9386826011698286e-06,
"loss": 0.6684,
"step": 3863
},
{
"epoch": 0.827165449145059,
"grad_norm": 0.1457392300989148,
"learning_rate": 2.931596170826294e-06,
"loss": 0.7045,
"step": 3864
},
{
"epoch": 0.8273795188782745,
"grad_norm": 0.14128710615788895,
"learning_rate": 2.9245176192267276e-06,
"loss": 0.7002,
"step": 3865
},
{
"epoch": 0.8275935886114902,
"grad_norm": 0.14166476392919206,
"learning_rate": 2.9174469496385648e-06,
"loss": 0.6694,
"step": 3866
},
{
"epoch": 0.8278076583447058,
"grad_norm": 0.14053579966386573,
"learning_rate": 2.9103841653256238e-06,
"loss": 0.6735,
"step": 3867
},
{
"epoch": 0.8280217280779214,
"grad_norm": 0.14390292930132934,
"learning_rate": 2.903329269548063e-06,
"loss": 0.6931,
"step": 3868
},
{
"epoch": 0.828235797811137,
"grad_norm": 0.14094241244359185,
"learning_rate": 2.8962822655624155e-06,
"loss": 0.7051,
"step": 3869
},
{
"epoch": 0.8284498675443526,
"grad_norm": 0.14289625621208796,
"learning_rate": 2.8892431566215685e-06,
"loss": 0.701,
"step": 3870
},
{
"epoch": 0.8286639372775682,
"grad_norm": 0.14272735146508855,
"learning_rate": 2.8822119459747534e-06,
"loss": 0.6844,
"step": 3871
},
{
"epoch": 0.8288780070107837,
"grad_norm": 0.1388613092953752,
"learning_rate": 2.8751886368675742e-06,
"loss": 0.7012,
"step": 3872
},
{
"epoch": 0.8290920767439993,
"grad_norm": 0.13846642983613058,
"learning_rate": 2.8681732325419666e-06,
"loss": 0.6712,
"step": 3873
},
{
"epoch": 0.8293061464772149,
"grad_norm": 0.14623432135979536,
"learning_rate": 2.8611657362362354e-06,
"loss": 0.7462,
"step": 3874
},
{
"epoch": 0.8295202162104306,
"grad_norm": 0.14847826703160524,
"learning_rate": 2.8541661511850295e-06,
"loss": 0.6931,
"step": 3875
},
{
"epoch": 0.8297342859436462,
"grad_norm": 0.14505536712986322,
"learning_rate": 2.8471744806193367e-06,
"loss": 0.7103,
"step": 3876
},
{
"epoch": 0.8299483556768618,
"grad_norm": 0.13743644349343906,
"learning_rate": 2.8401907277665096e-06,
"loss": 0.6591,
"step": 3877
},
{
"epoch": 0.8301624254100773,
"grad_norm": 0.18343752688834064,
"learning_rate": 2.8332148958502247e-06,
"loss": 0.6752,
"step": 3878
},
{
"epoch": 0.8303764951432929,
"grad_norm": 0.17207799635199497,
"learning_rate": 2.82624698809052e-06,
"loss": 0.6539,
"step": 3879
},
{
"epoch": 0.8305905648765085,
"grad_norm": 0.14449348982871577,
"learning_rate": 2.819287007703773e-06,
"loss": 0.6847,
"step": 3880
},
{
"epoch": 0.8308046346097241,
"grad_norm": 0.13911071475543194,
"learning_rate": 2.812334957902685e-06,
"loss": 0.7,
"step": 3881
},
{
"epoch": 0.8310187043429397,
"grad_norm": 0.1455899341813523,
"learning_rate": 2.8053908418963205e-06,
"loss": 0.7362,
"step": 3882
},
{
"epoch": 0.8312327740761553,
"grad_norm": 0.13809931159925534,
"learning_rate": 2.798454662890069e-06,
"loss": 0.6827,
"step": 3883
},
{
"epoch": 0.831446843809371,
"grad_norm": 0.1427053366283101,
"learning_rate": 2.7915264240856554e-06,
"loss": 0.6816,
"step": 3884
},
{
"epoch": 0.8316609135425865,
"grad_norm": 0.143272006365014,
"learning_rate": 2.78460612868114e-06,
"loss": 0.7046,
"step": 3885
},
{
"epoch": 0.8318749832758021,
"grad_norm": 0.14430443102194687,
"learning_rate": 2.777693779870927e-06,
"loss": 0.6726,
"step": 3886
},
{
"epoch": 0.8320890530090177,
"grad_norm": 0.13592084650091263,
"learning_rate": 2.7707893808457355e-06,
"loss": 0.6654,
"step": 3887
},
{
"epoch": 0.8323031227422333,
"grad_norm": 0.14487192191418882,
"learning_rate": 2.7638929347926245e-06,
"loss": 0.7002,
"step": 3888
},
{
"epoch": 0.8325171924754489,
"grad_norm": 0.14223244399105567,
"learning_rate": 2.7570044448949886e-06,
"loss": 0.6767,
"step": 3889
},
{
"epoch": 0.8327312622086644,
"grad_norm": 0.14475070977830992,
"learning_rate": 2.750123914332532e-06,
"loss": 0.6861,
"step": 3890
},
{
"epoch": 0.83294533194188,
"grad_norm": 0.1380380931616058,
"learning_rate": 2.743251346281297e-06,
"loss": 0.6816,
"step": 3891
},
{
"epoch": 0.8331594016750956,
"grad_norm": 0.1439929693576951,
"learning_rate": 2.7363867439136572e-06,
"loss": 0.7053,
"step": 3892
},
{
"epoch": 0.8333734714083113,
"grad_norm": 0.14272699628805557,
"learning_rate": 2.7295301103982906e-06,
"loss": 0.6921,
"step": 3893
},
{
"epoch": 0.8335875411415269,
"grad_norm": 0.14647739149487252,
"learning_rate": 2.722681448900213e-06,
"loss": 0.7005,
"step": 3894
},
{
"epoch": 0.8338016108747425,
"grad_norm": 0.14308375221398376,
"learning_rate": 2.715840762580748e-06,
"loss": 0.6926,
"step": 3895
},
{
"epoch": 0.834015680607958,
"grad_norm": 0.13914110296548118,
"learning_rate": 2.709008054597546e-06,
"loss": 0.6864,
"step": 3896
},
{
"epoch": 0.8342297503411736,
"grad_norm": 0.14306722460179636,
"learning_rate": 2.7021833281045796e-06,
"loss": 0.6922,
"step": 3897
},
{
"epoch": 0.8344438200743892,
"grad_norm": 0.14054438178321524,
"learning_rate": 2.6953665862521174e-06,
"loss": 0.7107,
"step": 3898
},
{
"epoch": 0.8346578898076048,
"grad_norm": 0.14759096236273558,
"learning_rate": 2.688557832186762e-06,
"loss": 0.6842,
"step": 3899
},
{
"epoch": 0.8348719595408204,
"grad_norm": 0.1413651806438682,
"learning_rate": 2.681757069051427e-06,
"loss": 0.6935,
"step": 3900
},
{
"epoch": 0.835086029274036,
"grad_norm": 0.14037485380233253,
"learning_rate": 2.674964299985321e-06,
"loss": 0.6776,
"step": 3901
},
{
"epoch": 0.8353000990072517,
"grad_norm": 0.14262034762857348,
"learning_rate": 2.6681795281239866e-06,
"loss": 0.6992,
"step": 3902
},
{
"epoch": 0.8355141687404672,
"grad_norm": 0.14717500778750456,
"learning_rate": 2.6614027565992473e-06,
"loss": 0.6866,
"step": 3903
},
{
"epoch": 0.8357282384736828,
"grad_norm": 0.13603128330850944,
"learning_rate": 2.6546339885392568e-06,
"loss": 0.6782,
"step": 3904
},
{
"epoch": 0.8359423082068984,
"grad_norm": 0.14291650916023088,
"learning_rate": 2.647873227068469e-06,
"loss": 0.6913,
"step": 3905
},
{
"epoch": 0.836156377940114,
"grad_norm": 0.13719684474729255,
"learning_rate": 2.6411204753076325e-06,
"loss": 0.6948,
"step": 3906
},
{
"epoch": 0.8363704476733296,
"grad_norm": 0.14062577396152376,
"learning_rate": 2.634375736373811e-06,
"loss": 0.6841,
"step": 3907
},
{
"epoch": 0.8365845174065452,
"grad_norm": 0.14660486128751915,
"learning_rate": 2.6276390133803585e-06,
"loss": 0.7241,
"step": 3908
},
{
"epoch": 0.8367985871397607,
"grad_norm": 0.14093723627470145,
"learning_rate": 2.620910309436937e-06,
"loss": 0.6936,
"step": 3909
},
{
"epoch": 0.8370126568729763,
"grad_norm": 0.14304147247852791,
"learning_rate": 2.6141896276495015e-06,
"loss": 0.7202,
"step": 3910
},
{
"epoch": 0.837226726606192,
"grad_norm": 0.13957784476811416,
"learning_rate": 2.6074769711203062e-06,
"loss": 0.7017,
"step": 3911
},
{
"epoch": 0.8374407963394076,
"grad_norm": 0.1443129027083466,
"learning_rate": 2.600772342947908e-06,
"loss": 0.7173,
"step": 3912
},
{
"epoch": 0.8376548660726232,
"grad_norm": 0.14499795810455413,
"learning_rate": 2.5940757462271405e-06,
"loss": 0.6996,
"step": 3913
},
{
"epoch": 0.8378689358058388,
"grad_norm": 0.13608416467777257,
"learning_rate": 2.5873871840491504e-06,
"loss": 0.6648,
"step": 3914
},
{
"epoch": 0.8380830055390543,
"grad_norm": 0.14263848842725366,
"learning_rate": 2.5807066595013574e-06,
"loss": 0.7054,
"step": 3915
},
{
"epoch": 0.8382970752722699,
"grad_norm": 0.1390153659503591,
"learning_rate": 2.5740341756674813e-06,
"loss": 0.6989,
"step": 3916
},
{
"epoch": 0.8385111450054855,
"grad_norm": 0.14440777083763068,
"learning_rate": 2.5673697356275364e-06,
"loss": 0.7063,
"step": 3917
},
{
"epoch": 0.8387252147387011,
"grad_norm": 0.1370223863651042,
"learning_rate": 2.560713342457806e-06,
"loss": 0.6645,
"step": 3918
},
{
"epoch": 0.8389392844719167,
"grad_norm": 0.14239714949739304,
"learning_rate": 2.554064999230876e-06,
"loss": 0.6911,
"step": 3919
},
{
"epoch": 0.8391533542051324,
"grad_norm": 0.13658205241746624,
"learning_rate": 2.5474247090156025e-06,
"loss": 0.6618,
"step": 3920
},
{
"epoch": 0.839367423938348,
"grad_norm": 0.1414954085156795,
"learning_rate": 2.540792474877134e-06,
"loss": 0.6789,
"step": 3921
},
{
"epoch": 0.8395814936715635,
"grad_norm": 0.13893306653794818,
"learning_rate": 2.5341682998769045e-06,
"loss": 0.6944,
"step": 3922
},
{
"epoch": 0.8397955634047791,
"grad_norm": 0.14402246315833805,
"learning_rate": 2.5275521870726107e-06,
"loss": 0.7252,
"step": 3923
},
{
"epoch": 0.8400096331379947,
"grad_norm": 0.13976737539025286,
"learning_rate": 2.5209441395182444e-06,
"loss": 0.6739,
"step": 3924
},
{
"epoch": 0.8402237028712103,
"grad_norm": 0.13960447481763935,
"learning_rate": 2.5143441602640662e-06,
"loss": 0.6841,
"step": 3925
},
{
"epoch": 0.8404377726044259,
"grad_norm": 0.13697710272548283,
"learning_rate": 2.5077522523566123e-06,
"loss": 0.6965,
"step": 3926
},
{
"epoch": 0.8406518423376415,
"grad_norm": 0.1474640548603916,
"learning_rate": 2.5011684188387044e-06,
"loss": 0.6872,
"step": 3927
},
{
"epoch": 0.840865912070857,
"grad_norm": 0.1406357364898473,
"learning_rate": 2.4945926627494154e-06,
"loss": 0.7,
"step": 3928
},
{
"epoch": 0.8410799818040727,
"grad_norm": 0.1406116770497962,
"learning_rate": 2.4880249871241135e-06,
"loss": 0.6694,
"step": 3929
},
{
"epoch": 0.8412940515372883,
"grad_norm": 0.16032841037283024,
"learning_rate": 2.4814653949944157e-06,
"loss": 0.7324,
"step": 3930
},
{
"epoch": 0.8415081212705039,
"grad_norm": 0.13927806938891313,
"learning_rate": 2.474913889388222e-06,
"loss": 0.7026,
"step": 3931
},
{
"epoch": 0.8417221910037195,
"grad_norm": 0.14112975682958137,
"learning_rate": 2.468370473329702e-06,
"loss": 0.6777,
"step": 3932
},
{
"epoch": 0.8419362607369351,
"grad_norm": 0.13985389682414093,
"learning_rate": 2.4618351498392735e-06,
"loss": 0.6811,
"step": 3933
},
{
"epoch": 0.8421503304701506,
"grad_norm": 0.1354473401959006,
"learning_rate": 2.4553079219336385e-06,
"loss": 0.6678,
"step": 3934
},
{
"epoch": 0.8423644002033662,
"grad_norm": 0.13276337595163643,
"learning_rate": 2.448788792625747e-06,
"loss": 0.6616,
"step": 3935
},
{
"epoch": 0.8425784699365818,
"grad_norm": 0.1401646446893855,
"learning_rate": 2.4422777649248186e-06,
"loss": 0.685,
"step": 3936
},
{
"epoch": 0.8427925396697974,
"grad_norm": 0.1422983340353118,
"learning_rate": 2.435774841836338e-06,
"loss": 0.6645,
"step": 3937
},
{
"epoch": 0.8430066094030131,
"grad_norm": 0.14091992521337854,
"learning_rate": 2.4292800263620354e-06,
"loss": 0.6835,
"step": 3938
},
{
"epoch": 0.8432206791362287,
"grad_norm": 0.155299653523915,
"learning_rate": 2.42279332149991e-06,
"loss": 0.7074,
"step": 3939
},
{
"epoch": 0.8434347488694443,
"grad_norm": 0.14304814131572857,
"learning_rate": 2.416314730244207e-06,
"loss": 0.6992,
"step": 3940
},
{
"epoch": 0.8436488186026598,
"grad_norm": 0.138437758159917,
"learning_rate": 2.4098442555854386e-06,
"loss": 0.6718,
"step": 3941
},
{
"epoch": 0.8438628883358754,
"grad_norm": 0.1437415045430234,
"learning_rate": 2.403381900510364e-06,
"loss": 0.6888,
"step": 3942
},
{
"epoch": 0.844076958069091,
"grad_norm": 0.14302757384348427,
"learning_rate": 2.396927668001987e-06,
"loss": 0.6965,
"step": 3943
},
{
"epoch": 0.8442910278023066,
"grad_norm": 0.14530139265461148,
"learning_rate": 2.3904815610395816e-06,
"loss": 0.6862,
"step": 3944
},
{
"epoch": 0.8445050975355222,
"grad_norm": 0.16416906748599933,
"learning_rate": 2.384043582598645e-06,
"loss": 0.6675,
"step": 3945
},
{
"epoch": 0.8447191672687377,
"grad_norm": 0.14360171170288485,
"learning_rate": 2.3776137356509455e-06,
"loss": 0.6786,
"step": 3946
},
{
"epoch": 0.8449332370019534,
"grad_norm": 0.21768664667533322,
"learning_rate": 2.3711920231644902e-06,
"loss": 0.6778,
"step": 3947
},
{
"epoch": 0.845147306735169,
"grad_norm": 0.13587049456150121,
"learning_rate": 2.364778448103524e-06,
"loss": 0.6682,
"step": 3948
},
{
"epoch": 0.8453613764683846,
"grad_norm": 0.1422460530792501,
"learning_rate": 2.3583730134285453e-06,
"loss": 0.6773,
"step": 3949
},
{
"epoch": 0.8455754462016002,
"grad_norm": 0.1397936301287952,
"learning_rate": 2.3519757220962847e-06,
"loss": 0.6886,
"step": 3950
},
{
"epoch": 0.8457895159348158,
"grad_norm": 0.1398299869684245,
"learning_rate": 2.345586577059731e-06,
"loss": 0.6643,
"step": 3951
},
{
"epoch": 0.8460035856680314,
"grad_norm": 0.139695615788581,
"learning_rate": 2.339205581268089e-06,
"loss": 0.6986,
"step": 3952
},
{
"epoch": 0.8462176554012469,
"grad_norm": 0.14577144515660195,
"learning_rate": 2.3328327376668237e-06,
"loss": 0.6841,
"step": 3953
},
{
"epoch": 0.8464317251344625,
"grad_norm": 0.1384862785022817,
"learning_rate": 2.32646804919763e-06,
"loss": 0.6623,
"step": 3954
},
{
"epoch": 0.8466457948676781,
"grad_norm": 0.14203223741428794,
"learning_rate": 2.320111518798427e-06,
"loss": 0.6709,
"step": 3955
},
{
"epoch": 0.8468598646008937,
"grad_norm": 0.1378325964817421,
"learning_rate": 2.3137631494033853e-06,
"loss": 0.7027,
"step": 3956
},
{
"epoch": 0.8470739343341094,
"grad_norm": 0.13838009661720402,
"learning_rate": 2.3074229439428964e-06,
"loss": 0.6772,
"step": 3957
},
{
"epoch": 0.847288004067325,
"grad_norm": 0.14114303651345592,
"learning_rate": 2.301090905343586e-06,
"loss": 0.7014,
"step": 3958
},
{
"epoch": 0.8475020738005405,
"grad_norm": 0.1398378489868303,
"learning_rate": 2.29476703652832e-06,
"loss": 0.6861,
"step": 3959
},
{
"epoch": 0.8477161435337561,
"grad_norm": 0.16828877180724813,
"learning_rate": 2.288451340416178e-06,
"loss": 0.6852,
"step": 3960
},
{
"epoch": 0.8479302132669717,
"grad_norm": 0.1415004020669396,
"learning_rate": 2.2821438199224756e-06,
"loss": 0.6754,
"step": 3961
},
{
"epoch": 0.8481442830001873,
"grad_norm": 0.13877695917097002,
"learning_rate": 2.2758444779587487e-06,
"loss": 0.6752,
"step": 3962
},
{
"epoch": 0.8483583527334029,
"grad_norm": 0.14433666441866894,
"learning_rate": 2.2695533174327667e-06,
"loss": 0.7113,
"step": 3963
},
{
"epoch": 0.8485724224666185,
"grad_norm": 0.14307448115144958,
"learning_rate": 2.263270341248518e-06,
"loss": 0.6886,
"step": 3964
},
{
"epoch": 0.848786492199834,
"grad_norm": 0.13571935435252858,
"learning_rate": 2.2569955523062093e-06,
"loss": 0.6711,
"step": 3965
},
{
"epoch": 0.8490005619330497,
"grad_norm": 0.13748358307770517,
"learning_rate": 2.2507289535022747e-06,
"loss": 0.6417,
"step": 3966
},
{
"epoch": 0.8492146316662653,
"grad_norm": 0.13798070078416347,
"learning_rate": 2.244470547729365e-06,
"loss": 0.6861,
"step": 3967
},
{
"epoch": 0.8494287013994809,
"grad_norm": 0.14618650486624518,
"learning_rate": 2.2382203378763466e-06,
"loss": 0.6687,
"step": 3968
},
{
"epoch": 0.8496427711326965,
"grad_norm": 0.13862003490689262,
"learning_rate": 2.2319783268283037e-06,
"loss": 0.6556,
"step": 3969
},
{
"epoch": 0.8498568408659121,
"grad_norm": 0.18601606585048341,
"learning_rate": 2.225744517466546e-06,
"loss": 0.7012,
"step": 3970
},
{
"epoch": 0.8500709105991276,
"grad_norm": 0.13825824549501115,
"learning_rate": 2.2195189126685746e-06,
"loss": 0.6855,
"step": 3971
},
{
"epoch": 0.8502849803323432,
"grad_norm": 0.2125788653793348,
"learning_rate": 2.2133015153081283e-06,
"loss": 0.6751,
"step": 3972
},
{
"epoch": 0.8504990500655588,
"grad_norm": 0.14006030798093297,
"learning_rate": 2.2070923282551447e-06,
"loss": 0.686,
"step": 3973
},
{
"epoch": 0.8507131197987744,
"grad_norm": 0.14281909317579342,
"learning_rate": 2.2008913543757673e-06,
"loss": 0.6904,
"step": 3974
},
{
"epoch": 0.8509271895319901,
"grad_norm": 0.14249985452681854,
"learning_rate": 2.1946985965323584e-06,
"loss": 0.6949,
"step": 3975
},
{
"epoch": 0.8511412592652057,
"grad_norm": 0.14261949527007434,
"learning_rate": 2.1885140575834862e-06,
"loss": 0.701,
"step": 3976
},
{
"epoch": 0.8513553289984213,
"grad_norm": 0.1353749900762074,
"learning_rate": 2.1823377403839176e-06,
"loss": 0.6786,
"step": 3977
},
{
"epoch": 0.8515693987316368,
"grad_norm": 0.14062008652826552,
"learning_rate": 2.1761696477846296e-06,
"loss": 0.6875,
"step": 3978
},
{
"epoch": 0.8517834684648524,
"grad_norm": 0.14372691293050957,
"learning_rate": 2.1700097826328116e-06,
"loss": 0.7095,
"step": 3979
},
{
"epoch": 0.851997538198068,
"grad_norm": 0.14027054705726305,
"learning_rate": 2.1638581477718313e-06,
"loss": 0.6967,
"step": 3980
},
{
"epoch": 0.8522116079312836,
"grad_norm": 0.1438960621108349,
"learning_rate": 2.157714746041286e-06,
"loss": 0.6933,
"step": 3981
},
{
"epoch": 0.8524256776644992,
"grad_norm": 0.1368611912657314,
"learning_rate": 2.151579580276948e-06,
"loss": 0.6867,
"step": 3982
},
{
"epoch": 0.8526397473977148,
"grad_norm": 0.143154276663767,
"learning_rate": 2.1454526533108024e-06,
"loss": 0.6693,
"step": 3983
},
{
"epoch": 0.8528538171309304,
"grad_norm": 0.14045446781032014,
"learning_rate": 2.139333967971031e-06,
"loss": 0.7029,
"step": 3984
},
{
"epoch": 0.853067886864146,
"grad_norm": 0.140086411666049,
"learning_rate": 2.133223527082002e-06,
"loss": 0.6762,
"step": 3985
},
{
"epoch": 0.8532819565973616,
"grad_norm": 0.1418318463075373,
"learning_rate": 2.1271213334642902e-06,
"loss": 0.7049,
"step": 3986
},
{
"epoch": 0.8534960263305772,
"grad_norm": 0.13948421996434165,
"learning_rate": 2.121027389934649e-06,
"loss": 0.6896,
"step": 3987
},
{
"epoch": 0.8537100960637928,
"grad_norm": 0.13862761525730624,
"learning_rate": 2.114941699306037e-06,
"loss": 0.7058,
"step": 3988
},
{
"epoch": 0.8539241657970084,
"grad_norm": 0.13840602652849981,
"learning_rate": 2.108864264387598e-06,
"loss": 0.6903,
"step": 3989
},
{
"epoch": 0.8541382355302239,
"grad_norm": 0.14001467256463854,
"learning_rate": 2.1027950879846615e-06,
"loss": 0.7052,
"step": 3990
},
{
"epoch": 0.8543523052634395,
"grad_norm": 0.13582098398945472,
"learning_rate": 2.0967341728987554e-06,
"loss": 0.6872,
"step": 3991
},
{
"epoch": 0.8545663749966551,
"grad_norm": 0.13998492598632672,
"learning_rate": 2.0906815219275756e-06,
"loss": 0.6864,
"step": 3992
},
{
"epoch": 0.8547804447298708,
"grad_norm": 0.1409050434646671,
"learning_rate": 2.0846371378650267e-06,
"loss": 0.6602,
"step": 3993
},
{
"epoch": 0.8549945144630864,
"grad_norm": 0.14229180526897744,
"learning_rate": 2.0786010235011745e-06,
"loss": 0.6781,
"step": 3994
},
{
"epoch": 0.855208584196302,
"grad_norm": 0.14197897212796198,
"learning_rate": 2.0725731816222836e-06,
"loss": 0.717,
"step": 3995
},
{
"epoch": 0.8554226539295176,
"grad_norm": 0.13879021687546958,
"learning_rate": 2.0665536150108e-06,
"loss": 0.6677,
"step": 3996
},
{
"epoch": 0.8556367236627331,
"grad_norm": 0.15826009454899004,
"learning_rate": 2.060542326445334e-06,
"loss": 0.6991,
"step": 3997
},
{
"epoch": 0.8558507933959487,
"grad_norm": 0.13594920436812258,
"learning_rate": 2.0545393187006945e-06,
"loss": 0.676,
"step": 3998
},
{
"epoch": 0.8560648631291643,
"grad_norm": 0.1410472132816311,
"learning_rate": 2.04854459454785e-06,
"loss": 0.6826,
"step": 3999
},
{
"epoch": 0.8562789328623799,
"grad_norm": 0.13966756865622357,
"learning_rate": 2.0425581567539597e-06,
"loss": 0.6944,
"step": 4000
},
{
"epoch": 0.8564930025955955,
"grad_norm": 0.1405657923960637,
"learning_rate": 2.0365800080823583e-06,
"loss": 0.6926,
"step": 4001
},
{
"epoch": 0.8567070723288112,
"grad_norm": 0.13870469508337874,
"learning_rate": 2.0306101512925357e-06,
"loss": 0.6523,
"step": 4002
},
{
"epoch": 0.8569211420620267,
"grad_norm": 0.1373519683059727,
"learning_rate": 2.0246485891401768e-06,
"loss": 0.7187,
"step": 4003
},
{
"epoch": 0.8571352117952423,
"grad_norm": 0.14028813574062063,
"learning_rate": 2.01869532437712e-06,
"loss": 0.6978,
"step": 4004
},
{
"epoch": 0.8573492815284579,
"grad_norm": 0.13781274742678487,
"learning_rate": 2.0127503597513877e-06,
"loss": 0.7051,
"step": 4005
},
{
"epoch": 0.8575633512616735,
"grad_norm": 0.13973132085958173,
"learning_rate": 2.006813698007164e-06,
"loss": 0.6982,
"step": 4006
},
{
"epoch": 0.8577774209948891,
"grad_norm": 0.14016676711135564,
"learning_rate": 2.0008853418847952e-06,
"loss": 0.6933,
"step": 4007
},
{
"epoch": 0.8579914907281047,
"grad_norm": 0.1461285120595108,
"learning_rate": 1.99496529412081e-06,
"loss": 0.7013,
"step": 4008
},
{
"epoch": 0.8582055604613202,
"grad_norm": 0.13781601790971343,
"learning_rate": 1.98905355744788e-06,
"loss": 0.6718,
"step": 4009
},
{
"epoch": 0.8584196301945358,
"grad_norm": 0.1375702362955899,
"learning_rate": 1.9831501345948578e-06,
"loss": 0.6823,
"step": 4010
},
{
"epoch": 0.8586336999277515,
"grad_norm": 0.14192092140244358,
"learning_rate": 1.9772550282867554e-06,
"loss": 0.6916,
"step": 4011
},
{
"epoch": 0.8588477696609671,
"grad_norm": 0.1423477032392777,
"learning_rate": 1.9713682412447377e-06,
"loss": 0.6693,
"step": 4012
},
{
"epoch": 0.8590618393941827,
"grad_norm": 0.14418711694126216,
"learning_rate": 1.9654897761861404e-06,
"loss": 0.7048,
"step": 4013
},
{
"epoch": 0.8592759091273983,
"grad_norm": 0.13848213193437015,
"learning_rate": 1.9596196358244434e-06,
"loss": 0.6694,
"step": 4014
},
{
"epoch": 0.8594899788606138,
"grad_norm": 0.13818260095241675,
"learning_rate": 1.9537578228693e-06,
"loss": 0.6819,
"step": 4015
},
{
"epoch": 0.8597040485938294,
"grad_norm": 0.14230002086414403,
"learning_rate": 1.947904340026514e-06,
"loss": 0.6929,
"step": 4016
},
{
"epoch": 0.859918118327045,
"grad_norm": 0.1362836947439643,
"learning_rate": 1.9420591899980357e-06,
"loss": 0.6675,
"step": 4017
},
{
"epoch": 0.8601321880602606,
"grad_norm": 0.1342696395775042,
"learning_rate": 1.936222375481982e-06,
"loss": 0.6619,
"step": 4018
},
{
"epoch": 0.8603462577934762,
"grad_norm": 0.1392639990911854,
"learning_rate": 1.930393899172611e-06,
"loss": 0.6682,
"step": 4019
},
{
"epoch": 0.8605603275266919,
"grad_norm": 0.14557138905120687,
"learning_rate": 1.9245737637603357e-06,
"loss": 0.6903,
"step": 4020
},
{
"epoch": 0.8607743972599075,
"grad_norm": 0.13880560912456477,
"learning_rate": 1.9187619719317286e-06,
"loss": 0.6616,
"step": 4021
},
{
"epoch": 0.860988466993123,
"grad_norm": 0.13649539154107124,
"learning_rate": 1.9129585263694904e-06,
"loss": 0.6835,
"step": 4022
},
{
"epoch": 0.8612025367263386,
"grad_norm": 0.1411681513042414,
"learning_rate": 1.9071634297524921e-06,
"loss": 0.7097,
"step": 4023
},
{
"epoch": 0.8614166064595542,
"grad_norm": 0.2129660008473166,
"learning_rate": 1.9013766847557292e-06,
"loss": 0.6706,
"step": 4024
},
{
"epoch": 0.8616306761927698,
"grad_norm": 0.13977040417070688,
"learning_rate": 1.895598294050358e-06,
"loss": 0.6828,
"step": 4025
},
{
"epoch": 0.8618447459259854,
"grad_norm": 0.1418421415881163,
"learning_rate": 1.8898282603036788e-06,
"loss": 0.7129,
"step": 4026
},
{
"epoch": 0.862058815659201,
"grad_norm": 0.1380293165353464,
"learning_rate": 1.8840665861791164e-06,
"loss": 0.6716,
"step": 4027
},
{
"epoch": 0.8622728853924165,
"grad_norm": 1.1070614596771704,
"learning_rate": 1.8783132743362608e-06,
"loss": 0.7131,
"step": 4028
},
{
"epoch": 0.8624869551256322,
"grad_norm": 0.13706533146576322,
"learning_rate": 1.8725683274308192e-06,
"loss": 0.6791,
"step": 4029
},
{
"epoch": 0.8627010248588478,
"grad_norm": 0.13937686157761398,
"learning_rate": 1.8668317481146546e-06,
"loss": 0.6935,
"step": 4030
},
{
"epoch": 0.8629150945920634,
"grad_norm": 0.1407002418282179,
"learning_rate": 1.8611035390357667e-06,
"loss": 0.6827,
"step": 4031
},
{
"epoch": 0.863129164325279,
"grad_norm": 0.1395748879602978,
"learning_rate": 1.8553837028382738e-06,
"loss": 0.6962,
"step": 4032
},
{
"epoch": 0.8633432340584946,
"grad_norm": 0.14068603560572177,
"learning_rate": 1.8496722421624547e-06,
"loss": 0.6925,
"step": 4033
},
{
"epoch": 0.8635573037917101,
"grad_norm": 0.13945254285115313,
"learning_rate": 1.8439691596446985e-06,
"loss": 0.679,
"step": 4034
},
{
"epoch": 0.8637713735249257,
"grad_norm": 0.13515587483722066,
"learning_rate": 1.838274457917546e-06,
"loss": 0.6769,
"step": 4035
},
{
"epoch": 0.8639854432581413,
"grad_norm": 0.14005354445109036,
"learning_rate": 1.8325881396096546e-06,
"loss": 0.7014,
"step": 4036
},
{
"epoch": 0.8641995129913569,
"grad_norm": 0.3089511311556019,
"learning_rate": 1.82691020734582e-06,
"loss": 0.6876,
"step": 4037
},
{
"epoch": 0.8644135827245726,
"grad_norm": 0.1394864161854626,
"learning_rate": 1.8212406637469704e-06,
"loss": 0.689,
"step": 4038
},
{
"epoch": 0.8646276524577882,
"grad_norm": 0.1359203093816971,
"learning_rate": 1.81557951143015e-06,
"loss": 0.6623,
"step": 4039
},
{
"epoch": 0.8648417221910037,
"grad_norm": 0.13586191488478291,
"learning_rate": 1.8099267530085419e-06,
"loss": 0.6786,
"step": 4040
},
{
"epoch": 0.8650557919242193,
"grad_norm": 0.1379347374636731,
"learning_rate": 1.8042823910914431e-06,
"loss": 0.6899,
"step": 4041
},
{
"epoch": 0.8652698616574349,
"grad_norm": 0.1427146555224502,
"learning_rate": 1.798646428284283e-06,
"loss": 0.7209,
"step": 4042
},
{
"epoch": 0.8654839313906505,
"grad_norm": 0.14033994833419458,
"learning_rate": 1.7930188671886183e-06,
"loss": 0.7096,
"step": 4043
},
{
"epoch": 0.8656980011238661,
"grad_norm": 0.5308944134458937,
"learning_rate": 1.7873997104021111e-06,
"loss": 0.6957,
"step": 4044
},
{
"epoch": 0.8659120708570817,
"grad_norm": 0.14445543716033282,
"learning_rate": 1.7817889605185557e-06,
"loss": 0.7236,
"step": 4045
},
{
"epoch": 0.8661261405902972,
"grad_norm": 0.14008048343161253,
"learning_rate": 1.7761866201278732e-06,
"loss": 0.7184,
"step": 4046
},
{
"epoch": 0.8663402103235129,
"grad_norm": 0.1428713195925786,
"learning_rate": 1.770592691816082e-06,
"loss": 0.7001,
"step": 4047
},
{
"epoch": 0.8665542800567285,
"grad_norm": 0.13973365555288117,
"learning_rate": 1.7650071781653343e-06,
"loss": 0.7003,
"step": 4048
},
{
"epoch": 0.8667683497899441,
"grad_norm": 0.14335127799622926,
"learning_rate": 1.7594300817538945e-06,
"loss": 0.6965,
"step": 4049
},
{
"epoch": 0.8669824195231597,
"grad_norm": 0.13719063963547518,
"learning_rate": 1.7538614051561365e-06,
"loss": 0.6943,
"step": 4050
},
{
"epoch": 0.8671964892563753,
"grad_norm": 0.14118989738150972,
"learning_rate": 1.7483011509425573e-06,
"loss": 0.7035,
"step": 4051
},
{
"epoch": 0.8674105589895909,
"grad_norm": 0.14086567309607934,
"learning_rate": 1.7427493216797509e-06,
"loss": 0.6658,
"step": 4052
},
{
"epoch": 0.8676246287228064,
"grad_norm": 0.14059478051894736,
"learning_rate": 1.7372059199304359e-06,
"loss": 0.6818,
"step": 4053
},
{
"epoch": 0.867838698456022,
"grad_norm": 0.1489502535911441,
"learning_rate": 1.731670948253441e-06,
"loss": 0.7071,
"step": 4054
},
{
"epoch": 0.8680527681892376,
"grad_norm": 0.14474702569714257,
"learning_rate": 1.7261444092036917e-06,
"loss": 0.6896,
"step": 4055
},
{
"epoch": 0.8682668379224533,
"grad_norm": 0.13988495404158932,
"learning_rate": 1.7206263053322314e-06,
"loss": 0.691,
"step": 4056
},
{
"epoch": 0.8684809076556689,
"grad_norm": 0.14392092840912138,
"learning_rate": 1.7151166391862096e-06,
"loss": 0.6893,
"step": 4057
},
{
"epoch": 0.8686949773888845,
"grad_norm": 0.13850915238121347,
"learning_rate": 1.7096154133088738e-06,
"loss": 0.6737,
"step": 4058
},
{
"epoch": 0.8689090471221,
"grad_norm": 0.13610006332881708,
"learning_rate": 1.7041226302395797e-06,
"loss": 0.684,
"step": 4059
},
{
"epoch": 0.8691231168553156,
"grad_norm": 0.13887628728275586,
"learning_rate": 1.69863829251379e-06,
"loss": 0.6932,
"step": 4060
},
{
"epoch": 0.8693371865885312,
"grad_norm": 0.136527775146042,
"learning_rate": 1.6931624026630622e-06,
"loss": 0.6585,
"step": 4061
},
{
"epoch": 0.8695512563217468,
"grad_norm": 0.13775423345621057,
"learning_rate": 1.687694963215054e-06,
"loss": 0.7006,
"step": 4062
},
{
"epoch": 0.8697653260549624,
"grad_norm": 0.1434875909523652,
"learning_rate": 1.6822359766935337e-06,
"loss": 0.6996,
"step": 4063
},
{
"epoch": 0.869979395788178,
"grad_norm": 0.13874008611678676,
"learning_rate": 1.6767854456183519e-06,
"loss": 0.6661,
"step": 4064
},
{
"epoch": 0.8701934655213935,
"grad_norm": 0.1392466463595333,
"learning_rate": 1.6713433725054694e-06,
"loss": 0.6846,
"step": 4065
},
{
"epoch": 0.8704075352546092,
"grad_norm": 0.1463817588381614,
"learning_rate": 1.6659097598669305e-06,
"loss": 0.6963,
"step": 4066
},
{
"epoch": 0.8706216049878248,
"grad_norm": 0.14190399348610305,
"learning_rate": 1.660484610210884e-06,
"loss": 0.7038,
"step": 4067
},
{
"epoch": 0.8708356747210404,
"grad_norm": 0.1417111676401168,
"learning_rate": 1.6550679260415736e-06,
"loss": 0.7028,
"step": 4068
},
{
"epoch": 0.871049744454256,
"grad_norm": 0.13496103825030722,
"learning_rate": 1.6496597098593237e-06,
"loss": 0.6607,
"step": 4069
},
{
"epoch": 0.8712638141874716,
"grad_norm": 0.1385552010527188,
"learning_rate": 1.6442599641605639e-06,
"loss": 0.7213,
"step": 4070
},
{
"epoch": 0.8714778839206871,
"grad_norm": 0.14098537014948517,
"learning_rate": 1.6388686914377982e-06,
"loss": 0.663,
"step": 4071
},
{
"epoch": 0.8716919536539027,
"grad_norm": 0.13869759909861296,
"learning_rate": 1.6334858941796339e-06,
"loss": 0.6673,
"step": 4072
},
{
"epoch": 0.8719060233871183,
"grad_norm": 0.14474368399396312,
"learning_rate": 1.6281115748707632e-06,
"loss": 0.6968,
"step": 4073
},
{
"epoch": 0.8721200931203339,
"grad_norm": 0.13961803789818047,
"learning_rate": 1.6227457359919551e-06,
"loss": 0.6931,
"step": 4074
},
{
"epoch": 0.8723341628535496,
"grad_norm": 0.14183495274306632,
"learning_rate": 1.6173883800200774e-06,
"loss": 0.7127,
"step": 4075
},
{
"epoch": 0.8725482325867652,
"grad_norm": 0.14046329761262472,
"learning_rate": 1.6120395094280693e-06,
"loss": 0.6904,
"step": 4076
},
{
"epoch": 0.8727623023199808,
"grad_norm": 0.1434999428386889,
"learning_rate": 1.6066991266849674e-06,
"loss": 0.677,
"step": 4077
},
{
"epoch": 0.8729763720531963,
"grad_norm": 0.1370578797007404,
"learning_rate": 1.601367234255875e-06,
"loss": 0.6887,
"step": 4078
},
{
"epoch": 0.8731904417864119,
"grad_norm": 0.14041343652782823,
"learning_rate": 1.5960438346019857e-06,
"loss": 0.7075,
"step": 4079
},
{
"epoch": 0.8734045115196275,
"grad_norm": 0.13712033347839142,
"learning_rate": 1.5907289301805783e-06,
"loss": 0.7176,
"step": 4080
},
{
"epoch": 0.8736185812528431,
"grad_norm": 0.13778323332346226,
"learning_rate": 1.5854225234449927e-06,
"loss": 0.6911,
"step": 4081
},
{
"epoch": 0.8738326509860587,
"grad_norm": 0.18035510722500825,
"learning_rate": 1.5801246168446626e-06,
"loss": 0.6808,
"step": 4082
},
{
"epoch": 0.8740467207192743,
"grad_norm": 0.1388176427967325,
"learning_rate": 1.57483521282509e-06,
"loss": 0.6797,
"step": 4083
},
{
"epoch": 0.8742607904524899,
"grad_norm": 0.14253161598562444,
"learning_rate": 1.5695543138278525e-06,
"loss": 0.7061,
"step": 4084
},
{
"epoch": 0.8744748601857055,
"grad_norm": 0.1404884643300318,
"learning_rate": 1.5642819222906092e-06,
"loss": 0.6908,
"step": 4085
},
{
"epoch": 0.8746889299189211,
"grad_norm": 0.13829510031856715,
"learning_rate": 1.55901804064708e-06,
"loss": 0.6763,
"step": 4086
},
{
"epoch": 0.8749029996521367,
"grad_norm": 0.13772618907491851,
"learning_rate": 1.553762671327068e-06,
"loss": 0.6728,
"step": 4087
},
{
"epoch": 0.8751170693853523,
"grad_norm": 0.13122916552432304,
"learning_rate": 1.5485158167564373e-06,
"loss": 0.6588,
"step": 4088
},
{
"epoch": 0.8753311391185679,
"grad_norm": 0.13879769879567272,
"learning_rate": 1.5432774793571282e-06,
"loss": 0.709,
"step": 4089
},
{
"epoch": 0.8755452088517834,
"grad_norm": 0.1372467912384837,
"learning_rate": 1.538047661547153e-06,
"loss": 0.6692,
"step": 4090
},
{
"epoch": 0.875759278584999,
"grad_norm": 0.14808122985403258,
"learning_rate": 1.5328263657405761e-06,
"loss": 0.7073,
"step": 4091
},
{
"epoch": 0.8759733483182146,
"grad_norm": 0.4852315448503471,
"learning_rate": 1.527613594347548e-06,
"loss": 0.6736,
"step": 4092
},
{
"epoch": 0.8761874180514303,
"grad_norm": 0.13678621068358662,
"learning_rate": 1.5224093497742654e-06,
"loss": 0.6904,
"step": 4093
},
{
"epoch": 0.8764014877846459,
"grad_norm": 0.13625031090508333,
"learning_rate": 1.5172136344230027e-06,
"loss": 0.6743,
"step": 4094
},
{
"epoch": 0.8766155575178615,
"grad_norm": 0.13679272543310853,
"learning_rate": 1.5120264506920968e-06,
"loss": 0.6595,
"step": 4095
},
{
"epoch": 0.876829627251077,
"grad_norm": 0.14166687897944127,
"learning_rate": 1.5068478009759324e-06,
"loss": 0.6986,
"step": 4096
},
{
"epoch": 0.8770436969842926,
"grad_norm": 0.14201800916806356,
"learning_rate": 1.5016776876649753e-06,
"loss": 0.6796,
"step": 4097
},
{
"epoch": 0.8772577667175082,
"grad_norm": 0.13590090128662613,
"learning_rate": 1.4965161131457296e-06,
"loss": 0.6799,
"step": 4098
},
{
"epoch": 0.8774718364507238,
"grad_norm": 0.1349365279421223,
"learning_rate": 1.491363079800776e-06,
"loss": 0.6893,
"step": 4099
},
{
"epoch": 0.8776859061839394,
"grad_norm": 0.1350254244677767,
"learning_rate": 1.4862185900087456e-06,
"loss": 0.6881,
"step": 4100
},
{
"epoch": 0.877899975917155,
"grad_norm": 0.14145139974077467,
"learning_rate": 1.4810826461443184e-06,
"loss": 0.7005,
"step": 4101
},
{
"epoch": 0.8781140456503707,
"grad_norm": 0.140456229812318,
"learning_rate": 1.475955250578247e-06,
"loss": 0.726,
"step": 4102
},
{
"epoch": 0.8783281153835862,
"grad_norm": 0.13748759121736628,
"learning_rate": 1.4708364056773182e-06,
"loss": 0.68,
"step": 4103
},
{
"epoch": 0.8785421851168018,
"grad_norm": 0.13777697754413418,
"learning_rate": 1.4657261138043865e-06,
"loss": 0.658,
"step": 4104
},
{
"epoch": 0.8787562548500174,
"grad_norm": 0.13468060059659814,
"learning_rate": 1.460624377318356e-06,
"loss": 0.667,
"step": 4105
},
{
"epoch": 0.878970324583233,
"grad_norm": 0.14844673351629187,
"learning_rate": 1.4555311985741716e-06,
"loss": 0.7201,
"step": 4106
},
{
"epoch": 0.8791843943164486,
"grad_norm": 0.13935277258856124,
"learning_rate": 1.4504465799228396e-06,
"loss": 0.7081,
"step": 4107
},
{
"epoch": 0.8793984640496642,
"grad_norm": 0.1450966754147359,
"learning_rate": 1.445370523711409e-06,
"loss": 0.7075,
"step": 4108
},
{
"epoch": 0.8796125337828797,
"grad_norm": 0.1417894459156667,
"learning_rate": 1.440303032282979e-06,
"loss": 0.7013,
"step": 4109
},
{
"epoch": 0.8798266035160953,
"grad_norm": 0.14065170619765627,
"learning_rate": 1.4352441079766987e-06,
"loss": 0.6946,
"step": 4110
},
{
"epoch": 0.880040673249311,
"grad_norm": 0.13767217083294478,
"learning_rate": 1.4301937531277489e-06,
"loss": 0.6947,
"step": 4111
},
{
"epoch": 0.8802547429825266,
"grad_norm": 0.13631349764913517,
"learning_rate": 1.4251519700673732e-06,
"loss": 0.6756,
"step": 4112
},
{
"epoch": 0.8804688127157422,
"grad_norm": 0.1377537748795002,
"learning_rate": 1.4201187611228417e-06,
"loss": 0.6948,
"step": 4113
},
{
"epoch": 0.8806828824489578,
"grad_norm": 0.13662299320895158,
"learning_rate": 1.4150941286174825e-06,
"loss": 0.6744,
"step": 4114
},
{
"epoch": 0.8808969521821733,
"grad_norm": 0.13632037699573227,
"learning_rate": 1.4100780748706488e-06,
"loss": 0.7033,
"step": 4115
},
{
"epoch": 0.8811110219153889,
"grad_norm": 0.1424545895794344,
"learning_rate": 1.4050706021977468e-06,
"loss": 0.7033,
"step": 4116
},
{
"epoch": 0.8813250916486045,
"grad_norm": 0.13688964624674085,
"learning_rate": 1.400071712910216e-06,
"loss": 0.6853,
"step": 4117
},
{
"epoch": 0.8815391613818201,
"grad_norm": 0.1415596920705807,
"learning_rate": 1.395081409315533e-06,
"loss": 0.6975,
"step": 4118
},
{
"epoch": 0.8817532311150357,
"grad_norm": 0.13687888237919926,
"learning_rate": 1.390099693717215e-06,
"loss": 0.6809,
"step": 4119
},
{
"epoch": 0.8819673008482514,
"grad_norm": 0.13959374865085333,
"learning_rate": 1.3851265684148097e-06,
"loss": 0.6793,
"step": 4120
},
{
"epoch": 0.882181370581467,
"grad_norm": 0.13775021606462998,
"learning_rate": 1.3801620357039047e-06,
"loss": 0.6996,
"step": 4121
},
{
"epoch": 0.8823954403146825,
"grad_norm": 0.13536273849605482,
"learning_rate": 1.3752060978761228e-06,
"loss": 0.6658,
"step": 4122
},
{
"epoch": 0.8826095100478981,
"grad_norm": 0.13694380189645117,
"learning_rate": 1.3702587572191073e-06,
"loss": 0.6757,
"step": 4123
},
{
"epoch": 0.8828235797811137,
"grad_norm": 0.13367124100384561,
"learning_rate": 1.3653200160165513e-06,
"loss": 0.6432,
"step": 4124
},
{
"epoch": 0.8830376495143293,
"grad_norm": 0.13486752085843698,
"learning_rate": 1.3603898765481604e-06,
"loss": 0.6597,
"step": 4125
},
{
"epoch": 0.8832517192475449,
"grad_norm": 0.1400954872084322,
"learning_rate": 1.3554683410896807e-06,
"loss": 0.6945,
"step": 4126
},
{
"epoch": 0.8834657889807604,
"grad_norm": 0.14690038632643201,
"learning_rate": 1.3505554119128861e-06,
"loss": 0.6943,
"step": 4127
},
{
"epoch": 0.883679858713976,
"grad_norm": 0.1377368646762418,
"learning_rate": 1.3456510912855736e-06,
"loss": 0.7124,
"step": 4128
},
{
"epoch": 0.8838939284471917,
"grad_norm": 0.13851820089899997,
"learning_rate": 1.340755381471568e-06,
"loss": 0.7009,
"step": 4129
},
{
"epoch": 0.8841079981804073,
"grad_norm": 0.14402806675625066,
"learning_rate": 1.3358682847307236e-06,
"loss": 0.6993,
"step": 4130
},
{
"epoch": 0.8843220679136229,
"grad_norm": 0.1368628935738042,
"learning_rate": 1.3309898033189117e-06,
"loss": 0.6932,
"step": 4131
},
{
"epoch": 0.8845361376468385,
"grad_norm": 0.13682052086400184,
"learning_rate": 1.3261199394880309e-06,
"loss": 0.6873,
"step": 4132
},
{
"epoch": 0.884750207380054,
"grad_norm": 0.13636934574803303,
"learning_rate": 1.3212586954860052e-06,
"loss": 0.6868,
"step": 4133
},
{
"epoch": 0.8849642771132696,
"grad_norm": 0.1417087412086574,
"learning_rate": 1.3164060735567684e-06,
"loss": 0.6856,
"step": 4134
},
{
"epoch": 0.8851783468464852,
"grad_norm": 0.13701695852595525,
"learning_rate": 1.3115620759402892e-06,
"loss": 0.6953,
"step": 4135
},
{
"epoch": 0.8853924165797008,
"grad_norm": 0.14133718939419468,
"learning_rate": 1.3067267048725452e-06,
"loss": 0.6936,
"step": 4136
},
{
"epoch": 0.8856064863129164,
"grad_norm": 0.1383476056781188,
"learning_rate": 1.3018999625855334e-06,
"loss": 0.6876,
"step": 4137
},
{
"epoch": 0.8858205560461321,
"grad_norm": 0.13815354089297535,
"learning_rate": 1.2970818513072737e-06,
"loss": 0.6724,
"step": 4138
},
{
"epoch": 0.8860346257793477,
"grad_norm": 0.13943089852733764,
"learning_rate": 1.2922723732617914e-06,
"loss": 0.6629,
"step": 4139
},
{
"epoch": 0.8862486955125632,
"grad_norm": 0.1356520486119351,
"learning_rate": 1.2874715306691355e-06,
"loss": 0.6774,
"step": 4140
},
{
"epoch": 0.8864627652457788,
"grad_norm": 0.13682394251305746,
"learning_rate": 1.2826793257453707e-06,
"loss": 0.6862,
"step": 4141
},
{
"epoch": 0.8866768349789944,
"grad_norm": 0.13425737975549762,
"learning_rate": 1.277895760702561e-06,
"loss": 0.6702,
"step": 4142
},
{
"epoch": 0.88689090471221,
"grad_norm": 0.13715559124946522,
"learning_rate": 1.2731208377487958e-06,
"loss": 0.6717,
"step": 4143
},
{
"epoch": 0.8871049744454256,
"grad_norm": 0.13791502849871914,
"learning_rate": 1.268354559088174e-06,
"loss": 0.6867,
"step": 4144
},
{
"epoch": 0.8873190441786412,
"grad_norm": 0.1376700480430702,
"learning_rate": 1.2635969269207959e-06,
"loss": 0.6871,
"step": 4145
},
{
"epoch": 0.8875331139118567,
"grad_norm": 0.13558269481742685,
"learning_rate": 1.258847943442778e-06,
"loss": 0.6786,
"step": 4146
},
{
"epoch": 0.8877471836450724,
"grad_norm": 0.14059255609283336,
"learning_rate": 1.254107610846247e-06,
"loss": 0.695,
"step": 4147
},
{
"epoch": 0.887961253378288,
"grad_norm": 0.1337203655304122,
"learning_rate": 1.249375931319321e-06,
"loss": 0.656,
"step": 4148
},
{
"epoch": 0.8881753231115036,
"grad_norm": 0.13911079351124517,
"learning_rate": 1.2446529070461443e-06,
"loss": 0.686,
"step": 4149
},
{
"epoch": 0.8883893928447192,
"grad_norm": 0.1327340678772263,
"learning_rate": 1.239938540206851e-06,
"loss": 0.6667,
"step": 4150
},
{
"epoch": 0.8886034625779348,
"grad_norm": 0.32176166095405384,
"learning_rate": 1.2352328329775865e-06,
"loss": 0.7068,
"step": 4151
},
{
"epoch": 0.8888175323111503,
"grad_norm": 0.1371432945479455,
"learning_rate": 1.230535787530498e-06,
"loss": 0.688,
"step": 4152
},
{
"epoch": 0.8890316020443659,
"grad_norm": 0.14197367576520264,
"learning_rate": 1.2258474060337267e-06,
"loss": 0.7041,
"step": 4153
},
{
"epoch": 0.8892456717775815,
"grad_norm": 0.14111039898587507,
"learning_rate": 1.2211676906514303e-06,
"loss": 0.6933,
"step": 4154
},
{
"epoch": 0.8894597415107971,
"grad_norm": 0.13911051066692642,
"learning_rate": 1.2164966435437474e-06,
"loss": 0.6829,
"step": 4155
},
{
"epoch": 0.8896738112440128,
"grad_norm": 0.1362048186205355,
"learning_rate": 1.2118342668668336e-06,
"loss": 0.6876,
"step": 4156
},
{
"epoch": 0.8898878809772284,
"grad_norm": 0.13818062029644287,
"learning_rate": 1.207180562772825e-06,
"loss": 0.6973,
"step": 4157
},
{
"epoch": 0.890101950710444,
"grad_norm": 0.1317278216703112,
"learning_rate": 1.2025355334098676e-06,
"loss": 0.6626,
"step": 4158
},
{
"epoch": 0.8903160204436595,
"grad_norm": 0.13939131334518293,
"learning_rate": 1.1978991809221019e-06,
"loss": 0.6978,
"step": 4159
},
{
"epoch": 0.8905300901768751,
"grad_norm": 0.13289617835808065,
"learning_rate": 1.1932715074496514e-06,
"loss": 0.6731,
"step": 4160
},
{
"epoch": 0.8907441599100907,
"grad_norm": 0.13587332632333382,
"learning_rate": 1.1886525151286477e-06,
"loss": 0.687,
"step": 4161
},
{
"epoch": 0.8909582296433063,
"grad_norm": 0.13414299970098992,
"learning_rate": 1.184042206091207e-06,
"loss": 0.6729,
"step": 4162
},
{
"epoch": 0.8911722993765219,
"grad_norm": 0.13301210870013755,
"learning_rate": 1.1794405824654386e-06,
"loss": 0.6868,
"step": 4163
},
{
"epoch": 0.8913863691097375,
"grad_norm": 0.1381687560987605,
"learning_rate": 1.1748476463754478e-06,
"loss": 0.7018,
"step": 4164
},
{
"epoch": 0.8916004388429531,
"grad_norm": 0.14151043043864178,
"learning_rate": 1.1702633999413204e-06,
"loss": 0.6632,
"step": 4165
},
{
"epoch": 0.8918145085761687,
"grad_norm": 0.1375264682306881,
"learning_rate": 1.165687845279142e-06,
"loss": 0.6857,
"step": 4166
},
{
"epoch": 0.8920285783093843,
"grad_norm": 0.1349179406097826,
"learning_rate": 1.1611209845009718e-06,
"loss": 0.6808,
"step": 4167
},
{
"epoch": 0.8922426480425999,
"grad_norm": 0.1349911400223185,
"learning_rate": 1.1565628197148704e-06,
"loss": 0.6556,
"step": 4168
},
{
"epoch": 0.8924567177758155,
"grad_norm": 0.13829912234677266,
"learning_rate": 1.1520133530248812e-06,
"loss": 0.6713,
"step": 4169
},
{
"epoch": 0.8926707875090311,
"grad_norm": 0.1362169824803463,
"learning_rate": 1.1474725865310199e-06,
"loss": 0.6762,
"step": 4170
},
{
"epoch": 0.8928848572422466,
"grad_norm": 0.13349799694666242,
"learning_rate": 1.1429405223293056e-06,
"loss": 0.6562,
"step": 4171
},
{
"epoch": 0.8930989269754622,
"grad_norm": 0.1352831346317916,
"learning_rate": 1.1384171625117246e-06,
"loss": 0.7042,
"step": 4172
},
{
"epoch": 0.8933129967086778,
"grad_norm": 0.13249036648854318,
"learning_rate": 1.1339025091662537e-06,
"loss": 0.6611,
"step": 4173
},
{
"epoch": 0.8935270664418934,
"grad_norm": 0.1340249258222099,
"learning_rate": 1.1293965643768523e-06,
"loss": 0.6863,
"step": 4174
},
{
"epoch": 0.8937411361751091,
"grad_norm": 0.1349001998761694,
"learning_rate": 1.1248993302234502e-06,
"loss": 0.6907,
"step": 4175
},
{
"epoch": 0.8939552059083247,
"grad_norm": 0.1318757777674753,
"learning_rate": 1.1204108087819666e-06,
"loss": 0.6667,
"step": 4176
},
{
"epoch": 0.8941692756415403,
"grad_norm": 0.13632503156690215,
"learning_rate": 1.1159310021242909e-06,
"loss": 0.7022,
"step": 4177
},
{
"epoch": 0.8943833453747558,
"grad_norm": 0.13556018752789825,
"learning_rate": 1.1114599123182956e-06,
"loss": 0.6734,
"step": 4178
},
{
"epoch": 0.8945974151079714,
"grad_norm": 0.135742094137929,
"learning_rate": 1.1069975414278321e-06,
"loss": 0.7064,
"step": 4179
},
{
"epoch": 0.894811484841187,
"grad_norm": 0.1359650650535543,
"learning_rate": 1.102543891512715e-06,
"loss": 0.69,
"step": 4180
},
{
"epoch": 0.8950255545744026,
"grad_norm": 0.1373653086794688,
"learning_rate": 1.0980989646287466e-06,
"loss": 0.69,
"step": 4181
},
{
"epoch": 0.8952396243076182,
"grad_norm": 0.13816654457909658,
"learning_rate": 1.0936627628276918e-06,
"loss": 0.721,
"step": 4182
},
{
"epoch": 0.8954536940408337,
"grad_norm": 0.13615854922586995,
"learning_rate": 1.0892352881572976e-06,
"loss": 0.6636,
"step": 4183
},
{
"epoch": 0.8956677637740494,
"grad_norm": 0.14082358750022633,
"learning_rate": 1.0848165426612778e-06,
"loss": 0.6976,
"step": 4184
},
{
"epoch": 0.895881833507265,
"grad_norm": 0.13391560669031236,
"learning_rate": 1.080406528379314e-06,
"loss": 0.7248,
"step": 4185
},
{
"epoch": 0.8960959032404806,
"grad_norm": 0.2292997829312597,
"learning_rate": 1.0760052473470673e-06,
"loss": 0.6818,
"step": 4186
},
{
"epoch": 0.8963099729736962,
"grad_norm": 0.13541889686321737,
"learning_rate": 1.0716127015961541e-06,
"loss": 0.6891,
"step": 4187
},
{
"epoch": 0.8965240427069118,
"grad_norm": 0.13648025986660156,
"learning_rate": 1.0672288931541664e-06,
"loss": 0.6687,
"step": 4188
},
{
"epoch": 0.8967381124401274,
"grad_norm": 0.13716114841950905,
"learning_rate": 1.0628538240446672e-06,
"loss": 0.6657,
"step": 4189
},
{
"epoch": 0.8969521821733429,
"grad_norm": 0.13591264002806472,
"learning_rate": 1.0584874962871728e-06,
"loss": 0.6845,
"step": 4190
},
{
"epoch": 0.8971662519065585,
"grad_norm": 0.1363589779239559,
"learning_rate": 1.0541299118971815e-06,
"loss": 0.6907,
"step": 4191
},
{
"epoch": 0.8973803216397741,
"grad_norm": 0.13668582559807566,
"learning_rate": 1.049781072886138e-06,
"loss": 0.7095,
"step": 4192
},
{
"epoch": 0.8975943913729898,
"grad_norm": 0.13323433252010586,
"learning_rate": 1.0454409812614586e-06,
"loss": 0.6692,
"step": 4193
},
{
"epoch": 0.8978084611062054,
"grad_norm": 0.13716866600975855,
"learning_rate": 1.0411096390265297e-06,
"loss": 0.6993,
"step": 4194
},
{
"epoch": 0.898022530839421,
"grad_norm": 0.13613843088374558,
"learning_rate": 1.036787048180683e-06,
"loss": 0.6777,
"step": 4195
},
{
"epoch": 0.8982366005726365,
"grad_norm": 0.13370154993136393,
"learning_rate": 1.0324732107192249e-06,
"loss": 0.6754,
"step": 4196
},
{
"epoch": 0.8984506703058521,
"grad_norm": 0.13288978948956706,
"learning_rate": 1.0281681286334068e-06,
"loss": 0.6555,
"step": 4197
},
{
"epoch": 0.8986647400390677,
"grad_norm": 0.13626230149594187,
"learning_rate": 1.0238718039104545e-06,
"loss": 0.6984,
"step": 4198
},
{
"epoch": 0.8988788097722833,
"grad_norm": 0.133980931294781,
"learning_rate": 1.0195842385335375e-06,
"loss": 0.6742,
"step": 4199
},
{
"epoch": 0.8990928795054989,
"grad_norm": 0.13860021319839488,
"learning_rate": 1.0153054344817926e-06,
"loss": 0.6932,
"step": 4200
},
{
"epoch": 0.8993069492387145,
"grad_norm": 0.13518658916061166,
"learning_rate": 1.0110353937303064e-06,
"loss": 0.6876,
"step": 4201
},
{
"epoch": 0.8995210189719302,
"grad_norm": 0.13478101610443857,
"learning_rate": 1.0067741182501201e-06,
"loss": 0.6889,
"step": 4202
},
{
"epoch": 0.8997350887051457,
"grad_norm": 0.13807951269005206,
"learning_rate": 1.0025216100082359e-06,
"loss": 0.707,
"step": 4203
},
{
"epoch": 0.8999491584383613,
"grad_norm": 0.13839079339829718,
"learning_rate": 9.982778709675967e-07,
"loss": 0.6835,
"step": 4204
},
{
"epoch": 0.9001632281715769,
"grad_norm": 0.13869051306125363,
"learning_rate": 9.94042903087109e-07,
"loss": 0.6854,
"step": 4205
},
{
"epoch": 0.9003772979047925,
"grad_norm": 0.13586026032485238,
"learning_rate": 9.89816708321627e-07,
"loss": 0.6706,
"step": 4206
},
{
"epoch": 0.9005913676380081,
"grad_norm": 0.13934395073413944,
"learning_rate": 9.855992886219501e-07,
"loss": 0.7135,
"step": 4207
},
{
"epoch": 0.9008054373712236,
"grad_norm": 0.1314625614588388,
"learning_rate": 9.813906459348388e-07,
"loss": 0.6656,
"step": 4208
},
{
"epoch": 0.9010195071044392,
"grad_norm": 0.13528061424125878,
"learning_rate": 9.77190782202986e-07,
"loss": 0.7033,
"step": 4209
},
{
"epoch": 0.9012335768376548,
"grad_norm": 0.1334704367168475,
"learning_rate": 9.72999699365047e-07,
"loss": 0.7018,
"step": 4210
},
{
"epoch": 0.9014476465708705,
"grad_norm": 0.13974577952090841,
"learning_rate": 9.68817399355617e-07,
"loss": 0.6883,
"step": 4211
},
{
"epoch": 0.9016617163040861,
"grad_norm": 0.1420288171507796,
"learning_rate": 9.646438841052364e-07,
"loss": 0.7104,
"step": 4212
},
{
"epoch": 0.9018757860373017,
"grad_norm": 0.20868338774224038,
"learning_rate": 9.604791555403924e-07,
"loss": 0.688,
"step": 4213
},
{
"epoch": 0.9020898557705173,
"grad_norm": 0.13340624567742823,
"learning_rate": 9.56323215583521e-07,
"loss": 0.6617,
"step": 4214
},
{
"epoch": 0.9023039255037328,
"grad_norm": 0.13582068993976912,
"learning_rate": 9.521760661529878e-07,
"loss": 0.6887,
"step": 4215
},
{
"epoch": 0.9025179952369484,
"grad_norm": 0.1405173650563728,
"learning_rate": 9.480377091631166e-07,
"loss": 0.7009,
"step": 4216
},
{
"epoch": 0.902732064970164,
"grad_norm": 0.13691065637710362,
"learning_rate": 9.43908146524164e-07,
"loss": 0.7067,
"step": 4217
},
{
"epoch": 0.9029461347033796,
"grad_norm": 0.13657292298918064,
"learning_rate": 9.397873801423252e-07,
"loss": 0.656,
"step": 4218
},
{
"epoch": 0.9031602044365952,
"grad_norm": 0.13747198064130112,
"learning_rate": 9.356754119197386e-07,
"loss": 0.6953,
"step": 4219
},
{
"epoch": 0.9033742741698109,
"grad_norm": 0.13493345557005731,
"learning_rate": 9.315722437544883e-07,
"loss": 0.6782,
"step": 4220
},
{
"epoch": 0.9035883439030264,
"grad_norm": 0.1360362388580472,
"learning_rate": 9.274778775405813e-07,
"loss": 0.6813,
"step": 4221
},
{
"epoch": 0.903802413636242,
"grad_norm": 0.13648955678738503,
"learning_rate": 9.233923151679724e-07,
"loss": 0.6967,
"step": 4222
},
{
"epoch": 0.9040164833694576,
"grad_norm": 0.13909357343446574,
"learning_rate": 9.193155585225511e-07,
"loss": 0.7042,
"step": 4223
},
{
"epoch": 0.9042305531026732,
"grad_norm": 0.13887076469319895,
"learning_rate": 9.152476094861384e-07,
"loss": 0.7095,
"step": 4224
},
{
"epoch": 0.9044446228358888,
"grad_norm": 0.13716160358883434,
"learning_rate": 9.111884699364926e-07,
"loss": 0.7037,
"step": 4225
},
{
"epoch": 0.9046586925691044,
"grad_norm": 0.1401960258811389,
"learning_rate": 9.07138141747308e-07,
"loss": 0.6599,
"step": 4226
},
{
"epoch": 0.9048727623023199,
"grad_norm": 0.19759839121290612,
"learning_rate": 9.030966267882024e-07,
"loss": 0.6862,
"step": 4227
},
{
"epoch": 0.9050868320355355,
"grad_norm": 0.1380400568062713,
"learning_rate": 8.990639269247392e-07,
"loss": 0.7016,
"step": 4228
},
{
"epoch": 0.9053009017687512,
"grad_norm": 0.13798859019694037,
"learning_rate": 8.950400440184004e-07,
"loss": 0.7022,
"step": 4229
},
{
"epoch": 0.9055149715019668,
"grad_norm": 0.13820047638881433,
"learning_rate": 8.910249799266024e-07,
"loss": 0.6957,
"step": 4230
},
{
"epoch": 0.9057290412351824,
"grad_norm": 0.13423981271998803,
"learning_rate": 8.870187365026961e-07,
"loss": 0.6714,
"step": 4231
},
{
"epoch": 0.905943110968398,
"grad_norm": 0.13946111140730902,
"learning_rate": 8.830213155959511e-07,
"loss": 0.709,
"step": 4232
},
{
"epoch": 0.9061571807016136,
"grad_norm": 0.13548709371465542,
"learning_rate": 8.790327190515757e-07,
"loss": 0.7107,
"step": 4233
},
{
"epoch": 0.9063712504348291,
"grad_norm": 0.13351014447234621,
"learning_rate": 8.750529487106907e-07,
"loss": 0.6611,
"step": 4234
},
{
"epoch": 0.9065853201680447,
"grad_norm": 0.19082707639063,
"learning_rate": 8.710820064103553e-07,
"loss": 0.7009,
"step": 4235
},
{
"epoch": 0.9067993899012603,
"grad_norm": 0.14032475995120708,
"learning_rate": 8.671198939835523e-07,
"loss": 0.7091,
"step": 4236
},
{
"epoch": 0.9070134596344759,
"grad_norm": 0.13718192279792774,
"learning_rate": 8.631666132591787e-07,
"loss": 0.6787,
"step": 4237
},
{
"epoch": 0.9072275293676916,
"grad_norm": 0.1369111108111374,
"learning_rate": 8.592221660620681e-07,
"loss": 0.6836,
"step": 4238
},
{
"epoch": 0.9074415991009072,
"grad_norm": 0.1381243286863606,
"learning_rate": 8.55286554212964e-07,
"loss": 0.6635,
"step": 4239
},
{
"epoch": 0.9076556688341227,
"grad_norm": 0.13582615247440263,
"learning_rate": 8.513597795285422e-07,
"loss": 0.7128,
"step": 4240
},
{
"epoch": 0.9078697385673383,
"grad_norm": 0.1345352526826563,
"learning_rate": 8.474418438213927e-07,
"loss": 0.6654,
"step": 4241
},
{
"epoch": 0.9080838083005539,
"grad_norm": 0.13392776665001568,
"learning_rate": 8.435327489000267e-07,
"loss": 0.6855,
"step": 4242
},
{
"epoch": 0.9082978780337695,
"grad_norm": 0.13830397496776295,
"learning_rate": 8.396324965688785e-07,
"loss": 0.6834,
"step": 4243
},
{
"epoch": 0.9085119477669851,
"grad_norm": 0.1345719563921343,
"learning_rate": 8.357410886282946e-07,
"loss": 0.6882,
"step": 4244
},
{
"epoch": 0.9087260175002007,
"grad_norm": 0.13969201427154823,
"learning_rate": 8.318585268745449e-07,
"loss": 0.7141,
"step": 4245
},
{
"epoch": 0.9089400872334162,
"grad_norm": 0.13621902698302082,
"learning_rate": 8.27984813099807e-07,
"loss": 0.6861,
"step": 4246
},
{
"epoch": 0.9091541569666319,
"grad_norm": 0.13470107612327034,
"learning_rate": 8.241199490921836e-07,
"loss": 0.6779,
"step": 4247
},
{
"epoch": 0.9093682266998475,
"grad_norm": 0.13756096212331864,
"learning_rate": 8.202639366356923e-07,
"loss": 0.6805,
"step": 4248
},
{
"epoch": 0.9095822964330631,
"grad_norm": 0.13487815443914544,
"learning_rate": 8.16416777510256e-07,
"loss": 0.7133,
"step": 4249
},
{
"epoch": 0.9097963661662787,
"grad_norm": 0.6780640255438278,
"learning_rate": 8.125784734917186e-07,
"loss": 0.6821,
"step": 4250
},
{
"epoch": 0.9100104358994943,
"grad_norm": 0.1397252595980418,
"learning_rate": 8.087490263518338e-07,
"loss": 0.7032,
"step": 4251
},
{
"epoch": 0.9102245056327098,
"grad_norm": 0.13391700082774008,
"learning_rate": 8.049284378582656e-07,
"loss": 0.6939,
"step": 4252
},
{
"epoch": 0.9104385753659254,
"grad_norm": 0.1392238017282743,
"learning_rate": 8.011167097745943e-07,
"loss": 0.6917,
"step": 4253
},
{
"epoch": 0.910652645099141,
"grad_norm": 0.13765354847185426,
"learning_rate": 7.973138438603034e-07,
"loss": 0.6743,
"step": 4254
},
{
"epoch": 0.9108667148323566,
"grad_norm": 0.1371561090435617,
"learning_rate": 7.935198418707935e-07,
"loss": 0.6724,
"step": 4255
},
{
"epoch": 0.9110807845655723,
"grad_norm": 0.14308962400311664,
"learning_rate": 7.897347055573634e-07,
"loss": 0.6678,
"step": 4256
},
{
"epoch": 0.9112948542987879,
"grad_norm": 0.13712832529790503,
"learning_rate": 7.859584366672268e-07,
"loss": 0.7004,
"step": 4257
},
{
"epoch": 0.9115089240320035,
"grad_norm": 0.13657913703164848,
"learning_rate": 7.821910369435048e-07,
"loss": 0.7099,
"step": 4258
},
{
"epoch": 0.911722993765219,
"grad_norm": 0.13626676720993744,
"learning_rate": 7.784325081252197e-07,
"loss": 0.6735,
"step": 4259
},
{
"epoch": 0.9119370634984346,
"grad_norm": 0.1406899067908914,
"learning_rate": 7.746828519473038e-07,
"loss": 0.7023,
"step": 4260
},
{
"epoch": 0.9121511332316502,
"grad_norm": 0.13602732059821163,
"learning_rate": 7.709420701405878e-07,
"loss": 0.7045,
"step": 4261
},
{
"epoch": 0.9123652029648658,
"grad_norm": 0.14004791769388897,
"learning_rate": 7.67210164431813e-07,
"loss": 0.6811,
"step": 4262
},
{
"epoch": 0.9125792726980814,
"grad_norm": 0.18193442284236228,
"learning_rate": 7.634871365436192e-07,
"loss": 0.6869,
"step": 4263
},
{
"epoch": 0.912793342431297,
"grad_norm": 0.1365871204195117,
"learning_rate": 7.597729881945492e-07,
"loss": 0.6855,
"step": 4264
},
{
"epoch": 0.9130074121645126,
"grad_norm": 0.13805740320721904,
"learning_rate": 7.560677210990497e-07,
"loss": 0.701,
"step": 4265
},
{
"epoch": 0.9132214818977282,
"grad_norm": 0.13376363466133837,
"learning_rate": 7.52371336967459e-07,
"loss": 0.6902,
"step": 4266
},
{
"epoch": 0.9134355516309438,
"grad_norm": 0.13443944421728973,
"learning_rate": 7.486838375060257e-07,
"loss": 0.6761,
"step": 4267
},
{
"epoch": 0.9136496213641594,
"grad_norm": 0.13479866072362381,
"learning_rate": 7.450052244168949e-07,
"loss": 0.6515,
"step": 4268
},
{
"epoch": 0.913863691097375,
"grad_norm": 0.13890310816730966,
"learning_rate": 7.413354993981015e-07,
"loss": 0.6787,
"step": 4269
},
{
"epoch": 0.9140777608305906,
"grad_norm": 0.13452385813391135,
"learning_rate": 7.376746641435883e-07,
"loss": 0.6948,
"step": 4270
},
{
"epoch": 0.9142918305638061,
"grad_norm": 0.1348138019183146,
"learning_rate": 7.340227203431882e-07,
"loss": 0.7141,
"step": 4271
},
{
"epoch": 0.9145059002970217,
"grad_norm": 0.14191670066971723,
"learning_rate": 7.303796696826348e-07,
"loss": 0.7142,
"step": 4272
},
{
"epoch": 0.9147199700302373,
"grad_norm": 0.134099494770952,
"learning_rate": 7.267455138435497e-07,
"loss": 0.6903,
"step": 4273
},
{
"epoch": 0.914934039763453,
"grad_norm": 0.13583121927338482,
"learning_rate": 7.231202545034554e-07,
"loss": 0.7117,
"step": 4274
},
{
"epoch": 0.9151481094966686,
"grad_norm": 0.13886885883411906,
"learning_rate": 7.195038933357645e-07,
"loss": 0.7012,
"step": 4275
},
{
"epoch": 0.9153621792298842,
"grad_norm": 0.13770266378658522,
"learning_rate": 7.158964320097794e-07,
"loss": 0.7132,
"step": 4276
},
{
"epoch": 0.9155762489630997,
"grad_norm": 0.13461741783486814,
"learning_rate": 7.122978721907015e-07,
"loss": 0.6913,
"step": 4277
},
{
"epoch": 0.9157903186963153,
"grad_norm": 0.1424576873922085,
"learning_rate": 7.087082155396196e-07,
"loss": 0.6816,
"step": 4278
},
{
"epoch": 0.9160043884295309,
"grad_norm": 0.14082754894010235,
"learning_rate": 7.051274637135108e-07,
"loss": 0.7105,
"step": 4279
},
{
"epoch": 0.9162184581627465,
"grad_norm": 0.14077410771621526,
"learning_rate": 7.015556183652439e-07,
"loss": 0.7326,
"step": 4280
},
{
"epoch": 0.9164325278959621,
"grad_norm": 0.1384443980516193,
"learning_rate": 6.979926811435755e-07,
"loss": 0.6861,
"step": 4281
},
{
"epoch": 0.9166465976291777,
"grad_norm": 0.13739173872270724,
"learning_rate": 6.944386536931547e-07,
"loss": 0.6947,
"step": 4282
},
{
"epoch": 0.9168606673623932,
"grad_norm": 0.13832205645117335,
"learning_rate": 6.908935376545067e-07,
"loss": 0.7074,
"step": 4283
},
{
"epoch": 0.9170747370956089,
"grad_norm": 0.13279344935726523,
"learning_rate": 6.87357334664056e-07,
"loss": 0.6893,
"step": 4284
},
{
"epoch": 0.9172888068288245,
"grad_norm": 0.17245526487864973,
"learning_rate": 6.838300463541103e-07,
"loss": 0.7016,
"step": 4285
},
{
"epoch": 0.9175028765620401,
"grad_norm": 0.13536991850625252,
"learning_rate": 6.803116743528516e-07,
"loss": 0.6945,
"step": 4286
},
{
"epoch": 0.9177169462952557,
"grad_norm": 0.13495945295599548,
"learning_rate": 6.768022202843605e-07,
"loss": 0.7181,
"step": 4287
},
{
"epoch": 0.9179310160284713,
"grad_norm": 0.15546373828749133,
"learning_rate": 6.733016857685903e-07,
"loss": 0.666,
"step": 4288
},
{
"epoch": 0.9181450857616869,
"grad_norm": 0.13600256434721386,
"learning_rate": 6.698100724213819e-07,
"loss": 0.6778,
"step": 4289
},
{
"epoch": 0.9183591554949024,
"grad_norm": 0.13338886836676647,
"learning_rate": 6.663273818544613e-07,
"loss": 0.6731,
"step": 4290
},
{
"epoch": 0.918573225228118,
"grad_norm": 0.13957994583414532,
"learning_rate": 6.628536156754273e-07,
"loss": 0.6838,
"step": 4291
},
{
"epoch": 0.9187872949613336,
"grad_norm": 0.14212469513701823,
"learning_rate": 6.59388775487766e-07,
"loss": 0.6983,
"step": 4292
},
{
"epoch": 0.9190013646945493,
"grad_norm": 0.13658478466758459,
"learning_rate": 6.559328628908446e-07,
"loss": 0.671,
"step": 4293
},
{
"epoch": 0.9192154344277649,
"grad_norm": 0.13367364859649347,
"learning_rate": 6.524858794799005e-07,
"loss": 0.688,
"step": 4294
},
{
"epoch": 0.9194295041609805,
"grad_norm": 0.13800794729231464,
"learning_rate": 6.49047826846061e-07,
"loss": 0.6902,
"step": 4295
},
{
"epoch": 0.919643573894196,
"grad_norm": 0.13528922517669267,
"learning_rate": 6.456187065763165e-07,
"loss": 0.6924,
"step": 4296
},
{
"epoch": 0.9198576436274116,
"grad_norm": 0.13582332354188012,
"learning_rate": 6.421985202535497e-07,
"loss": 0.6987,
"step": 4297
},
{
"epoch": 0.9200717133606272,
"grad_norm": 0.13086211665040132,
"learning_rate": 6.387872694565112e-07,
"loss": 0.666,
"step": 4298
},
{
"epoch": 0.9202857830938428,
"grad_norm": 0.13560469131861522,
"learning_rate": 6.353849557598235e-07,
"loss": 0.691,
"step": 4299
},
{
"epoch": 0.9204998528270584,
"grad_norm": 0.1396926440578085,
"learning_rate": 6.319915807339927e-07,
"loss": 0.7283,
"step": 4300
},
{
"epoch": 0.920713922560274,
"grad_norm": 0.13773179089997417,
"learning_rate": 6.286071459453969e-07,
"loss": 0.6897,
"step": 4301
},
{
"epoch": 0.9209279922934897,
"grad_norm": 0.13891185958429028,
"learning_rate": 6.252316529562797e-07,
"loss": 0.7037,
"step": 4302
},
{
"epoch": 0.9211420620267052,
"grad_norm": 0.133402661825684,
"learning_rate": 6.218651033247636e-07,
"loss": 0.6747,
"step": 4303
},
{
"epoch": 0.9213561317599208,
"grad_norm": 0.13430283133525148,
"learning_rate": 6.185074986048456e-07,
"loss": 0.6973,
"step": 4304
},
{
"epoch": 0.9215702014931364,
"grad_norm": 0.13605896074993906,
"learning_rate": 6.151588403463838e-07,
"loss": 0.7241,
"step": 4305
},
{
"epoch": 0.921784271226352,
"grad_norm": 0.1336695624237095,
"learning_rate": 6.118191300951171e-07,
"loss": 0.6703,
"step": 4306
},
{
"epoch": 0.9219983409595676,
"grad_norm": 0.137823698276276,
"learning_rate": 6.084883693926502e-07,
"loss": 0.6895,
"step": 4307
},
{
"epoch": 0.9222124106927831,
"grad_norm": 0.13543544897243118,
"learning_rate": 6.051665597764534e-07,
"loss": 0.6947,
"step": 4308
},
{
"epoch": 0.9224264804259987,
"grad_norm": 0.1403307461248502,
"learning_rate": 6.018537027798665e-07,
"loss": 0.7043,
"step": 4309
},
{
"epoch": 0.9226405501592143,
"grad_norm": 0.13496220856618912,
"learning_rate": 5.985497999321044e-07,
"loss": 0.6826,
"step": 4310
},
{
"epoch": 0.92285461989243,
"grad_norm": 0.13833307562725605,
"learning_rate": 5.952548527582358e-07,
"loss": 0.6961,
"step": 4311
},
{
"epoch": 0.9230686896256456,
"grad_norm": 0.13497327975776574,
"learning_rate": 5.919688627792086e-07,
"loss": 0.682,
"step": 4312
},
{
"epoch": 0.9232827593588612,
"grad_norm": 0.1346846618255718,
"learning_rate": 5.88691831511825e-07,
"loss": 0.6823,
"step": 4313
},
{
"epoch": 0.9234968290920768,
"grad_norm": 0.13567372056383306,
"learning_rate": 5.854237604687574e-07,
"loss": 0.6834,
"step": 4314
},
{
"epoch": 0.9237108988252923,
"grad_norm": 0.13482695070791353,
"learning_rate": 5.821646511585433e-07,
"loss": 0.6617,
"step": 4315
},
{
"epoch": 0.9239249685585079,
"grad_norm": 0.13795490822378076,
"learning_rate": 5.789145050855793e-07,
"loss": 0.7108,
"step": 4316
},
{
"epoch": 0.9241390382917235,
"grad_norm": 0.1379904292501817,
"learning_rate": 5.7567332375013e-07,
"loss": 0.6928,
"step": 4317
},
{
"epoch": 0.9243531080249391,
"grad_norm": 0.1343495493065187,
"learning_rate": 5.724411086483139e-07,
"loss": 0.662,
"step": 4318
},
{
"epoch": 0.9245671777581547,
"grad_norm": 0.1349392778611965,
"learning_rate": 5.6921786127212e-07,
"loss": 0.6817,
"step": 4319
},
{
"epoch": 0.9247812474913704,
"grad_norm": 0.22708194026805,
"learning_rate": 5.660035831093935e-07,
"loss": 0.698,
"step": 4320
},
{
"epoch": 0.924995317224586,
"grad_norm": 0.13672210511292796,
"learning_rate": 5.627982756438344e-07,
"loss": 0.7111,
"step": 4321
},
{
"epoch": 0.9252093869578015,
"grad_norm": 0.1363800057454497,
"learning_rate": 5.596019403550145e-07,
"loss": 0.6684,
"step": 4322
},
{
"epoch": 0.9254234566910171,
"grad_norm": 0.13840339865274745,
"learning_rate": 5.564145787183473e-07,
"loss": 0.6758,
"step": 4323
},
{
"epoch": 0.9256375264242327,
"grad_norm": 0.13425733861343547,
"learning_rate": 5.532361922051221e-07,
"loss": 0.6809,
"step": 4324
},
{
"epoch": 0.9258515961574483,
"grad_norm": 0.13930949914002347,
"learning_rate": 5.500667822824679e-07,
"loss": 0.7167,
"step": 4325
},
{
"epoch": 0.9260656658906639,
"grad_norm": 0.1344001305617284,
"learning_rate": 5.469063504133832e-07,
"loss": 0.6875,
"step": 4326
},
{
"epoch": 0.9262797356238794,
"grad_norm": 0.13340477883012816,
"learning_rate": 5.437548980567187e-07,
"loss": 0.6786,
"step": 4327
},
{
"epoch": 0.926493805357095,
"grad_norm": 0.13378110904294804,
"learning_rate": 5.406124266671753e-07,
"loss": 0.6913,
"step": 4328
},
{
"epoch": 0.9267078750903107,
"grad_norm": 0.134446707366625,
"learning_rate": 5.374789376953149e-07,
"loss": 0.6963,
"step": 4329
},
{
"epoch": 0.9269219448235263,
"grad_norm": 0.1338691969218982,
"learning_rate": 5.343544325875494e-07,
"loss": 0.6669,
"step": 4330
},
{
"epoch": 0.9271360145567419,
"grad_norm": 0.13841565087395938,
"learning_rate": 5.312389127861428e-07,
"loss": 0.6902,
"step": 4331
},
{
"epoch": 0.9273500842899575,
"grad_norm": 0.13213259093858248,
"learning_rate": 5.281323797292182e-07,
"loss": 0.6485,
"step": 4332
},
{
"epoch": 0.927564154023173,
"grad_norm": 0.13820700523065554,
"learning_rate": 5.250348348507395e-07,
"loss": 0.7012,
"step": 4333
},
{
"epoch": 0.9277782237563886,
"grad_norm": 0.13884337919224177,
"learning_rate": 5.219462795805341e-07,
"loss": 0.6931,
"step": 4334
},
{
"epoch": 0.9279922934896042,
"grad_norm": 0.14160605232931436,
"learning_rate": 5.188667153442661e-07,
"loss": 0.7401,
"step": 4335
},
{
"epoch": 0.9282063632228198,
"grad_norm": 0.13562775573795124,
"learning_rate": 5.157961435634628e-07,
"loss": 0.6852,
"step": 4336
},
{
"epoch": 0.9284204329560354,
"grad_norm": 0.13329570849461553,
"learning_rate": 5.127345656554928e-07,
"loss": 0.6655,
"step": 4337
},
{
"epoch": 0.9286345026892511,
"grad_norm": 0.13411666000495157,
"learning_rate": 5.09681983033572e-07,
"loss": 0.6786,
"step": 4338
},
{
"epoch": 0.9288485724224667,
"grad_norm": 0.13226173142356434,
"learning_rate": 5.066383971067735e-07,
"loss": 0.6784,
"step": 4339
},
{
"epoch": 0.9290626421556822,
"grad_norm": 0.1389445083227217,
"learning_rate": 5.036038092800044e-07,
"loss": 0.7004,
"step": 4340
},
{
"epoch": 0.9292767118888978,
"grad_norm": 0.138040961006918,
"learning_rate": 5.005782209540267e-07,
"loss": 0.6915,
"step": 4341
},
{
"epoch": 0.9294907816221134,
"grad_norm": 0.1399110270978668,
"learning_rate": 4.975616335254474e-07,
"loss": 0.7115,
"step": 4342
},
{
"epoch": 0.929704851355329,
"grad_norm": 0.1367055044829687,
"learning_rate": 4.945540483867173e-07,
"loss": 0.6983,
"step": 4343
},
{
"epoch": 0.9299189210885446,
"grad_norm": 0.13359304363733251,
"learning_rate": 4.915554669261346e-07,
"loss": 0.7019,
"step": 4344
},
{
"epoch": 0.9301329908217602,
"grad_norm": 0.1341565940715023,
"learning_rate": 4.885658905278345e-07,
"loss": 0.6756,
"step": 4345
},
{
"epoch": 0.9303470605549757,
"grad_norm": 0.13292871340605816,
"learning_rate": 4.855853205718019e-07,
"loss": 0.6851,
"step": 4346
},
{
"epoch": 0.9305611302881914,
"grad_norm": 0.13233325821628328,
"learning_rate": 4.826137584338653e-07,
"loss": 0.6881,
"step": 4347
},
{
"epoch": 0.930775200021407,
"grad_norm": 0.134368961841193,
"learning_rate": 4.796512054856872e-07,
"loss": 0.7141,
"step": 4348
},
{
"epoch": 0.9309892697546226,
"grad_norm": 0.13955584155179407,
"learning_rate": 4.766976630947806e-07,
"loss": 0.7054,
"step": 4349
},
{
"epoch": 0.9312033394878382,
"grad_norm": 0.1343345661614504,
"learning_rate": 4.737531326244926e-07,
"loss": 0.67,
"step": 4350
},
{
"epoch": 0.9314174092210538,
"grad_norm": 0.13838895579839383,
"learning_rate": 4.7081761543401604e-07,
"loss": 0.6818,
"step": 4351
},
{
"epoch": 0.9316314789542693,
"grad_norm": 0.13649714084423922,
"learning_rate": 4.678911128783781e-07,
"loss": 0.7375,
"step": 4352
},
{
"epoch": 0.9318455486874849,
"grad_norm": 0.13867464505550625,
"learning_rate": 4.64973626308447e-07,
"loss": 0.6888,
"step": 4353
},
{
"epoch": 0.9320596184207005,
"grad_norm": 0.13662169417139988,
"learning_rate": 4.6206515707093e-07,
"loss": 0.7053,
"step": 4354
},
{
"epoch": 0.9322736881539161,
"grad_norm": 0.1346330008188449,
"learning_rate": 4.59165706508371e-07,
"loss": 0.6918,
"step": 4355
},
{
"epoch": 0.9324877578871318,
"grad_norm": 0.13594427563867142,
"learning_rate": 4.5627527595915043e-07,
"loss": 0.6741,
"step": 4356
},
{
"epoch": 0.9327018276203474,
"grad_norm": 0.13626106039314084,
"learning_rate": 4.5339386675748775e-07,
"loss": 0.6916,
"step": 4357
},
{
"epoch": 0.932915897353563,
"grad_norm": 0.13426337054089502,
"learning_rate": 4.5052148023343234e-07,
"loss": 0.6959,
"step": 4358
},
{
"epoch": 0.9331299670867785,
"grad_norm": 0.1350433586386857,
"learning_rate": 4.4765811771287693e-07,
"loss": 0.685,
"step": 4359
},
{
"epoch": 0.9333440368199941,
"grad_norm": 0.14065317528715676,
"learning_rate": 4.44803780517542e-07,
"loss": 0.7046,
"step": 4360
},
{
"epoch": 0.9335581065532097,
"grad_norm": 0.13425958907066096,
"learning_rate": 4.419584699649826e-07,
"loss": 0.666,
"step": 4361
},
{
"epoch": 0.9337721762864253,
"grad_norm": 0.13427276238033473,
"learning_rate": 4.3912218736859467e-07,
"loss": 0.6703,
"step": 4362
},
{
"epoch": 0.9339862460196409,
"grad_norm": 0.1309313948057116,
"learning_rate": 4.362949340375955e-07,
"loss": 0.6681,
"step": 4363
},
{
"epoch": 0.9342003157528564,
"grad_norm": 0.1309459584048416,
"learning_rate": 4.3347671127704327e-07,
"loss": 0.6758,
"step": 4364
},
{
"epoch": 0.9344143854860721,
"grad_norm": 0.13306099235595414,
"learning_rate": 4.306675203878219e-07,
"loss": 0.6685,
"step": 4365
},
{
"epoch": 0.9346284552192877,
"grad_norm": 0.13658798500494218,
"learning_rate": 4.2786736266664965e-07,
"loss": 0.6837,
"step": 4366
},
{
"epoch": 0.9348425249525033,
"grad_norm": 0.13481011599257658,
"learning_rate": 4.250762394060748e-07,
"loss": 0.6928,
"step": 4367
},
{
"epoch": 0.9350565946857189,
"grad_norm": 0.13428232860094064,
"learning_rate": 4.2229415189447344e-07,
"loss": 0.6809,
"step": 4368
},
{
"epoch": 0.9352706644189345,
"grad_norm": 0.13680713632093935,
"learning_rate": 4.195211014160561e-07,
"loss": 0.6959,
"step": 4369
},
{
"epoch": 0.93548473415215,
"grad_norm": 0.1347440990639614,
"learning_rate": 4.167570892508521e-07,
"loss": 0.6776,
"step": 4370
},
{
"epoch": 0.9356988038853656,
"grad_norm": 0.14506695808540718,
"learning_rate": 4.140021166747299e-07,
"loss": 0.6971,
"step": 4371
},
{
"epoch": 0.9359128736185812,
"grad_norm": 0.1444892034589848,
"learning_rate": 4.112561849593766e-07,
"loss": 0.6971,
"step": 4372
},
{
"epoch": 0.9361269433517968,
"grad_norm": 0.1357028449587074,
"learning_rate": 4.085192953723072e-07,
"loss": 0.6765,
"step": 4373
},
{
"epoch": 0.9363410130850125,
"grad_norm": 0.14539471081296,
"learning_rate": 4.0579144917686884e-07,
"loss": 0.6844,
"step": 4374
},
{
"epoch": 0.9365550828182281,
"grad_norm": 0.13454942448465235,
"learning_rate": 4.0307264763223e-07,
"loss": 0.6732,
"step": 4375
},
{
"epoch": 0.9367691525514437,
"grad_norm": 0.13730075304577238,
"learning_rate": 4.0036289199338e-07,
"loss": 0.7159,
"step": 4376
},
{
"epoch": 0.9369832222846592,
"grad_norm": 0.13724928461966887,
"learning_rate": 3.9766218351114495e-07,
"loss": 0.7087,
"step": 4377
},
{
"epoch": 0.9371972920178748,
"grad_norm": 0.13364995741210184,
"learning_rate": 3.949705234321588e-07,
"loss": 0.6863,
"step": 4378
},
{
"epoch": 0.9374113617510904,
"grad_norm": 0.13579667041546697,
"learning_rate": 3.922879129988921e-07,
"loss": 0.6778,
"step": 4379
},
{
"epoch": 0.937625431484306,
"grad_norm": 0.13256914121735144,
"learning_rate": 3.8961435344963216e-07,
"loss": 0.6849,
"step": 4380
},
{
"epoch": 0.9378395012175216,
"grad_norm": 0.13199552332290715,
"learning_rate": 3.8694984601848727e-07,
"loss": 0.6923,
"step": 4381
},
{
"epoch": 0.9380535709507372,
"grad_norm": 0.13486365897241173,
"learning_rate": 3.842943919353914e-07,
"loss": 0.6542,
"step": 4382
},
{
"epoch": 0.9382676406839529,
"grad_norm": 0.14000675774329724,
"learning_rate": 3.8164799242609516e-07,
"loss": 0.7297,
"step": 4383
},
{
"epoch": 0.9384817104171684,
"grad_norm": 0.1370351927551661,
"learning_rate": 3.790106487121725e-07,
"loss": 0.6944,
"step": 4384
},
{
"epoch": 0.938695780150384,
"grad_norm": 0.1339984287202759,
"learning_rate": 3.763823620110207e-07,
"loss": 0.6994,
"step": 4385
},
{
"epoch": 0.9389098498835996,
"grad_norm": 0.21962560537470135,
"learning_rate": 3.737631335358427e-07,
"loss": 0.6857,
"step": 4386
},
{
"epoch": 0.9391239196168152,
"grad_norm": 0.1341062847667069,
"learning_rate": 3.7115296449567795e-07,
"loss": 0.6686,
"step": 4387
},
{
"epoch": 0.9393379893500308,
"grad_norm": 0.15318823133532672,
"learning_rate": 3.685518560953738e-07,
"loss": 0.709,
"step": 4388
},
{
"epoch": 0.9395520590832463,
"grad_norm": 0.13484647147667309,
"learning_rate": 3.659598095355921e-07,
"loss": 0.6827,
"step": 4389
},
{
"epoch": 0.9397661288164619,
"grad_norm": 0.13107261887840005,
"learning_rate": 3.633768260128223e-07,
"loss": 0.6734,
"step": 4390
},
{
"epoch": 0.9399801985496775,
"grad_norm": 0.12984179382874467,
"learning_rate": 3.6080290671936635e-07,
"loss": 0.6622,
"step": 4391
},
{
"epoch": 0.9401942682828931,
"grad_norm": 0.1436758111172662,
"learning_rate": 3.582380528433338e-07,
"loss": 0.6966,
"step": 4392
},
{
"epoch": 0.9404083380161088,
"grad_norm": 0.13607666299249183,
"learning_rate": 3.5568226556866206e-07,
"loss": 0.6861,
"step": 4393
},
{
"epoch": 0.9406224077493244,
"grad_norm": 0.1362418372533418,
"learning_rate": 3.5313554607509846e-07,
"loss": 0.6968,
"step": 4394
},
{
"epoch": 0.94083647748254,
"grad_norm": 0.13870892474796043,
"learning_rate": 3.5059789553819835e-07,
"loss": 0.7313,
"step": 4395
},
{
"epoch": 0.9410505472157555,
"grad_norm": 0.13294179159147404,
"learning_rate": 3.480693151293424e-07,
"loss": 0.6911,
"step": 4396
},
{
"epoch": 0.9412646169489711,
"grad_norm": 0.13480810563573517,
"learning_rate": 3.4554980601571474e-07,
"loss": 0.6941,
"step": 4397
},
{
"epoch": 0.9414786866821867,
"grad_norm": 0.13178243274206683,
"learning_rate": 3.4303936936031624e-07,
"loss": 0.6635,
"step": 4398
},
{
"epoch": 0.9416927564154023,
"grad_norm": 0.1327853646381699,
"learning_rate": 3.4053800632196434e-07,
"loss": 0.67,
"step": 4399
},
{
"epoch": 0.9419068261486179,
"grad_norm": 0.13921429858113968,
"learning_rate": 3.380457180552799e-07,
"loss": 0.6904,
"step": 4400
},
{
"epoch": 0.9421208958818335,
"grad_norm": 0.1353642741588206,
"learning_rate": 3.3556250571069813e-07,
"loss": 0.671,
"step": 4401
},
{
"epoch": 0.9423349656150491,
"grad_norm": 0.13596869594698763,
"learning_rate": 3.3308837043446897e-07,
"loss": 0.7122,
"step": 4402
},
{
"epoch": 0.9425490353482647,
"grad_norm": 0.1361060299958215,
"learning_rate": 3.306233133686454e-07,
"loss": 0.696,
"step": 4403
},
{
"epoch": 0.9427631050814803,
"grad_norm": 0.13932448088370206,
"learning_rate": 3.281673356510928e-07,
"loss": 0.6978,
"step": 4404
},
{
"epoch": 0.9429771748146959,
"grad_norm": 0.13320871046081048,
"learning_rate": 3.2572043841548664e-07,
"loss": 0.688,
"step": 4405
},
{
"epoch": 0.9431912445479115,
"grad_norm": 0.2034995414611716,
"learning_rate": 3.232826227913144e-07,
"loss": 0.7143,
"step": 4406
},
{
"epoch": 0.9434053142811271,
"grad_norm": 0.13209390106435284,
"learning_rate": 3.208538899038605e-07,
"loss": 0.6811,
"step": 4407
},
{
"epoch": 0.9436193840143426,
"grad_norm": 0.13962661634831805,
"learning_rate": 3.1843424087422805e-07,
"loss": 0.7176,
"step": 4408
},
{
"epoch": 0.9438334537475582,
"grad_norm": 0.13101465821042343,
"learning_rate": 3.1602367681932146e-07,
"loss": 0.667,
"step": 4409
},
{
"epoch": 0.9440475234807738,
"grad_norm": 0.1344252901880154,
"learning_rate": 3.1362219885185283e-07,
"loss": 0.6861,
"step": 4410
},
{
"epoch": 0.9442615932139895,
"grad_norm": 0.13888038594671412,
"learning_rate": 3.1122980808033997e-07,
"loss": 0.7037,
"step": 4411
},
{
"epoch": 0.9444756629472051,
"grad_norm": 0.13686136284085942,
"learning_rate": 3.088465056091061e-07,
"loss": 0.6975,
"step": 4412
},
{
"epoch": 0.9446897326804207,
"grad_norm": 0.1338173824839266,
"learning_rate": 3.0647229253828014e-07,
"loss": 0.68,
"step": 4413
},
{
"epoch": 0.9449038024136363,
"grad_norm": 0.13403148746534746,
"learning_rate": 3.041071699637921e-07,
"loss": 0.6726,
"step": 4414
},
{
"epoch": 0.9451178721468518,
"grad_norm": 0.13377541711281205,
"learning_rate": 3.017511389773775e-07,
"loss": 0.6628,
"step": 4415
},
{
"epoch": 0.9453319418800674,
"grad_norm": 0.13616913975187378,
"learning_rate": 2.9940420066658204e-07,
"loss": 0.6846,
"step": 4416
},
{
"epoch": 0.945546011613283,
"grad_norm": 0.13699250871645566,
"learning_rate": 2.970663561147413e-07,
"loss": 0.6778,
"step": 4417
},
{
"epoch": 0.9457600813464986,
"grad_norm": 0.14021969737564713,
"learning_rate": 2.9473760640100546e-07,
"loss": 0.7215,
"step": 4418
},
{
"epoch": 0.9459741510797142,
"grad_norm": 0.1345648386139211,
"learning_rate": 2.924179526003168e-07,
"loss": 0.6825,
"step": 4419
},
{
"epoch": 0.9461882208129299,
"grad_norm": 0.13169122305121064,
"learning_rate": 2.901073957834255e-07,
"loss": 0.687,
"step": 4420
},
{
"epoch": 0.9464022905461454,
"grad_norm": 0.1338011288145907,
"learning_rate": 2.8780593701688064e-07,
"loss": 0.6873,
"step": 4421
},
{
"epoch": 0.946616360279361,
"grad_norm": 0.13758208575962663,
"learning_rate": 2.855135773630302e-07,
"loss": 0.7097,
"step": 4422
},
{
"epoch": 0.9468304300125766,
"grad_norm": 0.13515567347664284,
"learning_rate": 2.832303178800233e-07,
"loss": 0.665,
"step": 4423
},
{
"epoch": 0.9470444997457922,
"grad_norm": 0.35350120064438184,
"learning_rate": 2.80956159621808e-07,
"loss": 0.7249,
"step": 4424
},
{
"epoch": 0.9472585694790078,
"grad_norm": 0.1367390259092207,
"learning_rate": 2.7869110363813344e-07,
"loss": 0.7237,
"step": 4425
},
{
"epoch": 0.9474726392122234,
"grad_norm": 0.13325065939442832,
"learning_rate": 2.7643515097454554e-07,
"loss": 0.6605,
"step": 4426
},
{
"epoch": 0.9476867089454389,
"grad_norm": 0.14815206254336985,
"learning_rate": 2.7418830267238463e-07,
"loss": 0.6906,
"step": 4427
},
{
"epoch": 0.9479007786786545,
"grad_norm": 0.13320652457641352,
"learning_rate": 2.719505597687944e-07,
"loss": 0.6752,
"step": 4428
},
{
"epoch": 0.9481148484118702,
"grad_norm": 0.1383322152862038,
"learning_rate": 2.6972192329671077e-07,
"loss": 0.6748,
"step": 4429
},
{
"epoch": 0.9483289181450858,
"grad_norm": 0.13515569302825342,
"learning_rate": 2.675023942848687e-07,
"loss": 0.7014,
"step": 4430
},
{
"epoch": 0.9485429878783014,
"grad_norm": 0.13961809113509327,
"learning_rate": 2.6529197375780414e-07,
"loss": 0.7026,
"step": 4431
},
{
"epoch": 0.948757057611517,
"grad_norm": 0.1332682385828863,
"learning_rate": 2.630906627358343e-07,
"loss": 0.6859,
"step": 4432
},
{
"epoch": 0.9489711273447325,
"grad_norm": 0.1332312600242191,
"learning_rate": 2.6089846223508853e-07,
"loss": 0.6807,
"step": 4433
},
{
"epoch": 0.9491851970779481,
"grad_norm": 0.13678085190562997,
"learning_rate": 2.587153732674752e-07,
"loss": 0.7067,
"step": 4434
},
{
"epoch": 0.9493992668111637,
"grad_norm": 0.13671878737652773,
"learning_rate": 2.5654139684070823e-07,
"loss": 0.7146,
"step": 4435
},
{
"epoch": 0.9496133365443793,
"grad_norm": 0.13399327711840386,
"learning_rate": 2.5437653395829374e-07,
"loss": 0.675,
"step": 4436
},
{
"epoch": 0.9498274062775949,
"grad_norm": 0.13492856584051507,
"learning_rate": 2.5222078561952133e-07,
"loss": 0.6755,
"step": 4437
},
{
"epoch": 0.9500414760108106,
"grad_norm": 0.13405416284934196,
"learning_rate": 2.500741528194883e-07,
"loss": 0.6931,
"step": 4438
},
{
"epoch": 0.9502555457440262,
"grad_norm": 0.1335342587000633,
"learning_rate": 2.4793663654906873e-07,
"loss": 0.6749,
"step": 4439
},
{
"epoch": 0.9504696154772417,
"grad_norm": 0.13363797238302091,
"learning_rate": 2.4580823779494223e-07,
"loss": 0.6909,
"step": 4440
},
{
"epoch": 0.9506836852104573,
"grad_norm": 0.13993036987058308,
"learning_rate": 2.436889575395718e-07,
"loss": 0.7144,
"step": 4441
},
{
"epoch": 0.9508977549436729,
"grad_norm": 0.1350306114647872,
"learning_rate": 2.415787967612127e-07,
"loss": 0.6808,
"step": 4442
},
{
"epoch": 0.9511118246768885,
"grad_norm": 0.132935355904939,
"learning_rate": 2.394777564339146e-07,
"loss": 0.6922,
"step": 4443
},
{
"epoch": 0.9513258944101041,
"grad_norm": 0.13584181055687816,
"learning_rate": 2.373858375275062e-07,
"loss": 0.7099,
"step": 4444
},
{
"epoch": 0.9515399641433196,
"grad_norm": 0.13149826096458744,
"learning_rate": 2.353030410076218e-07,
"loss": 0.6684,
"step": 4445
},
{
"epoch": 0.9517540338765352,
"grad_norm": 0.13398714533216113,
"learning_rate": 2.332293678356723e-07,
"loss": 0.6838,
"step": 4446
},
{
"epoch": 0.9519681036097509,
"grad_norm": 0.13535967046509848,
"learning_rate": 2.311648189688609e-07,
"loss": 0.71,
"step": 4447
},
{
"epoch": 0.9521821733429665,
"grad_norm": 0.13057818672857943,
"learning_rate": 2.2910939536018307e-07,
"loss": 0.6772,
"step": 4448
},
{
"epoch": 0.9523962430761821,
"grad_norm": 0.13637946468727496,
"learning_rate": 2.2706309795841318e-07,
"loss": 0.7041,
"step": 4449
},
{
"epoch": 0.9526103128093977,
"grad_norm": 0.13425348068570012,
"learning_rate": 2.250259277081246e-07,
"loss": 0.683,
"step": 4450
},
{
"epoch": 0.9528243825426133,
"grad_norm": 0.1333832035130265,
"learning_rate": 2.2299788554966507e-07,
"loss": 0.6914,
"step": 4451
},
{
"epoch": 0.9530384522758288,
"grad_norm": 0.1308044899213028,
"learning_rate": 2.209789724191791e-07,
"loss": 0.6722,
"step": 4452
},
{
"epoch": 0.9532525220090444,
"grad_norm": 0.1372767061875979,
"learning_rate": 2.1896918924859457e-07,
"loss": 0.7358,
"step": 4453
},
{
"epoch": 0.95346659174226,
"grad_norm": 0.1371866888592856,
"learning_rate": 2.1696853696562047e-07,
"loss": 0.685,
"step": 4454
},
{
"epoch": 0.9536806614754756,
"grad_norm": 0.13061073792711084,
"learning_rate": 2.149770164937559e-07,
"loss": 0.685,
"step": 4455
},
{
"epoch": 0.9538947312086913,
"grad_norm": 0.13442012214273127,
"learning_rate": 2.1299462875228105e-07,
"loss": 0.6849,
"step": 4456
},
{
"epoch": 0.9541088009419069,
"grad_norm": 0.15943456154091798,
"learning_rate": 2.1102137465626615e-07,
"loss": 0.6692,
"step": 4457
},
{
"epoch": 0.9543228706751224,
"grad_norm": 0.132934193772929,
"learning_rate": 2.0905725511655815e-07,
"loss": 0.6776,
"step": 4458
},
{
"epoch": 0.954536940408338,
"grad_norm": 0.13564113269996744,
"learning_rate": 2.0710227103979186e-07,
"loss": 0.6639,
"step": 4459
},
{
"epoch": 0.9547510101415536,
"grad_norm": 0.13845246517811657,
"learning_rate": 2.0515642332838537e-07,
"loss": 0.7074,
"step": 4460
},
{
"epoch": 0.9549650798747692,
"grad_norm": 0.13403493581662498,
"learning_rate": 2.032197128805402e-07,
"loss": 0.68,
"step": 4461
},
{
"epoch": 0.9551791496079848,
"grad_norm": 0.138020929093702,
"learning_rate": 2.012921405902346e-07,
"loss": 0.7176,
"step": 4462
},
{
"epoch": 0.9553932193412004,
"grad_norm": 0.1363071013158765,
"learning_rate": 1.993737073472324e-07,
"loss": 0.6726,
"step": 4463
},
{
"epoch": 0.9556072890744159,
"grad_norm": 0.13671866164881524,
"learning_rate": 1.9746441403708294e-07,
"loss": 0.7132,
"step": 4464
},
{
"epoch": 0.9558213588076316,
"grad_norm": 0.13271304688009625,
"learning_rate": 1.9556426154110798e-07,
"loss": 0.6677,
"step": 4465
},
{
"epoch": 0.9560354285408472,
"grad_norm": 0.14321486852087098,
"learning_rate": 1.9367325073641695e-07,
"loss": 0.7064,
"step": 4466
},
{
"epoch": 0.9562494982740628,
"grad_norm": 0.1347899281523679,
"learning_rate": 1.9179138249589836e-07,
"loss": 0.6871,
"step": 4467
},
{
"epoch": 0.9564635680072784,
"grad_norm": 0.1350541043284291,
"learning_rate": 1.8991865768821506e-07,
"loss": 0.6617,
"step": 4468
},
{
"epoch": 0.956677637740494,
"grad_norm": 0.1560933060408755,
"learning_rate": 1.8805507717781558e-07,
"loss": 0.6981,
"step": 4469
},
{
"epoch": 0.9568917074737096,
"grad_norm": 0.17697761160628103,
"learning_rate": 1.8620064182492513e-07,
"loss": 0.6937,
"step": 4470
},
{
"epoch": 0.9571057772069251,
"grad_norm": 0.12982871103577237,
"learning_rate": 1.8435535248554792e-07,
"loss": 0.6664,
"step": 4471
},
{
"epoch": 0.9573198469401407,
"grad_norm": 0.31474112827900536,
"learning_rate": 1.825192100114692e-07,
"loss": 0.6866,
"step": 4472
},
{
"epoch": 0.9575339166733563,
"grad_norm": 0.1320021736959052,
"learning_rate": 1.8069221525024217e-07,
"loss": 0.6785,
"step": 4473
},
{
"epoch": 0.957747986406572,
"grad_norm": 0.13296474083529464,
"learning_rate": 1.7887436904520772e-07,
"loss": 0.679,
"step": 4474
},
{
"epoch": 0.9579620561397876,
"grad_norm": 0.131819375973322,
"learning_rate": 1.7706567223548353e-07,
"loss": 0.6693,
"step": 4475
},
{
"epoch": 0.9581761258730032,
"grad_norm": 0.1333344066768023,
"learning_rate": 1.7526612565595513e-07,
"loss": 0.6722,
"step": 4476
},
{
"epoch": 0.9583901956062187,
"grad_norm": 0.13502912752118432,
"learning_rate": 1.7347573013729357e-07,
"loss": 0.7027,
"step": 4477
},
{
"epoch": 0.9586042653394343,
"grad_norm": 0.13916177471834354,
"learning_rate": 1.7169448650594e-07,
"loss": 0.7026,
"step": 4478
},
{
"epoch": 0.9588183350726499,
"grad_norm": 0.13060265900714255,
"learning_rate": 1.6992239558411448e-07,
"loss": 0.6887,
"step": 4479
},
{
"epoch": 0.9590324048058655,
"grad_norm": 0.13025350318471712,
"learning_rate": 1.6815945818981382e-07,
"loss": 0.6729,
"step": 4480
},
{
"epoch": 0.9592464745390811,
"grad_norm": 0.1329791169788774,
"learning_rate": 1.664056751368004e-07,
"loss": 0.6825,
"step": 4481
},
{
"epoch": 0.9594605442722967,
"grad_norm": 0.13406186774432716,
"learning_rate": 1.6466104723461995e-07,
"loss": 0.6926,
"step": 4482
},
{
"epoch": 0.9596746140055123,
"grad_norm": 0.13551817770086835,
"learning_rate": 1.6292557528859276e-07,
"loss": 0.7007,
"step": 4483
},
{
"epoch": 0.9598886837387279,
"grad_norm": 0.13647123698636426,
"learning_rate": 1.6119926009980468e-07,
"loss": 0.6807,
"step": 4484
},
{
"epoch": 0.9601027534719435,
"grad_norm": 0.1363350685771318,
"learning_rate": 1.5948210246512276e-07,
"loss": 0.712,
"step": 4485
},
{
"epoch": 0.9603168232051591,
"grad_norm": 0.14022481649117083,
"learning_rate": 1.57774103177184e-07,
"loss": 0.6867,
"step": 4486
},
{
"epoch": 0.9605308929383747,
"grad_norm": 0.13422756693687948,
"learning_rate": 1.5607526302439558e-07,
"loss": 0.6975,
"step": 4487
},
{
"epoch": 0.9607449626715903,
"grad_norm": 0.13485927075782286,
"learning_rate": 1.5438558279093907e-07,
"loss": 0.6973,
"step": 4488
},
{
"epoch": 0.9609590324048058,
"grad_norm": 0.1337655303690488,
"learning_rate": 1.5270506325676838e-07,
"loss": 0.6923,
"step": 4489
},
{
"epoch": 0.9611731021380214,
"grad_norm": 0.13397662237486613,
"learning_rate": 1.5103370519760963e-07,
"loss": 0.6814,
"step": 4490
},
{
"epoch": 0.961387171871237,
"grad_norm": 0.13626932420697732,
"learning_rate": 1.4937150938495682e-07,
"loss": 0.6974,
"step": 4491
},
{
"epoch": 0.9616012416044527,
"grad_norm": 0.13348708982316596,
"learning_rate": 1.4771847658608063e-07,
"loss": 0.6756,
"step": 4492
},
{
"epoch": 0.9618153113376683,
"grad_norm": 0.13474337487021495,
"learning_rate": 1.460746075640107e-07,
"loss": 0.6977,
"step": 4493
},
{
"epoch": 0.9620293810708839,
"grad_norm": 0.13275729646111029,
"learning_rate": 1.4443990307755784e-07,
"loss": 0.6781,
"step": 4494
},
{
"epoch": 0.9622434508040995,
"grad_norm": 0.1354756638284012,
"learning_rate": 1.4281436388130066e-07,
"loss": 0.6998,
"step": 4495
},
{
"epoch": 0.962457520537315,
"grad_norm": 0.1374455786312147,
"learning_rate": 1.4119799072558339e-07,
"loss": 0.7162,
"step": 4496
},
{
"epoch": 0.9626715902705306,
"grad_norm": 0.13387082697697542,
"learning_rate": 1.395907843565203e-07,
"loss": 0.6874,
"step": 4497
},
{
"epoch": 0.9628856600037462,
"grad_norm": 0.13300187771492367,
"learning_rate": 1.379927455159935e-07,
"loss": 0.6898,
"step": 4498
},
{
"epoch": 0.9630997297369618,
"grad_norm": 0.13649271462598345,
"learning_rate": 1.364038749416574e-07,
"loss": 0.6862,
"step": 4499
},
{
"epoch": 0.9633137994701774,
"grad_norm": 0.13130800543235865,
"learning_rate": 1.3482417336693198e-07,
"loss": 0.6665,
"step": 4500
},
{
"epoch": 0.963527869203393,
"grad_norm": 0.1353247385654163,
"learning_rate": 1.3325364152100063e-07,
"loss": 0.6953,
"step": 4501
},
{
"epoch": 0.9637419389366086,
"grad_norm": 0.13627686124618352,
"learning_rate": 1.316922801288234e-07,
"loss": 0.712,
"step": 4502
},
{
"epoch": 0.9639560086698242,
"grad_norm": 0.13842766847637222,
"learning_rate": 1.3014008991111936e-07,
"loss": 0.6979,
"step": 4503
},
{
"epoch": 0.9641700784030398,
"grad_norm": 0.13539032497831988,
"learning_rate": 1.285970715843754e-07,
"loss": 0.6989,
"step": 4504
},
{
"epoch": 0.9643841481362554,
"grad_norm": 0.13114386394446242,
"learning_rate": 1.270632258608484e-07,
"loss": 0.6835,
"step": 4505
},
{
"epoch": 0.964598217869471,
"grad_norm": 0.1347767192348643,
"learning_rate": 1.2553855344855648e-07,
"loss": 0.6578,
"step": 4506
},
{
"epoch": 0.9648122876026866,
"grad_norm": 0.17564488194509872,
"learning_rate": 1.2402305505128553e-07,
"loss": 0.6902,
"step": 4507
},
{
"epoch": 0.9650263573359021,
"grad_norm": 0.135273284231026,
"learning_rate": 1.2251673136858931e-07,
"loss": 0.6883,
"step": 4508
},
{
"epoch": 0.9652404270691177,
"grad_norm": 0.1377201593590194,
"learning_rate": 1.2101958309578275e-07,
"loss": 0.6963,
"step": 4509
},
{
"epoch": 0.9654544968023333,
"grad_norm": 0.13284051953668552,
"learning_rate": 1.1953161092394637e-07,
"loss": 0.6621,
"step": 4510
},
{
"epoch": 0.965668566535549,
"grad_norm": 0.1335395751103413,
"learning_rate": 1.1805281553992631e-07,
"loss": 0.7218,
"step": 4511
},
{
"epoch": 0.9658826362687646,
"grad_norm": 0.1326742311677986,
"learning_rate": 1.1658319762633207e-07,
"loss": 0.6955,
"step": 4512
},
{
"epoch": 0.9660967060019802,
"grad_norm": 0.1333845974289204,
"learning_rate": 1.1512275786153437e-07,
"loss": 0.6829,
"step": 4513
},
{
"epoch": 0.9663107757351957,
"grad_norm": 0.13277402275570413,
"learning_rate": 1.136714969196695e-07,
"loss": 0.6828,
"step": 4514
},
{
"epoch": 0.9665248454684113,
"grad_norm": 0.13891227491218763,
"learning_rate": 1.1222941547064159e-07,
"loss": 0.6815,
"step": 4515
},
{
"epoch": 0.9667389152016269,
"grad_norm": 0.1325049108312949,
"learning_rate": 1.1079651418010706e-07,
"loss": 0.6569,
"step": 4516
},
{
"epoch": 0.9669529849348425,
"grad_norm": 0.1311403374148213,
"learning_rate": 1.0937279370949461e-07,
"loss": 0.6904,
"step": 4517
},
{
"epoch": 0.9671670546680581,
"grad_norm": 0.13610484449605284,
"learning_rate": 1.0795825471598742e-07,
"loss": 0.6825,
"step": 4518
},
{
"epoch": 0.9673811244012737,
"grad_norm": 0.1357642611537287,
"learning_rate": 1.0655289785253875e-07,
"loss": 0.6813,
"step": 4519
},
{
"epoch": 0.9675951941344894,
"grad_norm": 0.13416760158515398,
"learning_rate": 1.0515672376785413e-07,
"loss": 0.6915,
"step": 4520
},
{
"epoch": 0.9678092638677049,
"grad_norm": 0.13157043480676256,
"learning_rate": 1.0376973310640692e-07,
"loss": 0.6847,
"step": 4521
},
{
"epoch": 0.9680233336009205,
"grad_norm": 0.1312532642516754,
"learning_rate": 1.0239192650842944e-07,
"loss": 0.6819,
"step": 4522
},
{
"epoch": 0.9682374033341361,
"grad_norm": 0.17392334900415451,
"learning_rate": 1.0102330460991516e-07,
"loss": 0.7287,
"step": 4523
},
{
"epoch": 0.9684514730673517,
"grad_norm": 0.13614588847997638,
"learning_rate": 9.966386804261651e-08,
"loss": 0.6857,
"step": 4524
},
{
"epoch": 0.9686655428005673,
"grad_norm": 0.13761547708688895,
"learning_rate": 9.831361743404711e-08,
"loss": 0.6998,
"step": 4525
},
{
"epoch": 0.9688796125337829,
"grad_norm": 0.13365846991376126,
"learning_rate": 9.697255340748169e-08,
"loss": 0.6638,
"step": 4526
},
{
"epoch": 0.9690936822669984,
"grad_norm": 0.13092060620285198,
"learning_rate": 9.564067658195175e-08,
"loss": 0.6685,
"step": 4527
},
{
"epoch": 0.969307752000214,
"grad_norm": 0.133799858697993,
"learning_rate": 9.431798757224775e-08,
"loss": 0.6734,
"step": 4528
},
{
"epoch": 0.9695218217334297,
"grad_norm": 0.13258756362529836,
"learning_rate": 9.300448698892128e-08,
"loss": 0.7031,
"step": 4529
},
{
"epoch": 0.9697358914666453,
"grad_norm": 0.13506452719613907,
"learning_rate": 9.170017543828291e-08,
"loss": 0.6823,
"step": 4530
},
{
"epoch": 0.9699499611998609,
"grad_norm": 0.12968976156955592,
"learning_rate": 9.040505352240215e-08,
"loss": 0.6692,
"step": 4531
},
{
"epoch": 0.9701640309330765,
"grad_norm": 0.1393291618833024,
"learning_rate": 8.911912183910077e-08,
"loss": 0.7383,
"step": 4532
},
{
"epoch": 0.970378100666292,
"grad_norm": 0.13532171209441804,
"learning_rate": 8.784238098196396e-08,
"loss": 0.6859,
"step": 4533
},
{
"epoch": 0.9705921703995076,
"grad_norm": 0.13127295571392864,
"learning_rate": 8.657483154033586e-08,
"loss": 0.6821,
"step": 4534
},
{
"epoch": 0.9708062401327232,
"grad_norm": 0.13111413121684962,
"learning_rate": 8.531647409931065e-08,
"loss": 0.6674,
"step": 4535
},
{
"epoch": 0.9710203098659388,
"grad_norm": 0.13673708125171508,
"learning_rate": 8.406730923974593e-08,
"loss": 0.689,
"step": 4536
},
{
"epoch": 0.9712343795991544,
"grad_norm": 0.1346365686237501,
"learning_rate": 8.282733753825378e-08,
"loss": 0.7005,
"step": 4537
},
{
"epoch": 0.9714484493323701,
"grad_norm": 0.13564020677220007,
"learning_rate": 8.159655956720303e-08,
"loss": 0.6937,
"step": 4538
},
{
"epoch": 0.9716625190655857,
"grad_norm": 0.13294099028119316,
"learning_rate": 8.037497589471699e-08,
"loss": 0.6826,
"step": 4539
},
{
"epoch": 0.9718765887988012,
"grad_norm": 0.13263632674633796,
"learning_rate": 7.916258708468016e-08,
"loss": 0.6925,
"step": 4540
},
{
"epoch": 0.9720906585320168,
"grad_norm": 0.3907402055413767,
"learning_rate": 7.79593936967249e-08,
"loss": 0.6641,
"step": 4541
},
{
"epoch": 0.9723047282652324,
"grad_norm": 0.1345438436171536,
"learning_rate": 7.676539628624469e-08,
"loss": 0.6759,
"step": 4542
},
{
"epoch": 0.972518797998448,
"grad_norm": 0.13517071987349472,
"learning_rate": 7.558059540438755e-08,
"loss": 0.7079,
"step": 4543
},
{
"epoch": 0.9727328677316636,
"grad_norm": 0.12940672250537424,
"learning_rate": 7.440499159805381e-08,
"loss": 0.6713,
"step": 4544
},
{
"epoch": 0.9729469374648791,
"grad_norm": 0.13381568999577242,
"learning_rate": 7.323858540990047e-08,
"loss": 0.6828,
"step": 4545
},
{
"epoch": 0.9731610071980947,
"grad_norm": 0.13679608309033056,
"learning_rate": 7.208137737833908e-08,
"loss": 0.701,
"step": 4546
},
{
"epoch": 0.9733750769313104,
"grad_norm": 0.1354627134399797,
"learning_rate": 7.093336803753347e-08,
"loss": 0.6907,
"step": 4547
},
{
"epoch": 0.973589146664526,
"grad_norm": 0.22164076577938704,
"learning_rate": 6.979455791740641e-08,
"loss": 0.6972,
"step": 4548
},
{
"epoch": 0.9738032163977416,
"grad_norm": 0.13698734603919305,
"learning_rate": 6.86649475436263e-08,
"loss": 0.7268,
"step": 4549
},
{
"epoch": 0.9740172861309572,
"grad_norm": 0.1358707421234845,
"learning_rate": 6.754453743761824e-08,
"loss": 0.682,
"step": 4550
},
{
"epoch": 0.9742313558641728,
"grad_norm": 0.13640199519497662,
"learning_rate": 6.643332811656633e-08,
"loss": 0.7169,
"step": 4551
},
{
"epoch": 0.9744454255973883,
"grad_norm": 0.13291791306820896,
"learning_rate": 6.533132009340026e-08,
"loss": 0.6747,
"step": 4552
},
{
"epoch": 0.9746594953306039,
"grad_norm": 0.13214013184003487,
"learning_rate": 6.423851387680424e-08,
"loss": 0.6771,
"step": 4553
},
{
"epoch": 0.9748735650638195,
"grad_norm": 0.1382466117805832,
"learning_rate": 6.315490997121698e-08,
"loss": 0.7015,
"step": 4554
},
{
"epoch": 0.9750876347970351,
"grad_norm": 0.13628649993624967,
"learning_rate": 6.208050887682727e-08,
"loss": 0.718,
"step": 4555
},
{
"epoch": 0.9753017045302508,
"grad_norm": 0.13249796108740908,
"learning_rate": 6.101531108957614e-08,
"loss": 0.686,
"step": 4556
},
{
"epoch": 0.9755157742634664,
"grad_norm": 0.1342226678938587,
"learning_rate": 5.995931710115921e-08,
"loss": 0.6844,
"step": 4557
},
{
"epoch": 0.975729843996682,
"grad_norm": 0.13273625153984353,
"learning_rate": 5.891252739901765e-08,
"loss": 0.6894,
"step": 4558
},
{
"epoch": 0.9759439137298975,
"grad_norm": 0.13228113597366548,
"learning_rate": 5.787494246635161e-08,
"loss": 0.7049,
"step": 4559
},
{
"epoch": 0.9761579834631131,
"grad_norm": 0.13027522400683192,
"learning_rate": 5.684656278210687e-08,
"loss": 0.6666,
"step": 4560
},
{
"epoch": 0.9763720531963287,
"grad_norm": 0.16405356281879138,
"learning_rate": 5.5827388820979265e-08,
"loss": 0.6831,
"step": 4561
},
{
"epoch": 0.9765861229295443,
"grad_norm": 0.12884738108177957,
"learning_rate": 5.481742105342136e-08,
"loss": 0.6602,
"step": 4562
},
{
"epoch": 0.9768001926627599,
"grad_norm": 0.13399673236269471,
"learning_rate": 5.3816659945631346e-08,
"loss": 0.6836,
"step": 4563
},
{
"epoch": 0.9770142623959754,
"grad_norm": 0.1356504234992234,
"learning_rate": 5.282510595955748e-08,
"loss": 0.6942,
"step": 4564
},
{
"epoch": 0.9772283321291911,
"grad_norm": 0.13302492348170722,
"learning_rate": 5.18427595529003e-08,
"loss": 0.6698,
"step": 4565
},
{
"epoch": 0.9774424018624067,
"grad_norm": 0.12824289024922397,
"learning_rate": 5.086962117910821e-08,
"loss": 0.6618,
"step": 4566
},
{
"epoch": 0.9776564715956223,
"grad_norm": 0.1362923366364546,
"learning_rate": 4.990569128737965e-08,
"loss": 0.6799,
"step": 4567
},
{
"epoch": 0.9778705413288379,
"grad_norm": 0.13261851006172748,
"learning_rate": 4.895097032266538e-08,
"loss": 0.6854,
"step": 4568
},
{
"epoch": 0.9780846110620535,
"grad_norm": 0.13578132918616978,
"learning_rate": 4.800545872566176e-08,
"loss": 0.6907,
"step": 4569
},
{
"epoch": 0.978298680795269,
"grad_norm": 0.13198275226992776,
"learning_rate": 4.7069156932813e-08,
"loss": 0.6945,
"step": 4570
},
{
"epoch": 0.9785127505284846,
"grad_norm": 0.13245423791551902,
"learning_rate": 4.614206537631783e-08,
"loss": 0.6792,
"step": 4571
},
{
"epoch": 0.9787268202617002,
"grad_norm": 0.1342756783960331,
"learning_rate": 4.522418448411614e-08,
"loss": 0.6971,
"step": 4572
},
{
"epoch": 0.9789408899949158,
"grad_norm": 0.13353541275258607,
"learning_rate": 4.431551467990458e-08,
"loss": 0.6806,
"step": 4573
},
{
"epoch": 0.9791549597281315,
"grad_norm": 0.1368572601063207,
"learning_rate": 4.3416056383120964e-08,
"loss": 0.7087,
"step": 4574
},
{
"epoch": 0.9793690294613471,
"grad_norm": 0.1348599817122092,
"learning_rate": 4.252581000895095e-08,
"loss": 0.672,
"step": 4575
},
{
"epoch": 0.9795830991945627,
"grad_norm": 0.1345546877940355,
"learning_rate": 4.164477596833694e-08,
"loss": 0.7196,
"step": 4576
},
{
"epoch": 0.9797971689277782,
"grad_norm": 0.13187991954492437,
"learning_rate": 4.0772954667958055e-08,
"loss": 0.6767,
"step": 4577
},
{
"epoch": 0.9800112386609938,
"grad_norm": 0.13743497793794174,
"learning_rate": 3.991034651024572e-08,
"loss": 0.7028,
"step": 4578
},
{
"epoch": 0.9802253083942094,
"grad_norm": 0.13462939869563612,
"learning_rate": 3.905695189337921e-08,
"loss": 0.6947,
"step": 4579
},
{
"epoch": 0.980439378127425,
"grad_norm": 0.13259974964785384,
"learning_rate": 3.821277121128342e-08,
"loss": 0.6691,
"step": 4580
},
{
"epoch": 0.9806534478606406,
"grad_norm": 0.13137142332961674,
"learning_rate": 3.737780485363107e-08,
"loss": 0.6888,
"step": 4581
},
{
"epoch": 0.9808675175938562,
"grad_norm": 0.1324316788555167,
"learning_rate": 3.6552053205842766e-08,
"loss": 0.691,
"step": 4582
},
{
"epoch": 0.9810815873270718,
"grad_norm": 0.13704081128650358,
"learning_rate": 3.5735516649080257e-08,
"loss": 0.7123,
"step": 4583
},
{
"epoch": 0.9812956570602874,
"grad_norm": 0.13262662939039688,
"learning_rate": 3.4928195560257614e-08,
"loss": 0.6789,
"step": 4584
},
{
"epoch": 0.981509726793503,
"grad_norm": 0.13835703252968434,
"learning_rate": 3.413009031203229e-08,
"loss": 0.701,
"step": 4585
},
{
"epoch": 0.9817237965267186,
"grad_norm": 0.13324173506990608,
"learning_rate": 3.334120127280738e-08,
"loss": 0.6722,
"step": 4586
},
{
"epoch": 0.9819378662599342,
"grad_norm": 0.13913532729510986,
"learning_rate": 3.256152880673602e-08,
"loss": 0.7032,
"step": 4587
},
{
"epoch": 0.9821519359931498,
"grad_norm": 0.13414261865764993,
"learning_rate": 3.179107327370812e-08,
"loss": 0.6785,
"step": 4588
},
{
"epoch": 0.9823660057263653,
"grad_norm": 0.13619968507042535,
"learning_rate": 3.102983502937029e-08,
"loss": 0.6929,
"step": 4589
},
{
"epoch": 0.9825800754595809,
"grad_norm": 0.1333380174811774,
"learning_rate": 3.027781442510369e-08,
"loss": 0.6589,
"step": 4590
},
{
"epoch": 0.9827941451927965,
"grad_norm": 0.13474858725447048,
"learning_rate": 2.9535011808043967e-08,
"loss": 0.6876,
"step": 4591
},
{
"epoch": 0.9830082149260122,
"grad_norm": 0.12802733758990661,
"learning_rate": 2.880142752106574e-08,
"loss": 0.6623,
"step": 4592
},
{
"epoch": 0.9832222846592278,
"grad_norm": 0.1326494823025216,
"learning_rate": 2.8077061902787028e-08,
"loss": 0.6812,
"step": 4593
},
{
"epoch": 0.9834363543924434,
"grad_norm": 0.1305521513669887,
"learning_rate": 2.7361915287578144e-08,
"loss": 0.6596,
"step": 4594
},
{
"epoch": 0.983650424125659,
"grad_norm": 0.13359993975925222,
"learning_rate": 2.665598800554836e-08,
"loss": 0.6826,
"step": 4595
},
{
"epoch": 0.9838644938588745,
"grad_norm": 0.13421851061639803,
"learning_rate": 2.5959280382550355e-08,
"loss": 0.6894,
"step": 4596
},
{
"epoch": 0.9840785635920901,
"grad_norm": 0.13241344494829288,
"learning_rate": 2.5271792740186874e-08,
"loss": 0.6654,
"step": 4597
},
{
"epoch": 0.9842926333253057,
"grad_norm": 0.13343662178139085,
"learning_rate": 2.4593525395797402e-08,
"loss": 0.6888,
"step": 4598
},
{
"epoch": 0.9845067030585213,
"grad_norm": 0.14384676987663056,
"learning_rate": 2.3924478662469275e-08,
"loss": 0.7035,
"step": 4599
},
{
"epoch": 0.9847207727917369,
"grad_norm": 0.13538969155135305,
"learning_rate": 2.326465284903545e-08,
"loss": 0.6904,
"step": 4600
},
{
"epoch": 0.9849348425249526,
"grad_norm": 0.13219703601808955,
"learning_rate": 2.2614048260067856e-08,
"loss": 0.6757,
"step": 4601
},
{
"epoch": 0.9851489122581681,
"grad_norm": 0.1336655266590498,
"learning_rate": 2.1972665195886256e-08,
"loss": 0.6883,
"step": 4602
},
{
"epoch": 0.9853629819913837,
"grad_norm": 0.13353985092154214,
"learning_rate": 2.1340503952551606e-08,
"loss": 0.69,
"step": 4603
},
{
"epoch": 0.9855770517245993,
"grad_norm": 0.13625638470157295,
"learning_rate": 2.0717564821868264e-08,
"loss": 0.7155,
"step": 4604
},
{
"epoch": 0.9857911214578149,
"grad_norm": 0.1369602486021807,
"learning_rate": 2.0103848091381773e-08,
"loss": 0.6773,
"step": 4605
},
{
"epoch": 0.9860051911910305,
"grad_norm": 0.13519674482160918,
"learning_rate": 1.949935404438552e-08,
"loss": 0.6888,
"step": 4606
},
{
"epoch": 0.986219260924246,
"grad_norm": 0.13288621786661212,
"learning_rate": 1.890408295990964e-08,
"loss": 0.6626,
"step": 4607
},
{
"epoch": 0.9864333306574616,
"grad_norm": 0.13460149976880195,
"learning_rate": 1.8318035112734335e-08,
"loss": 0.6882,
"step": 4608
},
{
"epoch": 0.9866474003906772,
"grad_norm": 0.13397113179129877,
"learning_rate": 1.7741210773376538e-08,
"loss": 0.6898,
"step": 4609
},
{
"epoch": 0.9868614701238928,
"grad_norm": 0.13250612058289976,
"learning_rate": 1.7173610208096603e-08,
"loss": 0.656,
"step": 4610
},
{
"epoch": 0.9870755398571085,
"grad_norm": 0.1330792982515392,
"learning_rate": 1.661523367889606e-08,
"loss": 0.6772,
"step": 4611
},
{
"epoch": 0.9872896095903241,
"grad_norm": 0.1367407065381511,
"learning_rate": 1.6066081443524284e-08,
"loss": 0.7211,
"step": 4612
},
{
"epoch": 0.9875036793235397,
"grad_norm": 0.12994364033838343,
"learning_rate": 1.55261537554674e-08,
"loss": 0.6624,
"step": 4613
},
{
"epoch": 0.9877177490567552,
"grad_norm": 0.1386557268862067,
"learning_rate": 1.499545086395493e-08,
"loss": 0.6876,
"step": 4614
},
{
"epoch": 0.9879318187899708,
"grad_norm": 0.14067556190485303,
"learning_rate": 1.4473973013957587e-08,
"loss": 0.6934,
"step": 4615
},
{
"epoch": 0.9881458885231864,
"grad_norm": 0.13844331925045963,
"learning_rate": 1.3961720446191707e-08,
"loss": 0.7093,
"step": 4616
},
{
"epoch": 0.988359958256402,
"grad_norm": 0.13448001465852008,
"learning_rate": 1.3458693397105926e-08,
"loss": 0.6757,
"step": 4617
},
{
"epoch": 0.9885740279896176,
"grad_norm": 0.16708772475882527,
"learning_rate": 1.2964892098903393e-08,
"loss": 0.6857,
"step": 4618
},
{
"epoch": 0.9887880977228332,
"grad_norm": 0.3096230916909998,
"learning_rate": 1.2480316779517332e-08,
"loss": 0.7003,
"step": 4619
},
{
"epoch": 0.9890021674560489,
"grad_norm": 0.1353265563541458,
"learning_rate": 1.2004967662628819e-08,
"loss": 0.6971,
"step": 4620
},
{
"epoch": 0.9892162371892644,
"grad_norm": 0.13128722461174966,
"learning_rate": 1.1538844967660112e-08,
"loss": 0.6693,
"step": 4621
},
{
"epoch": 0.98943030692248,
"grad_norm": 0.13025951588726517,
"learning_rate": 1.1081948909767992e-08,
"loss": 0.6588,
"step": 4622
},
{
"epoch": 0.9896443766556956,
"grad_norm": 0.13357595007751152,
"learning_rate": 1.0634279699857086e-08,
"loss": 0.6686,
"step": 4623
},
{
"epoch": 0.9898584463889112,
"grad_norm": 0.13467638747773059,
"learning_rate": 1.0195837544570986e-08,
"loss": 0.6897,
"step": 4624
},
{
"epoch": 0.9900725161221268,
"grad_norm": 0.1318685880619982,
"learning_rate": 9.766622646292246e-09,
"loss": 0.6883,
"step": 4625
},
{
"epoch": 0.9902865858553423,
"grad_norm": 0.13587796907019287,
"learning_rate": 9.346635203149046e-09,
"loss": 0.6952,
"step": 4626
},
{
"epoch": 0.9905006555885579,
"grad_norm": 0.13822458130193174,
"learning_rate": 8.93587540900409e-09,
"loss": 0.7011,
"step": 4627
},
{
"epoch": 0.9907147253217735,
"grad_norm": 0.13549919290199122,
"learning_rate": 8.53434345346349e-09,
"loss": 0.6856,
"step": 4628
},
{
"epoch": 0.9909287950549892,
"grad_norm": 0.13443503225334064,
"learning_rate": 8.142039521874534e-09,
"loss": 0.7048,
"step": 4629
},
{
"epoch": 0.9911428647882048,
"grad_norm": 0.13682513693725026,
"learning_rate": 7.758963795321262e-09,
"loss": 0.7063,
"step": 4630
},
{
"epoch": 0.9913569345214204,
"grad_norm": 0.1337141407342257,
"learning_rate": 7.385116450635555e-09,
"loss": 0.6911,
"step": 4631
},
{
"epoch": 0.991571004254636,
"grad_norm": 0.13226056373114575,
"learning_rate": 7.020497660381598e-09,
"loss": 0.6892,
"step": 4632
},
{
"epoch": 0.9917850739878515,
"grad_norm": 0.1332366376295061,
"learning_rate": 6.665107592866982e-09,
"loss": 0.6951,
"step": 4633
},
{
"epoch": 0.9919991437210671,
"grad_norm": 0.13468292491388104,
"learning_rate": 6.318946412140481e-09,
"loss": 0.7092,
"step": 4634
},
{
"epoch": 0.9922132134542827,
"grad_norm": 0.1334168070312017,
"learning_rate": 5.982014277987614e-09,
"loss": 0.6828,
"step": 4635
},
{
"epoch": 0.9924272831874983,
"grad_norm": 0.1332326702101735,
"learning_rate": 5.654311345937302e-09,
"loss": 0.697,
"step": 4636
},
{
"epoch": 0.9926413529207139,
"grad_norm": 0.13201991385115935,
"learning_rate": 5.335837767255214e-09,
"loss": 0.6745,
"step": 4637
},
{
"epoch": 0.9928554226539296,
"grad_norm": 0.13479723742553412,
"learning_rate": 5.0265936889482e-09,
"loss": 0.7088,
"step": 4638
},
{
"epoch": 0.9930694923871451,
"grad_norm": 0.138370397655778,
"learning_rate": 4.726579253764296e-09,
"loss": 0.6977,
"step": 4639
},
{
"epoch": 0.9932835621203607,
"grad_norm": 0.13221492488593217,
"learning_rate": 4.435794600188281e-09,
"loss": 0.6939,
"step": 4640
},
{
"epoch": 0.9934976318535763,
"grad_norm": 0.13330176726420126,
"learning_rate": 4.154239862446119e-09,
"loss": 0.6735,
"step": 4641
},
{
"epoch": 0.9937117015867919,
"grad_norm": 0.13338245456308295,
"learning_rate": 3.881915170502737e-09,
"loss": 0.676,
"step": 4642
},
{
"epoch": 0.9939257713200075,
"grad_norm": 0.13408047808814588,
"learning_rate": 3.6188206500620273e-09,
"loss": 0.6977,
"step": 4643
},
{
"epoch": 0.9941398410532231,
"grad_norm": 0.13091899343953667,
"learning_rate": 3.3649564225690655e-09,
"loss": 0.6772,
"step": 4644
},
{
"epoch": 0.9943539107864386,
"grad_norm": 0.13246650687304787,
"learning_rate": 3.1203226052078926e-09,
"loss": 0.6993,
"step": 4645
},
{
"epoch": 0.9945679805196542,
"grad_norm": 0.13530972931274127,
"learning_rate": 2.8849193109015127e-09,
"loss": 0.7144,
"step": 4646
},
{
"epoch": 0.9947820502528699,
"grad_norm": 0.13605103683147698,
"learning_rate": 2.658746648307453e-09,
"loss": 0.7093,
"step": 4647
},
{
"epoch": 0.9949961199860855,
"grad_norm": 0.13233322258862,
"learning_rate": 2.441804721831087e-09,
"loss": 0.6689,
"step": 4648
},
{
"epoch": 0.9952101897193011,
"grad_norm": 0.13295750468869433,
"learning_rate": 2.2340936316100904e-09,
"loss": 0.6726,
"step": 4649
},
{
"epoch": 0.9954242594525167,
"grad_norm": 0.13218341043847168,
"learning_rate": 2.0356134735233234e-09,
"loss": 0.6944,
"step": 4650
},
{
"epoch": 0.9956383291857323,
"grad_norm": 0.6752122162301123,
"learning_rate": 1.8463643391908314e-09,
"loss": 0.7099,
"step": 4651
},
{
"epoch": 0.9958523989189478,
"grad_norm": 0.13282059775678576,
"learning_rate": 1.6663463159671821e-09,
"loss": 0.678,
"step": 4652
},
{
"epoch": 0.9960664686521634,
"grad_norm": 0.13069347851367272,
"learning_rate": 1.4955594869525692e-09,
"loss": 0.671,
"step": 4653
},
{
"epoch": 0.996280538385379,
"grad_norm": 0.13686805337972868,
"learning_rate": 1.3340039309750475e-09,
"loss": 0.6952,
"step": 4654
},
{
"epoch": 0.9964946081185946,
"grad_norm": 0.13554420337436102,
"learning_rate": 1.181679722614959e-09,
"loss": 0.7047,
"step": 4655
},
{
"epoch": 0.9967086778518103,
"grad_norm": 0.13092668613792285,
"learning_rate": 1.038586932182728e-09,
"loss": 0.6706,
"step": 4656
},
{
"epoch": 0.9969227475850259,
"grad_norm": 0.13492033405363069,
"learning_rate": 9.047256257277426e-10,
"loss": 0.6837,
"step": 4657
},
{
"epoch": 0.9971368173182414,
"grad_norm": 0.1359172406873756,
"learning_rate": 7.800958650405754e-10,
"loss": 0.7018,
"step": 4658
},
{
"epoch": 0.997350887051457,
"grad_norm": 0.13351807830800608,
"learning_rate": 6.646977076529837e-10,
"loss": 0.6969,
"step": 4659
},
{
"epoch": 0.9975649567846726,
"grad_norm": 0.12618875789265696,
"learning_rate": 5.585312068312476e-10,
"loss": 0.6465,
"step": 4660
},
{
"epoch": 0.9977790265178882,
"grad_norm": 0.13713215611649313,
"learning_rate": 4.6159641157839107e-10,
"loss": 0.7069,
"step": 4661
},
{
"epoch": 0.9979930962511038,
"grad_norm": 0.14591205614842845,
"learning_rate": 3.738933666430633e-10,
"loss": 0.6816,
"step": 4662
},
{
"epoch": 0.9982071659843194,
"grad_norm": 0.13525007504585135,
"learning_rate": 2.954221125084367e-10,
"loss": 0.6965,
"step": 4663
},
{
"epoch": 0.9984212357175349,
"grad_norm": 0.13344615803605883,
"learning_rate": 2.2618268539664756e-10,
"loss": 0.6604,
"step": 4664
},
{
"epoch": 0.9986353054507506,
"grad_norm": 0.13354118073820614,
"learning_rate": 1.6617511726657597e-10,
"loss": 0.6893,
"step": 4665
},
{
"epoch": 0.9988493751839662,
"grad_norm": 0.13045906658111772,
"learning_rate": 1.1539943582050683e-10,
"loss": 0.6664,
"step": 4666
},
{
"epoch": 0.9990634449171818,
"grad_norm": 0.13023994655516052,
"learning_rate": 7.385566449302773e-11,
"loss": 0.6659,
"step": 4667
},
{
"epoch": 0.9992775146503974,
"grad_norm": 0.12962065845272616,
"learning_rate": 4.154382246435162e-11,
"loss": 0.6698,
"step": 4668
},
{
"epoch": 0.999491584383613,
"grad_norm": 0.16239162310804545,
"learning_rate": 1.8463924646994202e-11,
"loss": 0.6786,
"step": 4669
},
{
"epoch": 0.9997056541168285,
"grad_norm": 0.1344152267995376,
"learning_rate": 4.615981694655603e-12,
"loss": 0.6913,
"step": 4670
},
{
"epoch": 0.9999197238500441,
"grad_norm": 0.13351560734418214,
"learning_rate": 0.0,
"loss": 0.6768,
"step": 4671
},
{
"epoch": 0.9999197238500441,
"step": 4671,
"total_flos": 9004996003627008.0,
"train_loss": 0.728125217524414,
"train_runtime": 85870.4917,
"train_samples_per_second": 24.371,
"train_steps_per_second": 0.054
}
],
"logging_steps": 1,
"max_steps": 4671,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9004996003627008.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}