Files
Artificial-llama3_1_8B_inst…/trainer_state.json
ModelHub XC dfc5039340 初始化项目,由ModelHub XC社区提供模型
Model: BAAI/Artificial-llama3_1_8B_instruct
Source: Original Platform
2026-05-18 11:38:45 +08:00

48361 lines
1.1 MiB

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 6889,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00029031789809841774,
"grad_norm": 12.98371410369873,
"learning_rate": 1.1614401858304298e-08,
"loss": 1.5049,
"step": 1
},
{
"epoch": 0.0005806357961968355,
"grad_norm": 15.011346817016602,
"learning_rate": 2.3228803716608597e-08,
"loss": 1.5849,
"step": 2
},
{
"epoch": 0.0008709536942952533,
"grad_norm": 11.487916946411133,
"learning_rate": 3.484320557491289e-08,
"loss": 1.3669,
"step": 3
},
{
"epoch": 0.001161271592393671,
"grad_norm": 10.780348777770996,
"learning_rate": 4.645760743321719e-08,
"loss": 1.4929,
"step": 4
},
{
"epoch": 0.0014515894904920889,
"grad_norm": 8.864033699035645,
"learning_rate": 5.807200929152149e-08,
"loss": 1.3715,
"step": 5
},
{
"epoch": 0.0017419073885905066,
"grad_norm": 11.861454010009766,
"learning_rate": 6.968641114982578e-08,
"loss": 1.3279,
"step": 6
},
{
"epoch": 0.0020322252866889243,
"grad_norm": 12.796159744262695,
"learning_rate": 8.130081300813009e-08,
"loss": 1.5399,
"step": 7
},
{
"epoch": 0.002322543184787342,
"grad_norm": 14.083832740783691,
"learning_rate": 9.291521486643439e-08,
"loss": 1.5013,
"step": 8
},
{
"epoch": 0.00261286108288576,
"grad_norm": 14.128660202026367,
"learning_rate": 1.045296167247387e-07,
"loss": 1.4663,
"step": 9
},
{
"epoch": 0.0029031789809841778,
"grad_norm": 13.425607681274414,
"learning_rate": 1.1614401858304298e-07,
"loss": 1.5617,
"step": 10
},
{
"epoch": 0.0031934968790825954,
"grad_norm": 13.83140754699707,
"learning_rate": 1.277584204413473e-07,
"loss": 1.5362,
"step": 11
},
{
"epoch": 0.003483814777181013,
"grad_norm": 13.65449047088623,
"learning_rate": 1.3937282229965157e-07,
"loss": 1.5805,
"step": 12
},
{
"epoch": 0.003774132675279431,
"grad_norm": 12.831888198852539,
"learning_rate": 1.509872241579559e-07,
"loss": 1.4625,
"step": 13
},
{
"epoch": 0.0040644505733778485,
"grad_norm": 11.353224754333496,
"learning_rate": 1.6260162601626018e-07,
"loss": 1.3862,
"step": 14
},
{
"epoch": 0.004354768471476266,
"grad_norm": 12.183592796325684,
"learning_rate": 1.7421602787456448e-07,
"loss": 1.4039,
"step": 15
},
{
"epoch": 0.004645086369574684,
"grad_norm": 10.883825302124023,
"learning_rate": 1.8583042973286877e-07,
"loss": 1.4896,
"step": 16
},
{
"epoch": 0.0049354042676731024,
"grad_norm": 11.417899131774902,
"learning_rate": 1.9744483159117307e-07,
"loss": 1.4953,
"step": 17
},
{
"epoch": 0.00522572216577152,
"grad_norm": 12.301745414733887,
"learning_rate": 2.090592334494774e-07,
"loss": 1.5397,
"step": 18
},
{
"epoch": 0.005516040063869938,
"grad_norm": 12.54859733581543,
"learning_rate": 2.2067363530778166e-07,
"loss": 1.4311,
"step": 19
},
{
"epoch": 0.0058063579619683555,
"grad_norm": 11.264630317687988,
"learning_rate": 2.3228803716608595e-07,
"loss": 1.4226,
"step": 20
},
{
"epoch": 0.006096675860066773,
"grad_norm": 9.510553359985352,
"learning_rate": 2.439024390243903e-07,
"loss": 1.2677,
"step": 21
},
{
"epoch": 0.006386993758165191,
"grad_norm": 10.799087524414062,
"learning_rate": 2.555168408826946e-07,
"loss": 1.2587,
"step": 22
},
{
"epoch": 0.006677311656263609,
"grad_norm": 12.985727310180664,
"learning_rate": 2.6713124274099886e-07,
"loss": 1.5219,
"step": 23
},
{
"epoch": 0.006967629554362026,
"grad_norm": 11.036988258361816,
"learning_rate": 2.7874564459930313e-07,
"loss": 1.2812,
"step": 24
},
{
"epoch": 0.007257947452460444,
"grad_norm": 12.813385009765625,
"learning_rate": 2.9036004645760745e-07,
"loss": 1.4295,
"step": 25
},
{
"epoch": 0.007548265350558862,
"grad_norm": 11.850518226623535,
"learning_rate": 3.019744483159118e-07,
"loss": 1.5312,
"step": 26
},
{
"epoch": 0.00783858324865728,
"grad_norm": 10.156636238098145,
"learning_rate": 3.1358885017421604e-07,
"loss": 1.335,
"step": 27
},
{
"epoch": 0.008128901146755697,
"grad_norm": 13.199593544006348,
"learning_rate": 3.2520325203252037e-07,
"loss": 1.4746,
"step": 28
},
{
"epoch": 0.008419219044854116,
"grad_norm": 11.168906211853027,
"learning_rate": 3.3681765389082463e-07,
"loss": 1.4153,
"step": 29
},
{
"epoch": 0.008709536942952532,
"grad_norm": 10.479500770568848,
"learning_rate": 3.4843205574912896e-07,
"loss": 1.4663,
"step": 30
},
{
"epoch": 0.008999854841050951,
"grad_norm": 9.586933135986328,
"learning_rate": 3.600464576074333e-07,
"loss": 1.3906,
"step": 31
},
{
"epoch": 0.009290172739149368,
"grad_norm": 8.640244483947754,
"learning_rate": 3.7166085946573755e-07,
"loss": 1.2824,
"step": 32
},
{
"epoch": 0.009580490637247786,
"grad_norm": 9.352594375610352,
"learning_rate": 3.832752613240418e-07,
"loss": 1.5556,
"step": 33
},
{
"epoch": 0.009870808535346205,
"grad_norm": 9.151625633239746,
"learning_rate": 3.9488966318234614e-07,
"loss": 1.4723,
"step": 34
},
{
"epoch": 0.010161126433444622,
"grad_norm": 8.76069164276123,
"learning_rate": 4.0650406504065046e-07,
"loss": 1.3395,
"step": 35
},
{
"epoch": 0.01045144433154304,
"grad_norm": 8.062403678894043,
"learning_rate": 4.181184668989548e-07,
"loss": 1.3208,
"step": 36
},
{
"epoch": 0.010741762229641457,
"grad_norm": 7.3440117835998535,
"learning_rate": 4.2973286875725905e-07,
"loss": 1.3127,
"step": 37
},
{
"epoch": 0.011032080127739876,
"grad_norm": 8.806610107421875,
"learning_rate": 4.413472706155633e-07,
"loss": 1.4392,
"step": 38
},
{
"epoch": 0.011322398025838292,
"grad_norm": 7.228657245635986,
"learning_rate": 4.5296167247386764e-07,
"loss": 1.2803,
"step": 39
},
{
"epoch": 0.011612715923936711,
"grad_norm": 6.03056001663208,
"learning_rate": 4.645760743321719e-07,
"loss": 1.4196,
"step": 40
},
{
"epoch": 0.011903033822035128,
"grad_norm": 6.1218414306640625,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.3285,
"step": 41
},
{
"epoch": 0.012193351720133546,
"grad_norm": 5.98799467086792,
"learning_rate": 4.878048780487805e-07,
"loss": 1.4567,
"step": 42
},
{
"epoch": 0.012483669618231963,
"grad_norm": 6.054631233215332,
"learning_rate": 4.994192799070848e-07,
"loss": 1.167,
"step": 43
},
{
"epoch": 0.012773987516330382,
"grad_norm": 6.573026657104492,
"learning_rate": 5.110336817653892e-07,
"loss": 1.3017,
"step": 44
},
{
"epoch": 0.0130643054144288,
"grad_norm": 6.461268424987793,
"learning_rate": 5.226480836236935e-07,
"loss": 1.3128,
"step": 45
},
{
"epoch": 0.013354623312527217,
"grad_norm": 6.853832721710205,
"learning_rate": 5.342624854819977e-07,
"loss": 1.3945,
"step": 46
},
{
"epoch": 0.013644941210625636,
"grad_norm": 6.029784202575684,
"learning_rate": 5.45876887340302e-07,
"loss": 1.2451,
"step": 47
},
{
"epoch": 0.013935259108724053,
"grad_norm": 5.88099479675293,
"learning_rate": 5.574912891986063e-07,
"loss": 1.3445,
"step": 48
},
{
"epoch": 0.014225577006822471,
"grad_norm": 6.424746513366699,
"learning_rate": 5.691056910569106e-07,
"loss": 1.3422,
"step": 49
},
{
"epoch": 0.014515894904920888,
"grad_norm": 6.097443103790283,
"learning_rate": 5.807200929152149e-07,
"loss": 1.2858,
"step": 50
},
{
"epoch": 0.014806212803019306,
"grad_norm": 5.770637035369873,
"learning_rate": 5.923344947735192e-07,
"loss": 1.3279,
"step": 51
},
{
"epoch": 0.015096530701117723,
"grad_norm": 5.4142255783081055,
"learning_rate": 6.039488966318236e-07,
"loss": 1.0764,
"step": 52
},
{
"epoch": 0.015386848599216142,
"grad_norm": 5.637355327606201,
"learning_rate": 6.155632984901278e-07,
"loss": 1.3842,
"step": 53
},
{
"epoch": 0.01567716649731456,
"grad_norm": 5.330609321594238,
"learning_rate": 6.271777003484321e-07,
"loss": 1.378,
"step": 54
},
{
"epoch": 0.015967484395412977,
"grad_norm": 5.984035015106201,
"learning_rate": 6.387921022067365e-07,
"loss": 1.3356,
"step": 55
},
{
"epoch": 0.016257802293511394,
"grad_norm": 5.351655006408691,
"learning_rate": 6.504065040650407e-07,
"loss": 1.2484,
"step": 56
},
{
"epoch": 0.016548120191609814,
"grad_norm": 5.876321315765381,
"learning_rate": 6.62020905923345e-07,
"loss": 1.3715,
"step": 57
},
{
"epoch": 0.01683843808970823,
"grad_norm": 5.923037052154541,
"learning_rate": 6.736353077816493e-07,
"loss": 1.2894,
"step": 58
},
{
"epoch": 0.017128755987806648,
"grad_norm": 6.991525650024414,
"learning_rate": 6.852497096399536e-07,
"loss": 1.2768,
"step": 59
},
{
"epoch": 0.017419073885905065,
"grad_norm": 6.0120415687561035,
"learning_rate": 6.968641114982579e-07,
"loss": 1.3504,
"step": 60
},
{
"epoch": 0.017709391784003485,
"grad_norm": 5.738192081451416,
"learning_rate": 7.084785133565622e-07,
"loss": 1.2148,
"step": 61
},
{
"epoch": 0.017999709682101902,
"grad_norm": 5.9735565185546875,
"learning_rate": 7.200929152148666e-07,
"loss": 1.2813,
"step": 62
},
{
"epoch": 0.01829002758020032,
"grad_norm": 5.3724045753479,
"learning_rate": 7.317073170731707e-07,
"loss": 1.2319,
"step": 63
},
{
"epoch": 0.018580345478298736,
"grad_norm": 5.448958873748779,
"learning_rate": 7.433217189314751e-07,
"loss": 1.3275,
"step": 64
},
{
"epoch": 0.018870663376397156,
"grad_norm": 5.127908229827881,
"learning_rate": 7.549361207897795e-07,
"loss": 1.1126,
"step": 65
},
{
"epoch": 0.019160981274495573,
"grad_norm": 5.0388078689575195,
"learning_rate": 7.665505226480836e-07,
"loss": 1.1897,
"step": 66
},
{
"epoch": 0.01945129917259399,
"grad_norm": 5.600452423095703,
"learning_rate": 7.78164924506388e-07,
"loss": 1.2433,
"step": 67
},
{
"epoch": 0.01974161707069241,
"grad_norm": 5.887912750244141,
"learning_rate": 7.897793263646923e-07,
"loss": 1.3351,
"step": 68
},
{
"epoch": 0.020031934968790827,
"grad_norm": 5.299606800079346,
"learning_rate": 8.013937282229965e-07,
"loss": 1.3575,
"step": 69
},
{
"epoch": 0.020322252866889243,
"grad_norm": 5.105284214019775,
"learning_rate": 8.130081300813009e-07,
"loss": 1.3878,
"step": 70
},
{
"epoch": 0.02061257076498766,
"grad_norm": 5.7982611656188965,
"learning_rate": 8.246225319396052e-07,
"loss": 1.3991,
"step": 71
},
{
"epoch": 0.02090288866308608,
"grad_norm": 4.94700288772583,
"learning_rate": 8.362369337979096e-07,
"loss": 1.1393,
"step": 72
},
{
"epoch": 0.021193206561184497,
"grad_norm": 5.303609848022461,
"learning_rate": 8.478513356562137e-07,
"loss": 1.2964,
"step": 73
},
{
"epoch": 0.021483524459282914,
"grad_norm": 5.495324611663818,
"learning_rate": 8.594657375145181e-07,
"loss": 1.2954,
"step": 74
},
{
"epoch": 0.02177384235738133,
"grad_norm": 5.6491618156433105,
"learning_rate": 8.710801393728225e-07,
"loss": 1.2611,
"step": 75
},
{
"epoch": 0.02206416025547975,
"grad_norm": 5.240455627441406,
"learning_rate": 8.826945412311266e-07,
"loss": 1.2958,
"step": 76
},
{
"epoch": 0.022354478153578168,
"grad_norm": 5.199842929840088,
"learning_rate": 8.94308943089431e-07,
"loss": 1.2819,
"step": 77
},
{
"epoch": 0.022644796051676585,
"grad_norm": 5.2357563972473145,
"learning_rate": 9.059233449477353e-07,
"loss": 1.4676,
"step": 78
},
{
"epoch": 0.022935113949775005,
"grad_norm": 5.108120918273926,
"learning_rate": 9.175377468060395e-07,
"loss": 1.2405,
"step": 79
},
{
"epoch": 0.023225431847873422,
"grad_norm": 5.231665134429932,
"learning_rate": 9.291521486643438e-07,
"loss": 1.4379,
"step": 80
},
{
"epoch": 0.02351574974597184,
"grad_norm": 5.028713226318359,
"learning_rate": 9.407665505226482e-07,
"loss": 1.2525,
"step": 81
},
{
"epoch": 0.023806067644070256,
"grad_norm": 4.963902473449707,
"learning_rate": 9.523809523809525e-07,
"loss": 1.3688,
"step": 82
},
{
"epoch": 0.024096385542168676,
"grad_norm": 5.339515209197998,
"learning_rate": 9.639953542392568e-07,
"loss": 1.3483,
"step": 83
},
{
"epoch": 0.024386703440267093,
"grad_norm": 4.661757946014404,
"learning_rate": 9.75609756097561e-07,
"loss": 1.2315,
"step": 84
},
{
"epoch": 0.02467702133836551,
"grad_norm": 5.013600826263428,
"learning_rate": 9.872241579558654e-07,
"loss": 1.1119,
"step": 85
},
{
"epoch": 0.024967339236463926,
"grad_norm": 5.247570514678955,
"learning_rate": 9.988385598141696e-07,
"loss": 1.2577,
"step": 86
},
{
"epoch": 0.025257657134562347,
"grad_norm": 4.793586254119873,
"learning_rate": 1.010452961672474e-06,
"loss": 1.1647,
"step": 87
},
{
"epoch": 0.025547975032660764,
"grad_norm": 5.240910530090332,
"learning_rate": 1.0220673635307784e-06,
"loss": 1.2794,
"step": 88
},
{
"epoch": 0.02583829293075918,
"grad_norm": 5.577494144439697,
"learning_rate": 1.0336817653890824e-06,
"loss": 1.2242,
"step": 89
},
{
"epoch": 0.0261286108288576,
"grad_norm": 5.272243976593018,
"learning_rate": 1.045296167247387e-06,
"loss": 1.2782,
"step": 90
},
{
"epoch": 0.026418928726956017,
"grad_norm": 4.999173164367676,
"learning_rate": 1.0569105691056912e-06,
"loss": 1.2035,
"step": 91
},
{
"epoch": 0.026709246625054434,
"grad_norm": 5.285266399383545,
"learning_rate": 1.0685249709639955e-06,
"loss": 1.2585,
"step": 92
},
{
"epoch": 0.02699956452315285,
"grad_norm": 4.789974212646484,
"learning_rate": 1.0801393728222997e-06,
"loss": 1.1728,
"step": 93
},
{
"epoch": 0.02728988242125127,
"grad_norm": 4.92954158782959,
"learning_rate": 1.091753774680604e-06,
"loss": 1.0652,
"step": 94
},
{
"epoch": 0.027580200319349688,
"grad_norm": 5.096219062805176,
"learning_rate": 1.1033681765389083e-06,
"loss": 1.1034,
"step": 95
},
{
"epoch": 0.027870518217448105,
"grad_norm": 4.46090030670166,
"learning_rate": 1.1149825783972125e-06,
"loss": 1.1724,
"step": 96
},
{
"epoch": 0.028160836115546522,
"grad_norm": 4.940242767333984,
"learning_rate": 1.126596980255517e-06,
"loss": 1.2639,
"step": 97
},
{
"epoch": 0.028451154013644942,
"grad_norm": 5.028652667999268,
"learning_rate": 1.1382113821138213e-06,
"loss": 1.2364,
"step": 98
},
{
"epoch": 0.02874147191174336,
"grad_norm": 4.718242645263672,
"learning_rate": 1.1498257839721255e-06,
"loss": 1.1937,
"step": 99
},
{
"epoch": 0.029031789809841776,
"grad_norm": 4.432032585144043,
"learning_rate": 1.1614401858304298e-06,
"loss": 1.1513,
"step": 100
},
{
"epoch": 0.029322107707940196,
"grad_norm": 4.936924934387207,
"learning_rate": 1.173054587688734e-06,
"loss": 1.2232,
"step": 101
},
{
"epoch": 0.029612425606038613,
"grad_norm": 5.688957214355469,
"learning_rate": 1.1846689895470384e-06,
"loss": 1.1863,
"step": 102
},
{
"epoch": 0.02990274350413703,
"grad_norm": 5.0107598304748535,
"learning_rate": 1.1962833914053428e-06,
"loss": 1.3997,
"step": 103
},
{
"epoch": 0.030193061402235447,
"grad_norm": 5.119560241699219,
"learning_rate": 1.207897793263647e-06,
"loss": 1.2213,
"step": 104
},
{
"epoch": 0.030483379300333867,
"grad_norm": 4.578975677490234,
"learning_rate": 1.2195121951219514e-06,
"loss": 1.146,
"step": 105
},
{
"epoch": 0.030773697198432284,
"grad_norm": 4.886281490325928,
"learning_rate": 1.2311265969802556e-06,
"loss": 1.0512,
"step": 106
},
{
"epoch": 0.0310640150965307,
"grad_norm": 5.57105827331543,
"learning_rate": 1.24274099883856e-06,
"loss": 1.1915,
"step": 107
},
{
"epoch": 0.03135433299462912,
"grad_norm": 4.908017158508301,
"learning_rate": 1.2543554006968642e-06,
"loss": 1.3416,
"step": 108
},
{
"epoch": 0.031644650892727534,
"grad_norm": 4.684658050537109,
"learning_rate": 1.2659698025551684e-06,
"loss": 1.2278,
"step": 109
},
{
"epoch": 0.031934968790825954,
"grad_norm": 4.777091026306152,
"learning_rate": 1.277584204413473e-06,
"loss": 1.2465,
"step": 110
},
{
"epoch": 0.032225286688924375,
"grad_norm": 5.166219234466553,
"learning_rate": 1.289198606271777e-06,
"loss": 1.2505,
"step": 111
},
{
"epoch": 0.03251560458702279,
"grad_norm": 4.726422309875488,
"learning_rate": 1.3008130081300815e-06,
"loss": 1.0856,
"step": 112
},
{
"epoch": 0.03280592248512121,
"grad_norm": 5.305611610412598,
"learning_rate": 1.3124274099883857e-06,
"loss": 1.1678,
"step": 113
},
{
"epoch": 0.03309624038321963,
"grad_norm": 5.176076889038086,
"learning_rate": 1.32404181184669e-06,
"loss": 1.3583,
"step": 114
},
{
"epoch": 0.03338655828131804,
"grad_norm": 5.078863143920898,
"learning_rate": 1.3356562137049945e-06,
"loss": 1.2707,
"step": 115
},
{
"epoch": 0.03367687617941646,
"grad_norm": 4.867222785949707,
"learning_rate": 1.3472706155632985e-06,
"loss": 1.233,
"step": 116
},
{
"epoch": 0.033967194077514876,
"grad_norm": 5.004298210144043,
"learning_rate": 1.3588850174216028e-06,
"loss": 1.1668,
"step": 117
},
{
"epoch": 0.034257511975613296,
"grad_norm": 5.02892541885376,
"learning_rate": 1.3704994192799073e-06,
"loss": 1.3564,
"step": 118
},
{
"epoch": 0.034547829873711716,
"grad_norm": 5.394801616668701,
"learning_rate": 1.3821138211382116e-06,
"loss": 1.2523,
"step": 119
},
{
"epoch": 0.03483814777181013,
"grad_norm": 5.628437042236328,
"learning_rate": 1.3937282229965158e-06,
"loss": 1.4381,
"step": 120
},
{
"epoch": 0.03512846566990855,
"grad_norm": 4.8691229820251465,
"learning_rate": 1.4053426248548203e-06,
"loss": 1.25,
"step": 121
},
{
"epoch": 0.03541878356800697,
"grad_norm": 5.313623428344727,
"learning_rate": 1.4169570267131244e-06,
"loss": 1.2792,
"step": 122
},
{
"epoch": 0.035709101466105383,
"grad_norm": 4.7696943283081055,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.2598,
"step": 123
},
{
"epoch": 0.035999419364203804,
"grad_norm": 5.520746231079102,
"learning_rate": 1.4401858304297331e-06,
"loss": 1.3088,
"step": 124
},
{
"epoch": 0.036289737262302224,
"grad_norm": 4.788918495178223,
"learning_rate": 1.4518002322880374e-06,
"loss": 1.1841,
"step": 125
},
{
"epoch": 0.03658005516040064,
"grad_norm": 4.718968868255615,
"learning_rate": 1.4634146341463414e-06,
"loss": 1.1215,
"step": 126
},
{
"epoch": 0.03687037305849906,
"grad_norm": 4.99876070022583,
"learning_rate": 1.475029036004646e-06,
"loss": 1.174,
"step": 127
},
{
"epoch": 0.03716069095659747,
"grad_norm": 5.314165115356445,
"learning_rate": 1.4866434378629502e-06,
"loss": 1.2494,
"step": 128
},
{
"epoch": 0.03745100885469589,
"grad_norm": 4.882414817810059,
"learning_rate": 1.4982578397212545e-06,
"loss": 1.2868,
"step": 129
},
{
"epoch": 0.03774132675279431,
"grad_norm": 4.856612682342529,
"learning_rate": 1.509872241579559e-06,
"loss": 1.1583,
"step": 130
},
{
"epoch": 0.038031644650892725,
"grad_norm": 5.177412986755371,
"learning_rate": 1.521486643437863e-06,
"loss": 1.1967,
"step": 131
},
{
"epoch": 0.038321962548991145,
"grad_norm": 5.465760231018066,
"learning_rate": 1.5331010452961673e-06,
"loss": 1.2635,
"step": 132
},
{
"epoch": 0.038612280447089566,
"grad_norm": 4.9557342529296875,
"learning_rate": 1.5447154471544717e-06,
"loss": 1.1736,
"step": 133
},
{
"epoch": 0.03890259834518798,
"grad_norm": 5.2583537101745605,
"learning_rate": 1.556329849012776e-06,
"loss": 1.1179,
"step": 134
},
{
"epoch": 0.0391929162432864,
"grad_norm": 5.05612325668335,
"learning_rate": 1.56794425087108e-06,
"loss": 1.2193,
"step": 135
},
{
"epoch": 0.03948323414138482,
"grad_norm": 5.267907619476318,
"learning_rate": 1.5795586527293845e-06,
"loss": 1.2736,
"step": 136
},
{
"epoch": 0.03977355203948323,
"grad_norm": 4.456612586975098,
"learning_rate": 1.5911730545876888e-06,
"loss": 1.1091,
"step": 137
},
{
"epoch": 0.04006386993758165,
"grad_norm": 4.886338710784912,
"learning_rate": 1.602787456445993e-06,
"loss": 1.2456,
"step": 138
},
{
"epoch": 0.040354187835680067,
"grad_norm": 4.77720308303833,
"learning_rate": 1.6144018583042976e-06,
"loss": 1.0892,
"step": 139
},
{
"epoch": 0.04064450573377849,
"grad_norm": 5.040073394775391,
"learning_rate": 1.6260162601626018e-06,
"loss": 1.1725,
"step": 140
},
{
"epoch": 0.04093482363187691,
"grad_norm": 4.47899055480957,
"learning_rate": 1.6376306620209059e-06,
"loss": 1.0944,
"step": 141
},
{
"epoch": 0.04122514152997532,
"grad_norm": 4.960933208465576,
"learning_rate": 1.6492450638792104e-06,
"loss": 1.1804,
"step": 142
},
{
"epoch": 0.04151545942807374,
"grad_norm": 4.783790111541748,
"learning_rate": 1.6608594657375146e-06,
"loss": 1.2976,
"step": 143
},
{
"epoch": 0.04180577732617216,
"grad_norm": 4.320231914520264,
"learning_rate": 1.6724738675958191e-06,
"loss": 1.2064,
"step": 144
},
{
"epoch": 0.042096095224270574,
"grad_norm": 4.767696380615234,
"learning_rate": 1.6840882694541234e-06,
"loss": 1.1187,
"step": 145
},
{
"epoch": 0.042386413122368995,
"grad_norm": 4.700660228729248,
"learning_rate": 1.6957026713124274e-06,
"loss": 1.0453,
"step": 146
},
{
"epoch": 0.042676731020467415,
"grad_norm": 4.928901195526123,
"learning_rate": 1.707317073170732e-06,
"loss": 1.1402,
"step": 147
},
{
"epoch": 0.04296704891856583,
"grad_norm": 5.0144758224487305,
"learning_rate": 1.7189314750290362e-06,
"loss": 1.2565,
"step": 148
},
{
"epoch": 0.04325736681666425,
"grad_norm": 5.311608791351318,
"learning_rate": 1.7305458768873405e-06,
"loss": 1.3904,
"step": 149
},
{
"epoch": 0.04354768471476266,
"grad_norm": 5.366107940673828,
"learning_rate": 1.742160278745645e-06,
"loss": 1.2125,
"step": 150
},
{
"epoch": 0.04383800261286108,
"grad_norm": 5.120449066162109,
"learning_rate": 1.753774680603949e-06,
"loss": 1.2111,
"step": 151
},
{
"epoch": 0.0441283205109595,
"grad_norm": 4.783287525177002,
"learning_rate": 1.7653890824622533e-06,
"loss": 1.309,
"step": 152
},
{
"epoch": 0.044418638409057916,
"grad_norm": 5.0367751121521,
"learning_rate": 1.7770034843205577e-06,
"loss": 1.1008,
"step": 153
},
{
"epoch": 0.044708956307156336,
"grad_norm": 4.646999835968018,
"learning_rate": 1.788617886178862e-06,
"loss": 1.1683,
"step": 154
},
{
"epoch": 0.044999274205254756,
"grad_norm": 4.942159175872803,
"learning_rate": 1.800232288037166e-06,
"loss": 1.1253,
"step": 155
},
{
"epoch": 0.04528959210335317,
"grad_norm": 5.135502815246582,
"learning_rate": 1.8118466898954705e-06,
"loss": 1.4273,
"step": 156
},
{
"epoch": 0.04557991000145159,
"grad_norm": 4.905440330505371,
"learning_rate": 1.8234610917537748e-06,
"loss": 1.3051,
"step": 157
},
{
"epoch": 0.04587022789955001,
"grad_norm": 4.9893798828125,
"learning_rate": 1.835075493612079e-06,
"loss": 1.308,
"step": 158
},
{
"epoch": 0.046160545797648424,
"grad_norm": 4.7659759521484375,
"learning_rate": 1.8466898954703836e-06,
"loss": 1.1285,
"step": 159
},
{
"epoch": 0.046450863695746844,
"grad_norm": 4.867801189422607,
"learning_rate": 1.8583042973286876e-06,
"loss": 1.1024,
"step": 160
},
{
"epoch": 0.04674118159384526,
"grad_norm": 5.107170104980469,
"learning_rate": 1.8699186991869919e-06,
"loss": 1.2959,
"step": 161
},
{
"epoch": 0.04703149949194368,
"grad_norm": 5.213975429534912,
"learning_rate": 1.8815331010452964e-06,
"loss": 1.2973,
"step": 162
},
{
"epoch": 0.0473218173900421,
"grad_norm": 4.60981559753418,
"learning_rate": 1.8931475029036006e-06,
"loss": 1.2259,
"step": 163
},
{
"epoch": 0.04761213528814051,
"grad_norm": 4.348560333251953,
"learning_rate": 1.904761904761905e-06,
"loss": 1.0382,
"step": 164
},
{
"epoch": 0.04790245318623893,
"grad_norm": 4.841989517211914,
"learning_rate": 1.916376306620209e-06,
"loss": 1.2645,
"step": 165
},
{
"epoch": 0.04819277108433735,
"grad_norm": 4.736576557159424,
"learning_rate": 1.9279907084785137e-06,
"loss": 1.2034,
"step": 166
},
{
"epoch": 0.048483088982435765,
"grad_norm": 5.0388383865356445,
"learning_rate": 1.9396051103368177e-06,
"loss": 1.2515,
"step": 167
},
{
"epoch": 0.048773406880534186,
"grad_norm": 4.488497257232666,
"learning_rate": 1.951219512195122e-06,
"loss": 1.1909,
"step": 168
},
{
"epoch": 0.049063724778632606,
"grad_norm": 4.383110523223877,
"learning_rate": 1.9628339140534263e-06,
"loss": 1.0721,
"step": 169
},
{
"epoch": 0.04935404267673102,
"grad_norm": 5.338650703430176,
"learning_rate": 1.9744483159117307e-06,
"loss": 1.2284,
"step": 170
},
{
"epoch": 0.04964436057482944,
"grad_norm": 4.445425033569336,
"learning_rate": 1.986062717770035e-06,
"loss": 1.1499,
"step": 171
},
{
"epoch": 0.04993467847292785,
"grad_norm": 4.984339237213135,
"learning_rate": 1.9976771196283393e-06,
"loss": 1.3605,
"step": 172
},
{
"epoch": 0.05022499637102627,
"grad_norm": 4.657524585723877,
"learning_rate": 2.0092915214866433e-06,
"loss": 1.2488,
"step": 173
},
{
"epoch": 0.05051531426912469,
"grad_norm": 4.822662353515625,
"learning_rate": 2.020905923344948e-06,
"loss": 1.1577,
"step": 174
},
{
"epoch": 0.05080563216722311,
"grad_norm": 4.718631744384766,
"learning_rate": 2.0325203252032523e-06,
"loss": 1.0862,
"step": 175
},
{
"epoch": 0.05109595006532153,
"grad_norm": 4.929813861846924,
"learning_rate": 2.0441347270615568e-06,
"loss": 1.2659,
"step": 176
},
{
"epoch": 0.05138626796341995,
"grad_norm": 5.136166572570801,
"learning_rate": 2.055749128919861e-06,
"loss": 1.2169,
"step": 177
},
{
"epoch": 0.05167658586151836,
"grad_norm": 4.956854343414307,
"learning_rate": 2.067363530778165e-06,
"loss": 1.1328,
"step": 178
},
{
"epoch": 0.05196690375961678,
"grad_norm": 4.586047649383545,
"learning_rate": 2.0789779326364694e-06,
"loss": 1.1756,
"step": 179
},
{
"epoch": 0.0522572216577152,
"grad_norm": 4.752535820007324,
"learning_rate": 2.090592334494774e-06,
"loss": 1.3709,
"step": 180
},
{
"epoch": 0.052547539555813615,
"grad_norm": 5.013321876525879,
"learning_rate": 2.102206736353078e-06,
"loss": 1.1806,
"step": 181
},
{
"epoch": 0.052837857453912035,
"grad_norm": 4.766448020935059,
"learning_rate": 2.1138211382113824e-06,
"loss": 0.9959,
"step": 182
},
{
"epoch": 0.05312817535201045,
"grad_norm": 4.972908020019531,
"learning_rate": 2.1254355400696864e-06,
"loss": 1.2942,
"step": 183
},
{
"epoch": 0.05341849325010887,
"grad_norm": 4.858799934387207,
"learning_rate": 2.137049941927991e-06,
"loss": 1.1823,
"step": 184
},
{
"epoch": 0.05370881114820729,
"grad_norm": 4.911069393157959,
"learning_rate": 2.1486643437862954e-06,
"loss": 1.2269,
"step": 185
},
{
"epoch": 0.0539991290463057,
"grad_norm": 4.7894368171691895,
"learning_rate": 2.1602787456445995e-06,
"loss": 1.2492,
"step": 186
},
{
"epoch": 0.05428944694440412,
"grad_norm": 4.717777729034424,
"learning_rate": 2.1718931475029035e-06,
"loss": 1.2164,
"step": 187
},
{
"epoch": 0.05457976484250254,
"grad_norm": 4.9674763679504395,
"learning_rate": 2.183507549361208e-06,
"loss": 1.2069,
"step": 188
},
{
"epoch": 0.054870082740600956,
"grad_norm": 5.091649532318115,
"learning_rate": 2.1951219512195125e-06,
"loss": 1.1534,
"step": 189
},
{
"epoch": 0.055160400638699376,
"grad_norm": 4.965774059295654,
"learning_rate": 2.2067363530778165e-06,
"loss": 1.3424,
"step": 190
},
{
"epoch": 0.0554507185367978,
"grad_norm": 5.256765842437744,
"learning_rate": 2.218350754936121e-06,
"loss": 1.2947,
"step": 191
},
{
"epoch": 0.05574103643489621,
"grad_norm": 5.316900730133057,
"learning_rate": 2.229965156794425e-06,
"loss": 1.3133,
"step": 192
},
{
"epoch": 0.05603135433299463,
"grad_norm": 5.373122692108154,
"learning_rate": 2.2415795586527295e-06,
"loss": 1.0931,
"step": 193
},
{
"epoch": 0.056321672231093044,
"grad_norm": 5.171296119689941,
"learning_rate": 2.253193960511034e-06,
"loss": 1.2404,
"step": 194
},
{
"epoch": 0.056611990129191464,
"grad_norm": 5.496878147125244,
"learning_rate": 2.264808362369338e-06,
"loss": 1.1381,
"step": 195
},
{
"epoch": 0.056902308027289884,
"grad_norm": 5.244287014007568,
"learning_rate": 2.2764227642276426e-06,
"loss": 1.3365,
"step": 196
},
{
"epoch": 0.0571926259253883,
"grad_norm": 5.100976943969727,
"learning_rate": 2.288037166085947e-06,
"loss": 1.2419,
"step": 197
},
{
"epoch": 0.05748294382348672,
"grad_norm": 5.562692642211914,
"learning_rate": 2.299651567944251e-06,
"loss": 1.3099,
"step": 198
},
{
"epoch": 0.05777326172158514,
"grad_norm": 5.311895370483398,
"learning_rate": 2.311265969802555e-06,
"loss": 1.1572,
"step": 199
},
{
"epoch": 0.05806357961968355,
"grad_norm": 5.604903221130371,
"learning_rate": 2.3228803716608596e-06,
"loss": 1.3625,
"step": 200
},
{
"epoch": 0.05835389751778197,
"grad_norm": 5.023021697998047,
"learning_rate": 2.334494773519164e-06,
"loss": 1.2232,
"step": 201
},
{
"epoch": 0.05864421541588039,
"grad_norm": 5.409064769744873,
"learning_rate": 2.346109175377468e-06,
"loss": 1.2896,
"step": 202
},
{
"epoch": 0.058934533313978806,
"grad_norm": 4.986676216125488,
"learning_rate": 2.3577235772357727e-06,
"loss": 1.1278,
"step": 203
},
{
"epoch": 0.059224851212077226,
"grad_norm": 4.7012128829956055,
"learning_rate": 2.3693379790940767e-06,
"loss": 1.2292,
"step": 204
},
{
"epoch": 0.05951516911017564,
"grad_norm": 4.756272315979004,
"learning_rate": 2.380952380952381e-06,
"loss": 1.1426,
"step": 205
},
{
"epoch": 0.05980548700827406,
"grad_norm": 4.644824504852295,
"learning_rate": 2.3925667828106857e-06,
"loss": 1.1133,
"step": 206
},
{
"epoch": 0.06009580490637248,
"grad_norm": 4.655545234680176,
"learning_rate": 2.4041811846689897e-06,
"loss": 1.1316,
"step": 207
},
{
"epoch": 0.06038612280447089,
"grad_norm": 5.067546844482422,
"learning_rate": 2.415795586527294e-06,
"loss": 1.1613,
"step": 208
},
{
"epoch": 0.06067644070256931,
"grad_norm": 5.918067455291748,
"learning_rate": 2.4274099883855983e-06,
"loss": 1.3104,
"step": 209
},
{
"epoch": 0.060966758600667734,
"grad_norm": 4.958433151245117,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.1905,
"step": 210
},
{
"epoch": 0.06125707649876615,
"grad_norm": 4.635531902313232,
"learning_rate": 2.4506387921022072e-06,
"loss": 1.1553,
"step": 211
},
{
"epoch": 0.06154739439686457,
"grad_norm": 4.515402793884277,
"learning_rate": 2.4622531939605113e-06,
"loss": 1.1648,
"step": 212
},
{
"epoch": 0.06183771229496299,
"grad_norm": 4.840621471405029,
"learning_rate": 2.4738675958188153e-06,
"loss": 1.3244,
"step": 213
},
{
"epoch": 0.0621280301930614,
"grad_norm": 4.515079498291016,
"learning_rate": 2.48548199767712e-06,
"loss": 1.0585,
"step": 214
},
{
"epoch": 0.06241834809115982,
"grad_norm": 4.8643693923950195,
"learning_rate": 2.4970963995354243e-06,
"loss": 1.2397,
"step": 215
},
{
"epoch": 0.06270866598925824,
"grad_norm": 5.038429260253906,
"learning_rate": 2.5087108013937284e-06,
"loss": 1.1628,
"step": 216
},
{
"epoch": 0.06299898388735665,
"grad_norm": 5.393674373626709,
"learning_rate": 2.5203252032520324e-06,
"loss": 1.1503,
"step": 217
},
{
"epoch": 0.06328930178545507,
"grad_norm": 4.6619038581848145,
"learning_rate": 2.531939605110337e-06,
"loss": 1.2291,
"step": 218
},
{
"epoch": 0.0635796196835535,
"grad_norm": 4.9958648681640625,
"learning_rate": 2.5435540069686414e-06,
"loss": 1.1938,
"step": 219
},
{
"epoch": 0.06386993758165191,
"grad_norm": 4.516469955444336,
"learning_rate": 2.555168408826946e-06,
"loss": 1.0742,
"step": 220
},
{
"epoch": 0.06416025547975032,
"grad_norm": 4.328372001647949,
"learning_rate": 2.56678281068525e-06,
"loss": 1.2531,
"step": 221
},
{
"epoch": 0.06445057337784875,
"grad_norm": 4.436943054199219,
"learning_rate": 2.578397212543554e-06,
"loss": 1.2107,
"step": 222
},
{
"epoch": 0.06474089127594716,
"grad_norm": 4.877750396728516,
"learning_rate": 2.5900116144018584e-06,
"loss": 1.4222,
"step": 223
},
{
"epoch": 0.06503120917404558,
"grad_norm": 5.479437828063965,
"learning_rate": 2.601626016260163e-06,
"loss": 1.1668,
"step": 224
},
{
"epoch": 0.065321527072144,
"grad_norm": 4.4991583824157715,
"learning_rate": 2.6132404181184674e-06,
"loss": 0.9982,
"step": 225
},
{
"epoch": 0.06561184497024242,
"grad_norm": 4.993007183074951,
"learning_rate": 2.6248548199767715e-06,
"loss": 1.1666,
"step": 226
},
{
"epoch": 0.06590216286834083,
"grad_norm": 4.814315319061279,
"learning_rate": 2.6364692218350755e-06,
"loss": 1.2113,
"step": 227
},
{
"epoch": 0.06619248076643926,
"grad_norm": 4.64751672744751,
"learning_rate": 2.64808362369338e-06,
"loss": 1.1168,
"step": 228
},
{
"epoch": 0.06648279866453767,
"grad_norm": 4.427606582641602,
"learning_rate": 2.659698025551684e-06,
"loss": 1.05,
"step": 229
},
{
"epoch": 0.06677311656263608,
"grad_norm": 5.613397121429443,
"learning_rate": 2.671312427409989e-06,
"loss": 1.1802,
"step": 230
},
{
"epoch": 0.06706343446073451,
"grad_norm": 5.010979652404785,
"learning_rate": 2.682926829268293e-06,
"loss": 1.376,
"step": 231
},
{
"epoch": 0.06735375235883292,
"grad_norm": 4.853494644165039,
"learning_rate": 2.694541231126597e-06,
"loss": 1.006,
"step": 232
},
{
"epoch": 0.06764407025693134,
"grad_norm": 4.468390464782715,
"learning_rate": 2.7061556329849016e-06,
"loss": 1.2792,
"step": 233
},
{
"epoch": 0.06793438815502975,
"grad_norm": 4.853550910949707,
"learning_rate": 2.7177700348432056e-06,
"loss": 1.2201,
"step": 234
},
{
"epoch": 0.06822470605312818,
"grad_norm": 4.637911319732666,
"learning_rate": 2.7293844367015097e-06,
"loss": 1.1786,
"step": 235
},
{
"epoch": 0.06851502395122659,
"grad_norm": 4.544745922088623,
"learning_rate": 2.7409988385598146e-06,
"loss": 1.1948,
"step": 236
},
{
"epoch": 0.068805341849325,
"grad_norm": 4.622826099395752,
"learning_rate": 2.7526132404181186e-06,
"loss": 1.0758,
"step": 237
},
{
"epoch": 0.06909565974742343,
"grad_norm": 4.711224555969238,
"learning_rate": 2.764227642276423e-06,
"loss": 1.0424,
"step": 238
},
{
"epoch": 0.06938597764552185,
"grad_norm": 4.914583206176758,
"learning_rate": 2.775842044134727e-06,
"loss": 1.2045,
"step": 239
},
{
"epoch": 0.06967629554362026,
"grad_norm": 4.866950511932373,
"learning_rate": 2.7874564459930316e-06,
"loss": 1.1709,
"step": 240
},
{
"epoch": 0.06996661344171869,
"grad_norm": 4.3304123878479,
"learning_rate": 2.7990708478513357e-06,
"loss": 1.1456,
"step": 241
},
{
"epoch": 0.0702569313398171,
"grad_norm": 5.298426628112793,
"learning_rate": 2.8106852497096406e-06,
"loss": 1.3114,
"step": 242
},
{
"epoch": 0.07054724923791551,
"grad_norm": 4.610419750213623,
"learning_rate": 2.8222996515679447e-06,
"loss": 1.0748,
"step": 243
},
{
"epoch": 0.07083756713601394,
"grad_norm": 5.127123832702637,
"learning_rate": 2.8339140534262487e-06,
"loss": 1.1137,
"step": 244
},
{
"epoch": 0.07112788503411235,
"grad_norm": 4.717776775360107,
"learning_rate": 2.845528455284553e-06,
"loss": 1.286,
"step": 245
},
{
"epoch": 0.07141820293221077,
"grad_norm": 4.651844024658203,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.1565,
"step": 246
},
{
"epoch": 0.0717085208303092,
"grad_norm": 4.493513584136963,
"learning_rate": 2.8687572590011613e-06,
"loss": 1.1787,
"step": 247
},
{
"epoch": 0.07199883872840761,
"grad_norm": 4.902074813842773,
"learning_rate": 2.8803716608594662e-06,
"loss": 1.232,
"step": 248
},
{
"epoch": 0.07228915662650602,
"grad_norm": 4.760148048400879,
"learning_rate": 2.8919860627177703e-06,
"loss": 1.1347,
"step": 249
},
{
"epoch": 0.07257947452460445,
"grad_norm": 5.101321697235107,
"learning_rate": 2.9036004645760748e-06,
"loss": 1.067,
"step": 250
},
{
"epoch": 0.07286979242270286,
"grad_norm": 5.148083209991455,
"learning_rate": 2.915214866434379e-06,
"loss": 1.3102,
"step": 251
},
{
"epoch": 0.07316011032080127,
"grad_norm": 4.3725152015686035,
"learning_rate": 2.926829268292683e-06,
"loss": 1.0622,
"step": 252
},
{
"epoch": 0.0734504282188997,
"grad_norm": 5.068384170532227,
"learning_rate": 2.9384436701509873e-06,
"loss": 1.1661,
"step": 253
},
{
"epoch": 0.07374074611699812,
"grad_norm": 4.736722946166992,
"learning_rate": 2.950058072009292e-06,
"loss": 1.2684,
"step": 254
},
{
"epoch": 0.07403106401509653,
"grad_norm": 4.653499603271484,
"learning_rate": 2.9616724738675963e-06,
"loss": 1.1995,
"step": 255
},
{
"epoch": 0.07432138191319494,
"grad_norm": 4.878271102905273,
"learning_rate": 2.9732868757259004e-06,
"loss": 1.1359,
"step": 256
},
{
"epoch": 0.07461169981129337,
"grad_norm": 4.5596923828125,
"learning_rate": 2.9849012775842044e-06,
"loss": 1.2118,
"step": 257
},
{
"epoch": 0.07490201770939178,
"grad_norm": 4.714583873748779,
"learning_rate": 2.996515679442509e-06,
"loss": 1.1838,
"step": 258
},
{
"epoch": 0.0751923356074902,
"grad_norm": 4.619505405426025,
"learning_rate": 3.0081300813008134e-06,
"loss": 1.005,
"step": 259
},
{
"epoch": 0.07548265350558862,
"grad_norm": 4.827937602996826,
"learning_rate": 3.019744483159118e-06,
"loss": 1.2427,
"step": 260
},
{
"epoch": 0.07577297140368704,
"grad_norm": 4.799366474151611,
"learning_rate": 3.031358885017422e-06,
"loss": 1.2602,
"step": 261
},
{
"epoch": 0.07606328930178545,
"grad_norm": 4.541555404663086,
"learning_rate": 3.042973286875726e-06,
"loss": 1.2551,
"step": 262
},
{
"epoch": 0.07635360719988388,
"grad_norm": 4.521805286407471,
"learning_rate": 3.0545876887340305e-06,
"loss": 1.1664,
"step": 263
},
{
"epoch": 0.07664392509798229,
"grad_norm": 4.505204677581787,
"learning_rate": 3.0662020905923345e-06,
"loss": 1.119,
"step": 264
},
{
"epoch": 0.0769342429960807,
"grad_norm": 4.231343746185303,
"learning_rate": 3.0778164924506394e-06,
"loss": 1.2252,
"step": 265
},
{
"epoch": 0.07722456089417913,
"grad_norm": 4.726437568664551,
"learning_rate": 3.0894308943089435e-06,
"loss": 1.3634,
"step": 266
},
{
"epoch": 0.07751487879227754,
"grad_norm": 4.76708984375,
"learning_rate": 3.1010452961672475e-06,
"loss": 1.3045,
"step": 267
},
{
"epoch": 0.07780519669037596,
"grad_norm": 4.557008743286133,
"learning_rate": 3.112659698025552e-06,
"loss": 1.1581,
"step": 268
},
{
"epoch": 0.07809551458847439,
"grad_norm": 5.199429512023926,
"learning_rate": 3.124274099883856e-06,
"loss": 1.2774,
"step": 269
},
{
"epoch": 0.0783858324865728,
"grad_norm": 5.509277820587158,
"learning_rate": 3.13588850174216e-06,
"loss": 1.3066,
"step": 270
},
{
"epoch": 0.07867615038467121,
"grad_norm": 4.698461055755615,
"learning_rate": 3.147502903600465e-06,
"loss": 1.1477,
"step": 271
},
{
"epoch": 0.07896646828276964,
"grad_norm": 4.983335494995117,
"learning_rate": 3.159117305458769e-06,
"loss": 1.0314,
"step": 272
},
{
"epoch": 0.07925678618086805,
"grad_norm": 4.76466703414917,
"learning_rate": 3.1707317073170736e-06,
"loss": 1.2794,
"step": 273
},
{
"epoch": 0.07954710407896647,
"grad_norm": 4.861992359161377,
"learning_rate": 3.1823461091753776e-06,
"loss": 1.2291,
"step": 274
},
{
"epoch": 0.07983742197706489,
"grad_norm": 5.327348709106445,
"learning_rate": 3.1939605110336817e-06,
"loss": 1.1653,
"step": 275
},
{
"epoch": 0.0801277398751633,
"grad_norm": 4.695688247680664,
"learning_rate": 3.205574912891986e-06,
"loss": 1.2847,
"step": 276
},
{
"epoch": 0.08041805777326172,
"grad_norm": 4.913061141967773,
"learning_rate": 3.2171893147502906e-06,
"loss": 1.1864,
"step": 277
},
{
"epoch": 0.08070837567136013,
"grad_norm": 4.667782306671143,
"learning_rate": 3.228803716608595e-06,
"loss": 1.1751,
"step": 278
},
{
"epoch": 0.08099869356945856,
"grad_norm": 4.723694324493408,
"learning_rate": 3.240418118466899e-06,
"loss": 1.3455,
"step": 279
},
{
"epoch": 0.08128901146755697,
"grad_norm": 5.621630668640137,
"learning_rate": 3.2520325203252037e-06,
"loss": 1.156,
"step": 280
},
{
"epoch": 0.08157932936565539,
"grad_norm": 4.824314117431641,
"learning_rate": 3.2636469221835077e-06,
"loss": 1.2029,
"step": 281
},
{
"epoch": 0.08186964726375381,
"grad_norm": 4.6834025382995605,
"learning_rate": 3.2752613240418118e-06,
"loss": 1.1729,
"step": 282
},
{
"epoch": 0.08215996516185223,
"grad_norm": 4.411752223968506,
"learning_rate": 3.2868757259001167e-06,
"loss": 1.099,
"step": 283
},
{
"epoch": 0.08245028305995064,
"grad_norm": 4.955481052398682,
"learning_rate": 3.2984901277584207e-06,
"loss": 1.3098,
"step": 284
},
{
"epoch": 0.08274060095804907,
"grad_norm": 4.61010217666626,
"learning_rate": 3.310104529616725e-06,
"loss": 0.9964,
"step": 285
},
{
"epoch": 0.08303091885614748,
"grad_norm": 4.8403000831604,
"learning_rate": 3.3217189314750293e-06,
"loss": 1.2399,
"step": 286
},
{
"epoch": 0.0833212367542459,
"grad_norm": 4.739892482757568,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.2173,
"step": 287
},
{
"epoch": 0.08361155465234432,
"grad_norm": 4.817641735076904,
"learning_rate": 3.3449477351916382e-06,
"loss": 1.1471,
"step": 288
},
{
"epoch": 0.08390187255044274,
"grad_norm": 4.951786518096924,
"learning_rate": 3.3565621370499423e-06,
"loss": 1.3252,
"step": 289
},
{
"epoch": 0.08419219044854115,
"grad_norm": 4.856020927429199,
"learning_rate": 3.3681765389082468e-06,
"loss": 1.2437,
"step": 290
},
{
"epoch": 0.08448250834663958,
"grad_norm": 4.223579406738281,
"learning_rate": 3.379790940766551e-06,
"loss": 1.0679,
"step": 291
},
{
"epoch": 0.08477282624473799,
"grad_norm": 4.6746344566345215,
"learning_rate": 3.391405342624855e-06,
"loss": 1.2439,
"step": 292
},
{
"epoch": 0.0850631441428364,
"grad_norm": 4.416624069213867,
"learning_rate": 3.4030197444831594e-06,
"loss": 1.1559,
"step": 293
},
{
"epoch": 0.08535346204093483,
"grad_norm": 4.347145080566406,
"learning_rate": 3.414634146341464e-06,
"loss": 1.2274,
"step": 294
},
{
"epoch": 0.08564377993903324,
"grad_norm": 4.638583660125732,
"learning_rate": 3.4262485481997683e-06,
"loss": 1.1497,
"step": 295
},
{
"epoch": 0.08593409783713166,
"grad_norm": 4.834431171417236,
"learning_rate": 3.4378629500580724e-06,
"loss": 1.2264,
"step": 296
},
{
"epoch": 0.08622441573523008,
"grad_norm": 4.830117225646973,
"learning_rate": 3.4494773519163764e-06,
"loss": 1.1305,
"step": 297
},
{
"epoch": 0.0865147336333285,
"grad_norm": 4.986152172088623,
"learning_rate": 3.461091753774681e-06,
"loss": 1.258,
"step": 298
},
{
"epoch": 0.08680505153142691,
"grad_norm": 4.623694896697998,
"learning_rate": 3.472706155632985e-06,
"loss": 1.1584,
"step": 299
},
{
"epoch": 0.08709536942952532,
"grad_norm": 4.773608207702637,
"learning_rate": 3.48432055749129e-06,
"loss": 1.2913,
"step": 300
},
{
"epoch": 0.08738568732762375,
"grad_norm": 4.353751182556152,
"learning_rate": 3.495934959349594e-06,
"loss": 1.2131,
"step": 301
},
{
"epoch": 0.08767600522572216,
"grad_norm": 4.784504413604736,
"learning_rate": 3.507549361207898e-06,
"loss": 1.145,
"step": 302
},
{
"epoch": 0.08796632312382058,
"grad_norm": 4.825213432312012,
"learning_rate": 3.5191637630662025e-06,
"loss": 1.154,
"step": 303
},
{
"epoch": 0.088256641021919,
"grad_norm": 5.358443737030029,
"learning_rate": 3.5307781649245065e-06,
"loss": 1.2215,
"step": 304
},
{
"epoch": 0.08854695892001742,
"grad_norm": 4.255599021911621,
"learning_rate": 3.5423925667828106e-06,
"loss": 1.1419,
"step": 305
},
{
"epoch": 0.08883727681811583,
"grad_norm": 4.947575092315674,
"learning_rate": 3.5540069686411155e-06,
"loss": 1.2882,
"step": 306
},
{
"epoch": 0.08912759471621426,
"grad_norm": 5.248209476470947,
"learning_rate": 3.5656213704994195e-06,
"loss": 1.1513,
"step": 307
},
{
"epoch": 0.08941791261431267,
"grad_norm": 4.874551773071289,
"learning_rate": 3.577235772357724e-06,
"loss": 1.3386,
"step": 308
},
{
"epoch": 0.08970823051241109,
"grad_norm": 4.576282978057861,
"learning_rate": 3.588850174216028e-06,
"loss": 1.1586,
"step": 309
},
{
"epoch": 0.08999854841050951,
"grad_norm": 4.958520889282227,
"learning_rate": 3.600464576074332e-06,
"loss": 1.3224,
"step": 310
},
{
"epoch": 0.09028886630860793,
"grad_norm": 4.927209854125977,
"learning_rate": 3.6120789779326366e-06,
"loss": 1.2893,
"step": 311
},
{
"epoch": 0.09057918420670634,
"grad_norm": 4.564126968383789,
"learning_rate": 3.623693379790941e-06,
"loss": 1.1545,
"step": 312
},
{
"epoch": 0.09086950210480477,
"grad_norm": 4.723407745361328,
"learning_rate": 3.6353077816492456e-06,
"loss": 1.2735,
"step": 313
},
{
"epoch": 0.09115982000290318,
"grad_norm": 4.90524435043335,
"learning_rate": 3.6469221835075496e-06,
"loss": 1.2508,
"step": 314
},
{
"epoch": 0.0914501379010016,
"grad_norm": 4.609728813171387,
"learning_rate": 3.6585365853658537e-06,
"loss": 1.055,
"step": 315
},
{
"epoch": 0.09174045579910002,
"grad_norm": 4.467485427856445,
"learning_rate": 3.670150987224158e-06,
"loss": 1.118,
"step": 316
},
{
"epoch": 0.09203077369719843,
"grad_norm": 4.879512310028076,
"learning_rate": 3.6817653890824622e-06,
"loss": 1.1635,
"step": 317
},
{
"epoch": 0.09232109159529685,
"grad_norm": 4.552756309509277,
"learning_rate": 3.693379790940767e-06,
"loss": 1.2854,
"step": 318
},
{
"epoch": 0.09261140949339527,
"grad_norm": 4.794209003448486,
"learning_rate": 3.704994192799071e-06,
"loss": 1.2284,
"step": 319
},
{
"epoch": 0.09290172739149369,
"grad_norm": 4.714296340942383,
"learning_rate": 3.7166085946573752e-06,
"loss": 1.1289,
"step": 320
},
{
"epoch": 0.0931920452895921,
"grad_norm": 4.3302106857299805,
"learning_rate": 3.7282229965156797e-06,
"loss": 1.2047,
"step": 321
},
{
"epoch": 0.09348236318769051,
"grad_norm": 4.78410005569458,
"learning_rate": 3.7398373983739838e-06,
"loss": 1.2851,
"step": 322
},
{
"epoch": 0.09377268108578894,
"grad_norm": 4.550713539123535,
"learning_rate": 3.7514518002322887e-06,
"loss": 1.1176,
"step": 323
},
{
"epoch": 0.09406299898388736,
"grad_norm": 4.67529821395874,
"learning_rate": 3.7630662020905927e-06,
"loss": 1.1582,
"step": 324
},
{
"epoch": 0.09435331688198577,
"grad_norm": 5.0789361000061035,
"learning_rate": 3.7746806039488972e-06,
"loss": 1.1994,
"step": 325
},
{
"epoch": 0.0946436347800842,
"grad_norm": 4.371364593505859,
"learning_rate": 3.7862950058072013e-06,
"loss": 1.185,
"step": 326
},
{
"epoch": 0.09493395267818261,
"grad_norm": 4.266092300415039,
"learning_rate": 3.7979094076655053e-06,
"loss": 1.0634,
"step": 327
},
{
"epoch": 0.09522427057628102,
"grad_norm": 4.3022141456604,
"learning_rate": 3.80952380952381e-06,
"loss": 1.0949,
"step": 328
},
{
"epoch": 0.09551458847437945,
"grad_norm": 4.752735137939453,
"learning_rate": 3.821138211382115e-06,
"loss": 1.1035,
"step": 329
},
{
"epoch": 0.09580490637247786,
"grad_norm": 4.965222358703613,
"learning_rate": 3.832752613240418e-06,
"loss": 1.1323,
"step": 330
},
{
"epoch": 0.09609522427057628,
"grad_norm": 5.181162357330322,
"learning_rate": 3.844367015098723e-06,
"loss": 1.0707,
"step": 331
},
{
"epoch": 0.0963855421686747,
"grad_norm": 5.318249225616455,
"learning_rate": 3.855981416957027e-06,
"loss": 1.3433,
"step": 332
},
{
"epoch": 0.09667586006677312,
"grad_norm": 4.770782470703125,
"learning_rate": 3.867595818815331e-06,
"loss": 1.2307,
"step": 333
},
{
"epoch": 0.09696617796487153,
"grad_norm": 4.776768207550049,
"learning_rate": 3.8792102206736354e-06,
"loss": 1.2659,
"step": 334
},
{
"epoch": 0.09725649586296996,
"grad_norm": 4.787647724151611,
"learning_rate": 3.89082462253194e-06,
"loss": 1.149,
"step": 335
},
{
"epoch": 0.09754681376106837,
"grad_norm": 4.631190299987793,
"learning_rate": 3.902439024390244e-06,
"loss": 1.0426,
"step": 336
},
{
"epoch": 0.09783713165916678,
"grad_norm": 4.632266044616699,
"learning_rate": 3.914053426248549e-06,
"loss": 1.2512,
"step": 337
},
{
"epoch": 0.09812744955726521,
"grad_norm": 4.575108528137207,
"learning_rate": 3.9256678281068525e-06,
"loss": 1.1754,
"step": 338
},
{
"epoch": 0.09841776745536363,
"grad_norm": 4.5373454093933105,
"learning_rate": 3.937282229965157e-06,
"loss": 1.0264,
"step": 339
},
{
"epoch": 0.09870808535346204,
"grad_norm": 4.490976333618164,
"learning_rate": 3.9488966318234615e-06,
"loss": 1.1909,
"step": 340
},
{
"epoch": 0.09899840325156047,
"grad_norm": 4.690683841705322,
"learning_rate": 3.960511033681766e-06,
"loss": 1.2999,
"step": 341
},
{
"epoch": 0.09928872114965888,
"grad_norm": 5.3299479484558105,
"learning_rate": 3.97212543554007e-06,
"loss": 1.3982,
"step": 342
},
{
"epoch": 0.09957903904775729,
"grad_norm": 4.69218635559082,
"learning_rate": 3.983739837398374e-06,
"loss": 1.2105,
"step": 343
},
{
"epoch": 0.0998693569458557,
"grad_norm": 4.691149711608887,
"learning_rate": 3.9953542392566785e-06,
"loss": 1.1759,
"step": 344
},
{
"epoch": 0.10015967484395413,
"grad_norm": 4.793273448944092,
"learning_rate": 4.006968641114983e-06,
"loss": 1.3496,
"step": 345
},
{
"epoch": 0.10044999274205255,
"grad_norm": 4.364034652709961,
"learning_rate": 4.018583042973287e-06,
"loss": 1.1204,
"step": 346
},
{
"epoch": 0.10074031064015096,
"grad_norm": 4.571069240570068,
"learning_rate": 4.030197444831592e-06,
"loss": 1.0896,
"step": 347
},
{
"epoch": 0.10103062853824939,
"grad_norm": 4.80451774597168,
"learning_rate": 4.041811846689896e-06,
"loss": 1.1484,
"step": 348
},
{
"epoch": 0.1013209464363478,
"grad_norm": 5.162931442260742,
"learning_rate": 4.0534262485482e-06,
"loss": 1.2662,
"step": 349
},
{
"epoch": 0.10161126433444621,
"grad_norm": 4.779268264770508,
"learning_rate": 4.0650406504065046e-06,
"loss": 1.1183,
"step": 350
},
{
"epoch": 0.10190158223254464,
"grad_norm": 4.979952812194824,
"learning_rate": 4.076655052264808e-06,
"loss": 1.4163,
"step": 351
},
{
"epoch": 0.10219190013064305,
"grad_norm": 4.158762454986572,
"learning_rate": 4.0882694541231135e-06,
"loss": 1.0298,
"step": 352
},
{
"epoch": 0.10248221802874147,
"grad_norm": 4.852020740509033,
"learning_rate": 4.099883855981417e-06,
"loss": 1.2704,
"step": 353
},
{
"epoch": 0.1027725359268399,
"grad_norm": 5.023031234741211,
"learning_rate": 4.111498257839722e-06,
"loss": 1.242,
"step": 354
},
{
"epoch": 0.10306285382493831,
"grad_norm": 5.079054355621338,
"learning_rate": 4.123112659698026e-06,
"loss": 1.1842,
"step": 355
},
{
"epoch": 0.10335317172303672,
"grad_norm": 4.983884811401367,
"learning_rate": 4.13472706155633e-06,
"loss": 1.2416,
"step": 356
},
{
"epoch": 0.10364348962113515,
"grad_norm": 4.6025543212890625,
"learning_rate": 4.146341463414634e-06,
"loss": 1.1068,
"step": 357
},
{
"epoch": 0.10393380751923356,
"grad_norm": 5.108760833740234,
"learning_rate": 4.157955865272939e-06,
"loss": 1.2235,
"step": 358
},
{
"epoch": 0.10422412541733198,
"grad_norm": 4.9223480224609375,
"learning_rate": 4.169570267131243e-06,
"loss": 1.2334,
"step": 359
},
{
"epoch": 0.1045144433154304,
"grad_norm": 4.93304443359375,
"learning_rate": 4.181184668989548e-06,
"loss": 1.2043,
"step": 360
},
{
"epoch": 0.10480476121352882,
"grad_norm": 4.894895553588867,
"learning_rate": 4.192799070847851e-06,
"loss": 1.2835,
"step": 361
},
{
"epoch": 0.10509507911162723,
"grad_norm": 4.598118782043457,
"learning_rate": 4.204413472706156e-06,
"loss": 1.1895,
"step": 362
},
{
"epoch": 0.10538539700972566,
"grad_norm": 4.7202839851379395,
"learning_rate": 4.21602787456446e-06,
"loss": 1.2704,
"step": 363
},
{
"epoch": 0.10567571490782407,
"grad_norm": 4.768918991088867,
"learning_rate": 4.227642276422765e-06,
"loss": 1.2544,
"step": 364
},
{
"epoch": 0.10596603280592248,
"grad_norm": 4.733092784881592,
"learning_rate": 4.239256678281069e-06,
"loss": 1.2009,
"step": 365
},
{
"epoch": 0.1062563507040209,
"grad_norm": 4.309986591339111,
"learning_rate": 4.250871080139373e-06,
"loss": 1.3049,
"step": 366
},
{
"epoch": 0.10654666860211932,
"grad_norm": 4.730205535888672,
"learning_rate": 4.262485481997677e-06,
"loss": 1.2298,
"step": 367
},
{
"epoch": 0.10683698650021774,
"grad_norm": 4.841794013977051,
"learning_rate": 4.274099883855982e-06,
"loss": 1.2284,
"step": 368
},
{
"epoch": 0.10712730439831615,
"grad_norm": 4.516952037811279,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.1675,
"step": 369
},
{
"epoch": 0.10741762229641458,
"grad_norm": 4.625637054443359,
"learning_rate": 4.297328687572591e-06,
"loss": 1.055,
"step": 370
},
{
"epoch": 0.10770794019451299,
"grad_norm": 4.419715881347656,
"learning_rate": 4.308943089430894e-06,
"loss": 1.2107,
"step": 371
},
{
"epoch": 0.1079982580926114,
"grad_norm": 4.896949291229248,
"learning_rate": 4.320557491289199e-06,
"loss": 1.3021,
"step": 372
},
{
"epoch": 0.10828857599070983,
"grad_norm": 4.905848503112793,
"learning_rate": 4.332171893147503e-06,
"loss": 1.2083,
"step": 373
},
{
"epoch": 0.10857889388880824,
"grad_norm": 5.094426155090332,
"learning_rate": 4.343786295005807e-06,
"loss": 1.2681,
"step": 374
},
{
"epoch": 0.10886921178690666,
"grad_norm": 4.462698936462402,
"learning_rate": 4.3554006968641115e-06,
"loss": 1.2045,
"step": 375
},
{
"epoch": 0.10915952968500509,
"grad_norm": 4.681826591491699,
"learning_rate": 4.367015098722416e-06,
"loss": 1.1561,
"step": 376
},
{
"epoch": 0.1094498475831035,
"grad_norm": 4.762950420379639,
"learning_rate": 4.3786295005807205e-06,
"loss": 1.38,
"step": 377
},
{
"epoch": 0.10974016548120191,
"grad_norm": 4.647446155548096,
"learning_rate": 4.390243902439025e-06,
"loss": 1.1523,
"step": 378
},
{
"epoch": 0.11003048337930034,
"grad_norm": 4.403470039367676,
"learning_rate": 4.4018583042973286e-06,
"loss": 1.1952,
"step": 379
},
{
"epoch": 0.11032080127739875,
"grad_norm": 4.534971237182617,
"learning_rate": 4.413472706155633e-06,
"loss": 1.2161,
"step": 380
},
{
"epoch": 0.11061111917549717,
"grad_norm": 4.459516525268555,
"learning_rate": 4.4250871080139375e-06,
"loss": 1.165,
"step": 381
},
{
"epoch": 0.1109014370735956,
"grad_norm": 4.685759544372559,
"learning_rate": 4.436701509872242e-06,
"loss": 1.2302,
"step": 382
},
{
"epoch": 0.111191754971694,
"grad_norm": 4.3947062492370605,
"learning_rate": 4.4483159117305465e-06,
"loss": 1.0562,
"step": 383
},
{
"epoch": 0.11148207286979242,
"grad_norm": 4.368214130401611,
"learning_rate": 4.45993031358885e-06,
"loss": 1.1429,
"step": 384
},
{
"epoch": 0.11177239076789085,
"grad_norm": 4.556305408477783,
"learning_rate": 4.471544715447155e-06,
"loss": 1.2435,
"step": 385
},
{
"epoch": 0.11206270866598926,
"grad_norm": 4.672650337219238,
"learning_rate": 4.483159117305459e-06,
"loss": 1.2102,
"step": 386
},
{
"epoch": 0.11235302656408767,
"grad_norm": 4.5687127113342285,
"learning_rate": 4.4947735191637636e-06,
"loss": 1.0996,
"step": 387
},
{
"epoch": 0.11264334446218609,
"grad_norm": 4.420834064483643,
"learning_rate": 4.506387921022068e-06,
"loss": 1.0808,
"step": 388
},
{
"epoch": 0.11293366236028451,
"grad_norm": 4.193338394165039,
"learning_rate": 4.5180023228803725e-06,
"loss": 1.1776,
"step": 389
},
{
"epoch": 0.11322398025838293,
"grad_norm": 4.821009635925293,
"learning_rate": 4.529616724738676e-06,
"loss": 1.0963,
"step": 390
},
{
"epoch": 0.11351429815648134,
"grad_norm": 4.469620227813721,
"learning_rate": 4.541231126596981e-06,
"loss": 1.2174,
"step": 391
},
{
"epoch": 0.11380461605457977,
"grad_norm": 4.591622352600098,
"learning_rate": 4.552845528455285e-06,
"loss": 1.3208,
"step": 392
},
{
"epoch": 0.11409493395267818,
"grad_norm": 4.99096155166626,
"learning_rate": 4.56445993031359e-06,
"loss": 1.2179,
"step": 393
},
{
"epoch": 0.1143852518507766,
"grad_norm": 4.644974708557129,
"learning_rate": 4.576074332171894e-06,
"loss": 1.2476,
"step": 394
},
{
"epoch": 0.11467556974887502,
"grad_norm": 4.829218864440918,
"learning_rate": 4.587688734030198e-06,
"loss": 1.3546,
"step": 395
},
{
"epoch": 0.11496588764697344,
"grad_norm": 4.542574882507324,
"learning_rate": 4.599303135888502e-06,
"loss": 1.0894,
"step": 396
},
{
"epoch": 0.11525620554507185,
"grad_norm": 4.826246738433838,
"learning_rate": 4.610917537746807e-06,
"loss": 1.1201,
"step": 397
},
{
"epoch": 0.11554652344317028,
"grad_norm": 4.562570095062256,
"learning_rate": 4.62253193960511e-06,
"loss": 1.1192,
"step": 398
},
{
"epoch": 0.11583684134126869,
"grad_norm": 4.720918655395508,
"learning_rate": 4.634146341463416e-06,
"loss": 1.2242,
"step": 399
},
{
"epoch": 0.1161271592393671,
"grad_norm": 4.849851131439209,
"learning_rate": 4.645760743321719e-06,
"loss": 1.1184,
"step": 400
},
{
"epoch": 0.11641747713746553,
"grad_norm": 5.324199199676514,
"learning_rate": 4.657375145180024e-06,
"loss": 1.2376,
"step": 401
},
{
"epoch": 0.11670779503556394,
"grad_norm": 4.4429192543029785,
"learning_rate": 4.668989547038328e-06,
"loss": 1.0613,
"step": 402
},
{
"epoch": 0.11699811293366236,
"grad_norm": 4.644254684448242,
"learning_rate": 4.680603948896632e-06,
"loss": 1.1684,
"step": 403
},
{
"epoch": 0.11728843083176078,
"grad_norm": 5.012441158294678,
"learning_rate": 4.692218350754936e-06,
"loss": 1.2038,
"step": 404
},
{
"epoch": 0.1175787487298592,
"grad_norm": 4.652109622955322,
"learning_rate": 4.703832752613241e-06,
"loss": 1.2779,
"step": 405
},
{
"epoch": 0.11786906662795761,
"grad_norm": 4.548923969268799,
"learning_rate": 4.715447154471545e-06,
"loss": 1.2691,
"step": 406
},
{
"epoch": 0.11815938452605604,
"grad_norm": 4.364345073699951,
"learning_rate": 4.72706155632985e-06,
"loss": 1.1318,
"step": 407
},
{
"epoch": 0.11844970242415445,
"grad_norm": 4.687953472137451,
"learning_rate": 4.738675958188153e-06,
"loss": 1.4441,
"step": 408
},
{
"epoch": 0.11874002032225286,
"grad_norm": 4.44487190246582,
"learning_rate": 4.750290360046458e-06,
"loss": 1.2781,
"step": 409
},
{
"epoch": 0.11903033822035128,
"grad_norm": 4.23728609085083,
"learning_rate": 4.761904761904762e-06,
"loss": 1.0713,
"step": 410
},
{
"epoch": 0.1193206561184497,
"grad_norm": 4.650542736053467,
"learning_rate": 4.773519163763067e-06,
"loss": 1.1529,
"step": 411
},
{
"epoch": 0.11961097401654812,
"grad_norm": 4.119630813598633,
"learning_rate": 4.785133565621371e-06,
"loss": 1.0351,
"step": 412
},
{
"epoch": 0.11990129191464653,
"grad_norm": 4.689528465270996,
"learning_rate": 4.796747967479675e-06,
"loss": 1.27,
"step": 413
},
{
"epoch": 0.12019160981274496,
"grad_norm": 4.582840919494629,
"learning_rate": 4.8083623693379794e-06,
"loss": 1.2461,
"step": 414
},
{
"epoch": 0.12048192771084337,
"grad_norm": 4.441833019256592,
"learning_rate": 4.819976771196284e-06,
"loss": 1.2983,
"step": 415
},
{
"epoch": 0.12077224560894179,
"grad_norm": 4.192812919616699,
"learning_rate": 4.831591173054588e-06,
"loss": 1.1723,
"step": 416
},
{
"epoch": 0.12106256350704021,
"grad_norm": 4.11320686340332,
"learning_rate": 4.843205574912893e-06,
"loss": 1.1548,
"step": 417
},
{
"epoch": 0.12135288140513863,
"grad_norm": 4.811589241027832,
"learning_rate": 4.8548199767711965e-06,
"loss": 1.2443,
"step": 418
},
{
"epoch": 0.12164319930323704,
"grad_norm": 4.167344570159912,
"learning_rate": 4.866434378629501e-06,
"loss": 1.0633,
"step": 419
},
{
"epoch": 0.12193351720133547,
"grad_norm": 4.8188090324401855,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.2695,
"step": 420
},
{
"epoch": 0.12222383509943388,
"grad_norm": 4.46505880355835,
"learning_rate": 4.889663182346109e-06,
"loss": 1.1716,
"step": 421
},
{
"epoch": 0.1225141529975323,
"grad_norm": 4.715555667877197,
"learning_rate": 4.9012775842044144e-06,
"loss": 1.1526,
"step": 422
},
{
"epoch": 0.12280447089563072,
"grad_norm": 4.3485612869262695,
"learning_rate": 4.912891986062718e-06,
"loss": 1.0477,
"step": 423
},
{
"epoch": 0.12309478879372913,
"grad_norm": 4.8962483406066895,
"learning_rate": 4.9245063879210226e-06,
"loss": 1.2028,
"step": 424
},
{
"epoch": 0.12338510669182755,
"grad_norm": 4.331915378570557,
"learning_rate": 4.936120789779327e-06,
"loss": 0.9834,
"step": 425
},
{
"epoch": 0.12367542458992598,
"grad_norm": 4.94401216506958,
"learning_rate": 4.947735191637631e-06,
"loss": 1.2552,
"step": 426
},
{
"epoch": 0.12396574248802439,
"grad_norm": 4.512451648712158,
"learning_rate": 4.959349593495935e-06,
"loss": 1.1289,
"step": 427
},
{
"epoch": 0.1242560603861228,
"grad_norm": 4.4072489738464355,
"learning_rate": 4.97096399535424e-06,
"loss": 1.1327,
"step": 428
},
{
"epoch": 0.12454637828422123,
"grad_norm": 4.699981212615967,
"learning_rate": 4.982578397212544e-06,
"loss": 1.2143,
"step": 429
},
{
"epoch": 0.12483669618231964,
"grad_norm": 4.3754496574401855,
"learning_rate": 4.994192799070849e-06,
"loss": 1.2076,
"step": 430
},
{
"epoch": 0.12512701408041807,
"grad_norm": 4.274416446685791,
"learning_rate": 5.005807200929152e-06,
"loss": 1.2851,
"step": 431
},
{
"epoch": 0.12541733197851648,
"grad_norm": 3.8760673999786377,
"learning_rate": 5.017421602787457e-06,
"loss": 1.0735,
"step": 432
},
{
"epoch": 0.1257076498766149,
"grad_norm": 4.6121015548706055,
"learning_rate": 5.029036004645761e-06,
"loss": 1.1603,
"step": 433
},
{
"epoch": 0.1259979677747133,
"grad_norm": 4.314383506774902,
"learning_rate": 5.040650406504065e-06,
"loss": 1.1134,
"step": 434
},
{
"epoch": 0.12628828567281172,
"grad_norm": 4.7067036628723145,
"learning_rate": 5.052264808362369e-06,
"loss": 1.2772,
"step": 435
},
{
"epoch": 0.12657860357091014,
"grad_norm": 4.6313982009887695,
"learning_rate": 5.063879210220674e-06,
"loss": 1.0923,
"step": 436
},
{
"epoch": 0.12686892146900858,
"grad_norm": 4.323302268981934,
"learning_rate": 5.075493612078979e-06,
"loss": 1.1981,
"step": 437
},
{
"epoch": 0.127159239367107,
"grad_norm": 4.471177101135254,
"learning_rate": 5.087108013937283e-06,
"loss": 1.456,
"step": 438
},
{
"epoch": 0.1274495572652054,
"grad_norm": 4.510197639465332,
"learning_rate": 5.098722415795587e-06,
"loss": 1.256,
"step": 439
},
{
"epoch": 0.12773987516330382,
"grad_norm": 4.906876087188721,
"learning_rate": 5.110336817653892e-06,
"loss": 1.1103,
"step": 440
},
{
"epoch": 0.12803019306140223,
"grad_norm": 4.39389181137085,
"learning_rate": 5.121951219512195e-06,
"loss": 1.1229,
"step": 441
},
{
"epoch": 0.12832051095950064,
"grad_norm": 4.98647403717041,
"learning_rate": 5.1335656213705e-06,
"loss": 1.1612,
"step": 442
},
{
"epoch": 0.12861082885759906,
"grad_norm": 4.218196392059326,
"learning_rate": 5.145180023228804e-06,
"loss": 1.2441,
"step": 443
},
{
"epoch": 0.1289011467556975,
"grad_norm": 4.1096086502075195,
"learning_rate": 5.156794425087108e-06,
"loss": 1.1238,
"step": 444
},
{
"epoch": 0.1291914646537959,
"grad_norm": 4.741826057434082,
"learning_rate": 5.168408826945412e-06,
"loss": 1.3542,
"step": 445
},
{
"epoch": 0.12948178255189433,
"grad_norm": 4.725194454193115,
"learning_rate": 5.180023228803717e-06,
"loss": 1.3447,
"step": 446
},
{
"epoch": 0.12977210044999274,
"grad_norm": 4.7122273445129395,
"learning_rate": 5.1916376306620205e-06,
"loss": 1.1016,
"step": 447
},
{
"epoch": 0.13006241834809115,
"grad_norm": 5.179031848907471,
"learning_rate": 5.203252032520326e-06,
"loss": 1.073,
"step": 448
},
{
"epoch": 0.13035273624618957,
"grad_norm": 4.772004127502441,
"learning_rate": 5.21486643437863e-06,
"loss": 1.2237,
"step": 449
},
{
"epoch": 0.130643054144288,
"grad_norm": 4.839110374450684,
"learning_rate": 5.226480836236935e-06,
"loss": 1.2604,
"step": 450
},
{
"epoch": 0.13093337204238642,
"grad_norm": 4.533593654632568,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.1173,
"step": 451
},
{
"epoch": 0.13122368994048483,
"grad_norm": 4.776732444763184,
"learning_rate": 5.249709639953543e-06,
"loss": 1.2574,
"step": 452
},
{
"epoch": 0.13151400783858325,
"grad_norm": 4.366232872009277,
"learning_rate": 5.261324041811847e-06,
"loss": 1.1167,
"step": 453
},
{
"epoch": 0.13180432573668166,
"grad_norm": 4.264481067657471,
"learning_rate": 5.272938443670151e-06,
"loss": 1.081,
"step": 454
},
{
"epoch": 0.13209464363478007,
"grad_norm": 4.251311302185059,
"learning_rate": 5.2845528455284555e-06,
"loss": 1.094,
"step": 455
},
{
"epoch": 0.13238496153287851,
"grad_norm": 4.391427516937256,
"learning_rate": 5.29616724738676e-06,
"loss": 1.2025,
"step": 456
},
{
"epoch": 0.13267527943097693,
"grad_norm": 4.8531270027160645,
"learning_rate": 5.307781649245064e-06,
"loss": 1.16,
"step": 457
},
{
"epoch": 0.13296559732907534,
"grad_norm": 5.001920223236084,
"learning_rate": 5.319396051103368e-06,
"loss": 1.3174,
"step": 458
},
{
"epoch": 0.13325591522717375,
"grad_norm": 5.8515849113464355,
"learning_rate": 5.331010452961673e-06,
"loss": 1.2568,
"step": 459
},
{
"epoch": 0.13354623312527217,
"grad_norm": 4.972232818603516,
"learning_rate": 5.342624854819978e-06,
"loss": 1.3323,
"step": 460
},
{
"epoch": 0.13383655102337058,
"grad_norm": 4.840256690979004,
"learning_rate": 5.3542392566782816e-06,
"loss": 1.0883,
"step": 461
},
{
"epoch": 0.13412686892146902,
"grad_norm": 4.309145450592041,
"learning_rate": 5.365853658536586e-06,
"loss": 1.1276,
"step": 462
},
{
"epoch": 0.13441718681956744,
"grad_norm": 4.385857582092285,
"learning_rate": 5.3774680603948905e-06,
"loss": 1.0481,
"step": 463
},
{
"epoch": 0.13470750471766585,
"grad_norm": 4.541776180267334,
"learning_rate": 5.389082462253194e-06,
"loss": 1.1545,
"step": 464
},
{
"epoch": 0.13499782261576426,
"grad_norm": 4.9798712730407715,
"learning_rate": 5.400696864111499e-06,
"loss": 1.2534,
"step": 465
},
{
"epoch": 0.13528814051386268,
"grad_norm": 4.9744977951049805,
"learning_rate": 5.412311265969803e-06,
"loss": 1.104,
"step": 466
},
{
"epoch": 0.1355784584119611,
"grad_norm": 4.3919878005981445,
"learning_rate": 5.423925667828107e-06,
"loss": 1.1293,
"step": 467
},
{
"epoch": 0.1358687763100595,
"grad_norm": 4.843119144439697,
"learning_rate": 5.435540069686411e-06,
"loss": 1.2784,
"step": 468
},
{
"epoch": 0.13615909420815794,
"grad_norm": 4.212307453155518,
"learning_rate": 5.447154471544716e-06,
"loss": 1.1175,
"step": 469
},
{
"epoch": 0.13644941210625636,
"grad_norm": 4.107914447784424,
"learning_rate": 5.458768873403019e-06,
"loss": 1.1508,
"step": 470
},
{
"epoch": 0.13673973000435477,
"grad_norm": 4.234799385070801,
"learning_rate": 5.470383275261324e-06,
"loss": 1.1592,
"step": 471
},
{
"epoch": 0.13703004790245318,
"grad_norm": 4.388983726501465,
"learning_rate": 5.481997677119629e-06,
"loss": 1.1882,
"step": 472
},
{
"epoch": 0.1373203658005516,
"grad_norm": 4.463111877441406,
"learning_rate": 5.493612078977934e-06,
"loss": 1.3275,
"step": 473
},
{
"epoch": 0.13761068369865,
"grad_norm": 4.7095255851745605,
"learning_rate": 5.505226480836237e-06,
"loss": 1.3394,
"step": 474
},
{
"epoch": 0.13790100159674845,
"grad_norm": 4.3856024742126465,
"learning_rate": 5.516840882694542e-06,
"loss": 1.2117,
"step": 475
},
{
"epoch": 0.13819131949484686,
"grad_norm": 4.319365978240967,
"learning_rate": 5.528455284552846e-06,
"loss": 1.1883,
"step": 476
},
{
"epoch": 0.13848163739294528,
"grad_norm": 5.07382869720459,
"learning_rate": 5.540069686411151e-06,
"loss": 1.3553,
"step": 477
},
{
"epoch": 0.1387719552910437,
"grad_norm": 4.294496059417725,
"learning_rate": 5.551684088269454e-06,
"loss": 1.1118,
"step": 478
},
{
"epoch": 0.1390622731891421,
"grad_norm": 4.60385274887085,
"learning_rate": 5.563298490127759e-06,
"loss": 1.1757,
"step": 479
},
{
"epoch": 0.13935259108724052,
"grad_norm": 4.500978946685791,
"learning_rate": 5.574912891986063e-06,
"loss": 1.2481,
"step": 480
},
{
"epoch": 0.13964290898533896,
"grad_norm": 4.490742206573486,
"learning_rate": 5.586527293844367e-06,
"loss": 1.1312,
"step": 481
},
{
"epoch": 0.13993322688343737,
"grad_norm": 4.06981086730957,
"learning_rate": 5.598141695702671e-06,
"loss": 1.1169,
"step": 482
},
{
"epoch": 0.14022354478153579,
"grad_norm": 4.395321369171143,
"learning_rate": 5.609756097560977e-06,
"loss": 1.3147,
"step": 483
},
{
"epoch": 0.1405138626796342,
"grad_norm": 4.509646415710449,
"learning_rate": 5.621370499419281e-06,
"loss": 1.2163,
"step": 484
},
{
"epoch": 0.1408041805777326,
"grad_norm": 4.4350175857543945,
"learning_rate": 5.632984901277585e-06,
"loss": 1.2908,
"step": 485
},
{
"epoch": 0.14109449847583103,
"grad_norm": 4.2386698722839355,
"learning_rate": 5.644599303135889e-06,
"loss": 1.1437,
"step": 486
},
{
"epoch": 0.14138481637392944,
"grad_norm": 4.659437656402588,
"learning_rate": 5.656213704994194e-06,
"loss": 1.1965,
"step": 487
},
{
"epoch": 0.14167513427202788,
"grad_norm": 4.744169235229492,
"learning_rate": 5.6678281068524974e-06,
"loss": 1.2036,
"step": 488
},
{
"epoch": 0.1419654521701263,
"grad_norm": 4.265536785125732,
"learning_rate": 5.679442508710802e-06,
"loss": 1.0867,
"step": 489
},
{
"epoch": 0.1422557700682247,
"grad_norm": 5.0157999992370605,
"learning_rate": 5.691056910569106e-06,
"loss": 1.3433,
"step": 490
},
{
"epoch": 0.14254608796632312,
"grad_norm": 4.504755020141602,
"learning_rate": 5.70267131242741e-06,
"loss": 1.1293,
"step": 491
},
{
"epoch": 0.14283640586442153,
"grad_norm": 4.358330726623535,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.138,
"step": 492
},
{
"epoch": 0.14312672376251995,
"grad_norm": 4.373233318328857,
"learning_rate": 5.725900116144019e-06,
"loss": 1.0552,
"step": 493
},
{
"epoch": 0.1434170416606184,
"grad_norm": 5.096903324127197,
"learning_rate": 5.737514518002323e-06,
"loss": 1.3915,
"step": 494
},
{
"epoch": 0.1437073595587168,
"grad_norm": 4.237011432647705,
"learning_rate": 5.749128919860628e-06,
"loss": 1.2089,
"step": 495
},
{
"epoch": 0.14399767745681522,
"grad_norm": 4.598453998565674,
"learning_rate": 5.7607433217189324e-06,
"loss": 1.1485,
"step": 496
},
{
"epoch": 0.14428799535491363,
"grad_norm": 4.585260391235352,
"learning_rate": 5.772357723577237e-06,
"loss": 1.1004,
"step": 497
},
{
"epoch": 0.14457831325301204,
"grad_norm": 4.202107906341553,
"learning_rate": 5.7839721254355405e-06,
"loss": 1.2586,
"step": 498
},
{
"epoch": 0.14486863115111046,
"grad_norm": 4.69224739074707,
"learning_rate": 5.795586527293845e-06,
"loss": 1.2772,
"step": 499
},
{
"epoch": 0.1451589490492089,
"grad_norm": 4.4062323570251465,
"learning_rate": 5.8072009291521495e-06,
"loss": 1.1927,
"step": 500
},
{
"epoch": 0.1451589490492089,
"eval_loss": 1.225874662399292,
"eval_runtime": 11.4881,
"eval_samples_per_second": 34.819,
"eval_steps_per_second": 4.352,
"step": 500
},
{
"epoch": 0.1454492669473073,
"grad_norm": 4.7002739906311035,
"learning_rate": 5.818815331010453e-06,
"loss": 1.2239,
"step": 501
},
{
"epoch": 0.14573958484540572,
"grad_norm": 4.650073528289795,
"learning_rate": 5.830429732868758e-06,
"loss": 1.3074,
"step": 502
},
{
"epoch": 0.14602990274350414,
"grad_norm": 4.497559070587158,
"learning_rate": 5.842044134727062e-06,
"loss": 1.398,
"step": 503
},
{
"epoch": 0.14632022064160255,
"grad_norm": 4.4335174560546875,
"learning_rate": 5.853658536585366e-06,
"loss": 1.1606,
"step": 504
},
{
"epoch": 0.14661053853970096,
"grad_norm": 4.531015396118164,
"learning_rate": 5.86527293844367e-06,
"loss": 1.2087,
"step": 505
},
{
"epoch": 0.1469008564377994,
"grad_norm": 4.521320343017578,
"learning_rate": 5.876887340301975e-06,
"loss": 1.0572,
"step": 506
},
{
"epoch": 0.14719117433589782,
"grad_norm": 4.088536739349365,
"learning_rate": 5.88850174216028e-06,
"loss": 1.1005,
"step": 507
},
{
"epoch": 0.14748149223399623,
"grad_norm": 4.54278039932251,
"learning_rate": 5.900116144018584e-06,
"loss": 1.2687,
"step": 508
},
{
"epoch": 0.14777181013209464,
"grad_norm": 4.390741348266602,
"learning_rate": 5.911730545876888e-06,
"loss": 1.4151,
"step": 509
},
{
"epoch": 0.14806212803019306,
"grad_norm": 4.438811779022217,
"learning_rate": 5.923344947735193e-06,
"loss": 1.3253,
"step": 510
},
{
"epoch": 0.14835244592829147,
"grad_norm": 4.363897800445557,
"learning_rate": 5.934959349593496e-06,
"loss": 1.2319,
"step": 511
},
{
"epoch": 0.14864276382638988,
"grad_norm": 4.362700462341309,
"learning_rate": 5.946573751451801e-06,
"loss": 1.2404,
"step": 512
},
{
"epoch": 0.14893308172448833,
"grad_norm": 4.311462879180908,
"learning_rate": 5.958188153310105e-06,
"loss": 1.1152,
"step": 513
},
{
"epoch": 0.14922339962258674,
"grad_norm": 4.525477886199951,
"learning_rate": 5.969802555168409e-06,
"loss": 1.4097,
"step": 514
},
{
"epoch": 0.14951371752068515,
"grad_norm": 4.645956516265869,
"learning_rate": 5.981416957026713e-06,
"loss": 1.2637,
"step": 515
},
{
"epoch": 0.14980403541878357,
"grad_norm": 4.705561637878418,
"learning_rate": 5.993031358885018e-06,
"loss": 1.23,
"step": 516
},
{
"epoch": 0.15009435331688198,
"grad_norm": 4.898301601409912,
"learning_rate": 6.0046457607433214e-06,
"loss": 1.2903,
"step": 517
},
{
"epoch": 0.1503846712149804,
"grad_norm": 4.390701770782471,
"learning_rate": 6.016260162601627e-06,
"loss": 1.1944,
"step": 518
},
{
"epoch": 0.15067498911307883,
"grad_norm": 4.7379913330078125,
"learning_rate": 6.027874564459931e-06,
"loss": 1.3016,
"step": 519
},
{
"epoch": 0.15096530701117725,
"grad_norm": 4.652884006500244,
"learning_rate": 6.039488966318236e-06,
"loss": 1.3385,
"step": 520
},
{
"epoch": 0.15125562490927566,
"grad_norm": 4.54412317276001,
"learning_rate": 6.051103368176539e-06,
"loss": 1.1898,
"step": 521
},
{
"epoch": 0.15154594280737407,
"grad_norm": 4.629741668701172,
"learning_rate": 6.062717770034844e-06,
"loss": 1.2784,
"step": 522
},
{
"epoch": 0.1518362607054725,
"grad_norm": 4.372036457061768,
"learning_rate": 6.074332171893148e-06,
"loss": 1.223,
"step": 523
},
{
"epoch": 0.1521265786035709,
"grad_norm": 4.209630966186523,
"learning_rate": 6.085946573751452e-06,
"loss": 1.2334,
"step": 524
},
{
"epoch": 0.15241689650166934,
"grad_norm": 4.473580360412598,
"learning_rate": 6.0975609756097564e-06,
"loss": 1.1312,
"step": 525
},
{
"epoch": 0.15270721439976775,
"grad_norm": 4.313533782958984,
"learning_rate": 6.109175377468061e-06,
"loss": 1.2681,
"step": 526
},
{
"epoch": 0.15299753229786617,
"grad_norm": 4.518441200256348,
"learning_rate": 6.1207897793263645e-06,
"loss": 1.2946,
"step": 527
},
{
"epoch": 0.15328785019596458,
"grad_norm": 4.112656593322754,
"learning_rate": 6.132404181184669e-06,
"loss": 1.2083,
"step": 528
},
{
"epoch": 0.153578168094063,
"grad_norm": 4.3622565269470215,
"learning_rate": 6.1440185830429735e-06,
"loss": 1.2937,
"step": 529
},
{
"epoch": 0.1538684859921614,
"grad_norm": 4.5020751953125,
"learning_rate": 6.155632984901279e-06,
"loss": 1.2604,
"step": 530
},
{
"epoch": 0.15415880389025982,
"grad_norm": 4.212316989898682,
"learning_rate": 6.1672473867595825e-06,
"loss": 1.1373,
"step": 531
},
{
"epoch": 0.15444912178835826,
"grad_norm": 4.951518535614014,
"learning_rate": 6.178861788617887e-06,
"loss": 1.3234,
"step": 532
},
{
"epoch": 0.15473943968645668,
"grad_norm": 4.149683475494385,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.191,
"step": 533
},
{
"epoch": 0.1550297575845551,
"grad_norm": 4.293402194976807,
"learning_rate": 6.202090592334495e-06,
"loss": 1.3976,
"step": 534
},
{
"epoch": 0.1553200754826535,
"grad_norm": 4.633920669555664,
"learning_rate": 6.2137049941927995e-06,
"loss": 1.363,
"step": 535
},
{
"epoch": 0.15561039338075192,
"grad_norm": 4.190507888793945,
"learning_rate": 6.225319396051104e-06,
"loss": 1.0997,
"step": 536
},
{
"epoch": 0.15590071127885033,
"grad_norm": 4.259326934814453,
"learning_rate": 6.236933797909408e-06,
"loss": 1.2049,
"step": 537
},
{
"epoch": 0.15619102917694877,
"grad_norm": 4.1629204750061035,
"learning_rate": 6.248548199767712e-06,
"loss": 1.1561,
"step": 538
},
{
"epoch": 0.15648134707504718,
"grad_norm": 4.23039436340332,
"learning_rate": 6.260162601626017e-06,
"loss": 1.1901,
"step": 539
},
{
"epoch": 0.1567716649731456,
"grad_norm": 4.121535778045654,
"learning_rate": 6.27177700348432e-06,
"loss": 1.1737,
"step": 540
},
{
"epoch": 0.157061982871244,
"grad_norm": 4.287825584411621,
"learning_rate": 6.283391405342625e-06,
"loss": 1.3326,
"step": 541
},
{
"epoch": 0.15735230076934242,
"grad_norm": 9.216053009033203,
"learning_rate": 6.29500580720093e-06,
"loss": 1.3105,
"step": 542
},
{
"epoch": 0.15764261866744084,
"grad_norm": 4.486374855041504,
"learning_rate": 6.3066202090592345e-06,
"loss": 1.1196,
"step": 543
},
{
"epoch": 0.15793293656553928,
"grad_norm": 4.181046485900879,
"learning_rate": 6.318234610917538e-06,
"loss": 1.111,
"step": 544
},
{
"epoch": 0.1582232544636377,
"grad_norm": 4.662967205047607,
"learning_rate": 6.329849012775843e-06,
"loss": 1.1715,
"step": 545
},
{
"epoch": 0.1585135723617361,
"grad_norm": 4.380138397216797,
"learning_rate": 6.341463414634147e-06,
"loss": 1.2221,
"step": 546
},
{
"epoch": 0.15880389025983452,
"grad_norm": 4.870767593383789,
"learning_rate": 6.353077816492451e-06,
"loss": 1.1926,
"step": 547
},
{
"epoch": 0.15909420815793293,
"grad_norm": 4.571467876434326,
"learning_rate": 6.364692218350755e-06,
"loss": 1.0445,
"step": 548
},
{
"epoch": 0.15938452605603134,
"grad_norm": 4.919942378997803,
"learning_rate": 6.37630662020906e-06,
"loss": 1.2702,
"step": 549
},
{
"epoch": 0.15967484395412979,
"grad_norm": 4.4647979736328125,
"learning_rate": 6.387921022067363e-06,
"loss": 1.2969,
"step": 550
},
{
"epoch": 0.1599651618522282,
"grad_norm": 4.359588146209717,
"learning_rate": 6.399535423925668e-06,
"loss": 1.3191,
"step": 551
},
{
"epoch": 0.1602554797503266,
"grad_norm": 4.425624370574951,
"learning_rate": 6.411149825783972e-06,
"loss": 1.2345,
"step": 552
},
{
"epoch": 0.16054579764842503,
"grad_norm": 4.439249038696289,
"learning_rate": 6.422764227642278e-06,
"loss": 1.1849,
"step": 553
},
{
"epoch": 0.16083611554652344,
"grad_norm": 4.451704025268555,
"learning_rate": 6.434378629500581e-06,
"loss": 1.2828,
"step": 554
},
{
"epoch": 0.16112643344462185,
"grad_norm": 4.43411922454834,
"learning_rate": 6.445993031358886e-06,
"loss": 1.4051,
"step": 555
},
{
"epoch": 0.16141675134272027,
"grad_norm": 4.6609392166137695,
"learning_rate": 6.45760743321719e-06,
"loss": 1.1596,
"step": 556
},
{
"epoch": 0.1617070692408187,
"grad_norm": 4.231972694396973,
"learning_rate": 6.469221835075494e-06,
"loss": 1.2903,
"step": 557
},
{
"epoch": 0.16199738713891712,
"grad_norm": 4.471492290496826,
"learning_rate": 6.480836236933798e-06,
"loss": 1.2261,
"step": 558
},
{
"epoch": 0.16228770503701553,
"grad_norm": 4.300949573516846,
"learning_rate": 6.492450638792103e-06,
"loss": 1.232,
"step": 559
},
{
"epoch": 0.16257802293511395,
"grad_norm": 4.097339153289795,
"learning_rate": 6.504065040650407e-06,
"loss": 1.1599,
"step": 560
},
{
"epoch": 0.16286834083321236,
"grad_norm": 3.920823097229004,
"learning_rate": 6.515679442508711e-06,
"loss": 1.1565,
"step": 561
},
{
"epoch": 0.16315865873131077,
"grad_norm": 4.790262222290039,
"learning_rate": 6.5272938443670154e-06,
"loss": 1.3051,
"step": 562
},
{
"epoch": 0.16344897662940922,
"grad_norm": 4.490232467651367,
"learning_rate": 6.53890824622532e-06,
"loss": 1.2613,
"step": 563
},
{
"epoch": 0.16373929452750763,
"grad_norm": 4.4714813232421875,
"learning_rate": 6.5505226480836235e-06,
"loss": 1.2043,
"step": 564
},
{
"epoch": 0.16402961242560604,
"grad_norm": 4.994192600250244,
"learning_rate": 6.562137049941929e-06,
"loss": 1.2062,
"step": 565
},
{
"epoch": 0.16431993032370446,
"grad_norm": 4.22312068939209,
"learning_rate": 6.573751451800233e-06,
"loss": 1.2887,
"step": 566
},
{
"epoch": 0.16461024822180287,
"grad_norm": 4.273190975189209,
"learning_rate": 6.585365853658538e-06,
"loss": 1.2889,
"step": 567
},
{
"epoch": 0.16490056611990128,
"grad_norm": 4.727954387664795,
"learning_rate": 6.5969802555168415e-06,
"loss": 1.3973,
"step": 568
},
{
"epoch": 0.16519088401799972,
"grad_norm": 4.461411476135254,
"learning_rate": 6.608594657375146e-06,
"loss": 1.2739,
"step": 569
},
{
"epoch": 0.16548120191609814,
"grad_norm": 4.23778772354126,
"learning_rate": 6.62020905923345e-06,
"loss": 1.1162,
"step": 570
},
{
"epoch": 0.16577151981419655,
"grad_norm": 4.434848785400391,
"learning_rate": 6.631823461091754e-06,
"loss": 1.2089,
"step": 571
},
{
"epoch": 0.16606183771229496,
"grad_norm": 4.056807518005371,
"learning_rate": 6.6434378629500585e-06,
"loss": 1.2375,
"step": 572
},
{
"epoch": 0.16635215561039338,
"grad_norm": 4.4226975440979,
"learning_rate": 6.655052264808363e-06,
"loss": 1.1912,
"step": 573
},
{
"epoch": 0.1666424735084918,
"grad_norm": 4.397589206695557,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2756,
"step": 574
},
{
"epoch": 0.1669327914065902,
"grad_norm": 4.375736236572266,
"learning_rate": 6.678281068524971e-06,
"loss": 1.1205,
"step": 575
},
{
"epoch": 0.16722310930468864,
"grad_norm": 4.373353481292725,
"learning_rate": 6.6898954703832765e-06,
"loss": 1.2309,
"step": 576
},
{
"epoch": 0.16751342720278706,
"grad_norm": 4.392578125,
"learning_rate": 6.701509872241581e-06,
"loss": 1.3111,
"step": 577
},
{
"epoch": 0.16780374510088547,
"grad_norm": 4.608421325683594,
"learning_rate": 6.7131242740998846e-06,
"loss": 1.1501,
"step": 578
},
{
"epoch": 0.16809406299898388,
"grad_norm": 4.548303127288818,
"learning_rate": 6.724738675958189e-06,
"loss": 1.2754,
"step": 579
},
{
"epoch": 0.1683843808970823,
"grad_norm": 4.56739616394043,
"learning_rate": 6.7363530778164935e-06,
"loss": 1.2028,
"step": 580
},
{
"epoch": 0.1686746987951807,
"grad_norm": 4.294614315032959,
"learning_rate": 6.747967479674797e-06,
"loss": 1.1459,
"step": 581
},
{
"epoch": 0.16896501669327915,
"grad_norm": 4.636039733886719,
"learning_rate": 6.759581881533102e-06,
"loss": 1.3814,
"step": 582
},
{
"epoch": 0.16925533459137757,
"grad_norm": 4.619139671325684,
"learning_rate": 6.771196283391406e-06,
"loss": 1.242,
"step": 583
},
{
"epoch": 0.16954565248947598,
"grad_norm": 4.989368915557861,
"learning_rate": 6.78281068524971e-06,
"loss": 1.4686,
"step": 584
},
{
"epoch": 0.1698359703875744,
"grad_norm": 4.284407138824463,
"learning_rate": 6.794425087108014e-06,
"loss": 1.1228,
"step": 585
},
{
"epoch": 0.1701262882856728,
"grad_norm": 4.518624782562256,
"learning_rate": 6.806039488966319e-06,
"loss": 1.0664,
"step": 586
},
{
"epoch": 0.17041660618377122,
"grad_norm": 4.132668495178223,
"learning_rate": 6.817653890824622e-06,
"loss": 1.1725,
"step": 587
},
{
"epoch": 0.17070692408186966,
"grad_norm": 4.393999099731445,
"learning_rate": 6.829268292682928e-06,
"loss": 1.2639,
"step": 588
},
{
"epoch": 0.17099724197996807,
"grad_norm": 4.1911139488220215,
"learning_rate": 6.840882694541232e-06,
"loss": 1.3127,
"step": 589
},
{
"epoch": 0.1712875598780665,
"grad_norm": 4.69661283493042,
"learning_rate": 6.852497096399537e-06,
"loss": 1.2984,
"step": 590
},
{
"epoch": 0.1715778777761649,
"grad_norm": 4.060606956481934,
"learning_rate": 6.86411149825784e-06,
"loss": 1.2638,
"step": 591
},
{
"epoch": 0.1718681956742633,
"grad_norm": 4.7827677726745605,
"learning_rate": 6.875725900116145e-06,
"loss": 1.3978,
"step": 592
},
{
"epoch": 0.17215851357236173,
"grad_norm": 4.189406394958496,
"learning_rate": 6.887340301974449e-06,
"loss": 1.2079,
"step": 593
},
{
"epoch": 0.17244883147046017,
"grad_norm": 4.125210762023926,
"learning_rate": 6.898954703832753e-06,
"loss": 1.2004,
"step": 594
},
{
"epoch": 0.17273914936855858,
"grad_norm": 4.049924373626709,
"learning_rate": 6.910569105691057e-06,
"loss": 1.254,
"step": 595
},
{
"epoch": 0.173029467266657,
"grad_norm": 4.361916542053223,
"learning_rate": 6.922183507549362e-06,
"loss": 1.3253,
"step": 596
},
{
"epoch": 0.1733197851647554,
"grad_norm": 3.9269027709960938,
"learning_rate": 6.9337979094076655e-06,
"loss": 1.114,
"step": 597
},
{
"epoch": 0.17361010306285382,
"grad_norm": 4.094462871551514,
"learning_rate": 6.94541231126597e-06,
"loss": 1.3056,
"step": 598
},
{
"epoch": 0.17390042096095223,
"grad_norm": 4.001208305358887,
"learning_rate": 6.957026713124274e-06,
"loss": 1.2286,
"step": 599
},
{
"epoch": 0.17419073885905065,
"grad_norm": 4.29280948638916,
"learning_rate": 6.96864111498258e-06,
"loss": 1.2494,
"step": 600
},
{
"epoch": 0.1744810567571491,
"grad_norm": 4.355632305145264,
"learning_rate": 6.980255516840883e-06,
"loss": 1.2811,
"step": 601
},
{
"epoch": 0.1747713746552475,
"grad_norm": 4.2747273445129395,
"learning_rate": 6.991869918699188e-06,
"loss": 1.2177,
"step": 602
},
{
"epoch": 0.17506169255334592,
"grad_norm": 4.914125442504883,
"learning_rate": 7.003484320557492e-06,
"loss": 1.2001,
"step": 603
},
{
"epoch": 0.17535201045144433,
"grad_norm": 4.380726337432861,
"learning_rate": 7.015098722415796e-06,
"loss": 1.2322,
"step": 604
},
{
"epoch": 0.17564232834954274,
"grad_norm": 4.1070733070373535,
"learning_rate": 7.0267131242741005e-06,
"loss": 1.0689,
"step": 605
},
{
"epoch": 0.17593264624764116,
"grad_norm": 4.090858459472656,
"learning_rate": 7.038327526132405e-06,
"loss": 1.0399,
"step": 606
},
{
"epoch": 0.1762229641457396,
"grad_norm": 4.439457893371582,
"learning_rate": 7.0499419279907086e-06,
"loss": 1.0798,
"step": 607
},
{
"epoch": 0.176513282043838,
"grad_norm": 4.626300811767578,
"learning_rate": 7.061556329849013e-06,
"loss": 1.1793,
"step": 608
},
{
"epoch": 0.17680359994193642,
"grad_norm": 4.283360481262207,
"learning_rate": 7.0731707317073175e-06,
"loss": 1.1937,
"step": 609
},
{
"epoch": 0.17709391784003484,
"grad_norm": 4.1504669189453125,
"learning_rate": 7.084785133565621e-06,
"loss": 1.0317,
"step": 610
},
{
"epoch": 0.17738423573813325,
"grad_norm": 4.170088768005371,
"learning_rate": 7.0963995354239265e-06,
"loss": 1.1571,
"step": 611
},
{
"epoch": 0.17767455363623166,
"grad_norm": 4.515710353851318,
"learning_rate": 7.108013937282231e-06,
"loss": 1.1888,
"step": 612
},
{
"epoch": 0.1779648715343301,
"grad_norm": 3.985978841781616,
"learning_rate": 7.1196283391405354e-06,
"loss": 1.1603,
"step": 613
},
{
"epoch": 0.17825518943242852,
"grad_norm": 4.436974048614502,
"learning_rate": 7.131242740998839e-06,
"loss": 1.2722,
"step": 614
},
{
"epoch": 0.17854550733052693,
"grad_norm": 4.694450855255127,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.2873,
"step": 615
},
{
"epoch": 0.17883582522862534,
"grad_norm": 4.002849578857422,
"learning_rate": 7.154471544715448e-06,
"loss": 1.2664,
"step": 616
},
{
"epoch": 0.17912614312672376,
"grad_norm": 4.15142822265625,
"learning_rate": 7.166085946573752e-06,
"loss": 1.1794,
"step": 617
},
{
"epoch": 0.17941646102482217,
"grad_norm": 4.208678245544434,
"learning_rate": 7.177700348432056e-06,
"loss": 1.3951,
"step": 618
},
{
"epoch": 0.17970677892292058,
"grad_norm": 4.212402820587158,
"learning_rate": 7.189314750290361e-06,
"loss": 1.2183,
"step": 619
},
{
"epoch": 0.17999709682101903,
"grad_norm": 4.358024597167969,
"learning_rate": 7.200929152148664e-06,
"loss": 1.2951,
"step": 620
},
{
"epoch": 0.18028741471911744,
"grad_norm": 4.831110000610352,
"learning_rate": 7.212543554006969e-06,
"loss": 1.2045,
"step": 621
},
{
"epoch": 0.18057773261721585,
"grad_norm": 4.0317206382751465,
"learning_rate": 7.224157955865273e-06,
"loss": 1.1498,
"step": 622
},
{
"epoch": 0.18086805051531427,
"grad_norm": 4.493712425231934,
"learning_rate": 7.2357723577235786e-06,
"loss": 1.2473,
"step": 623
},
{
"epoch": 0.18115836841341268,
"grad_norm": 4.345702648162842,
"learning_rate": 7.247386759581882e-06,
"loss": 1.1674,
"step": 624
},
{
"epoch": 0.1814486863115111,
"grad_norm": 4.302826404571533,
"learning_rate": 7.259001161440187e-06,
"loss": 1.2631,
"step": 625
},
{
"epoch": 0.18173900420960953,
"grad_norm": 4.829352855682373,
"learning_rate": 7.270615563298491e-06,
"loss": 1.1601,
"step": 626
},
{
"epoch": 0.18202932210770795,
"grad_norm": 4.134838104248047,
"learning_rate": 7.282229965156795e-06,
"loss": 1.1322,
"step": 627
},
{
"epoch": 0.18231964000580636,
"grad_norm": 4.196687698364258,
"learning_rate": 7.293844367015099e-06,
"loss": 1.2701,
"step": 628
},
{
"epoch": 0.18260995790390477,
"grad_norm": 4.318655490875244,
"learning_rate": 7.305458768873404e-06,
"loss": 1.3027,
"step": 629
},
{
"epoch": 0.1829002758020032,
"grad_norm": 4.255601406097412,
"learning_rate": 7.317073170731707e-06,
"loss": 1.1377,
"step": 630
},
{
"epoch": 0.1831905937001016,
"grad_norm": 4.319618225097656,
"learning_rate": 7.328687572590012e-06,
"loss": 1.1671,
"step": 631
},
{
"epoch": 0.18348091159820004,
"grad_norm": 4.360809803009033,
"learning_rate": 7.340301974448316e-06,
"loss": 1.2979,
"step": 632
},
{
"epoch": 0.18377122949629845,
"grad_norm": 3.8124513626098633,
"learning_rate": 7.35191637630662e-06,
"loss": 1.1039,
"step": 633
},
{
"epoch": 0.18406154739439687,
"grad_norm": 4.552162170410156,
"learning_rate": 7.3635307781649245e-06,
"loss": 1.1019,
"step": 634
},
{
"epoch": 0.18435186529249528,
"grad_norm": 3.8770148754119873,
"learning_rate": 7.37514518002323e-06,
"loss": 1.0831,
"step": 635
},
{
"epoch": 0.1846421831905937,
"grad_norm": 4.136161804199219,
"learning_rate": 7.386759581881534e-06,
"loss": 1.1656,
"step": 636
},
{
"epoch": 0.1849325010886921,
"grad_norm": 4.266040802001953,
"learning_rate": 7.398373983739838e-06,
"loss": 1.1633,
"step": 637
},
{
"epoch": 0.18522281898679055,
"grad_norm": 4.174380779266357,
"learning_rate": 7.409988385598142e-06,
"loss": 1.2005,
"step": 638
},
{
"epoch": 0.18551313688488896,
"grad_norm": 4.037458419799805,
"learning_rate": 7.421602787456447e-06,
"loss": 1.1017,
"step": 639
},
{
"epoch": 0.18580345478298738,
"grad_norm": 4.106693744659424,
"learning_rate": 7.4332171893147505e-06,
"loss": 1.1764,
"step": 640
},
{
"epoch": 0.1860937726810858,
"grad_norm": 4.502237319946289,
"learning_rate": 7.444831591173055e-06,
"loss": 1.3775,
"step": 641
},
{
"epoch": 0.1863840905791842,
"grad_norm": 4.384480953216553,
"learning_rate": 7.4564459930313594e-06,
"loss": 1.2214,
"step": 642
},
{
"epoch": 0.18667440847728262,
"grad_norm": 4.051870346069336,
"learning_rate": 7.468060394889663e-06,
"loss": 1.2507,
"step": 643
},
{
"epoch": 0.18696472637538103,
"grad_norm": 3.967947244644165,
"learning_rate": 7.4796747967479676e-06,
"loss": 1.1179,
"step": 644
},
{
"epoch": 0.18725504427347947,
"grad_norm": 4.541753768920898,
"learning_rate": 7.491289198606272e-06,
"loss": 1.3501,
"step": 645
},
{
"epoch": 0.18754536217157788,
"grad_norm": 4.431195259094238,
"learning_rate": 7.502903600464577e-06,
"loss": 1.2707,
"step": 646
},
{
"epoch": 0.1878356800696763,
"grad_norm": 4.027304172515869,
"learning_rate": 7.514518002322881e-06,
"loss": 0.9999,
"step": 647
},
{
"epoch": 0.1881259979677747,
"grad_norm": 4.287905693054199,
"learning_rate": 7.5261324041811855e-06,
"loss": 1.2036,
"step": 648
},
{
"epoch": 0.18841631586587312,
"grad_norm": 4.41646671295166,
"learning_rate": 7.53774680603949e-06,
"loss": 1.3499,
"step": 649
},
{
"epoch": 0.18870663376397154,
"grad_norm": 3.83207106590271,
"learning_rate": 7.5493612078977944e-06,
"loss": 1.0668,
"step": 650
},
{
"epoch": 0.18899695166206998,
"grad_norm": 4.674952507019043,
"learning_rate": 7.560975609756098e-06,
"loss": 1.2712,
"step": 651
},
{
"epoch": 0.1892872695601684,
"grad_norm": 4.142502784729004,
"learning_rate": 7.5725900116144026e-06,
"loss": 1.2139,
"step": 652
},
{
"epoch": 0.1895775874582668,
"grad_norm": 4.170092582702637,
"learning_rate": 7.584204413472707e-06,
"loss": 1.1265,
"step": 653
},
{
"epoch": 0.18986790535636522,
"grad_norm": 4.253111362457275,
"learning_rate": 7.595818815331011e-06,
"loss": 1.3418,
"step": 654
},
{
"epoch": 0.19015822325446363,
"grad_norm": 4.222099781036377,
"learning_rate": 7.607433217189315e-06,
"loss": 1.1752,
"step": 655
},
{
"epoch": 0.19044854115256205,
"grad_norm": 3.9238572120666504,
"learning_rate": 7.61904761904762e-06,
"loss": 1.0777,
"step": 656
},
{
"epoch": 0.1907388590506605,
"grad_norm": 4.306210994720459,
"learning_rate": 7.630662020905924e-06,
"loss": 1.3503,
"step": 657
},
{
"epoch": 0.1910291769487589,
"grad_norm": 4.187571048736572,
"learning_rate": 7.64227642276423e-06,
"loss": 1.1342,
"step": 658
},
{
"epoch": 0.1913194948468573,
"grad_norm": 4.448465824127197,
"learning_rate": 7.653890824622533e-06,
"loss": 1.2355,
"step": 659
},
{
"epoch": 0.19160981274495573,
"grad_norm": 4.302551746368408,
"learning_rate": 7.665505226480837e-06,
"loss": 1.1779,
"step": 660
},
{
"epoch": 0.19190013064305414,
"grad_norm": 4.034951686859131,
"learning_rate": 7.677119628339142e-06,
"loss": 1.1235,
"step": 661
},
{
"epoch": 0.19219044854115255,
"grad_norm": 4.021313190460205,
"learning_rate": 7.688734030197446e-06,
"loss": 1.1306,
"step": 662
},
{
"epoch": 0.19248076643925097,
"grad_norm": 4.604819297790527,
"learning_rate": 7.70034843205575e-06,
"loss": 1.265,
"step": 663
},
{
"epoch": 0.1927710843373494,
"grad_norm": 4.1214189529418945,
"learning_rate": 7.711962833914055e-06,
"loss": 1.2959,
"step": 664
},
{
"epoch": 0.19306140223544782,
"grad_norm": 4.4705047607421875,
"learning_rate": 7.723577235772358e-06,
"loss": 1.3114,
"step": 665
},
{
"epoch": 0.19335172013354623,
"grad_norm": 4.120425701141357,
"learning_rate": 7.735191637630662e-06,
"loss": 1.113,
"step": 666
},
{
"epoch": 0.19364203803164465,
"grad_norm": 3.661496877670288,
"learning_rate": 7.746806039488967e-06,
"loss": 1.1437,
"step": 667
},
{
"epoch": 0.19393235592974306,
"grad_norm": 4.550029277801514,
"learning_rate": 7.758420441347271e-06,
"loss": 1.3003,
"step": 668
},
{
"epoch": 0.19422267382784147,
"grad_norm": 4.394417762756348,
"learning_rate": 7.770034843205574e-06,
"loss": 1.1571,
"step": 669
},
{
"epoch": 0.19451299172593992,
"grad_norm": 4.869671821594238,
"learning_rate": 7.78164924506388e-06,
"loss": 1.1525,
"step": 670
},
{
"epoch": 0.19480330962403833,
"grad_norm": 4.481640815734863,
"learning_rate": 7.793263646922185e-06,
"loss": 1.273,
"step": 671
},
{
"epoch": 0.19509362752213674,
"grad_norm": 4.039763450622559,
"learning_rate": 7.804878048780489e-06,
"loss": 1.1533,
"step": 672
},
{
"epoch": 0.19538394542023516,
"grad_norm": 4.439721584320068,
"learning_rate": 7.816492450638792e-06,
"loss": 1.2893,
"step": 673
},
{
"epoch": 0.19567426331833357,
"grad_norm": 3.8747873306274414,
"learning_rate": 7.828106852497098e-06,
"loss": 1.0562,
"step": 674
},
{
"epoch": 0.19596458121643198,
"grad_norm": 4.250256538391113,
"learning_rate": 7.839721254355401e-06,
"loss": 1.1873,
"step": 675
},
{
"epoch": 0.19625489911453042,
"grad_norm": 4.367439270019531,
"learning_rate": 7.851335656213705e-06,
"loss": 1.2494,
"step": 676
},
{
"epoch": 0.19654521701262884,
"grad_norm": 3.8989996910095215,
"learning_rate": 7.86295005807201e-06,
"loss": 1.2933,
"step": 677
},
{
"epoch": 0.19683553491072725,
"grad_norm": 4.156364917755127,
"learning_rate": 7.874564459930314e-06,
"loss": 1.2346,
"step": 678
},
{
"epoch": 0.19712585280882566,
"grad_norm": 3.9347167015075684,
"learning_rate": 7.886178861788618e-06,
"loss": 1.0208,
"step": 679
},
{
"epoch": 0.19741617070692408,
"grad_norm": 4.1161627769470215,
"learning_rate": 7.897793263646923e-06,
"loss": 1.2088,
"step": 680
},
{
"epoch": 0.1977064886050225,
"grad_norm": 4.2744855880737305,
"learning_rate": 7.909407665505228e-06,
"loss": 1.2502,
"step": 681
},
{
"epoch": 0.19799680650312093,
"grad_norm": 4.033324718475342,
"learning_rate": 7.921022067363532e-06,
"loss": 1.2464,
"step": 682
},
{
"epoch": 0.19828712440121934,
"grad_norm": 4.08077335357666,
"learning_rate": 7.932636469221836e-06,
"loss": 1.2234,
"step": 683
},
{
"epoch": 0.19857744229931776,
"grad_norm": 4.596649646759033,
"learning_rate": 7.94425087108014e-06,
"loss": 1.3688,
"step": 684
},
{
"epoch": 0.19886776019741617,
"grad_norm": 4.569955348968506,
"learning_rate": 7.955865272938444e-06,
"loss": 1.2121,
"step": 685
},
{
"epoch": 0.19915807809551458,
"grad_norm": 4.908385753631592,
"learning_rate": 7.967479674796748e-06,
"loss": 1.3586,
"step": 686
},
{
"epoch": 0.199448395993613,
"grad_norm": 4.093334674835205,
"learning_rate": 7.979094076655053e-06,
"loss": 1.2516,
"step": 687
},
{
"epoch": 0.1997387138917114,
"grad_norm": 4.448044776916504,
"learning_rate": 7.990708478513357e-06,
"loss": 1.1447,
"step": 688
},
{
"epoch": 0.20002903178980985,
"grad_norm": 4.412672519683838,
"learning_rate": 8.00232288037166e-06,
"loss": 1.2134,
"step": 689
},
{
"epoch": 0.20031934968790827,
"grad_norm": 3.8759539127349854,
"learning_rate": 8.013937282229966e-06,
"loss": 1.1278,
"step": 690
},
{
"epoch": 0.20060966758600668,
"grad_norm": 3.993645668029785,
"learning_rate": 8.02555168408827e-06,
"loss": 1.1997,
"step": 691
},
{
"epoch": 0.2008999854841051,
"grad_norm": 4.497583389282227,
"learning_rate": 8.037166085946573e-06,
"loss": 1.2892,
"step": 692
},
{
"epoch": 0.2011903033822035,
"grad_norm": 4.036830425262451,
"learning_rate": 8.048780487804879e-06,
"loss": 1.2577,
"step": 693
},
{
"epoch": 0.20148062128030192,
"grad_norm": 4.649497985839844,
"learning_rate": 8.060394889663184e-06,
"loss": 1.3546,
"step": 694
},
{
"epoch": 0.20177093917840036,
"grad_norm": 4.232790946960449,
"learning_rate": 8.072009291521488e-06,
"loss": 1.0828,
"step": 695
},
{
"epoch": 0.20206125707649877,
"grad_norm": 4.427145481109619,
"learning_rate": 8.083623693379791e-06,
"loss": 1.2565,
"step": 696
},
{
"epoch": 0.2023515749745972,
"grad_norm": 4.624083042144775,
"learning_rate": 8.095238095238097e-06,
"loss": 1.3997,
"step": 697
},
{
"epoch": 0.2026418928726956,
"grad_norm": 4.487246036529541,
"learning_rate": 8.1068524970964e-06,
"loss": 1.2862,
"step": 698
},
{
"epoch": 0.202932210770794,
"grad_norm": 4.557520866394043,
"learning_rate": 8.118466898954704e-06,
"loss": 1.1943,
"step": 699
},
{
"epoch": 0.20322252866889243,
"grad_norm": 4.63982629776001,
"learning_rate": 8.130081300813009e-06,
"loss": 1.1608,
"step": 700
},
{
"epoch": 0.20351284656699087,
"grad_norm": 4.147871017456055,
"learning_rate": 8.141695702671313e-06,
"loss": 1.1881,
"step": 701
},
{
"epoch": 0.20380316446508928,
"grad_norm": 4.48539924621582,
"learning_rate": 8.153310104529616e-06,
"loss": 1.2512,
"step": 702
},
{
"epoch": 0.2040934823631877,
"grad_norm": 4.378758430480957,
"learning_rate": 8.164924506387922e-06,
"loss": 1.2635,
"step": 703
},
{
"epoch": 0.2043838002612861,
"grad_norm": 4.198378562927246,
"learning_rate": 8.176538908246227e-06,
"loss": 1.3167,
"step": 704
},
{
"epoch": 0.20467411815938452,
"grad_norm": 4.351714134216309,
"learning_rate": 8.18815331010453e-06,
"loss": 1.3105,
"step": 705
},
{
"epoch": 0.20496443605748293,
"grad_norm": 3.9941248893737793,
"learning_rate": 8.199767711962834e-06,
"loss": 1.1611,
"step": 706
},
{
"epoch": 0.20525475395558135,
"grad_norm": 4.21259880065918,
"learning_rate": 8.21138211382114e-06,
"loss": 1.2724,
"step": 707
},
{
"epoch": 0.2055450718536798,
"grad_norm": 4.212434768676758,
"learning_rate": 8.222996515679443e-06,
"loss": 1.2564,
"step": 708
},
{
"epoch": 0.2058353897517782,
"grad_norm": 4.102781295776367,
"learning_rate": 8.234610917537747e-06,
"loss": 1.1005,
"step": 709
},
{
"epoch": 0.20612570764987662,
"grad_norm": 4.176314830780029,
"learning_rate": 8.246225319396052e-06,
"loss": 1.3496,
"step": 710
},
{
"epoch": 0.20641602554797503,
"grad_norm": 3.998204469680786,
"learning_rate": 8.257839721254356e-06,
"loss": 1.1549,
"step": 711
},
{
"epoch": 0.20670634344607344,
"grad_norm": 4.177518844604492,
"learning_rate": 8.26945412311266e-06,
"loss": 1.2156,
"step": 712
},
{
"epoch": 0.20699666134417186,
"grad_norm": 3.991353750228882,
"learning_rate": 8.281068524970965e-06,
"loss": 1.247,
"step": 713
},
{
"epoch": 0.2072869792422703,
"grad_norm": 4.021002292633057,
"learning_rate": 8.292682926829268e-06,
"loss": 1.2432,
"step": 714
},
{
"epoch": 0.2075772971403687,
"grad_norm": 4.410247802734375,
"learning_rate": 8.304297328687572e-06,
"loss": 1.4163,
"step": 715
},
{
"epoch": 0.20786761503846712,
"grad_norm": 4.138284683227539,
"learning_rate": 8.315911730545877e-06,
"loss": 1.1089,
"step": 716
},
{
"epoch": 0.20815793293656554,
"grad_norm": 3.8682849407196045,
"learning_rate": 8.327526132404183e-06,
"loss": 1.1813,
"step": 717
},
{
"epoch": 0.20844825083466395,
"grad_norm": 4.133089065551758,
"learning_rate": 8.339140534262486e-06,
"loss": 1.2069,
"step": 718
},
{
"epoch": 0.20873856873276236,
"grad_norm": 4.1765875816345215,
"learning_rate": 8.35075493612079e-06,
"loss": 1.3223,
"step": 719
},
{
"epoch": 0.2090288866308608,
"grad_norm": 4.326620101928711,
"learning_rate": 8.362369337979095e-06,
"loss": 1.1926,
"step": 720
},
{
"epoch": 0.20931920452895922,
"grad_norm": 4.258913993835449,
"learning_rate": 8.373983739837399e-06,
"loss": 1.2684,
"step": 721
},
{
"epoch": 0.20960952242705763,
"grad_norm": 3.9621224403381348,
"learning_rate": 8.385598141695703e-06,
"loss": 1.1473,
"step": 722
},
{
"epoch": 0.20989984032515605,
"grad_norm": 4.3580322265625,
"learning_rate": 8.397212543554008e-06,
"loss": 1.2432,
"step": 723
},
{
"epoch": 0.21019015822325446,
"grad_norm": 4.387808799743652,
"learning_rate": 8.408826945412312e-06,
"loss": 1.395,
"step": 724
},
{
"epoch": 0.21048047612135287,
"grad_norm": 4.777324199676514,
"learning_rate": 8.420441347270615e-06,
"loss": 1.2738,
"step": 725
},
{
"epoch": 0.2107707940194513,
"grad_norm": 3.977665424346924,
"learning_rate": 8.43205574912892e-06,
"loss": 1.2753,
"step": 726
},
{
"epoch": 0.21106111191754973,
"grad_norm": 4.048496246337891,
"learning_rate": 8.443670150987224e-06,
"loss": 1.1514,
"step": 727
},
{
"epoch": 0.21135142981564814,
"grad_norm": 3.8251259326934814,
"learning_rate": 8.45528455284553e-06,
"loss": 1.1101,
"step": 728
},
{
"epoch": 0.21164174771374655,
"grad_norm": 4.291660308837891,
"learning_rate": 8.466898954703833e-06,
"loss": 1.2019,
"step": 729
},
{
"epoch": 0.21193206561184497,
"grad_norm": 4.2600555419921875,
"learning_rate": 8.478513356562138e-06,
"loss": 1.2865,
"step": 730
},
{
"epoch": 0.21222238350994338,
"grad_norm": 3.9936861991882324,
"learning_rate": 8.490127758420442e-06,
"loss": 1.1312,
"step": 731
},
{
"epoch": 0.2125127014080418,
"grad_norm": 4.250927448272705,
"learning_rate": 8.501742160278746e-06,
"loss": 1.2805,
"step": 732
},
{
"epoch": 0.21280301930614023,
"grad_norm": 4.299734592437744,
"learning_rate": 8.513356562137051e-06,
"loss": 1.1194,
"step": 733
},
{
"epoch": 0.21309333720423865,
"grad_norm": 4.459551811218262,
"learning_rate": 8.524970963995355e-06,
"loss": 1.411,
"step": 734
},
{
"epoch": 0.21338365510233706,
"grad_norm": 4.234330654144287,
"learning_rate": 8.536585365853658e-06,
"loss": 1.2569,
"step": 735
},
{
"epoch": 0.21367397300043547,
"grad_norm": 4.489592552185059,
"learning_rate": 8.548199767711964e-06,
"loss": 1.341,
"step": 736
},
{
"epoch": 0.2139642908985339,
"grad_norm": 4.3680739402771,
"learning_rate": 8.559814169570267e-06,
"loss": 1.3785,
"step": 737
},
{
"epoch": 0.2142546087966323,
"grad_norm": 4.33858060836792,
"learning_rate": 8.571428571428571e-06,
"loss": 1.2048,
"step": 738
},
{
"epoch": 0.21454492669473074,
"grad_norm": 4.339114189147949,
"learning_rate": 8.583042973286876e-06,
"loss": 1.289,
"step": 739
},
{
"epoch": 0.21483524459282916,
"grad_norm": 3.8613274097442627,
"learning_rate": 8.594657375145182e-06,
"loss": 1.0864,
"step": 740
},
{
"epoch": 0.21512556249092757,
"grad_norm": 4.468837261199951,
"learning_rate": 8.606271777003485e-06,
"loss": 1.2507,
"step": 741
},
{
"epoch": 0.21541588038902598,
"grad_norm": 4.397847652435303,
"learning_rate": 8.617886178861789e-06,
"loss": 1.3629,
"step": 742
},
{
"epoch": 0.2157061982871244,
"grad_norm": 4.4040303230285645,
"learning_rate": 8.629500580720094e-06,
"loss": 1.2387,
"step": 743
},
{
"epoch": 0.2159965161852228,
"grad_norm": 4.0640130043029785,
"learning_rate": 8.641114982578398e-06,
"loss": 1.1718,
"step": 744
},
{
"epoch": 0.21628683408332125,
"grad_norm": 4.574658393859863,
"learning_rate": 8.652729384436701e-06,
"loss": 1.3895,
"step": 745
},
{
"epoch": 0.21657715198141966,
"grad_norm": 4.159901142120361,
"learning_rate": 8.664343786295007e-06,
"loss": 1.0791,
"step": 746
},
{
"epoch": 0.21686746987951808,
"grad_norm": 4.002528667449951,
"learning_rate": 8.67595818815331e-06,
"loss": 1.2234,
"step": 747
},
{
"epoch": 0.2171577877776165,
"grad_norm": 4.431401252746582,
"learning_rate": 8.687572590011614e-06,
"loss": 1.3391,
"step": 748
},
{
"epoch": 0.2174481056757149,
"grad_norm": 3.9772732257843018,
"learning_rate": 8.69918699186992e-06,
"loss": 1.2431,
"step": 749
},
{
"epoch": 0.21773842357381332,
"grad_norm": 3.6207127571105957,
"learning_rate": 8.710801393728223e-06,
"loss": 1.2068,
"step": 750
},
{
"epoch": 0.21802874147191173,
"grad_norm": 4.086411952972412,
"learning_rate": 8.722415795586528e-06,
"loss": 1.2978,
"step": 751
},
{
"epoch": 0.21831905937001017,
"grad_norm": 3.863708257675171,
"learning_rate": 8.734030197444832e-06,
"loss": 1.2108,
"step": 752
},
{
"epoch": 0.21860937726810858,
"grad_norm": 4.488952159881592,
"learning_rate": 8.745644599303137e-06,
"loss": 1.1774,
"step": 753
},
{
"epoch": 0.218899695166207,
"grad_norm": 4.089755535125732,
"learning_rate": 8.757259001161441e-06,
"loss": 1.2003,
"step": 754
},
{
"epoch": 0.2191900130643054,
"grad_norm": 3.3888041973114014,
"learning_rate": 8.768873403019745e-06,
"loss": 1.0689,
"step": 755
},
{
"epoch": 0.21948033096240382,
"grad_norm": 4.007880687713623,
"learning_rate": 8.78048780487805e-06,
"loss": 1.2168,
"step": 756
},
{
"epoch": 0.21977064886050224,
"grad_norm": 3.9035606384277344,
"learning_rate": 8.792102206736354e-06,
"loss": 1.0885,
"step": 757
},
{
"epoch": 0.22006096675860068,
"grad_norm": 4.004887580871582,
"learning_rate": 8.803716608594657e-06,
"loss": 1.1846,
"step": 758
},
{
"epoch": 0.2203512846566991,
"grad_norm": 4.1913580894470215,
"learning_rate": 8.815331010452962e-06,
"loss": 1.3213,
"step": 759
},
{
"epoch": 0.2206416025547975,
"grad_norm": 4.157240867614746,
"learning_rate": 8.826945412311266e-06,
"loss": 1.2908,
"step": 760
},
{
"epoch": 0.22093192045289592,
"grad_norm": 4.264801979064941,
"learning_rate": 8.83855981416957e-06,
"loss": 1.2615,
"step": 761
},
{
"epoch": 0.22122223835099433,
"grad_norm": 4.292517185211182,
"learning_rate": 8.850174216027875e-06,
"loss": 1.223,
"step": 762
},
{
"epoch": 0.22151255624909275,
"grad_norm": 3.773144245147705,
"learning_rate": 8.86178861788618e-06,
"loss": 1.1587,
"step": 763
},
{
"epoch": 0.2218028741471912,
"grad_norm": 4.224881172180176,
"learning_rate": 8.873403019744484e-06,
"loss": 1.3653,
"step": 764
},
{
"epoch": 0.2220931920452896,
"grad_norm": 4.401252269744873,
"learning_rate": 8.885017421602788e-06,
"loss": 1.2275,
"step": 765
},
{
"epoch": 0.222383509943388,
"grad_norm": 4.408329963684082,
"learning_rate": 8.896631823461093e-06,
"loss": 1.3104,
"step": 766
},
{
"epoch": 0.22267382784148643,
"grad_norm": 4.158458709716797,
"learning_rate": 8.908246225319397e-06,
"loss": 1.1537,
"step": 767
},
{
"epoch": 0.22296414573958484,
"grad_norm": 3.915562868118286,
"learning_rate": 8.9198606271777e-06,
"loss": 1.2162,
"step": 768
},
{
"epoch": 0.22325446363768325,
"grad_norm": 3.9854915142059326,
"learning_rate": 8.931475029036006e-06,
"loss": 1.0471,
"step": 769
},
{
"epoch": 0.2235447815357817,
"grad_norm": 4.040715217590332,
"learning_rate": 8.94308943089431e-06,
"loss": 1.2871,
"step": 770
},
{
"epoch": 0.2238350994338801,
"grad_norm": 4.417214870452881,
"learning_rate": 8.954703832752613e-06,
"loss": 1.2301,
"step": 771
},
{
"epoch": 0.22412541733197852,
"grad_norm": 4.276007175445557,
"learning_rate": 8.966318234610918e-06,
"loss": 1.4429,
"step": 772
},
{
"epoch": 0.22441573523007693,
"grad_norm": 4.329378604888916,
"learning_rate": 8.977932636469222e-06,
"loss": 1.3906,
"step": 773
},
{
"epoch": 0.22470605312817535,
"grad_norm": 4.080763339996338,
"learning_rate": 8.989547038327527e-06,
"loss": 1.1965,
"step": 774
},
{
"epoch": 0.22499637102627376,
"grad_norm": 3.89856219291687,
"learning_rate": 9.00116144018583e-06,
"loss": 1.1666,
"step": 775
},
{
"epoch": 0.22528668892437217,
"grad_norm": 4.193841457366943,
"learning_rate": 9.012775842044136e-06,
"loss": 1.3002,
"step": 776
},
{
"epoch": 0.22557700682247062,
"grad_norm": 4.260502338409424,
"learning_rate": 9.02439024390244e-06,
"loss": 1.2584,
"step": 777
},
{
"epoch": 0.22586732472056903,
"grad_norm": 4.089141845703125,
"learning_rate": 9.036004645760745e-06,
"loss": 1.102,
"step": 778
},
{
"epoch": 0.22615764261866744,
"grad_norm": 4.167725563049316,
"learning_rate": 9.047619047619049e-06,
"loss": 1.2121,
"step": 779
},
{
"epoch": 0.22644796051676586,
"grad_norm": 4.360806941986084,
"learning_rate": 9.059233449477352e-06,
"loss": 1.196,
"step": 780
},
{
"epoch": 0.22673827841486427,
"grad_norm": 4.336724281311035,
"learning_rate": 9.070847851335658e-06,
"loss": 1.142,
"step": 781
},
{
"epoch": 0.22702859631296268,
"grad_norm": 4.499552249908447,
"learning_rate": 9.082462253193961e-06,
"loss": 1.3475,
"step": 782
},
{
"epoch": 0.22731891421106112,
"grad_norm": 3.940721273422241,
"learning_rate": 9.094076655052265e-06,
"loss": 1.1308,
"step": 783
},
{
"epoch": 0.22760923210915954,
"grad_norm": 4.627920150756836,
"learning_rate": 9.10569105691057e-06,
"loss": 1.3685,
"step": 784
},
{
"epoch": 0.22789955000725795,
"grad_norm": 4.070476055145264,
"learning_rate": 9.117305458768874e-06,
"loss": 1.2696,
"step": 785
},
{
"epoch": 0.22818986790535636,
"grad_norm": 3.932196617126465,
"learning_rate": 9.12891986062718e-06,
"loss": 1.1755,
"step": 786
},
{
"epoch": 0.22848018580345478,
"grad_norm": 4.1085968017578125,
"learning_rate": 9.140534262485483e-06,
"loss": 1.3788,
"step": 787
},
{
"epoch": 0.2287705037015532,
"grad_norm": 4.546936511993408,
"learning_rate": 9.152148664343788e-06,
"loss": 1.2131,
"step": 788
},
{
"epoch": 0.22906082159965163,
"grad_norm": 3.854112148284912,
"learning_rate": 9.163763066202092e-06,
"loss": 1.2509,
"step": 789
},
{
"epoch": 0.22935113949775005,
"grad_norm": 3.6372368335723877,
"learning_rate": 9.175377468060395e-06,
"loss": 1.002,
"step": 790
},
{
"epoch": 0.22964145739584846,
"grad_norm": 4.038814067840576,
"learning_rate": 9.1869918699187e-06,
"loss": 1.273,
"step": 791
},
{
"epoch": 0.22993177529394687,
"grad_norm": 4.1536712646484375,
"learning_rate": 9.198606271777004e-06,
"loss": 1.3666,
"step": 792
},
{
"epoch": 0.23022209319204529,
"grad_norm": 4.179312705993652,
"learning_rate": 9.210220673635308e-06,
"loss": 1.2411,
"step": 793
},
{
"epoch": 0.2305124110901437,
"grad_norm": 3.946230411529541,
"learning_rate": 9.221835075493613e-06,
"loss": 1.357,
"step": 794
},
{
"epoch": 0.2308027289882421,
"grad_norm": 4.157958030700684,
"learning_rate": 9.233449477351917e-06,
"loss": 1.1273,
"step": 795
},
{
"epoch": 0.23109304688634055,
"grad_norm": 4.40532922744751,
"learning_rate": 9.24506387921022e-06,
"loss": 1.4211,
"step": 796
},
{
"epoch": 0.23138336478443897,
"grad_norm": 4.301095008850098,
"learning_rate": 9.256678281068526e-06,
"loss": 1.3181,
"step": 797
},
{
"epoch": 0.23167368268253738,
"grad_norm": 3.6317696571350098,
"learning_rate": 9.268292682926831e-06,
"loss": 1.0635,
"step": 798
},
{
"epoch": 0.2319640005806358,
"grad_norm": 4.2273359298706055,
"learning_rate": 9.279907084785135e-06,
"loss": 1.2817,
"step": 799
},
{
"epoch": 0.2322543184787342,
"grad_norm": 4.259072303771973,
"learning_rate": 9.291521486643439e-06,
"loss": 1.2549,
"step": 800
},
{
"epoch": 0.23254463637683262,
"grad_norm": 4.03896951675415,
"learning_rate": 9.303135888501744e-06,
"loss": 1.1359,
"step": 801
},
{
"epoch": 0.23283495427493106,
"grad_norm": 4.3312907218933105,
"learning_rate": 9.314750290360047e-06,
"loss": 1.3102,
"step": 802
},
{
"epoch": 0.23312527217302947,
"grad_norm": 3.9520628452301025,
"learning_rate": 9.326364692218351e-06,
"loss": 1.0503,
"step": 803
},
{
"epoch": 0.2334155900711279,
"grad_norm": 4.0430498123168945,
"learning_rate": 9.337979094076656e-06,
"loss": 1.2876,
"step": 804
},
{
"epoch": 0.2337059079692263,
"grad_norm": 4.059528350830078,
"learning_rate": 9.34959349593496e-06,
"loss": 1.1509,
"step": 805
},
{
"epoch": 0.23399622586732471,
"grad_norm": 3.862774610519409,
"learning_rate": 9.361207897793264e-06,
"loss": 1.1237,
"step": 806
},
{
"epoch": 0.23428654376542313,
"grad_norm": 4.267635345458984,
"learning_rate": 9.372822299651569e-06,
"loss": 1.2307,
"step": 807
},
{
"epoch": 0.23457686166352157,
"grad_norm": 3.8617136478424072,
"learning_rate": 9.384436701509873e-06,
"loss": 1.3029,
"step": 808
},
{
"epoch": 0.23486717956161998,
"grad_norm": 4.106259346008301,
"learning_rate": 9.396051103368178e-06,
"loss": 1.2887,
"step": 809
},
{
"epoch": 0.2351574974597184,
"grad_norm": 3.966156005859375,
"learning_rate": 9.407665505226482e-06,
"loss": 1.1533,
"step": 810
},
{
"epoch": 0.2354478153578168,
"grad_norm": 4.011099338531494,
"learning_rate": 9.419279907084787e-06,
"loss": 1.23,
"step": 811
},
{
"epoch": 0.23573813325591522,
"grad_norm": 3.8420891761779785,
"learning_rate": 9.43089430894309e-06,
"loss": 1.2038,
"step": 812
},
{
"epoch": 0.23602845115401364,
"grad_norm": 3.7966573238372803,
"learning_rate": 9.442508710801394e-06,
"loss": 1.1904,
"step": 813
},
{
"epoch": 0.23631876905211208,
"grad_norm": 3.8873846530914307,
"learning_rate": 9.4541231126597e-06,
"loss": 1.1495,
"step": 814
},
{
"epoch": 0.2366090869502105,
"grad_norm": 4.556484699249268,
"learning_rate": 9.465737514518003e-06,
"loss": 1.3733,
"step": 815
},
{
"epoch": 0.2368994048483089,
"grad_norm": 3.8360376358032227,
"learning_rate": 9.477351916376307e-06,
"loss": 1.1459,
"step": 816
},
{
"epoch": 0.23718972274640732,
"grad_norm": 4.036248683929443,
"learning_rate": 9.488966318234612e-06,
"loss": 1.0859,
"step": 817
},
{
"epoch": 0.23748004064450573,
"grad_norm": 4.281419277191162,
"learning_rate": 9.500580720092916e-06,
"loss": 1.1087,
"step": 818
},
{
"epoch": 0.23777035854260414,
"grad_norm": 4.298630237579346,
"learning_rate": 9.51219512195122e-06,
"loss": 1.2629,
"step": 819
},
{
"epoch": 0.23806067644070256,
"grad_norm": 4.755696773529053,
"learning_rate": 9.523809523809525e-06,
"loss": 1.4463,
"step": 820
},
{
"epoch": 0.238350994338801,
"grad_norm": 4.519877910614014,
"learning_rate": 9.53542392566783e-06,
"loss": 1.23,
"step": 821
},
{
"epoch": 0.2386413122368994,
"grad_norm": 4.5725812911987305,
"learning_rate": 9.547038327526134e-06,
"loss": 1.2735,
"step": 822
},
{
"epoch": 0.23893163013499782,
"grad_norm": 4.227170944213867,
"learning_rate": 9.558652729384437e-06,
"loss": 1.1873,
"step": 823
},
{
"epoch": 0.23922194803309624,
"grad_norm": 4.264405727386475,
"learning_rate": 9.570267131242743e-06,
"loss": 1.1793,
"step": 824
},
{
"epoch": 0.23951226593119465,
"grad_norm": 3.8990113735198975,
"learning_rate": 9.581881533101046e-06,
"loss": 1.2099,
"step": 825
},
{
"epoch": 0.23980258382929306,
"grad_norm": 4.033143997192383,
"learning_rate": 9.59349593495935e-06,
"loss": 1.159,
"step": 826
},
{
"epoch": 0.2400929017273915,
"grad_norm": 3.914243459701538,
"learning_rate": 9.605110336817655e-06,
"loss": 1.147,
"step": 827
},
{
"epoch": 0.24038321962548992,
"grad_norm": 4.004579067230225,
"learning_rate": 9.616724738675959e-06,
"loss": 1.3154,
"step": 828
},
{
"epoch": 0.24067353752358833,
"grad_norm": 4.188416004180908,
"learning_rate": 9.628339140534263e-06,
"loss": 1.2799,
"step": 829
},
{
"epoch": 0.24096385542168675,
"grad_norm": 4.339681148529053,
"learning_rate": 9.639953542392568e-06,
"loss": 1.2475,
"step": 830
},
{
"epoch": 0.24125417331978516,
"grad_norm": 4.40482759475708,
"learning_rate": 9.651567944250871e-06,
"loss": 1.2131,
"step": 831
},
{
"epoch": 0.24154449121788357,
"grad_norm": 3.721519947052002,
"learning_rate": 9.663182346109177e-06,
"loss": 1.1448,
"step": 832
},
{
"epoch": 0.241834809115982,
"grad_norm": 4.03656530380249,
"learning_rate": 9.67479674796748e-06,
"loss": 1.1783,
"step": 833
},
{
"epoch": 0.24212512701408043,
"grad_norm": 3.787747621536255,
"learning_rate": 9.686411149825786e-06,
"loss": 1.2477,
"step": 834
},
{
"epoch": 0.24241544491217884,
"grad_norm": 4.436072826385498,
"learning_rate": 9.69802555168409e-06,
"loss": 1.3761,
"step": 835
},
{
"epoch": 0.24270576281027725,
"grad_norm": 4.418893814086914,
"learning_rate": 9.709639953542393e-06,
"loss": 1.2114,
"step": 836
},
{
"epoch": 0.24299608070837567,
"grad_norm": 4.714204788208008,
"learning_rate": 9.721254355400698e-06,
"loss": 1.1931,
"step": 837
},
{
"epoch": 0.24328639860647408,
"grad_norm": 4.259952545166016,
"learning_rate": 9.732868757259002e-06,
"loss": 1.1285,
"step": 838
},
{
"epoch": 0.2435767165045725,
"grad_norm": 3.6294689178466797,
"learning_rate": 9.744483159117306e-06,
"loss": 1.0827,
"step": 839
},
{
"epoch": 0.24386703440267093,
"grad_norm": 4.037003040313721,
"learning_rate": 9.756097560975611e-06,
"loss": 1.1824,
"step": 840
},
{
"epoch": 0.24415735230076935,
"grad_norm": 4.08364200592041,
"learning_rate": 9.767711962833915e-06,
"loss": 1.1278,
"step": 841
},
{
"epoch": 0.24444767019886776,
"grad_norm": 4.233451843261719,
"learning_rate": 9.779326364692218e-06,
"loss": 1.2704,
"step": 842
},
{
"epoch": 0.24473798809696617,
"grad_norm": 4.0865631103515625,
"learning_rate": 9.790940766550524e-06,
"loss": 1.2111,
"step": 843
},
{
"epoch": 0.2450283059950646,
"grad_norm": 4.192430019378662,
"learning_rate": 9.802555168408829e-06,
"loss": 1.218,
"step": 844
},
{
"epoch": 0.245318623893163,
"grad_norm": 3.8745322227478027,
"learning_rate": 9.814169570267133e-06,
"loss": 1.2443,
"step": 845
},
{
"epoch": 0.24560894179126144,
"grad_norm": 3.955824136734009,
"learning_rate": 9.825783972125436e-06,
"loss": 1.1244,
"step": 846
},
{
"epoch": 0.24589925968935986,
"grad_norm": 4.057941913604736,
"learning_rate": 9.837398373983741e-06,
"loss": 1.1756,
"step": 847
},
{
"epoch": 0.24618957758745827,
"grad_norm": 3.894920587539673,
"learning_rate": 9.849012775842045e-06,
"loss": 1.2709,
"step": 848
},
{
"epoch": 0.24647989548555668,
"grad_norm": 3.87312912940979,
"learning_rate": 9.860627177700349e-06,
"loss": 1.0949,
"step": 849
},
{
"epoch": 0.2467702133836551,
"grad_norm": 3.991598606109619,
"learning_rate": 9.872241579558654e-06,
"loss": 1.0914,
"step": 850
},
{
"epoch": 0.2470605312817535,
"grad_norm": 4.442087650299072,
"learning_rate": 9.883855981416958e-06,
"loss": 1.3785,
"step": 851
},
{
"epoch": 0.24735084917985195,
"grad_norm": 4.167323112487793,
"learning_rate": 9.895470383275261e-06,
"loss": 1.1777,
"step": 852
},
{
"epoch": 0.24764116707795036,
"grad_norm": 3.8976168632507324,
"learning_rate": 9.907084785133567e-06,
"loss": 1.2094,
"step": 853
},
{
"epoch": 0.24793148497604878,
"grad_norm": 4.286924362182617,
"learning_rate": 9.91869918699187e-06,
"loss": 1.3301,
"step": 854
},
{
"epoch": 0.2482218028741472,
"grad_norm": 4.022475242614746,
"learning_rate": 9.930313588850174e-06,
"loss": 1.2017,
"step": 855
},
{
"epoch": 0.2485121207722456,
"grad_norm": 3.858656644821167,
"learning_rate": 9.94192799070848e-06,
"loss": 1.2202,
"step": 856
},
{
"epoch": 0.24880243867034402,
"grad_norm": 3.9576399326324463,
"learning_rate": 9.953542392566785e-06,
"loss": 1.2639,
"step": 857
},
{
"epoch": 0.24909275656844246,
"grad_norm": 3.78914213180542,
"learning_rate": 9.965156794425088e-06,
"loss": 1.0952,
"step": 858
},
{
"epoch": 0.24938307446654087,
"grad_norm": 4.147533893585205,
"learning_rate": 9.976771196283392e-06,
"loss": 1.191,
"step": 859
},
{
"epoch": 0.24967339236463928,
"grad_norm": 4.042754650115967,
"learning_rate": 9.988385598141697e-06,
"loss": 1.2136,
"step": 860
},
{
"epoch": 0.2499637102627377,
"grad_norm": 3.9550065994262695,
"learning_rate": 1e-05,
"loss": 1.1666,
"step": 861
},
{
"epoch": 0.25025402816083614,
"grad_norm": 4.223484516143799,
"learning_rate": 9.999999907800993e-06,
"loss": 1.1374,
"step": 862
},
{
"epoch": 0.25054434605893455,
"grad_norm": 3.993415355682373,
"learning_rate": 9.999999631203973e-06,
"loss": 1.206,
"step": 863
},
{
"epoch": 0.25083466395703297,
"grad_norm": 4.242237091064453,
"learning_rate": 9.99999917020895e-06,
"loss": 1.1703,
"step": 864
},
{
"epoch": 0.2511249818551314,
"grad_norm": 4.252773761749268,
"learning_rate": 9.999998524815943e-06,
"loss": 1.2586,
"step": 865
},
{
"epoch": 0.2514152997532298,
"grad_norm": 3.9203879833221436,
"learning_rate": 9.999997695024973e-06,
"loss": 1.1088,
"step": 866
},
{
"epoch": 0.2517056176513282,
"grad_norm": 4.138311386108398,
"learning_rate": 9.999996680836072e-06,
"loss": 1.2563,
"step": 867
},
{
"epoch": 0.2519959355494266,
"grad_norm": 4.038930416107178,
"learning_rate": 9.999995482249281e-06,
"loss": 1.2899,
"step": 868
},
{
"epoch": 0.25228625344752503,
"grad_norm": 4.0346879959106445,
"learning_rate": 9.999994099264638e-06,
"loss": 1.1238,
"step": 869
},
{
"epoch": 0.25257657134562345,
"grad_norm": 3.8026630878448486,
"learning_rate": 9.999992531882197e-06,
"loss": 1.0621,
"step": 870
},
{
"epoch": 0.25286688924372186,
"grad_norm": 4.135496139526367,
"learning_rate": 9.999990780102015e-06,
"loss": 1.1553,
"step": 871
},
{
"epoch": 0.2531572071418203,
"grad_norm": 3.8665709495544434,
"learning_rate": 9.999988843924158e-06,
"loss": 1.1897,
"step": 872
},
{
"epoch": 0.2534475250399187,
"grad_norm": 3.7282605171203613,
"learning_rate": 9.999986723348697e-06,
"loss": 1.162,
"step": 873
},
{
"epoch": 0.25373784293801716,
"grad_norm": 3.997755765914917,
"learning_rate": 9.99998441837571e-06,
"loss": 1.3417,
"step": 874
},
{
"epoch": 0.25402816083611557,
"grad_norm": 4.263042449951172,
"learning_rate": 9.999981929005281e-06,
"loss": 1.3103,
"step": 875
},
{
"epoch": 0.254318478734214,
"grad_norm": 4.087371826171875,
"learning_rate": 9.999979255237504e-06,
"loss": 1.2355,
"step": 876
},
{
"epoch": 0.2546087966323124,
"grad_norm": 4.311849117279053,
"learning_rate": 9.999976397072474e-06,
"loss": 1.25,
"step": 877
},
{
"epoch": 0.2548991145304108,
"grad_norm": 3.9726626873016357,
"learning_rate": 9.9999733545103e-06,
"loss": 1.2877,
"step": 878
},
{
"epoch": 0.2551894324285092,
"grad_norm": 4.184573173522949,
"learning_rate": 9.999970127551094e-06,
"loss": 1.4488,
"step": 879
},
{
"epoch": 0.25547975032660764,
"grad_norm": 4.292477130889893,
"learning_rate": 9.999966716194973e-06,
"loss": 1.3899,
"step": 880
},
{
"epoch": 0.25577006822470605,
"grad_norm": 3.918590545654297,
"learning_rate": 9.999963120442062e-06,
"loss": 1.2766,
"step": 881
},
{
"epoch": 0.25606038612280446,
"grad_norm": 3.896446466445923,
"learning_rate": 9.999959340292497e-06,
"loss": 1.2409,
"step": 882
},
{
"epoch": 0.2563507040209029,
"grad_norm": 3.7944939136505127,
"learning_rate": 9.999955375746415e-06,
"loss": 1.1856,
"step": 883
},
{
"epoch": 0.2566410219190013,
"grad_norm": 4.00242805480957,
"learning_rate": 9.999951226803963e-06,
"loss": 1.1902,
"step": 884
},
{
"epoch": 0.2569313398170997,
"grad_norm": 3.9395718574523926,
"learning_rate": 9.999946893465294e-06,
"loss": 1.2137,
"step": 885
},
{
"epoch": 0.2572216577151981,
"grad_norm": 3.7727317810058594,
"learning_rate": 9.999942375730568e-06,
"loss": 1.2436,
"step": 886
},
{
"epoch": 0.2575119756132966,
"grad_norm": 3.9272992610931396,
"learning_rate": 9.999937673599951e-06,
"loss": 1.223,
"step": 887
},
{
"epoch": 0.257802293511395,
"grad_norm": 4.122605800628662,
"learning_rate": 9.99993278707362e-06,
"loss": 1.2457,
"step": 888
},
{
"epoch": 0.2580926114094934,
"grad_norm": 3.6556971073150635,
"learning_rate": 9.999927716151747e-06,
"loss": 1.1214,
"step": 889
},
{
"epoch": 0.2583829293075918,
"grad_norm": 4.025891304016113,
"learning_rate": 9.999922460834525e-06,
"loss": 1.2022,
"step": 890
},
{
"epoch": 0.25867324720569024,
"grad_norm": 4.0044379234313965,
"learning_rate": 9.99991702112215e-06,
"loss": 1.1408,
"step": 891
},
{
"epoch": 0.25896356510378865,
"grad_norm": 3.8944759368896484,
"learning_rate": 9.999911397014816e-06,
"loss": 1.2388,
"step": 892
},
{
"epoch": 0.25925388300188706,
"grad_norm": 3.943559169769287,
"learning_rate": 9.999905588512735e-06,
"loss": 1.0437,
"step": 893
},
{
"epoch": 0.2595442008999855,
"grad_norm": 3.794334888458252,
"learning_rate": 9.99989959561612e-06,
"loss": 1.1493,
"step": 894
},
{
"epoch": 0.2598345187980839,
"grad_norm": 3.97279691696167,
"learning_rate": 9.999893418325193e-06,
"loss": 1.2069,
"step": 895
},
{
"epoch": 0.2601248366961823,
"grad_norm": 4.2030534744262695,
"learning_rate": 9.999887056640178e-06,
"loss": 1.3481,
"step": 896
},
{
"epoch": 0.2604151545942807,
"grad_norm": 3.7260630130767822,
"learning_rate": 9.999880510561316e-06,
"loss": 1.185,
"step": 897
},
{
"epoch": 0.26070547249237913,
"grad_norm": 4.051196575164795,
"learning_rate": 9.999873780088842e-06,
"loss": 1.2857,
"step": 898
},
{
"epoch": 0.2609957903904776,
"grad_norm": 3.895303964614868,
"learning_rate": 9.99986686522301e-06,
"loss": 1.1956,
"step": 899
},
{
"epoch": 0.261286108288576,
"grad_norm": 3.712827682495117,
"learning_rate": 9.999859765964071e-06,
"loss": 1.255,
"step": 900
},
{
"epoch": 0.2615764261866744,
"grad_norm": 4.21458101272583,
"learning_rate": 9.999852482312287e-06,
"loss": 1.2748,
"step": 901
},
{
"epoch": 0.26186674408477284,
"grad_norm": 4.291463375091553,
"learning_rate": 9.999845014267928e-06,
"loss": 1.3972,
"step": 902
},
{
"epoch": 0.26215706198287125,
"grad_norm": 3.866318464279175,
"learning_rate": 9.999837361831269e-06,
"loss": 1.1126,
"step": 903
},
{
"epoch": 0.26244737988096967,
"grad_norm": 3.7740962505340576,
"learning_rate": 9.999829525002593e-06,
"loss": 1.1077,
"step": 904
},
{
"epoch": 0.2627376977790681,
"grad_norm": 3.9418838024139404,
"learning_rate": 9.999821503782188e-06,
"loss": 1.1723,
"step": 905
},
{
"epoch": 0.2630280156771665,
"grad_norm": 4.411069869995117,
"learning_rate": 9.999813298170349e-06,
"loss": 1.2593,
"step": 906
},
{
"epoch": 0.2633183335752649,
"grad_norm": 4.006514549255371,
"learning_rate": 9.99980490816738e-06,
"loss": 1.2224,
"step": 907
},
{
"epoch": 0.2636086514733633,
"grad_norm": 4.01617956161499,
"learning_rate": 9.999796333773591e-06,
"loss": 1.3176,
"step": 908
},
{
"epoch": 0.26389896937146173,
"grad_norm": 3.717695951461792,
"learning_rate": 9.999787574989297e-06,
"loss": 1.1465,
"step": 909
},
{
"epoch": 0.26418928726956015,
"grad_norm": 4.200732231140137,
"learning_rate": 9.999778631814822e-06,
"loss": 1.2268,
"step": 910
},
{
"epoch": 0.26447960516765856,
"grad_norm": 4.170313358306885,
"learning_rate": 9.999769504250495e-06,
"loss": 1.1818,
"step": 911
},
{
"epoch": 0.26476992306575703,
"grad_norm": 4.117874622344971,
"learning_rate": 9.999760192296651e-06,
"loss": 1.2266,
"step": 912
},
{
"epoch": 0.26506024096385544,
"grad_norm": 4.023068428039551,
"learning_rate": 9.999750695953635e-06,
"loss": 1.2564,
"step": 913
},
{
"epoch": 0.26535055886195386,
"grad_norm": 3.9565770626068115,
"learning_rate": 9.9997410152218e-06,
"loss": 1.2719,
"step": 914
},
{
"epoch": 0.26564087676005227,
"grad_norm": 4.1268510818481445,
"learning_rate": 9.999731150101499e-06,
"loss": 1.1941,
"step": 915
},
{
"epoch": 0.2659311946581507,
"grad_norm": 4.024060249328613,
"learning_rate": 9.999721100593098e-06,
"loss": 1.2576,
"step": 916
},
{
"epoch": 0.2662215125562491,
"grad_norm": 4.292674541473389,
"learning_rate": 9.999710866696967e-06,
"loss": 1.3313,
"step": 917
},
{
"epoch": 0.2665118304543475,
"grad_norm": 3.7949039936065674,
"learning_rate": 9.999700448413483e-06,
"loss": 1.2748,
"step": 918
},
{
"epoch": 0.2668021483524459,
"grad_norm": 3.83724308013916,
"learning_rate": 9.99968984574303e-06,
"loss": 1.2568,
"step": 919
},
{
"epoch": 0.26709246625054434,
"grad_norm": 3.7601423263549805,
"learning_rate": 9.999679058686e-06,
"loss": 1.1709,
"step": 920
},
{
"epoch": 0.26738278414864275,
"grad_norm": 3.65810227394104,
"learning_rate": 9.999668087242789e-06,
"loss": 1.1861,
"step": 921
},
{
"epoch": 0.26767310204674116,
"grad_norm": 3.8424625396728516,
"learning_rate": 9.999656931413805e-06,
"loss": 1.2347,
"step": 922
},
{
"epoch": 0.2679634199448396,
"grad_norm": 3.8711178302764893,
"learning_rate": 9.999645591199456e-06,
"loss": 1.1713,
"step": 923
},
{
"epoch": 0.26825373784293804,
"grad_norm": 3.7193312644958496,
"learning_rate": 9.999634066600162e-06,
"loss": 1.1272,
"step": 924
},
{
"epoch": 0.26854405574103646,
"grad_norm": 3.983853578567505,
"learning_rate": 9.999622357616348e-06,
"loss": 1.2762,
"step": 925
},
{
"epoch": 0.26883437363913487,
"grad_norm": 4.00912618637085,
"learning_rate": 9.999610464248446e-06,
"loss": 1.1777,
"step": 926
},
{
"epoch": 0.2691246915372333,
"grad_norm": 4.1947126388549805,
"learning_rate": 9.999598386496893e-06,
"loss": 1.389,
"step": 927
},
{
"epoch": 0.2694150094353317,
"grad_norm": 3.9506235122680664,
"learning_rate": 9.999586124362136e-06,
"loss": 1.3365,
"step": 928
},
{
"epoch": 0.2697053273334301,
"grad_norm": 3.9439916610717773,
"learning_rate": 9.999573677844627e-06,
"loss": 1.2287,
"step": 929
},
{
"epoch": 0.2699956452315285,
"grad_norm": 4.163543224334717,
"learning_rate": 9.999561046944824e-06,
"loss": 1.2869,
"step": 930
},
{
"epoch": 0.27028596312962694,
"grad_norm": 3.9208672046661377,
"learning_rate": 9.999548231663194e-06,
"loss": 1.2985,
"step": 931
},
{
"epoch": 0.27057628102772535,
"grad_norm": 4.060229778289795,
"learning_rate": 9.99953523200021e-06,
"loss": 1.2768,
"step": 932
},
{
"epoch": 0.27086659892582376,
"grad_norm": 3.6714141368865967,
"learning_rate": 9.99952204795635e-06,
"loss": 1.1783,
"step": 933
},
{
"epoch": 0.2711569168239222,
"grad_norm": 3.772534132003784,
"learning_rate": 9.999508679532102e-06,
"loss": 1.1146,
"step": 934
},
{
"epoch": 0.2714472347220206,
"grad_norm": 4.284186840057373,
"learning_rate": 9.999495126727956e-06,
"loss": 1.3329,
"step": 935
},
{
"epoch": 0.271737552620119,
"grad_norm": 3.7998135089874268,
"learning_rate": 9.999481389544414e-06,
"loss": 1.2101,
"step": 936
},
{
"epoch": 0.2720278705182175,
"grad_norm": 4.04706335067749,
"learning_rate": 9.999467467981984e-06,
"loss": 1.307,
"step": 937
},
{
"epoch": 0.2723181884163159,
"grad_norm": 3.911973237991333,
"learning_rate": 9.999453362041177e-06,
"loss": 1.1824,
"step": 938
},
{
"epoch": 0.2726085063144143,
"grad_norm": 4.05914831161499,
"learning_rate": 9.999439071722513e-06,
"loss": 1.2237,
"step": 939
},
{
"epoch": 0.2728988242125127,
"grad_norm": 4.172504901885986,
"learning_rate": 9.999424597026521e-06,
"loss": 1.2877,
"step": 940
},
{
"epoch": 0.2731891421106111,
"grad_norm": 3.855518341064453,
"learning_rate": 9.999409937953732e-06,
"loss": 1.1341,
"step": 941
},
{
"epoch": 0.27347946000870954,
"grad_norm": 4.338953018188477,
"learning_rate": 9.999395094504692e-06,
"loss": 1.2654,
"step": 942
},
{
"epoch": 0.27376977790680795,
"grad_norm": 3.9418210983276367,
"learning_rate": 9.999380066679943e-06,
"loss": 1.2278,
"step": 943
},
{
"epoch": 0.27406009580490637,
"grad_norm": 3.866417646408081,
"learning_rate": 9.99936485448004e-06,
"loss": 1.3366,
"step": 944
},
{
"epoch": 0.2743504137030048,
"grad_norm": 3.783524513244629,
"learning_rate": 9.999349457905545e-06,
"loss": 1.1555,
"step": 945
},
{
"epoch": 0.2746407316011032,
"grad_norm": 3.9190661907196045,
"learning_rate": 9.999333876957027e-06,
"loss": 1.2089,
"step": 946
},
{
"epoch": 0.2749310494992016,
"grad_norm": 3.7447915077209473,
"learning_rate": 9.99931811163506e-06,
"loss": 1.2385,
"step": 947
},
{
"epoch": 0.2752213673973,
"grad_norm": 4.181678295135498,
"learning_rate": 9.999302161940224e-06,
"loss": 1.2333,
"step": 948
},
{
"epoch": 0.27551168529539843,
"grad_norm": 3.853498697280884,
"learning_rate": 9.99928602787311e-06,
"loss": 1.1547,
"step": 949
},
{
"epoch": 0.2758020031934969,
"grad_norm": 3.614431619644165,
"learning_rate": 9.999269709434308e-06,
"loss": 1.1117,
"step": 950
},
{
"epoch": 0.2760923210915953,
"grad_norm": 4.468873977661133,
"learning_rate": 9.999253206624425e-06,
"loss": 1.3627,
"step": 951
},
{
"epoch": 0.27638263898969373,
"grad_norm": 4.207579135894775,
"learning_rate": 9.999236519444067e-06,
"loss": 1.2428,
"step": 952
},
{
"epoch": 0.27667295688779214,
"grad_norm": 3.9187076091766357,
"learning_rate": 9.999219647893852e-06,
"loss": 1.1798,
"step": 953
},
{
"epoch": 0.27696327478589056,
"grad_norm": 3.7778027057647705,
"learning_rate": 9.999202591974398e-06,
"loss": 1.1975,
"step": 954
},
{
"epoch": 0.27725359268398897,
"grad_norm": 3.8436973094940186,
"learning_rate": 9.999185351686336e-06,
"loss": 1.1884,
"step": 955
},
{
"epoch": 0.2775439105820874,
"grad_norm": 4.115079402923584,
"learning_rate": 9.999167927030304e-06,
"loss": 1.2735,
"step": 956
},
{
"epoch": 0.2778342284801858,
"grad_norm": 3.7705702781677246,
"learning_rate": 9.999150318006942e-06,
"loss": 1.1011,
"step": 957
},
{
"epoch": 0.2781245463782842,
"grad_norm": 4.015285491943359,
"learning_rate": 9.9991325246169e-06,
"loss": 1.2667,
"step": 958
},
{
"epoch": 0.2784148642763826,
"grad_norm": 3.9331655502319336,
"learning_rate": 9.999114546860834e-06,
"loss": 1.2667,
"step": 959
},
{
"epoch": 0.27870518217448104,
"grad_norm": 4.180220603942871,
"learning_rate": 9.999096384739407e-06,
"loss": 1.2929,
"step": 960
},
{
"epoch": 0.27899550007257945,
"grad_norm": 4.194953918457031,
"learning_rate": 9.99907803825329e-06,
"loss": 1.451,
"step": 961
},
{
"epoch": 0.2792858179706779,
"grad_norm": 3.872340679168701,
"learning_rate": 9.99905950740316e-06,
"loss": 1.1172,
"step": 962
},
{
"epoch": 0.27957613586877633,
"grad_norm": 3.8990437984466553,
"learning_rate": 9.999040792189696e-06,
"loss": 1.2839,
"step": 963
},
{
"epoch": 0.27986645376687475,
"grad_norm": 4.102906703948975,
"learning_rate": 9.999021892613594e-06,
"loss": 1.1807,
"step": 964
},
{
"epoch": 0.28015677166497316,
"grad_norm": 3.698540210723877,
"learning_rate": 9.999002808675547e-06,
"loss": 1.3311,
"step": 965
},
{
"epoch": 0.28044708956307157,
"grad_norm": 4.117794990539551,
"learning_rate": 9.998983540376262e-06,
"loss": 1.2954,
"step": 966
},
{
"epoch": 0.28073740746117,
"grad_norm": 4.094895362854004,
"learning_rate": 9.998964087716445e-06,
"loss": 1.2965,
"step": 967
},
{
"epoch": 0.2810277253592684,
"grad_norm": 3.921121120452881,
"learning_rate": 9.998944450696818e-06,
"loss": 1.3762,
"step": 968
},
{
"epoch": 0.2813180432573668,
"grad_norm": 3.5735599994659424,
"learning_rate": 9.998924629318103e-06,
"loss": 1.227,
"step": 969
},
{
"epoch": 0.2816083611554652,
"grad_norm": 3.7150392532348633,
"learning_rate": 9.998904623581032e-06,
"loss": 1.2873,
"step": 970
},
{
"epoch": 0.28189867905356364,
"grad_norm": 4.215477466583252,
"learning_rate": 9.998884433486342e-06,
"loss": 1.4844,
"step": 971
},
{
"epoch": 0.28218899695166205,
"grad_norm": 3.861442804336548,
"learning_rate": 9.998864059034778e-06,
"loss": 1.1615,
"step": 972
},
{
"epoch": 0.28247931484976047,
"grad_norm": 3.7807931900024414,
"learning_rate": 9.998843500227092e-06,
"loss": 1.3308,
"step": 973
},
{
"epoch": 0.2827696327478589,
"grad_norm": 4.654616832733154,
"learning_rate": 9.99882275706404e-06,
"loss": 1.3845,
"step": 974
},
{
"epoch": 0.28305995064595735,
"grad_norm": 3.788461685180664,
"learning_rate": 9.998801829546387e-06,
"loss": 1.2098,
"step": 975
},
{
"epoch": 0.28335026854405576,
"grad_norm": 3.7853169441223145,
"learning_rate": 9.99878071767491e-06,
"loss": 1.1913,
"step": 976
},
{
"epoch": 0.2836405864421542,
"grad_norm": 3.6798760890960693,
"learning_rate": 9.998759421450382e-06,
"loss": 1.0833,
"step": 977
},
{
"epoch": 0.2839309043402526,
"grad_norm": 3.5938055515289307,
"learning_rate": 9.998737940873589e-06,
"loss": 1.2577,
"step": 978
},
{
"epoch": 0.284221222238351,
"grad_norm": 3.609879970550537,
"learning_rate": 9.998716275945326e-06,
"loss": 1.2261,
"step": 979
},
{
"epoch": 0.2845115401364494,
"grad_norm": 4.083144187927246,
"learning_rate": 9.99869442666639e-06,
"loss": 1.37,
"step": 980
},
{
"epoch": 0.28480185803454783,
"grad_norm": 3.6036617755889893,
"learning_rate": 9.998672393037587e-06,
"loss": 1.1282,
"step": 981
},
{
"epoch": 0.28509217593264624,
"grad_norm": 3.648822784423828,
"learning_rate": 9.99865017505973e-06,
"loss": 1.1288,
"step": 982
},
{
"epoch": 0.28538249383074465,
"grad_norm": 3.8245482444763184,
"learning_rate": 9.998627772733638e-06,
"loss": 1.1163,
"step": 983
},
{
"epoch": 0.28567281172884307,
"grad_norm": 3.836742877960205,
"learning_rate": 9.998605186060138e-06,
"loss": 1.1848,
"step": 984
},
{
"epoch": 0.2859631296269415,
"grad_norm": 3.5548558235168457,
"learning_rate": 9.998582415040061e-06,
"loss": 1.1864,
"step": 985
},
{
"epoch": 0.2862534475250399,
"grad_norm": 4.147696018218994,
"learning_rate": 9.99855945967425e-06,
"loss": 1.4001,
"step": 986
},
{
"epoch": 0.28654376542313836,
"grad_norm": 3.7722232341766357,
"learning_rate": 9.99853631996355e-06,
"loss": 1.2789,
"step": 987
},
{
"epoch": 0.2868340833212368,
"grad_norm": 4.302724838256836,
"learning_rate": 9.998512995908812e-06,
"loss": 1.2114,
"step": 988
},
{
"epoch": 0.2871244012193352,
"grad_norm": 4.2343621253967285,
"learning_rate": 9.9984894875109e-06,
"loss": 1.2674,
"step": 989
},
{
"epoch": 0.2874147191174336,
"grad_norm": 3.9608490467071533,
"learning_rate": 9.998465794770677e-06,
"loss": 1.0819,
"step": 990
},
{
"epoch": 0.287705037015532,
"grad_norm": 3.951963424682617,
"learning_rate": 9.998441917689022e-06,
"loss": 1.2561,
"step": 991
},
{
"epoch": 0.28799535491363043,
"grad_norm": 3.7183871269226074,
"learning_rate": 9.998417856266811e-06,
"loss": 1.1932,
"step": 992
},
{
"epoch": 0.28828567281172884,
"grad_norm": 3.7486894130706787,
"learning_rate": 9.998393610504933e-06,
"loss": 1.1478,
"step": 993
},
{
"epoch": 0.28857599070982726,
"grad_norm": 3.986708402633667,
"learning_rate": 9.998369180404283e-06,
"loss": 1.1849,
"step": 994
},
{
"epoch": 0.28886630860792567,
"grad_norm": 3.6684303283691406,
"learning_rate": 9.998344565965761e-06,
"loss": 1.2896,
"step": 995
},
{
"epoch": 0.2891566265060241,
"grad_norm": 3.8808441162109375,
"learning_rate": 9.998319767190274e-06,
"loss": 1.3013,
"step": 996
},
{
"epoch": 0.2894469444041225,
"grad_norm": 3.917853832244873,
"learning_rate": 9.998294784078739e-06,
"loss": 1.3916,
"step": 997
},
{
"epoch": 0.2897372623022209,
"grad_norm": 3.955862045288086,
"learning_rate": 9.998269616632075e-06,
"loss": 1.1784,
"step": 998
},
{
"epoch": 0.2900275802003193,
"grad_norm": 3.538889169692993,
"learning_rate": 9.998244264851211e-06,
"loss": 0.9783,
"step": 999
},
{
"epoch": 0.2903178980984178,
"grad_norm": 3.675344228744507,
"learning_rate": 9.998218728737081e-06,
"loss": 1.3406,
"step": 1000
},
{
"epoch": 0.2903178980984178,
"eval_loss": 1.252946376800537,
"eval_runtime": 11.2256,
"eval_samples_per_second": 35.633,
"eval_steps_per_second": 4.454,
"step": 1000
},
{
"epoch": 0.2906082159965162,
"grad_norm": 3.7118828296661377,
"learning_rate": 9.99819300829063e-06,
"loss": 1.0587,
"step": 1001
},
{
"epoch": 0.2908985338946146,
"grad_norm": 3.9424095153808594,
"learning_rate": 9.998167103512803e-06,
"loss": 1.1582,
"step": 1002
},
{
"epoch": 0.29118885179271303,
"grad_norm": 3.7735092639923096,
"learning_rate": 9.998141014404556e-06,
"loss": 1.3521,
"step": 1003
},
{
"epoch": 0.29147916969081145,
"grad_norm": 3.752547264099121,
"learning_rate": 9.998114740966853e-06,
"loss": 1.1414,
"step": 1004
},
{
"epoch": 0.29176948758890986,
"grad_norm": 3.8838298320770264,
"learning_rate": 9.998088283200662e-06,
"loss": 1.1848,
"step": 1005
},
{
"epoch": 0.2920598054870083,
"grad_norm": 4.132805824279785,
"learning_rate": 9.998061641106958e-06,
"loss": 1.288,
"step": 1006
},
{
"epoch": 0.2923501233851067,
"grad_norm": 3.8610050678253174,
"learning_rate": 9.998034814686724e-06,
"loss": 1.209,
"step": 1007
},
{
"epoch": 0.2926404412832051,
"grad_norm": 3.819197416305542,
"learning_rate": 9.99800780394095e-06,
"loss": 1.1585,
"step": 1008
},
{
"epoch": 0.2929307591813035,
"grad_norm": 3.5778913497924805,
"learning_rate": 9.99798060887063e-06,
"loss": 1.0819,
"step": 1009
},
{
"epoch": 0.2932210770794019,
"grad_norm": 3.7328646183013916,
"learning_rate": 9.997953229476771e-06,
"loss": 1.1686,
"step": 1010
},
{
"epoch": 0.29351139497750034,
"grad_norm": 3.9370815753936768,
"learning_rate": 9.997925665760378e-06,
"loss": 1.1981,
"step": 1011
},
{
"epoch": 0.2938017128755988,
"grad_norm": 3.5711724758148193,
"learning_rate": 9.997897917722473e-06,
"loss": 1.162,
"step": 1012
},
{
"epoch": 0.2940920307736972,
"grad_norm": 3.807966709136963,
"learning_rate": 9.997869985364073e-06,
"loss": 1.0689,
"step": 1013
},
{
"epoch": 0.29438234867179564,
"grad_norm": 3.5610194206237793,
"learning_rate": 9.997841868686211e-06,
"loss": 1.1146,
"step": 1014
},
{
"epoch": 0.29467266656989405,
"grad_norm": 3.8267099857330322,
"learning_rate": 9.997813567689926e-06,
"loss": 1.228,
"step": 1015
},
{
"epoch": 0.29496298446799246,
"grad_norm": 4.01648473739624,
"learning_rate": 9.99778508237626e-06,
"loss": 1.1465,
"step": 1016
},
{
"epoch": 0.2952533023660909,
"grad_norm": 3.702500820159912,
"learning_rate": 9.997756412746262e-06,
"loss": 1.1933,
"step": 1017
},
{
"epoch": 0.2955436202641893,
"grad_norm": 3.886366605758667,
"learning_rate": 9.997727558800991e-06,
"loss": 1.2683,
"step": 1018
},
{
"epoch": 0.2958339381622877,
"grad_norm": 4.139401912689209,
"learning_rate": 9.997698520541513e-06,
"loss": 1.2807,
"step": 1019
},
{
"epoch": 0.2961242560603861,
"grad_norm": 4.107751846313477,
"learning_rate": 9.997669297968895e-06,
"loss": 1.3347,
"step": 1020
},
{
"epoch": 0.29641457395848453,
"grad_norm": 3.888638734817505,
"learning_rate": 9.997639891084216e-06,
"loss": 1.2342,
"step": 1021
},
{
"epoch": 0.29670489185658294,
"grad_norm": 3.8988595008850098,
"learning_rate": 9.997610299888562e-06,
"loss": 1.2046,
"step": 1022
},
{
"epoch": 0.29699520975468136,
"grad_norm": 3.6805219650268555,
"learning_rate": 9.997580524383025e-06,
"loss": 1.1419,
"step": 1023
},
{
"epoch": 0.29728552765277977,
"grad_norm": 3.717468500137329,
"learning_rate": 9.997550564568698e-06,
"loss": 1.2272,
"step": 1024
},
{
"epoch": 0.29757584555087824,
"grad_norm": 3.684636116027832,
"learning_rate": 9.997520420446694e-06,
"loss": 1.2279,
"step": 1025
},
{
"epoch": 0.29786616344897665,
"grad_norm": 3.6968002319335938,
"learning_rate": 9.997490092018117e-06,
"loss": 1.1613,
"step": 1026
},
{
"epoch": 0.29815648134707506,
"grad_norm": 4.012862682342529,
"learning_rate": 9.997459579284088e-06,
"loss": 1.1938,
"step": 1027
},
{
"epoch": 0.2984467992451735,
"grad_norm": 4.252531051635742,
"learning_rate": 9.997428882245735e-06,
"loss": 1.149,
"step": 1028
},
{
"epoch": 0.2987371171432719,
"grad_norm": 3.787094831466675,
"learning_rate": 9.997398000904185e-06,
"loss": 1.2608,
"step": 1029
},
{
"epoch": 0.2990274350413703,
"grad_norm": 4.0114970207214355,
"learning_rate": 9.997366935260582e-06,
"loss": 1.185,
"step": 1030
},
{
"epoch": 0.2993177529394687,
"grad_norm": 3.625157356262207,
"learning_rate": 9.99733568531607e-06,
"loss": 1.1818,
"step": 1031
},
{
"epoch": 0.29960807083756713,
"grad_norm": 3.3687214851379395,
"learning_rate": 9.997304251071802e-06,
"loss": 1.0876,
"step": 1032
},
{
"epoch": 0.29989838873566554,
"grad_norm": 3.9616904258728027,
"learning_rate": 9.997272632528933e-06,
"loss": 1.1674,
"step": 1033
},
{
"epoch": 0.30018870663376396,
"grad_norm": 4.397826194763184,
"learning_rate": 9.997240829688634e-06,
"loss": 1.3382,
"step": 1034
},
{
"epoch": 0.30047902453186237,
"grad_norm": 3.7658543586730957,
"learning_rate": 9.997208842552077e-06,
"loss": 1.1838,
"step": 1035
},
{
"epoch": 0.3007693424299608,
"grad_norm": 3.806561231613159,
"learning_rate": 9.99717667112044e-06,
"loss": 1.1805,
"step": 1036
},
{
"epoch": 0.3010596603280592,
"grad_norm": 3.5808584690093994,
"learning_rate": 9.997144315394912e-06,
"loss": 1.2062,
"step": 1037
},
{
"epoch": 0.30134997822615767,
"grad_norm": 3.2824292182922363,
"learning_rate": 9.997111775376684e-06,
"loss": 1.0395,
"step": 1038
},
{
"epoch": 0.3016402961242561,
"grad_norm": 3.9872941970825195,
"learning_rate": 9.997079051066956e-06,
"loss": 1.2192,
"step": 1039
},
{
"epoch": 0.3019306140223545,
"grad_norm": 4.112649440765381,
"learning_rate": 9.997046142466935e-06,
"loss": 1.4281,
"step": 1040
},
{
"epoch": 0.3022209319204529,
"grad_norm": 3.963346481323242,
"learning_rate": 9.997013049577838e-06,
"loss": 1.2096,
"step": 1041
},
{
"epoch": 0.3025112498185513,
"grad_norm": 3.9230425357818604,
"learning_rate": 9.99697977240088e-06,
"loss": 1.2325,
"step": 1042
},
{
"epoch": 0.30280156771664973,
"grad_norm": 4.026306629180908,
"learning_rate": 9.996946310937292e-06,
"loss": 1.2818,
"step": 1043
},
{
"epoch": 0.30309188561474815,
"grad_norm": 4.02335786819458,
"learning_rate": 9.996912665188308e-06,
"loss": 1.3765,
"step": 1044
},
{
"epoch": 0.30338220351284656,
"grad_norm": 3.77268123626709,
"learning_rate": 9.996878835155166e-06,
"loss": 1.3176,
"step": 1045
},
{
"epoch": 0.303672521410945,
"grad_norm": 4.2044525146484375,
"learning_rate": 9.996844820839115e-06,
"loss": 1.3502,
"step": 1046
},
{
"epoch": 0.3039628393090434,
"grad_norm": 3.5329604148864746,
"learning_rate": 9.996810622241412e-06,
"loss": 1.1506,
"step": 1047
},
{
"epoch": 0.3042531572071418,
"grad_norm": 3.349825620651245,
"learning_rate": 9.996776239363317e-06,
"loss": 1.0941,
"step": 1048
},
{
"epoch": 0.3045434751052402,
"grad_norm": 3.884256362915039,
"learning_rate": 9.996741672206095e-06,
"loss": 1.308,
"step": 1049
},
{
"epoch": 0.3048337930033387,
"grad_norm": 3.6708192825317383,
"learning_rate": 9.996706920771024e-06,
"loss": 1.06,
"step": 1050
},
{
"epoch": 0.3051241109014371,
"grad_norm": 3.7969107627868652,
"learning_rate": 9.996671985059384e-06,
"loss": 1.2821,
"step": 1051
},
{
"epoch": 0.3054144287995355,
"grad_norm": 4.150816917419434,
"learning_rate": 9.996636865072464e-06,
"loss": 1.3209,
"step": 1052
},
{
"epoch": 0.3057047466976339,
"grad_norm": 3.5923068523406982,
"learning_rate": 9.99660156081156e-06,
"loss": 1.1685,
"step": 1053
},
{
"epoch": 0.30599506459573234,
"grad_norm": 4.074513912200928,
"learning_rate": 9.996566072277974e-06,
"loss": 1.1066,
"step": 1054
},
{
"epoch": 0.30628538249383075,
"grad_norm": 3.7009284496307373,
"learning_rate": 9.996530399473012e-06,
"loss": 1.1065,
"step": 1055
},
{
"epoch": 0.30657570039192916,
"grad_norm": 3.790034055709839,
"learning_rate": 9.996494542397993e-06,
"loss": 1.2058,
"step": 1056
},
{
"epoch": 0.3068660182900276,
"grad_norm": 4.157486915588379,
"learning_rate": 9.996458501054237e-06,
"loss": 1.3369,
"step": 1057
},
{
"epoch": 0.307156336188126,
"grad_norm": 4.008849143981934,
"learning_rate": 9.996422275443076e-06,
"loss": 1.3844,
"step": 1058
},
{
"epoch": 0.3074466540862244,
"grad_norm": 4.041140556335449,
"learning_rate": 9.996385865565844e-06,
"loss": 1.2306,
"step": 1059
},
{
"epoch": 0.3077369719843228,
"grad_norm": 4.257492542266846,
"learning_rate": 9.996349271423883e-06,
"loss": 1.248,
"step": 1060
},
{
"epoch": 0.30802728988242123,
"grad_norm": 4.013744354248047,
"learning_rate": 9.996312493018545e-06,
"loss": 1.2645,
"step": 1061
},
{
"epoch": 0.30831760778051964,
"grad_norm": 3.783053398132324,
"learning_rate": 9.996275530351184e-06,
"loss": 1.2519,
"step": 1062
},
{
"epoch": 0.3086079256786181,
"grad_norm": 4.049034118652344,
"learning_rate": 9.996238383423162e-06,
"loss": 1.2987,
"step": 1063
},
{
"epoch": 0.3088982435767165,
"grad_norm": 4.0037078857421875,
"learning_rate": 9.996201052235855e-06,
"loss": 1.3219,
"step": 1064
},
{
"epoch": 0.30918856147481494,
"grad_norm": 3.8853280544281006,
"learning_rate": 9.996163536790633e-06,
"loss": 1.3642,
"step": 1065
},
{
"epoch": 0.30947887937291335,
"grad_norm": 3.756002902984619,
"learning_rate": 9.996125837088883e-06,
"loss": 1.2355,
"step": 1066
},
{
"epoch": 0.30976919727101176,
"grad_norm": 3.9041924476623535,
"learning_rate": 9.996087953131996e-06,
"loss": 1.2097,
"step": 1067
},
{
"epoch": 0.3100595151691102,
"grad_norm": 3.773911952972412,
"learning_rate": 9.996049884921367e-06,
"loss": 1.1904,
"step": 1068
},
{
"epoch": 0.3103498330672086,
"grad_norm": 3.802534341812134,
"learning_rate": 9.996011632458403e-06,
"loss": 1.1983,
"step": 1069
},
{
"epoch": 0.310640150965307,
"grad_norm": 3.91593861579895,
"learning_rate": 9.99597319574451e-06,
"loss": 1.3075,
"step": 1070
},
{
"epoch": 0.3109304688634054,
"grad_norm": 3.9573280811309814,
"learning_rate": 9.995934574781108e-06,
"loss": 1.3832,
"step": 1071
},
{
"epoch": 0.31122078676150383,
"grad_norm": 3.5446033477783203,
"learning_rate": 9.995895769569623e-06,
"loss": 1.1472,
"step": 1072
},
{
"epoch": 0.31151110465960224,
"grad_norm": 3.6855850219726562,
"learning_rate": 9.995856780111483e-06,
"loss": 1.1494,
"step": 1073
},
{
"epoch": 0.31180142255770066,
"grad_norm": 4.052492618560791,
"learning_rate": 9.995817606408129e-06,
"loss": 1.3019,
"step": 1074
},
{
"epoch": 0.3120917404557991,
"grad_norm": 3.6750905513763428,
"learning_rate": 9.995778248461003e-06,
"loss": 1.1294,
"step": 1075
},
{
"epoch": 0.31238205835389754,
"grad_norm": 3.975306510925293,
"learning_rate": 9.995738706271559e-06,
"loss": 1.1529,
"step": 1076
},
{
"epoch": 0.31267237625199595,
"grad_norm": 3.8198189735412598,
"learning_rate": 9.995698979841253e-06,
"loss": 1.1464,
"step": 1077
},
{
"epoch": 0.31296269415009437,
"grad_norm": 3.8802731037139893,
"learning_rate": 9.99565906917155e-06,
"loss": 1.296,
"step": 1078
},
{
"epoch": 0.3132530120481928,
"grad_norm": 3.874182939529419,
"learning_rate": 9.995618974263925e-06,
"loss": 1.2741,
"step": 1079
},
{
"epoch": 0.3135433299462912,
"grad_norm": 4.022329807281494,
"learning_rate": 9.995578695119856e-06,
"loss": 1.2235,
"step": 1080
},
{
"epoch": 0.3138336478443896,
"grad_norm": 3.432136058807373,
"learning_rate": 9.995538231740825e-06,
"loss": 1.1024,
"step": 1081
},
{
"epoch": 0.314123965742488,
"grad_norm": 3.90201735496521,
"learning_rate": 9.995497584128326e-06,
"loss": 1.17,
"step": 1082
},
{
"epoch": 0.31441428364058643,
"grad_norm": 3.2675185203552246,
"learning_rate": 9.995456752283858e-06,
"loss": 1.0976,
"step": 1083
},
{
"epoch": 0.31470460153868485,
"grad_norm": 3.555330991744995,
"learning_rate": 9.99541573620893e-06,
"loss": 1.2697,
"step": 1084
},
{
"epoch": 0.31499491943678326,
"grad_norm": 3.853966236114502,
"learning_rate": 9.99537453590505e-06,
"loss": 1.4011,
"step": 1085
},
{
"epoch": 0.3152852373348817,
"grad_norm": 3.650466203689575,
"learning_rate": 9.99533315137374e-06,
"loss": 1.1749,
"step": 1086
},
{
"epoch": 0.3155755552329801,
"grad_norm": 3.698735475540161,
"learning_rate": 9.995291582616526e-06,
"loss": 1.3977,
"step": 1087
},
{
"epoch": 0.31586587313107856,
"grad_norm": 3.5275065898895264,
"learning_rate": 9.99524982963494e-06,
"loss": 1.3007,
"step": 1088
},
{
"epoch": 0.31615619102917697,
"grad_norm": 3.885864019393921,
"learning_rate": 9.995207892430525e-06,
"loss": 1.3067,
"step": 1089
},
{
"epoch": 0.3164465089272754,
"grad_norm": 3.5765745639801025,
"learning_rate": 9.995165771004821e-06,
"loss": 1.2831,
"step": 1090
},
{
"epoch": 0.3167368268253738,
"grad_norm": 4.13949728012085,
"learning_rate": 9.99512346535939e-06,
"loss": 1.3137,
"step": 1091
},
{
"epoch": 0.3170271447234722,
"grad_norm": 3.839385747909546,
"learning_rate": 9.995080975495786e-06,
"loss": 1.1197,
"step": 1092
},
{
"epoch": 0.3173174626215706,
"grad_norm": 3.585883617401123,
"learning_rate": 9.995038301415575e-06,
"loss": 1.043,
"step": 1093
},
{
"epoch": 0.31760778051966904,
"grad_norm": 3.585265636444092,
"learning_rate": 9.994995443120338e-06,
"loss": 1.2184,
"step": 1094
},
{
"epoch": 0.31789809841776745,
"grad_norm": 3.765455722808838,
"learning_rate": 9.99495240061165e-06,
"loss": 1.1284,
"step": 1095
},
{
"epoch": 0.31818841631586586,
"grad_norm": 3.9608914852142334,
"learning_rate": 9.994909173891098e-06,
"loss": 1.2844,
"step": 1096
},
{
"epoch": 0.3184787342139643,
"grad_norm": 4.155348777770996,
"learning_rate": 9.99486576296028e-06,
"loss": 1.2291,
"step": 1097
},
{
"epoch": 0.3187690521120627,
"grad_norm": 4.106432914733887,
"learning_rate": 9.994822167820794e-06,
"loss": 1.3016,
"step": 1098
},
{
"epoch": 0.3190593700101611,
"grad_norm": 3.668353319168091,
"learning_rate": 9.994778388474249e-06,
"loss": 1.1079,
"step": 1099
},
{
"epoch": 0.31934968790825957,
"grad_norm": 4.098554611206055,
"learning_rate": 9.994734424922258e-06,
"loss": 1.2308,
"step": 1100
},
{
"epoch": 0.319640005806358,
"grad_norm": 3.569974660873413,
"learning_rate": 9.994690277166443e-06,
"loss": 1.144,
"step": 1101
},
{
"epoch": 0.3199303237044564,
"grad_norm": 3.9479312896728516,
"learning_rate": 9.994645945208434e-06,
"loss": 1.147,
"step": 1102
},
{
"epoch": 0.3202206416025548,
"grad_norm": 3.754945755004883,
"learning_rate": 9.994601429049866e-06,
"loss": 1.2279,
"step": 1103
},
{
"epoch": 0.3205109595006532,
"grad_norm": 3.6482317447662354,
"learning_rate": 9.994556728692377e-06,
"loss": 1.1124,
"step": 1104
},
{
"epoch": 0.32080127739875164,
"grad_norm": 3.5694377422332764,
"learning_rate": 9.994511844137618e-06,
"loss": 1.1965,
"step": 1105
},
{
"epoch": 0.32109159529685005,
"grad_norm": 3.633552312850952,
"learning_rate": 9.994466775387246e-06,
"loss": 1.1248,
"step": 1106
},
{
"epoch": 0.32138191319494847,
"grad_norm": 4.080570220947266,
"learning_rate": 9.99442152244292e-06,
"loss": 1.4583,
"step": 1107
},
{
"epoch": 0.3216722310930469,
"grad_norm": 3.8583877086639404,
"learning_rate": 9.994376085306309e-06,
"loss": 1.314,
"step": 1108
},
{
"epoch": 0.3219625489911453,
"grad_norm": 4.030450820922852,
"learning_rate": 9.994330463979092e-06,
"loss": 1.1375,
"step": 1109
},
{
"epoch": 0.3222528668892437,
"grad_norm": 3.8722689151763916,
"learning_rate": 9.994284658462949e-06,
"loss": 1.3931,
"step": 1110
},
{
"epoch": 0.3225431847873421,
"grad_norm": 3.761976957321167,
"learning_rate": 9.99423866875957e-06,
"loss": 1.1999,
"step": 1111
},
{
"epoch": 0.32283350268544053,
"grad_norm": 3.489006519317627,
"learning_rate": 9.994192494870649e-06,
"loss": 1.1845,
"step": 1112
},
{
"epoch": 0.323123820583539,
"grad_norm": 4.012115001678467,
"learning_rate": 9.994146136797893e-06,
"loss": 1.1846,
"step": 1113
},
{
"epoch": 0.3234141384816374,
"grad_norm": 4.048895359039307,
"learning_rate": 9.994099594543007e-06,
"loss": 1.2829,
"step": 1114
},
{
"epoch": 0.3237044563797358,
"grad_norm": 3.85603666305542,
"learning_rate": 9.994052868107712e-06,
"loss": 1.1342,
"step": 1115
},
{
"epoch": 0.32399477427783424,
"grad_norm": 3.687089681625366,
"learning_rate": 9.99400595749373e-06,
"loss": 1.1298,
"step": 1116
},
{
"epoch": 0.32428509217593265,
"grad_norm": 3.7886598110198975,
"learning_rate": 9.993958862702785e-06,
"loss": 1.4015,
"step": 1117
},
{
"epoch": 0.32457541007403107,
"grad_norm": 3.9265501499176025,
"learning_rate": 9.993911583736624e-06,
"loss": 1.2466,
"step": 1118
},
{
"epoch": 0.3248657279721295,
"grad_norm": 3.571340560913086,
"learning_rate": 9.993864120596982e-06,
"loss": 1.1224,
"step": 1119
},
{
"epoch": 0.3251560458702279,
"grad_norm": 3.711078643798828,
"learning_rate": 9.993816473285615e-06,
"loss": 1.1134,
"step": 1120
},
{
"epoch": 0.3254463637683263,
"grad_norm": 3.8613884449005127,
"learning_rate": 9.993768641804279e-06,
"loss": 1.249,
"step": 1121
},
{
"epoch": 0.3257366816664247,
"grad_norm": 3.556450605392456,
"learning_rate": 9.993720626154736e-06,
"loss": 1.1877,
"step": 1122
},
{
"epoch": 0.32602699956452313,
"grad_norm": 4.229327201843262,
"learning_rate": 9.99367242633876e-06,
"loss": 1.3764,
"step": 1123
},
{
"epoch": 0.32631731746262155,
"grad_norm": 3.5248398780822754,
"learning_rate": 9.993624042358123e-06,
"loss": 1.1134,
"step": 1124
},
{
"epoch": 0.32660763536071996,
"grad_norm": 3.608933210372925,
"learning_rate": 9.993575474214615e-06,
"loss": 1.1646,
"step": 1125
},
{
"epoch": 0.32689795325881843,
"grad_norm": 3.668365001678467,
"learning_rate": 9.993526721910026e-06,
"loss": 1.2625,
"step": 1126
},
{
"epoch": 0.32718827115691684,
"grad_norm": 3.6710710525512695,
"learning_rate": 9.993477785446151e-06,
"loss": 1.2838,
"step": 1127
},
{
"epoch": 0.32747858905501526,
"grad_norm": 3.607513904571533,
"learning_rate": 9.993428664824798e-06,
"loss": 1.1953,
"step": 1128
},
{
"epoch": 0.32776890695311367,
"grad_norm": 4.071550369262695,
"learning_rate": 9.993379360047777e-06,
"loss": 1.1125,
"step": 1129
},
{
"epoch": 0.3280592248512121,
"grad_norm": 3.6153531074523926,
"learning_rate": 9.993329871116907e-06,
"loss": 1.0884,
"step": 1130
},
{
"epoch": 0.3283495427493105,
"grad_norm": 3.3417906761169434,
"learning_rate": 9.993280198034013e-06,
"loss": 1.046,
"step": 1131
},
{
"epoch": 0.3286398606474089,
"grad_norm": 4.090729236602783,
"learning_rate": 9.993230340800926e-06,
"loss": 1.3781,
"step": 1132
},
{
"epoch": 0.3289301785455073,
"grad_norm": 3.5112178325653076,
"learning_rate": 9.993180299419487e-06,
"loss": 1.1914,
"step": 1133
},
{
"epoch": 0.32922049644360574,
"grad_norm": 4.069597244262695,
"learning_rate": 9.993130073891539e-06,
"loss": 1.2936,
"step": 1134
},
{
"epoch": 0.32951081434170415,
"grad_norm": 3.7383646965026855,
"learning_rate": 9.993079664218936e-06,
"loss": 1.1317,
"step": 1135
},
{
"epoch": 0.32980113223980256,
"grad_norm": 3.911933422088623,
"learning_rate": 9.993029070403535e-06,
"loss": 1.17,
"step": 1136
},
{
"epoch": 0.330091450137901,
"grad_norm": 3.8537962436676025,
"learning_rate": 9.992978292447206e-06,
"loss": 1.2672,
"step": 1137
},
{
"epoch": 0.33038176803599945,
"grad_norm": 3.6948013305664062,
"learning_rate": 9.992927330351815e-06,
"loss": 1.2145,
"step": 1138
},
{
"epoch": 0.33067208593409786,
"grad_norm": 4.0727362632751465,
"learning_rate": 9.992876184119248e-06,
"loss": 1.2109,
"step": 1139
},
{
"epoch": 0.3309624038321963,
"grad_norm": 3.8704004287719727,
"learning_rate": 9.99282485375139e-06,
"loss": 1.2516,
"step": 1140
},
{
"epoch": 0.3312527217302947,
"grad_norm": 3.7747249603271484,
"learning_rate": 9.99277333925013e-06,
"loss": 1.2104,
"step": 1141
},
{
"epoch": 0.3315430396283931,
"grad_norm": 3.8810410499572754,
"learning_rate": 9.992721640617373e-06,
"loss": 1.2335,
"step": 1142
},
{
"epoch": 0.3318333575264915,
"grad_norm": 3.924704074859619,
"learning_rate": 9.992669757855022e-06,
"loss": 1.3601,
"step": 1143
},
{
"epoch": 0.3321236754245899,
"grad_norm": 3.7031071186065674,
"learning_rate": 9.992617690964992e-06,
"loss": 1.2986,
"step": 1144
},
{
"epoch": 0.33241399332268834,
"grad_norm": 3.5863468647003174,
"learning_rate": 9.992565439949202e-06,
"loss": 1.0064,
"step": 1145
},
{
"epoch": 0.33270431122078675,
"grad_norm": 3.349553346633911,
"learning_rate": 9.99251300480958e-06,
"loss": 1.0804,
"step": 1146
},
{
"epoch": 0.33299462911888517,
"grad_norm": 3.7625350952148438,
"learning_rate": 9.99246038554806e-06,
"loss": 1.2891,
"step": 1147
},
{
"epoch": 0.3332849470169836,
"grad_norm": 3.663235664367676,
"learning_rate": 9.992407582166582e-06,
"loss": 1.0882,
"step": 1148
},
{
"epoch": 0.333575264915082,
"grad_norm": 4.091626167297363,
"learning_rate": 9.992354594667092e-06,
"loss": 1.3082,
"step": 1149
},
{
"epoch": 0.3338655828131804,
"grad_norm": 4.003473281860352,
"learning_rate": 9.99230142305155e-06,
"loss": 1.3269,
"step": 1150
},
{
"epoch": 0.3341559007112789,
"grad_norm": 4.316757678985596,
"learning_rate": 9.992248067321908e-06,
"loss": 1.3672,
"step": 1151
},
{
"epoch": 0.3344462186093773,
"grad_norm": 3.5924463272094727,
"learning_rate": 9.992194527480141e-06,
"loss": 1.1899,
"step": 1152
},
{
"epoch": 0.3347365365074757,
"grad_norm": 3.5745058059692383,
"learning_rate": 9.99214080352822e-06,
"loss": 1.2595,
"step": 1153
},
{
"epoch": 0.3350268544055741,
"grad_norm": 3.8298416137695312,
"learning_rate": 9.992086895468126e-06,
"loss": 1.3461,
"step": 1154
},
{
"epoch": 0.33531717230367253,
"grad_norm": 3.9122047424316406,
"learning_rate": 9.992032803301852e-06,
"loss": 1.2159,
"step": 1155
},
{
"epoch": 0.33560749020177094,
"grad_norm": 3.804358959197998,
"learning_rate": 9.991978527031388e-06,
"loss": 1.2398,
"step": 1156
},
{
"epoch": 0.33589780809986935,
"grad_norm": 3.9901576042175293,
"learning_rate": 9.991924066658734e-06,
"loss": 1.3343,
"step": 1157
},
{
"epoch": 0.33618812599796777,
"grad_norm": 4.042963027954102,
"learning_rate": 9.991869422185905e-06,
"loss": 1.266,
"step": 1158
},
{
"epoch": 0.3364784438960662,
"grad_norm": 3.808166742324829,
"learning_rate": 9.991814593614911e-06,
"loss": 1.3053,
"step": 1159
},
{
"epoch": 0.3367687617941646,
"grad_norm": 3.918839931488037,
"learning_rate": 9.991759580947775e-06,
"loss": 1.2586,
"step": 1160
},
{
"epoch": 0.337059079692263,
"grad_norm": 4.197708606719971,
"learning_rate": 9.991704384186527e-06,
"loss": 1.4134,
"step": 1161
},
{
"epoch": 0.3373493975903614,
"grad_norm": 4.288426876068115,
"learning_rate": 9.991649003333202e-06,
"loss": 1.182,
"step": 1162
},
{
"epoch": 0.3376397154884599,
"grad_norm": 3.746020555496216,
"learning_rate": 9.991593438389844e-06,
"loss": 1.1078,
"step": 1163
},
{
"epoch": 0.3379300333865583,
"grad_norm": 4.072814464569092,
"learning_rate": 9.9915376893585e-06,
"loss": 1.1521,
"step": 1164
},
{
"epoch": 0.3382203512846567,
"grad_norm": 3.3874738216400146,
"learning_rate": 9.991481756241228e-06,
"loss": 1.0637,
"step": 1165
},
{
"epoch": 0.33851066918275513,
"grad_norm": 3.7892661094665527,
"learning_rate": 9.991425639040088e-06,
"loss": 1.1503,
"step": 1166
},
{
"epoch": 0.33880098708085354,
"grad_norm": 3.8184001445770264,
"learning_rate": 9.991369337757152e-06,
"loss": 1.2691,
"step": 1167
},
{
"epoch": 0.33909130497895196,
"grad_norm": 3.9826607704162598,
"learning_rate": 9.991312852394495e-06,
"loss": 1.2423,
"step": 1168
},
{
"epoch": 0.33938162287705037,
"grad_norm": 3.558635711669922,
"learning_rate": 9.9912561829542e-06,
"loss": 1.1773,
"step": 1169
},
{
"epoch": 0.3396719407751488,
"grad_norm": 4.2123847007751465,
"learning_rate": 9.99119932943836e-06,
"loss": 1.2101,
"step": 1170
},
{
"epoch": 0.3399622586732472,
"grad_norm": 3.4792020320892334,
"learning_rate": 9.991142291849068e-06,
"loss": 1.172,
"step": 1171
},
{
"epoch": 0.3402525765713456,
"grad_norm": 3.54262113571167,
"learning_rate": 9.991085070188429e-06,
"loss": 1.0937,
"step": 1172
},
{
"epoch": 0.340542894469444,
"grad_norm": 4.025277614593506,
"learning_rate": 9.991027664458553e-06,
"loss": 1.2719,
"step": 1173
},
{
"epoch": 0.34083321236754244,
"grad_norm": 3.762990713119507,
"learning_rate": 9.990970074661558e-06,
"loss": 1.1391,
"step": 1174
},
{
"epoch": 0.34112353026564085,
"grad_norm": 3.8915021419525146,
"learning_rate": 9.990912300799567e-06,
"loss": 1.3049,
"step": 1175
},
{
"epoch": 0.3414138481637393,
"grad_norm": 4.053305149078369,
"learning_rate": 9.990854342874712e-06,
"loss": 1.302,
"step": 1176
},
{
"epoch": 0.34170416606183773,
"grad_norm": 4.007221221923828,
"learning_rate": 9.990796200889129e-06,
"loss": 1.2686,
"step": 1177
},
{
"epoch": 0.34199448395993615,
"grad_norm": 3.757418632507324,
"learning_rate": 9.990737874844961e-06,
"loss": 1.1974,
"step": 1178
},
{
"epoch": 0.34228480185803456,
"grad_norm": 3.871196746826172,
"learning_rate": 9.99067936474436e-06,
"loss": 1.2349,
"step": 1179
},
{
"epoch": 0.342575119756133,
"grad_norm": 4.200139045715332,
"learning_rate": 9.990620670589488e-06,
"loss": 1.2872,
"step": 1180
},
{
"epoch": 0.3428654376542314,
"grad_norm": 3.794616460800171,
"learning_rate": 9.990561792382504e-06,
"loss": 1.1205,
"step": 1181
},
{
"epoch": 0.3431557555523298,
"grad_norm": 4.073183536529541,
"learning_rate": 9.990502730125583e-06,
"loss": 1.2517,
"step": 1182
},
{
"epoch": 0.3434460734504282,
"grad_norm": 3.3805885314941406,
"learning_rate": 9.990443483820899e-06,
"loss": 1.0957,
"step": 1183
},
{
"epoch": 0.3437363913485266,
"grad_norm": 3.706669807434082,
"learning_rate": 9.99038405347064e-06,
"loss": 1.1709,
"step": 1184
},
{
"epoch": 0.34402670924662504,
"grad_norm": 3.701693296432495,
"learning_rate": 9.990324439077e-06,
"loss": 1.3542,
"step": 1185
},
{
"epoch": 0.34431702714472345,
"grad_norm": 3.3958957195281982,
"learning_rate": 9.990264640642175e-06,
"loss": 1.0783,
"step": 1186
},
{
"epoch": 0.34460734504282187,
"grad_norm": 3.568415641784668,
"learning_rate": 9.990204658168368e-06,
"loss": 1.1532,
"step": 1187
},
{
"epoch": 0.34489766294092034,
"grad_norm": 3.5190603733062744,
"learning_rate": 9.990144491657796e-06,
"loss": 1.1625,
"step": 1188
},
{
"epoch": 0.34518798083901875,
"grad_norm": 3.578280210494995,
"learning_rate": 9.990084141112674e-06,
"loss": 1.1424,
"step": 1189
},
{
"epoch": 0.34547829873711716,
"grad_norm": 3.530015468597412,
"learning_rate": 9.990023606535229e-06,
"loss": 1.2192,
"step": 1190
},
{
"epoch": 0.3457686166352156,
"grad_norm": 3.9412999153137207,
"learning_rate": 9.989962887927693e-06,
"loss": 1.2546,
"step": 1191
},
{
"epoch": 0.346058934533314,
"grad_norm": 3.7730867862701416,
"learning_rate": 9.989901985292307e-06,
"loss": 1.3085,
"step": 1192
},
{
"epoch": 0.3463492524314124,
"grad_norm": 3.756413698196411,
"learning_rate": 9.989840898631316e-06,
"loss": 1.2506,
"step": 1193
},
{
"epoch": 0.3466395703295108,
"grad_norm": 3.6548380851745605,
"learning_rate": 9.989779627946974e-06,
"loss": 1.1645,
"step": 1194
},
{
"epoch": 0.34692988822760923,
"grad_norm": 3.9941673278808594,
"learning_rate": 9.989718173241537e-06,
"loss": 1.2806,
"step": 1195
},
{
"epoch": 0.34722020612570764,
"grad_norm": 4.010866641998291,
"learning_rate": 9.989656534517277e-06,
"loss": 1.283,
"step": 1196
},
{
"epoch": 0.34751052402380606,
"grad_norm": 3.7354612350463867,
"learning_rate": 9.98959471177646e-06,
"loss": 1.1878,
"step": 1197
},
{
"epoch": 0.34780084192190447,
"grad_norm": 3.2911434173583984,
"learning_rate": 9.989532705021373e-06,
"loss": 1.0222,
"step": 1198
},
{
"epoch": 0.3480911598200029,
"grad_norm": 3.4110004901885986,
"learning_rate": 9.989470514254298e-06,
"loss": 1.1306,
"step": 1199
},
{
"epoch": 0.3483814777181013,
"grad_norm": 3.56748366355896,
"learning_rate": 9.989408139477532e-06,
"loss": 1.0896,
"step": 1200
},
{
"epoch": 0.34867179561619976,
"grad_norm": 3.6176071166992188,
"learning_rate": 9.989345580693372e-06,
"loss": 1.1132,
"step": 1201
},
{
"epoch": 0.3489621135142982,
"grad_norm": 4.058175563812256,
"learning_rate": 9.989282837904128e-06,
"loss": 1.2873,
"step": 1202
},
{
"epoch": 0.3492524314123966,
"grad_norm": 3.6613640785217285,
"learning_rate": 9.989219911112114e-06,
"loss": 1.1239,
"step": 1203
},
{
"epoch": 0.349542749310495,
"grad_norm": 3.6973867416381836,
"learning_rate": 9.989156800319648e-06,
"loss": 1.2085,
"step": 1204
},
{
"epoch": 0.3498330672085934,
"grad_norm": 4.278224468231201,
"learning_rate": 9.989093505529061e-06,
"loss": 1.3686,
"step": 1205
},
{
"epoch": 0.35012338510669183,
"grad_norm": 3.5927252769470215,
"learning_rate": 9.989030026742683e-06,
"loss": 1.1315,
"step": 1206
},
{
"epoch": 0.35041370300479024,
"grad_norm": 3.239856004714966,
"learning_rate": 9.98896636396286e-06,
"loss": 1.1717,
"step": 1207
},
{
"epoch": 0.35070402090288866,
"grad_norm": 3.571183204650879,
"learning_rate": 9.988902517191935e-06,
"loss": 1.1082,
"step": 1208
},
{
"epoch": 0.35099433880098707,
"grad_norm": 3.4660732746124268,
"learning_rate": 9.988838486432266e-06,
"loss": 1.1124,
"step": 1209
},
{
"epoch": 0.3512846566990855,
"grad_norm": 3.6221065521240234,
"learning_rate": 9.988774271686213e-06,
"loss": 1.3044,
"step": 1210
},
{
"epoch": 0.3515749745971839,
"grad_norm": 3.52908992767334,
"learning_rate": 9.988709872956146e-06,
"loss": 1.1691,
"step": 1211
},
{
"epoch": 0.3518652924952823,
"grad_norm": 3.7822394371032715,
"learning_rate": 9.988645290244436e-06,
"loss": 1.3203,
"step": 1212
},
{
"epoch": 0.3521556103933807,
"grad_norm": 3.8475475311279297,
"learning_rate": 9.98858052355347e-06,
"loss": 1.1906,
"step": 1213
},
{
"epoch": 0.3524459282914792,
"grad_norm": 4.064851760864258,
"learning_rate": 9.988515572885632e-06,
"loss": 1.2655,
"step": 1214
},
{
"epoch": 0.3527362461895776,
"grad_norm": 4.1176018714904785,
"learning_rate": 9.98845043824332e-06,
"loss": 1.3391,
"step": 1215
},
{
"epoch": 0.353026564087676,
"grad_norm": 3.622924327850342,
"learning_rate": 9.988385119628936e-06,
"loss": 1.1187,
"step": 1216
},
{
"epoch": 0.35331688198577443,
"grad_norm": 3.7255032062530518,
"learning_rate": 9.988319617044889e-06,
"loss": 1.25,
"step": 1217
},
{
"epoch": 0.35360719988387285,
"grad_norm": 3.67846941947937,
"learning_rate": 9.988253930493592e-06,
"loss": 1.1302,
"step": 1218
},
{
"epoch": 0.35389751778197126,
"grad_norm": 3.972423791885376,
"learning_rate": 9.98818805997747e-06,
"loss": 1.2769,
"step": 1219
},
{
"epoch": 0.3541878356800697,
"grad_norm": 3.8370683193206787,
"learning_rate": 9.988122005498952e-06,
"loss": 1.3485,
"step": 1220
},
{
"epoch": 0.3544781535781681,
"grad_norm": 3.591844320297241,
"learning_rate": 9.988055767060474e-06,
"loss": 1.2438,
"step": 1221
},
{
"epoch": 0.3547684714762665,
"grad_norm": 3.620933771133423,
"learning_rate": 9.987989344664479e-06,
"loss": 1.2388,
"step": 1222
},
{
"epoch": 0.3550587893743649,
"grad_norm": 3.5270962715148926,
"learning_rate": 9.987922738313417e-06,
"loss": 1.157,
"step": 1223
},
{
"epoch": 0.3553491072724633,
"grad_norm": 3.6065704822540283,
"learning_rate": 9.987855948009744e-06,
"loss": 1.1126,
"step": 1224
},
{
"epoch": 0.35563942517056174,
"grad_norm": 3.9604432582855225,
"learning_rate": 9.98778897375592e-06,
"loss": 1.3218,
"step": 1225
},
{
"epoch": 0.3559297430686602,
"grad_norm": 3.827787160873413,
"learning_rate": 9.987721815554421e-06,
"loss": 1.2084,
"step": 1226
},
{
"epoch": 0.3562200609667586,
"grad_norm": 3.869262456893921,
"learning_rate": 9.98765447340772e-06,
"loss": 1.2332,
"step": 1227
},
{
"epoch": 0.35651037886485704,
"grad_norm": 3.5749378204345703,
"learning_rate": 9.987586947318302e-06,
"loss": 1.1676,
"step": 1228
},
{
"epoch": 0.35680069676295545,
"grad_norm": 3.531912088394165,
"learning_rate": 9.987519237288656e-06,
"loss": 1.2284,
"step": 1229
},
{
"epoch": 0.35709101466105386,
"grad_norm": 3.333885431289673,
"learning_rate": 9.98745134332128e-06,
"loss": 1.068,
"step": 1230
},
{
"epoch": 0.3573813325591523,
"grad_norm": 3.75718355178833,
"learning_rate": 9.987383265418677e-06,
"loss": 1.1405,
"step": 1231
},
{
"epoch": 0.3576716504572507,
"grad_norm": 3.853196859359741,
"learning_rate": 9.987315003583359e-06,
"loss": 1.2619,
"step": 1232
},
{
"epoch": 0.3579619683553491,
"grad_norm": 3.7360024452209473,
"learning_rate": 9.987246557817843e-06,
"loss": 1.241,
"step": 1233
},
{
"epoch": 0.3582522862534475,
"grad_norm": 3.7324812412261963,
"learning_rate": 9.987177928124651e-06,
"loss": 1.053,
"step": 1234
},
{
"epoch": 0.35854260415154593,
"grad_norm": 3.9284653663635254,
"learning_rate": 9.98710911450632e-06,
"loss": 1.3436,
"step": 1235
},
{
"epoch": 0.35883292204964434,
"grad_norm": 3.787597179412842,
"learning_rate": 9.987040116965381e-06,
"loss": 1.1066,
"step": 1236
},
{
"epoch": 0.35912323994774276,
"grad_norm": 3.7411112785339355,
"learning_rate": 9.98697093550438e-06,
"loss": 1.0653,
"step": 1237
},
{
"epoch": 0.35941355784584117,
"grad_norm": 3.5020062923431396,
"learning_rate": 9.986901570125873e-06,
"loss": 1.1956,
"step": 1238
},
{
"epoch": 0.35970387574393964,
"grad_norm": 3.475775718688965,
"learning_rate": 9.986832020832416e-06,
"loss": 1.1577,
"step": 1239
},
{
"epoch": 0.35999419364203805,
"grad_norm": 3.781212568283081,
"learning_rate": 9.98676228762657e-06,
"loss": 1.2847,
"step": 1240
},
{
"epoch": 0.36028451154013647,
"grad_norm": 3.5571868419647217,
"learning_rate": 9.98669237051091e-06,
"loss": 1.0893,
"step": 1241
},
{
"epoch": 0.3605748294382349,
"grad_norm": 3.7990763187408447,
"learning_rate": 9.986622269488017e-06,
"loss": 1.3096,
"step": 1242
},
{
"epoch": 0.3608651473363333,
"grad_norm": 3.936373710632324,
"learning_rate": 9.98655198456047e-06,
"loss": 1.2876,
"step": 1243
},
{
"epoch": 0.3611554652344317,
"grad_norm": 3.4436564445495605,
"learning_rate": 9.986481515730868e-06,
"loss": 1.1857,
"step": 1244
},
{
"epoch": 0.3614457831325301,
"grad_norm": 3.6510026454925537,
"learning_rate": 9.986410863001806e-06,
"loss": 1.277,
"step": 1245
},
{
"epoch": 0.36173610103062853,
"grad_norm": 4.282403469085693,
"learning_rate": 9.986340026375888e-06,
"loss": 1.2899,
"step": 1246
},
{
"epoch": 0.36202641892872695,
"grad_norm": 3.948631763458252,
"learning_rate": 9.98626900585573e-06,
"loss": 1.3668,
"step": 1247
},
{
"epoch": 0.36231673682682536,
"grad_norm": 3.5207550525665283,
"learning_rate": 9.98619780144395e-06,
"loss": 1.1732,
"step": 1248
},
{
"epoch": 0.36260705472492377,
"grad_norm": 3.9342057704925537,
"learning_rate": 9.986126413143173e-06,
"loss": 1.2864,
"step": 1249
},
{
"epoch": 0.3628973726230222,
"grad_norm": 4.076601982116699,
"learning_rate": 9.986054840956033e-06,
"loss": 1.3249,
"step": 1250
},
{
"epoch": 0.36318769052112065,
"grad_norm": 3.6744585037231445,
"learning_rate": 9.985983084885169e-06,
"loss": 1.1245,
"step": 1251
},
{
"epoch": 0.36347800841921907,
"grad_norm": 3.6365158557891846,
"learning_rate": 9.985911144933228e-06,
"loss": 1.1338,
"step": 1252
},
{
"epoch": 0.3637683263173175,
"grad_norm": 4.260592937469482,
"learning_rate": 9.985839021102862e-06,
"loss": 1.3485,
"step": 1253
},
{
"epoch": 0.3640586442154159,
"grad_norm": 3.7015466690063477,
"learning_rate": 9.985766713396732e-06,
"loss": 1.275,
"step": 1254
},
{
"epoch": 0.3643489621135143,
"grad_norm": 3.6575965881347656,
"learning_rate": 9.985694221817504e-06,
"loss": 1.1995,
"step": 1255
},
{
"epoch": 0.3646392800116127,
"grad_norm": 3.805546283721924,
"learning_rate": 9.985621546367851e-06,
"loss": 1.2516,
"step": 1256
},
{
"epoch": 0.36492959790971113,
"grad_norm": 3.6391587257385254,
"learning_rate": 9.985548687050454e-06,
"loss": 1.1948,
"step": 1257
},
{
"epoch": 0.36521991580780955,
"grad_norm": 3.510903835296631,
"learning_rate": 9.985475643868e-06,
"loss": 1.1434,
"step": 1258
},
{
"epoch": 0.36551023370590796,
"grad_norm": 3.690833806991577,
"learning_rate": 9.985402416823183e-06,
"loss": 1.3163,
"step": 1259
},
{
"epoch": 0.3658005516040064,
"grad_norm": 3.6341683864593506,
"learning_rate": 9.985329005918702e-06,
"loss": 1.2531,
"step": 1260
},
{
"epoch": 0.3660908695021048,
"grad_norm": 3.9021074771881104,
"learning_rate": 9.985255411157268e-06,
"loss": 1.3222,
"step": 1261
},
{
"epoch": 0.3663811874002032,
"grad_norm": 3.5397932529449463,
"learning_rate": 9.985181632541591e-06,
"loss": 1.1676,
"step": 1262
},
{
"epoch": 0.3666715052983016,
"grad_norm": 3.973975896835327,
"learning_rate": 9.985107670074394e-06,
"loss": 1.2106,
"step": 1263
},
{
"epoch": 0.3669618231964001,
"grad_norm": 3.945737600326538,
"learning_rate": 9.985033523758405e-06,
"loss": 1.2573,
"step": 1264
},
{
"epoch": 0.3672521410944985,
"grad_norm": 3.5193498134613037,
"learning_rate": 9.984959193596358e-06,
"loss": 1.1568,
"step": 1265
},
{
"epoch": 0.3675424589925969,
"grad_norm": 4.018974781036377,
"learning_rate": 9.984884679590994e-06,
"loss": 1.2194,
"step": 1266
},
{
"epoch": 0.3678327768906953,
"grad_norm": 3.666628122329712,
"learning_rate": 9.984809981745061e-06,
"loss": 1.3031,
"step": 1267
},
{
"epoch": 0.36812309478879374,
"grad_norm": 3.4612388610839844,
"learning_rate": 9.984735100061313e-06,
"loss": 1.1842,
"step": 1268
},
{
"epoch": 0.36841341268689215,
"grad_norm": 4.13927698135376,
"learning_rate": 9.984660034542512e-06,
"loss": 1.3674,
"step": 1269
},
{
"epoch": 0.36870373058499056,
"grad_norm": 3.5382606983184814,
"learning_rate": 9.98458478519143e-06,
"loss": 1.1857,
"step": 1270
},
{
"epoch": 0.368994048483089,
"grad_norm": 3.827183246612549,
"learning_rate": 9.984509352010839e-06,
"loss": 1.1914,
"step": 1271
},
{
"epoch": 0.3692843663811874,
"grad_norm": 3.528890609741211,
"learning_rate": 9.984433735003518e-06,
"loss": 1.1497,
"step": 1272
},
{
"epoch": 0.3695746842792858,
"grad_norm": 3.6063666343688965,
"learning_rate": 9.984357934172263e-06,
"loss": 1.2329,
"step": 1273
},
{
"epoch": 0.3698650021773842,
"grad_norm": 3.64660382270813,
"learning_rate": 9.984281949519861e-06,
"loss": 1.1589,
"step": 1274
},
{
"epoch": 0.37015532007548263,
"grad_norm": 3.4852254390716553,
"learning_rate": 9.984205781049122e-06,
"loss": 1.2945,
"step": 1275
},
{
"epoch": 0.3704456379735811,
"grad_norm": 4.028648376464844,
"learning_rate": 9.98412942876285e-06,
"loss": 1.1381,
"step": 1276
},
{
"epoch": 0.3707359558716795,
"grad_norm": 3.437859296798706,
"learning_rate": 9.984052892663863e-06,
"loss": 1.099,
"step": 1277
},
{
"epoch": 0.3710262737697779,
"grad_norm": 3.5467662811279297,
"learning_rate": 9.983976172754982e-06,
"loss": 1.1857,
"step": 1278
},
{
"epoch": 0.37131659166787634,
"grad_norm": 3.897996425628662,
"learning_rate": 9.98389926903904e-06,
"loss": 1.3047,
"step": 1279
},
{
"epoch": 0.37160690956597475,
"grad_norm": 3.553786516189575,
"learning_rate": 9.98382218151887e-06,
"loss": 1.1506,
"step": 1280
},
{
"epoch": 0.37189722746407317,
"grad_norm": 3.5104734897613525,
"learning_rate": 9.983744910197315e-06,
"loss": 1.2489,
"step": 1281
},
{
"epoch": 0.3721875453621716,
"grad_norm": 3.6049647331237793,
"learning_rate": 9.983667455077225e-06,
"loss": 1.2921,
"step": 1282
},
{
"epoch": 0.37247786326027,
"grad_norm": 3.746884822845459,
"learning_rate": 9.983589816161458e-06,
"loss": 1.0715,
"step": 1283
},
{
"epoch": 0.3727681811583684,
"grad_norm": 3.4639060497283936,
"learning_rate": 9.983511993452875e-06,
"loss": 1.2717,
"step": 1284
},
{
"epoch": 0.3730584990564668,
"grad_norm": 4.013452529907227,
"learning_rate": 9.983433986954349e-06,
"loss": 1.3516,
"step": 1285
},
{
"epoch": 0.37334881695456523,
"grad_norm": 3.8270010948181152,
"learning_rate": 9.983355796668755e-06,
"loss": 1.4126,
"step": 1286
},
{
"epoch": 0.37363913485266365,
"grad_norm": 3.6755404472351074,
"learning_rate": 9.983277422598976e-06,
"loss": 1.1109,
"step": 1287
},
{
"epoch": 0.37392945275076206,
"grad_norm": 3.8300483226776123,
"learning_rate": 9.983198864747904e-06,
"loss": 1.0732,
"step": 1288
},
{
"epoch": 0.37421977064886053,
"grad_norm": 3.9538397789001465,
"learning_rate": 9.983120123118435e-06,
"loss": 1.3122,
"step": 1289
},
{
"epoch": 0.37451008854695894,
"grad_norm": 3.865281343460083,
"learning_rate": 9.983041197713473e-06,
"loss": 1.3144,
"step": 1290
},
{
"epoch": 0.37480040644505735,
"grad_norm": 3.875990152359009,
"learning_rate": 9.982962088535928e-06,
"loss": 1.1896,
"step": 1291
},
{
"epoch": 0.37509072434315577,
"grad_norm": 3.8319966793060303,
"learning_rate": 9.98288279558872e-06,
"loss": 1.1693,
"step": 1292
},
{
"epoch": 0.3753810422412542,
"grad_norm": 3.9637584686279297,
"learning_rate": 9.982803318874772e-06,
"loss": 1.3056,
"step": 1293
},
{
"epoch": 0.3756713601393526,
"grad_norm": 3.718834400177002,
"learning_rate": 9.982723658397016e-06,
"loss": 1.3783,
"step": 1294
},
{
"epoch": 0.375961678037451,
"grad_norm": 3.859952688217163,
"learning_rate": 9.982643814158387e-06,
"loss": 1.224,
"step": 1295
},
{
"epoch": 0.3762519959355494,
"grad_norm": 3.4103081226348877,
"learning_rate": 9.982563786161831e-06,
"loss": 1.1378,
"step": 1296
},
{
"epoch": 0.37654231383364783,
"grad_norm": 3.879765510559082,
"learning_rate": 9.982483574410302e-06,
"loss": 1.2272,
"step": 1297
},
{
"epoch": 0.37683263173174625,
"grad_norm": 3.8443405628204346,
"learning_rate": 9.982403178906755e-06,
"loss": 1.2383,
"step": 1298
},
{
"epoch": 0.37712294962984466,
"grad_norm": 3.5465097427368164,
"learning_rate": 9.982322599654156e-06,
"loss": 1.1018,
"step": 1299
},
{
"epoch": 0.3774132675279431,
"grad_norm": 4.120823383331299,
"learning_rate": 9.982241836655475e-06,
"loss": 1.4552,
"step": 1300
},
{
"epoch": 0.3777035854260415,
"grad_norm": 3.9285216331481934,
"learning_rate": 9.982160889913695e-06,
"loss": 1.3464,
"step": 1301
},
{
"epoch": 0.37799390332413996,
"grad_norm": 3.467785596847534,
"learning_rate": 9.982079759431797e-06,
"loss": 1.0364,
"step": 1302
},
{
"epoch": 0.37828422122223837,
"grad_norm": 3.7329118251800537,
"learning_rate": 9.981998445212775e-06,
"loss": 1.3733,
"step": 1303
},
{
"epoch": 0.3785745391203368,
"grad_norm": 3.560277223587036,
"learning_rate": 9.981916947259627e-06,
"loss": 1.2214,
"step": 1304
},
{
"epoch": 0.3788648570184352,
"grad_norm": 3.2049508094787598,
"learning_rate": 9.981835265575358e-06,
"loss": 1.1433,
"step": 1305
},
{
"epoch": 0.3791551749165336,
"grad_norm": 3.6437489986419678,
"learning_rate": 9.981753400162984e-06,
"loss": 1.1825,
"step": 1306
},
{
"epoch": 0.379445492814632,
"grad_norm": 3.253337860107422,
"learning_rate": 9.981671351025519e-06,
"loss": 1.0779,
"step": 1307
},
{
"epoch": 0.37973581071273044,
"grad_norm": 3.6426970958709717,
"learning_rate": 9.981589118165993e-06,
"loss": 1.3683,
"step": 1308
},
{
"epoch": 0.38002612861082885,
"grad_norm": 3.8423707485198975,
"learning_rate": 9.981506701587437e-06,
"loss": 1.1725,
"step": 1309
},
{
"epoch": 0.38031644650892726,
"grad_norm": 3.6762940883636475,
"learning_rate": 9.98142410129289e-06,
"loss": 1.1383,
"step": 1310
},
{
"epoch": 0.3806067644070257,
"grad_norm": 3.8239686489105225,
"learning_rate": 9.9813413172854e-06,
"loss": 1.2646,
"step": 1311
},
{
"epoch": 0.3808970823051241,
"grad_norm": 3.683504581451416,
"learning_rate": 9.981258349568018e-06,
"loss": 1.2585,
"step": 1312
},
{
"epoch": 0.3811874002032225,
"grad_norm": 3.893596649169922,
"learning_rate": 9.981175198143805e-06,
"loss": 1.231,
"step": 1313
},
{
"epoch": 0.381477718101321,
"grad_norm": 3.4069478511810303,
"learning_rate": 9.981091863015828e-06,
"loss": 1.0599,
"step": 1314
},
{
"epoch": 0.3817680359994194,
"grad_norm": 3.19846248626709,
"learning_rate": 9.981008344187159e-06,
"loss": 1.0661,
"step": 1315
},
{
"epoch": 0.3820583538975178,
"grad_norm": 3.7466282844543457,
"learning_rate": 9.98092464166088e-06,
"loss": 1.1942,
"step": 1316
},
{
"epoch": 0.3823486717956162,
"grad_norm": 3.7203147411346436,
"learning_rate": 9.980840755440075e-06,
"loss": 1.1872,
"step": 1317
},
{
"epoch": 0.3826389896937146,
"grad_norm": 3.3040809631347656,
"learning_rate": 9.980756685527841e-06,
"loss": 1.0091,
"step": 1318
},
{
"epoch": 0.38292930759181304,
"grad_norm": 3.2888503074645996,
"learning_rate": 9.980672431927278e-06,
"loss": 1.148,
"step": 1319
},
{
"epoch": 0.38321962548991145,
"grad_norm": 3.654926061630249,
"learning_rate": 9.980587994641491e-06,
"loss": 1.3017,
"step": 1320
},
{
"epoch": 0.38350994338800987,
"grad_norm": 3.980696439743042,
"learning_rate": 9.980503373673594e-06,
"loss": 1.3312,
"step": 1321
},
{
"epoch": 0.3838002612861083,
"grad_norm": 3.6352922916412354,
"learning_rate": 9.980418569026711e-06,
"loss": 1.3227,
"step": 1322
},
{
"epoch": 0.3840905791842067,
"grad_norm": 3.5730032920837402,
"learning_rate": 9.980333580703968e-06,
"loss": 1.2282,
"step": 1323
},
{
"epoch": 0.3843808970823051,
"grad_norm": 3.418905258178711,
"learning_rate": 9.980248408708497e-06,
"loss": 1.1507,
"step": 1324
},
{
"epoch": 0.3846712149804035,
"grad_norm": 3.594193696975708,
"learning_rate": 9.980163053043441e-06,
"loss": 1.2218,
"step": 1325
},
{
"epoch": 0.38496153287850193,
"grad_norm": 3.8186099529266357,
"learning_rate": 9.98007751371195e-06,
"loss": 1.1744,
"step": 1326
},
{
"epoch": 0.3852518507766004,
"grad_norm": 3.8397912979125977,
"learning_rate": 9.979991790717174e-06,
"loss": 1.2721,
"step": 1327
},
{
"epoch": 0.3855421686746988,
"grad_norm": 3.193303346633911,
"learning_rate": 9.97990588406228e-06,
"loss": 1.1092,
"step": 1328
},
{
"epoch": 0.38583248657279723,
"grad_norm": 3.7081987857818604,
"learning_rate": 9.97981979375043e-06,
"loss": 1.2054,
"step": 1329
},
{
"epoch": 0.38612280447089564,
"grad_norm": 3.6489391326904297,
"learning_rate": 9.979733519784804e-06,
"loss": 1.2679,
"step": 1330
},
{
"epoch": 0.38641312236899406,
"grad_norm": 3.412721633911133,
"learning_rate": 9.979647062168582e-06,
"loss": 1.049,
"step": 1331
},
{
"epoch": 0.38670344026709247,
"grad_norm": 3.916553258895874,
"learning_rate": 9.979560420904953e-06,
"loss": 1.4672,
"step": 1332
},
{
"epoch": 0.3869937581651909,
"grad_norm": 3.6796796321868896,
"learning_rate": 9.97947359599711e-06,
"loss": 1.2632,
"step": 1333
},
{
"epoch": 0.3872840760632893,
"grad_norm": 3.4813990592956543,
"learning_rate": 9.979386587448257e-06,
"loss": 1.1071,
"step": 1334
},
{
"epoch": 0.3875743939613877,
"grad_norm": 3.768031120300293,
"learning_rate": 9.979299395261604e-06,
"loss": 1.3182,
"step": 1335
},
{
"epoch": 0.3878647118594861,
"grad_norm": 3.838653087615967,
"learning_rate": 9.979212019440364e-06,
"loss": 1.3277,
"step": 1336
},
{
"epoch": 0.38815502975758454,
"grad_norm": 3.5848910808563232,
"learning_rate": 9.97912445998776e-06,
"loss": 1.1353,
"step": 1337
},
{
"epoch": 0.38844534765568295,
"grad_norm": 3.538034439086914,
"learning_rate": 9.979036716907025e-06,
"loss": 1.3567,
"step": 1338
},
{
"epoch": 0.3887356655537814,
"grad_norm": 3.8515238761901855,
"learning_rate": 9.978948790201388e-06,
"loss": 1.1621,
"step": 1339
},
{
"epoch": 0.38902598345187983,
"grad_norm": 3.3468730449676514,
"learning_rate": 9.978860679874098e-06,
"loss": 1.1637,
"step": 1340
},
{
"epoch": 0.38931630134997824,
"grad_norm": 3.7249915599823,
"learning_rate": 9.9787723859284e-06,
"loss": 1.1381,
"step": 1341
},
{
"epoch": 0.38960661924807666,
"grad_norm": 3.5593464374542236,
"learning_rate": 9.978683908367555e-06,
"loss": 1.2549,
"step": 1342
},
{
"epoch": 0.38989693714617507,
"grad_norm": 3.818927526473999,
"learning_rate": 9.978595247194822e-06,
"loss": 1.3647,
"step": 1343
},
{
"epoch": 0.3901872550442735,
"grad_norm": 3.786468744277954,
"learning_rate": 9.978506402413472e-06,
"loss": 1.1994,
"step": 1344
},
{
"epoch": 0.3904775729423719,
"grad_norm": 3.9170660972595215,
"learning_rate": 9.97841737402678e-06,
"loss": 1.1363,
"step": 1345
},
{
"epoch": 0.3907678908404703,
"grad_norm": 3.4517476558685303,
"learning_rate": 9.978328162038032e-06,
"loss": 1.1165,
"step": 1346
},
{
"epoch": 0.3910582087385687,
"grad_norm": 3.631568670272827,
"learning_rate": 9.978238766450518e-06,
"loss": 1.199,
"step": 1347
},
{
"epoch": 0.39134852663666714,
"grad_norm": 3.3780012130737305,
"learning_rate": 9.978149187267532e-06,
"loss": 1.0625,
"step": 1348
},
{
"epoch": 0.39163884453476555,
"grad_norm": 3.4305973052978516,
"learning_rate": 9.97805942449238e-06,
"loss": 1.1939,
"step": 1349
},
{
"epoch": 0.39192916243286396,
"grad_norm": 3.3205480575561523,
"learning_rate": 9.977969478128373e-06,
"loss": 1.2248,
"step": 1350
},
{
"epoch": 0.3922194803309624,
"grad_norm": 3.6359150409698486,
"learning_rate": 9.977879348178826e-06,
"loss": 1.3019,
"step": 1351
},
{
"epoch": 0.39250979822906085,
"grad_norm": 3.7038495540618896,
"learning_rate": 9.977789034647066e-06,
"loss": 1.2069,
"step": 1352
},
{
"epoch": 0.39280011612715926,
"grad_norm": 3.569873094558716,
"learning_rate": 9.97769853753642e-06,
"loss": 1.2185,
"step": 1353
},
{
"epoch": 0.3930904340252577,
"grad_norm": 4.010556221008301,
"learning_rate": 9.977607856850227e-06,
"loss": 1.4308,
"step": 1354
},
{
"epoch": 0.3933807519233561,
"grad_norm": 3.732271432876587,
"learning_rate": 9.977516992591832e-06,
"loss": 1.3511,
"step": 1355
},
{
"epoch": 0.3936710698214545,
"grad_norm": 3.649620771408081,
"learning_rate": 9.977425944764585e-06,
"loss": 1.2222,
"step": 1356
},
{
"epoch": 0.3939613877195529,
"grad_norm": 3.5589444637298584,
"learning_rate": 9.977334713371844e-06,
"loss": 1.1794,
"step": 1357
},
{
"epoch": 0.3942517056176513,
"grad_norm": 3.443727970123291,
"learning_rate": 9.977243298416976e-06,
"loss": 1.2031,
"step": 1358
},
{
"epoch": 0.39454202351574974,
"grad_norm": 3.4052302837371826,
"learning_rate": 9.977151699903349e-06,
"loss": 1.3753,
"step": 1359
},
{
"epoch": 0.39483234141384815,
"grad_norm": 3.364332675933838,
"learning_rate": 9.977059917834342e-06,
"loss": 1.1101,
"step": 1360
},
{
"epoch": 0.39512265931194657,
"grad_norm": 3.46517276763916,
"learning_rate": 9.97696795221334e-06,
"loss": 1.1746,
"step": 1361
},
{
"epoch": 0.395412977210045,
"grad_norm": 3.6271650791168213,
"learning_rate": 9.976875803043737e-06,
"loss": 1.1741,
"step": 1362
},
{
"epoch": 0.3957032951081434,
"grad_norm": 3.873410224914551,
"learning_rate": 9.976783470328928e-06,
"loss": 1.2825,
"step": 1363
},
{
"epoch": 0.39599361300624186,
"grad_norm": 3.7868969440460205,
"learning_rate": 9.97669095407232e-06,
"loss": 1.3593,
"step": 1364
},
{
"epoch": 0.3962839309043403,
"grad_norm": 3.300156354904175,
"learning_rate": 9.976598254277324e-06,
"loss": 1.106,
"step": 1365
},
{
"epoch": 0.3965742488024387,
"grad_norm": 4.34855318069458,
"learning_rate": 9.97650537094736e-06,
"loss": 1.1779,
"step": 1366
},
{
"epoch": 0.3968645667005371,
"grad_norm": 3.3535711765289307,
"learning_rate": 9.976412304085852e-06,
"loss": 1.091,
"step": 1367
},
{
"epoch": 0.3971548845986355,
"grad_norm": 3.616659641265869,
"learning_rate": 9.976319053696236e-06,
"loss": 1.1698,
"step": 1368
},
{
"epoch": 0.39744520249673393,
"grad_norm": 3.9007325172424316,
"learning_rate": 9.976225619781944e-06,
"loss": 1.3209,
"step": 1369
},
{
"epoch": 0.39773552039483234,
"grad_norm": 3.554885149002075,
"learning_rate": 9.976132002346429e-06,
"loss": 1.0978,
"step": 1370
},
{
"epoch": 0.39802583829293076,
"grad_norm": 3.662487506866455,
"learning_rate": 9.976038201393138e-06,
"loss": 1.3094,
"step": 1371
},
{
"epoch": 0.39831615619102917,
"grad_norm": 3.5315754413604736,
"learning_rate": 9.975944216925533e-06,
"loss": 1.1677,
"step": 1372
},
{
"epoch": 0.3986064740891276,
"grad_norm": 3.787691831588745,
"learning_rate": 9.975850048947082e-06,
"loss": 1.294,
"step": 1373
},
{
"epoch": 0.398896791987226,
"grad_norm": 3.4021782875061035,
"learning_rate": 9.975755697461254e-06,
"loss": 1.1671,
"step": 1374
},
{
"epoch": 0.3991871098853244,
"grad_norm": 3.5344481468200684,
"learning_rate": 9.975661162471531e-06,
"loss": 1.061,
"step": 1375
},
{
"epoch": 0.3994774277834228,
"grad_norm": 3.530378580093384,
"learning_rate": 9.9755664439814e-06,
"loss": 1.1311,
"step": 1376
},
{
"epoch": 0.3997677456815213,
"grad_norm": 3.5945799350738525,
"learning_rate": 9.97547154199435e-06,
"loss": 1.0421,
"step": 1377
},
{
"epoch": 0.4000580635796197,
"grad_norm": 3.523029327392578,
"learning_rate": 9.975376456513886e-06,
"loss": 1.1865,
"step": 1378
},
{
"epoch": 0.4003483814777181,
"grad_norm": 3.855416774749756,
"learning_rate": 9.975281187543514e-06,
"loss": 1.3703,
"step": 1379
},
{
"epoch": 0.40063869937581653,
"grad_norm": 4.034465789794922,
"learning_rate": 9.975185735086745e-06,
"loss": 1.309,
"step": 1380
},
{
"epoch": 0.40092901727391494,
"grad_norm": 4.100909233093262,
"learning_rate": 9.9750900991471e-06,
"loss": 1.3027,
"step": 1381
},
{
"epoch": 0.40121933517201336,
"grad_norm": 3.6835947036743164,
"learning_rate": 9.974994279728105e-06,
"loss": 1.1245,
"step": 1382
},
{
"epoch": 0.40150965307011177,
"grad_norm": 3.456866979598999,
"learning_rate": 9.974898276833298e-06,
"loss": 1.1117,
"step": 1383
},
{
"epoch": 0.4017999709682102,
"grad_norm": 3.656215190887451,
"learning_rate": 9.974802090466216e-06,
"loss": 1.2049,
"step": 1384
},
{
"epoch": 0.4020902888663086,
"grad_norm": 4.105678081512451,
"learning_rate": 9.974705720630407e-06,
"loss": 1.4034,
"step": 1385
},
{
"epoch": 0.402380606764407,
"grad_norm": 3.769406795501709,
"learning_rate": 9.974609167329425e-06,
"loss": 1.365,
"step": 1386
},
{
"epoch": 0.4026709246625054,
"grad_norm": 3.7818362712860107,
"learning_rate": 9.974512430566829e-06,
"loss": 1.1959,
"step": 1387
},
{
"epoch": 0.40296124256060384,
"grad_norm": 3.7046732902526855,
"learning_rate": 9.974415510346192e-06,
"loss": 1.276,
"step": 1388
},
{
"epoch": 0.40325156045870225,
"grad_norm": 4.240913391113281,
"learning_rate": 9.974318406671083e-06,
"loss": 1.3754,
"step": 1389
},
{
"epoch": 0.4035418783568007,
"grad_norm": 3.827770948410034,
"learning_rate": 9.974221119545086e-06,
"loss": 1.1494,
"step": 1390
},
{
"epoch": 0.40383219625489913,
"grad_norm": 3.8236684799194336,
"learning_rate": 9.974123648971787e-06,
"loss": 1.3407,
"step": 1391
},
{
"epoch": 0.40412251415299755,
"grad_norm": 3.5897345542907715,
"learning_rate": 9.974025994954783e-06,
"loss": 1.1962,
"step": 1392
},
{
"epoch": 0.40441283205109596,
"grad_norm": 3.6147966384887695,
"learning_rate": 9.973928157497675e-06,
"loss": 1.2777,
"step": 1393
},
{
"epoch": 0.4047031499491944,
"grad_norm": 3.617846727371216,
"learning_rate": 9.973830136604068e-06,
"loss": 1.2909,
"step": 1394
},
{
"epoch": 0.4049934678472928,
"grad_norm": 3.4171886444091797,
"learning_rate": 9.973731932277581e-06,
"loss": 1.0739,
"step": 1395
},
{
"epoch": 0.4052837857453912,
"grad_norm": 3.370614767074585,
"learning_rate": 9.973633544521834e-06,
"loss": 1.1842,
"step": 1396
},
{
"epoch": 0.4055741036434896,
"grad_norm": 3.4126060009002686,
"learning_rate": 9.973534973340456e-06,
"loss": 1.1144,
"step": 1397
},
{
"epoch": 0.405864421541588,
"grad_norm": 3.8534622192382812,
"learning_rate": 9.97343621873708e-06,
"loss": 1.1885,
"step": 1398
},
{
"epoch": 0.40615473943968644,
"grad_norm": 3.420496940612793,
"learning_rate": 9.973337280715351e-06,
"loss": 1.1136,
"step": 1399
},
{
"epoch": 0.40644505733778485,
"grad_norm": 3.775999069213867,
"learning_rate": 9.973238159278917e-06,
"loss": 1.2418,
"step": 1400
},
{
"epoch": 0.40673537523588327,
"grad_norm": 3.9710724353790283,
"learning_rate": 9.973138854431433e-06,
"loss": 1.2584,
"step": 1401
},
{
"epoch": 0.40702569313398174,
"grad_norm": 3.2783279418945312,
"learning_rate": 9.97303936617656e-06,
"loss": 1.1942,
"step": 1402
},
{
"epoch": 0.40731601103208015,
"grad_norm": 3.7478137016296387,
"learning_rate": 9.972939694517971e-06,
"loss": 1.1562,
"step": 1403
},
{
"epoch": 0.40760632893017856,
"grad_norm": 3.628674030303955,
"learning_rate": 9.97283983945934e-06,
"loss": 1.2307,
"step": 1404
},
{
"epoch": 0.407896646828277,
"grad_norm": 3.313133716583252,
"learning_rate": 9.972739801004347e-06,
"loss": 1.0223,
"step": 1405
},
{
"epoch": 0.4081869647263754,
"grad_norm": 3.7657248973846436,
"learning_rate": 9.972639579156684e-06,
"loss": 1.2811,
"step": 1406
},
{
"epoch": 0.4084772826244738,
"grad_norm": 3.6290464401245117,
"learning_rate": 9.972539173920048e-06,
"loss": 1.1364,
"step": 1407
},
{
"epoch": 0.4087676005225722,
"grad_norm": 3.805755376815796,
"learning_rate": 9.972438585298139e-06,
"loss": 1.3117,
"step": 1408
},
{
"epoch": 0.40905791842067063,
"grad_norm": 3.6081717014312744,
"learning_rate": 9.972337813294668e-06,
"loss": 1.308,
"step": 1409
},
{
"epoch": 0.40934823631876904,
"grad_norm": 3.5983402729034424,
"learning_rate": 9.972236857913354e-06,
"loss": 1.1535,
"step": 1410
},
{
"epoch": 0.40963855421686746,
"grad_norm": 3.7803027629852295,
"learning_rate": 9.972135719157916e-06,
"loss": 1.3223,
"step": 1411
},
{
"epoch": 0.40992887211496587,
"grad_norm": 3.356072425842285,
"learning_rate": 9.972034397032086e-06,
"loss": 1.103,
"step": 1412
},
{
"epoch": 0.4102191900130643,
"grad_norm": 3.7912418842315674,
"learning_rate": 9.9719328915396e-06,
"loss": 1.1965,
"step": 1413
},
{
"epoch": 0.4105095079111627,
"grad_norm": 3.382089138031006,
"learning_rate": 9.971831202684203e-06,
"loss": 1.1991,
"step": 1414
},
{
"epoch": 0.41079982580926117,
"grad_norm": 3.6623477935791016,
"learning_rate": 9.971729330469644e-06,
"loss": 1.1763,
"step": 1415
},
{
"epoch": 0.4110901437073596,
"grad_norm": 3.4154701232910156,
"learning_rate": 9.97162727489968e-06,
"loss": 1.1038,
"step": 1416
},
{
"epoch": 0.411380461605458,
"grad_norm": 3.7780191898345947,
"learning_rate": 9.971525035978076e-06,
"loss": 1.315,
"step": 1417
},
{
"epoch": 0.4116707795035564,
"grad_norm": 3.626234292984009,
"learning_rate": 9.971422613708602e-06,
"loss": 1.2964,
"step": 1418
},
{
"epoch": 0.4119610974016548,
"grad_norm": 3.3718817234039307,
"learning_rate": 9.971320008095031e-06,
"loss": 1.2485,
"step": 1419
},
{
"epoch": 0.41225141529975323,
"grad_norm": 3.4189116954803467,
"learning_rate": 9.971217219141156e-06,
"loss": 1.1006,
"step": 1420
},
{
"epoch": 0.41254173319785165,
"grad_norm": 3.846132516860962,
"learning_rate": 9.97111424685076e-06,
"loss": 1.283,
"step": 1421
},
{
"epoch": 0.41283205109595006,
"grad_norm": 3.672684669494629,
"learning_rate": 9.971011091227642e-06,
"loss": 1.3357,
"step": 1422
},
{
"epoch": 0.41312236899404847,
"grad_norm": 3.523810863494873,
"learning_rate": 9.970907752275609e-06,
"loss": 1.2956,
"step": 1423
},
{
"epoch": 0.4134126868921469,
"grad_norm": 3.600360155105591,
"learning_rate": 9.970804229998472e-06,
"loss": 1.2537,
"step": 1424
},
{
"epoch": 0.4137030047902453,
"grad_norm": 3.5895142555236816,
"learning_rate": 9.970700524400047e-06,
"loss": 1.1542,
"step": 1425
},
{
"epoch": 0.4139933226883437,
"grad_norm": 3.9078710079193115,
"learning_rate": 9.970596635484158e-06,
"loss": 1.1888,
"step": 1426
},
{
"epoch": 0.4142836405864422,
"grad_norm": 3.8377363681793213,
"learning_rate": 9.970492563254638e-06,
"loss": 1.2513,
"step": 1427
},
{
"epoch": 0.4145739584845406,
"grad_norm": 3.7490737438201904,
"learning_rate": 9.970388307715326e-06,
"loss": 1.25,
"step": 1428
},
{
"epoch": 0.414864276382639,
"grad_norm": 3.806488275527954,
"learning_rate": 9.970283868870065e-06,
"loss": 1.1911,
"step": 1429
},
{
"epoch": 0.4151545942807374,
"grad_norm": 3.4695956707000732,
"learning_rate": 9.970179246722707e-06,
"loss": 1.1784,
"step": 1430
},
{
"epoch": 0.41544491217883583,
"grad_norm": 3.5068411827087402,
"learning_rate": 9.970074441277111e-06,
"loss": 1.2052,
"step": 1431
},
{
"epoch": 0.41573523007693425,
"grad_norm": 3.612985134124756,
"learning_rate": 9.96996945253714e-06,
"loss": 1.2254,
"step": 1432
},
{
"epoch": 0.41602554797503266,
"grad_norm": 3.5536611080169678,
"learning_rate": 9.96986428050667e-06,
"loss": 1.2219,
"step": 1433
},
{
"epoch": 0.4163158658731311,
"grad_norm": 3.725837469100952,
"learning_rate": 9.96975892518958e-06,
"loss": 1.174,
"step": 1434
},
{
"epoch": 0.4166061837712295,
"grad_norm": 3.201591968536377,
"learning_rate": 9.969653386589749e-06,
"loss": 0.9781,
"step": 1435
},
{
"epoch": 0.4168965016693279,
"grad_norm": 3.9703338146209717,
"learning_rate": 9.969547664711074e-06,
"loss": 1.1812,
"step": 1436
},
{
"epoch": 0.4171868195674263,
"grad_norm": 3.7230799198150635,
"learning_rate": 9.969441759557453e-06,
"loss": 1.102,
"step": 1437
},
{
"epoch": 0.41747713746552473,
"grad_norm": 3.4397854804992676,
"learning_rate": 9.969335671132793e-06,
"loss": 1.1384,
"step": 1438
},
{
"epoch": 0.41776745536362314,
"grad_norm": 3.201946258544922,
"learning_rate": 9.969229399441006e-06,
"loss": 1.0366,
"step": 1439
},
{
"epoch": 0.4180577732617216,
"grad_norm": 3.333623170852661,
"learning_rate": 9.96912294448601e-06,
"loss": 1.1551,
"step": 1440
},
{
"epoch": 0.41834809115982,
"grad_norm": 3.6181843280792236,
"learning_rate": 9.969016306271731e-06,
"loss": 1.2059,
"step": 1441
},
{
"epoch": 0.41863840905791844,
"grad_norm": 3.383269786834717,
"learning_rate": 9.968909484802103e-06,
"loss": 1.2181,
"step": 1442
},
{
"epoch": 0.41892872695601685,
"grad_norm": 3.3849494457244873,
"learning_rate": 9.968802480081065e-06,
"loss": 1.1045,
"step": 1443
},
{
"epoch": 0.41921904485411526,
"grad_norm": 3.6936628818511963,
"learning_rate": 9.968695292112564e-06,
"loss": 1.4005,
"step": 1444
},
{
"epoch": 0.4195093627522137,
"grad_norm": 3.769911050796509,
"learning_rate": 9.968587920900552e-06,
"loss": 1.3328,
"step": 1445
},
{
"epoch": 0.4197996806503121,
"grad_norm": 3.6452932357788086,
"learning_rate": 9.968480366448989e-06,
"loss": 1.2832,
"step": 1446
},
{
"epoch": 0.4200899985484105,
"grad_norm": 3.6828529834747314,
"learning_rate": 9.968372628761841e-06,
"loss": 1.2306,
"step": 1447
},
{
"epoch": 0.4203803164465089,
"grad_norm": 3.583516836166382,
"learning_rate": 9.968264707843083e-06,
"loss": 1.2331,
"step": 1448
},
{
"epoch": 0.42067063434460733,
"grad_norm": 4.031094074249268,
"learning_rate": 9.968156603696696e-06,
"loss": 1.2641,
"step": 1449
},
{
"epoch": 0.42096095224270574,
"grad_norm": 3.9242236614227295,
"learning_rate": 9.968048316326661e-06,
"loss": 1.2058,
"step": 1450
},
{
"epoch": 0.42125127014080416,
"grad_norm": 3.463057041168213,
"learning_rate": 9.967939845736978e-06,
"loss": 1.1357,
"step": 1451
},
{
"epoch": 0.4215415880389026,
"grad_norm": 3.4815406799316406,
"learning_rate": 9.967831191931645e-06,
"loss": 1.372,
"step": 1452
},
{
"epoch": 0.42183190593700104,
"grad_norm": 3.369882583618164,
"learning_rate": 9.967722354914668e-06,
"loss": 1.0852,
"step": 1453
},
{
"epoch": 0.42212222383509945,
"grad_norm": 3.3886513710021973,
"learning_rate": 9.967613334690065e-06,
"loss": 1.2646,
"step": 1454
},
{
"epoch": 0.42241254173319787,
"grad_norm": 3.631355047225952,
"learning_rate": 9.96750413126185e-06,
"loss": 1.1813,
"step": 1455
},
{
"epoch": 0.4227028596312963,
"grad_norm": 3.5558574199676514,
"learning_rate": 9.967394744634056e-06,
"loss": 1.2245,
"step": 1456
},
{
"epoch": 0.4229931775293947,
"grad_norm": 3.1339149475097656,
"learning_rate": 9.967285174810713e-06,
"loss": 1.0773,
"step": 1457
},
{
"epoch": 0.4232834954274931,
"grad_norm": 3.7277801036834717,
"learning_rate": 9.967175421795865e-06,
"loss": 1.3972,
"step": 1458
},
{
"epoch": 0.4235738133255915,
"grad_norm": 3.4788103103637695,
"learning_rate": 9.967065485593559e-06,
"loss": 1.2236,
"step": 1459
},
{
"epoch": 0.42386413122368993,
"grad_norm": 3.0842342376708984,
"learning_rate": 9.966955366207849e-06,
"loss": 1.0713,
"step": 1460
},
{
"epoch": 0.42415444912178835,
"grad_norm": 3.700028657913208,
"learning_rate": 9.966845063642795e-06,
"loss": 1.2501,
"step": 1461
},
{
"epoch": 0.42444476701988676,
"grad_norm": 3.3011817932128906,
"learning_rate": 9.966734577902469e-06,
"loss": 1.0213,
"step": 1462
},
{
"epoch": 0.4247350849179852,
"grad_norm": 3.7596285343170166,
"learning_rate": 9.96662390899094e-06,
"loss": 1.2344,
"step": 1463
},
{
"epoch": 0.4250254028160836,
"grad_norm": 3.251818895339966,
"learning_rate": 9.966513056912292e-06,
"loss": 1.1105,
"step": 1464
},
{
"epoch": 0.42531572071418206,
"grad_norm": 3.8628876209259033,
"learning_rate": 9.966402021670615e-06,
"loss": 1.2871,
"step": 1465
},
{
"epoch": 0.42560603861228047,
"grad_norm": 3.814058542251587,
"learning_rate": 9.966290803270003e-06,
"loss": 1.1547,
"step": 1466
},
{
"epoch": 0.4258963565103789,
"grad_norm": 3.737708330154419,
"learning_rate": 9.966179401714556e-06,
"loss": 1.2086,
"step": 1467
},
{
"epoch": 0.4261866744084773,
"grad_norm": 3.685622453689575,
"learning_rate": 9.966067817008383e-06,
"loss": 1.209,
"step": 1468
},
{
"epoch": 0.4264769923065757,
"grad_norm": 3.5678586959838867,
"learning_rate": 9.9659560491556e-06,
"loss": 1.3042,
"step": 1469
},
{
"epoch": 0.4267673102046741,
"grad_norm": 3.4052236080169678,
"learning_rate": 9.965844098160326e-06,
"loss": 1.084,
"step": 1470
},
{
"epoch": 0.42705762810277254,
"grad_norm": 3.542491912841797,
"learning_rate": 9.965731964026696e-06,
"loss": 1.2259,
"step": 1471
},
{
"epoch": 0.42734794600087095,
"grad_norm": 3.580087184906006,
"learning_rate": 9.96561964675884e-06,
"loss": 1.1961,
"step": 1472
},
{
"epoch": 0.42763826389896936,
"grad_norm": 3.7177071571350098,
"learning_rate": 9.965507146360902e-06,
"loss": 1.2361,
"step": 1473
},
{
"epoch": 0.4279285817970678,
"grad_norm": 3.361457586288452,
"learning_rate": 9.965394462837032e-06,
"loss": 1.1595,
"step": 1474
},
{
"epoch": 0.4282188996951662,
"grad_norm": 3.8086483478546143,
"learning_rate": 9.965281596191384e-06,
"loss": 1.4176,
"step": 1475
},
{
"epoch": 0.4285092175932646,
"grad_norm": 3.709951639175415,
"learning_rate": 9.965168546428122e-06,
"loss": 1.2644,
"step": 1476
},
{
"epoch": 0.428799535491363,
"grad_norm": 3.452254295349121,
"learning_rate": 9.965055313551413e-06,
"loss": 1.1387,
"step": 1477
},
{
"epoch": 0.4290898533894615,
"grad_norm": 3.2605044841766357,
"learning_rate": 9.964941897565434e-06,
"loss": 1.1387,
"step": 1478
},
{
"epoch": 0.4293801712875599,
"grad_norm": 3.717010498046875,
"learning_rate": 9.96482829847437e-06,
"loss": 1.2172,
"step": 1479
},
{
"epoch": 0.4296704891856583,
"grad_norm": 3.5657219886779785,
"learning_rate": 9.964714516282407e-06,
"loss": 1.2444,
"step": 1480
},
{
"epoch": 0.4299608070837567,
"grad_norm": 3.469438314437866,
"learning_rate": 9.964600550993744e-06,
"loss": 1.1068,
"step": 1481
},
{
"epoch": 0.43025112498185514,
"grad_norm": 3.4567294120788574,
"learning_rate": 9.96448640261258e-06,
"loss": 1.0813,
"step": 1482
},
{
"epoch": 0.43054144287995355,
"grad_norm": 3.3223202228546143,
"learning_rate": 9.964372071143131e-06,
"loss": 1.143,
"step": 1483
},
{
"epoch": 0.43083176077805196,
"grad_norm": 3.2226054668426514,
"learning_rate": 9.96425755658961e-06,
"loss": 1.189,
"step": 1484
},
{
"epoch": 0.4311220786761504,
"grad_norm": 3.6389126777648926,
"learning_rate": 9.964142858956239e-06,
"loss": 1.3073,
"step": 1485
},
{
"epoch": 0.4314123965742488,
"grad_norm": 3.3728039264678955,
"learning_rate": 9.964027978247248e-06,
"loss": 1.0786,
"step": 1486
},
{
"epoch": 0.4317027144723472,
"grad_norm": 3.0883610248565674,
"learning_rate": 9.963912914466877e-06,
"loss": 1.0915,
"step": 1487
},
{
"epoch": 0.4319930323704456,
"grad_norm": 3.4856998920440674,
"learning_rate": 9.963797667619368e-06,
"loss": 1.2368,
"step": 1488
},
{
"epoch": 0.43228335026854403,
"grad_norm": 3.4481701850891113,
"learning_rate": 9.96368223770897e-06,
"loss": 1.1896,
"step": 1489
},
{
"epoch": 0.4325736681666425,
"grad_norm": 3.5037729740142822,
"learning_rate": 9.963566624739939e-06,
"loss": 1.1162,
"step": 1490
},
{
"epoch": 0.4328639860647409,
"grad_norm": 3.3900668621063232,
"learning_rate": 9.963450828716543e-06,
"loss": 1.1042,
"step": 1491
},
{
"epoch": 0.4331543039628393,
"grad_norm": 3.7949774265289307,
"learning_rate": 9.96333484964305e-06,
"loss": 1.1798,
"step": 1492
},
{
"epoch": 0.43344462186093774,
"grad_norm": 3.6395134925842285,
"learning_rate": 9.963218687523737e-06,
"loss": 1.1875,
"step": 1493
},
{
"epoch": 0.43373493975903615,
"grad_norm": 3.550593376159668,
"learning_rate": 9.963102342362887e-06,
"loss": 1.2833,
"step": 1494
},
{
"epoch": 0.43402525765713457,
"grad_norm": 3.293470859527588,
"learning_rate": 9.962985814164794e-06,
"loss": 1.1546,
"step": 1495
},
{
"epoch": 0.434315575555233,
"grad_norm": 3.5365588665008545,
"learning_rate": 9.962869102933754e-06,
"loss": 1.1175,
"step": 1496
},
{
"epoch": 0.4346058934533314,
"grad_norm": 3.716935157775879,
"learning_rate": 9.962752208674069e-06,
"loss": 1.1918,
"step": 1497
},
{
"epoch": 0.4348962113514298,
"grad_norm": 3.8409154415130615,
"learning_rate": 9.962635131390054e-06,
"loss": 1.4506,
"step": 1498
},
{
"epoch": 0.4351865292495282,
"grad_norm": 3.3539156913757324,
"learning_rate": 9.962517871086023e-06,
"loss": 1.0693,
"step": 1499
},
{
"epoch": 0.43547684714762663,
"grad_norm": 3.8182966709136963,
"learning_rate": 9.962400427766304e-06,
"loss": 1.2104,
"step": 1500
},
{
"epoch": 0.43547684714762663,
"eval_loss": 1.2276885509490967,
"eval_runtime": 11.6259,
"eval_samples_per_second": 34.406,
"eval_steps_per_second": 4.301,
"step": 1500
},
{
"epoch": 0.43576716504572505,
"grad_norm": 3.362107992172241,
"learning_rate": 9.962282801435226e-06,
"loss": 1.1556,
"step": 1501
},
{
"epoch": 0.43605748294382346,
"grad_norm": 3.7278659343719482,
"learning_rate": 9.962164992097125e-06,
"loss": 1.2846,
"step": 1502
},
{
"epoch": 0.43634780084192193,
"grad_norm": 3.296018362045288,
"learning_rate": 9.962046999756352e-06,
"loss": 1.1573,
"step": 1503
},
{
"epoch": 0.43663811874002034,
"grad_norm": 3.632516860961914,
"learning_rate": 9.961928824417252e-06,
"loss": 1.3175,
"step": 1504
},
{
"epoch": 0.43692843663811876,
"grad_norm": 4.042605876922607,
"learning_rate": 9.961810466084188e-06,
"loss": 1.2586,
"step": 1505
},
{
"epoch": 0.43721875453621717,
"grad_norm": 3.322206735610962,
"learning_rate": 9.961691924761522e-06,
"loss": 1.0772,
"step": 1506
},
{
"epoch": 0.4375090724343156,
"grad_norm": 3.485081672668457,
"learning_rate": 9.961573200453627e-06,
"loss": 1.0764,
"step": 1507
},
{
"epoch": 0.437799390332414,
"grad_norm": 3.5794100761413574,
"learning_rate": 9.961454293164881e-06,
"loss": 1.1919,
"step": 1508
},
{
"epoch": 0.4380897082305124,
"grad_norm": 3.594174861907959,
"learning_rate": 9.96133520289967e-06,
"loss": 1.0832,
"step": 1509
},
{
"epoch": 0.4383800261286108,
"grad_norm": 3.7435574531555176,
"learning_rate": 9.961215929662385e-06,
"loss": 1.2706,
"step": 1510
},
{
"epoch": 0.43867034402670924,
"grad_norm": 3.7980704307556152,
"learning_rate": 9.961096473457425e-06,
"loss": 1.0843,
"step": 1511
},
{
"epoch": 0.43896066192480765,
"grad_norm": 3.568105459213257,
"learning_rate": 9.960976834289197e-06,
"loss": 1.1144,
"step": 1512
},
{
"epoch": 0.43925097982290606,
"grad_norm": 3.6921393871307373,
"learning_rate": 9.960857012162111e-06,
"loss": 1.2484,
"step": 1513
},
{
"epoch": 0.4395412977210045,
"grad_norm": 3.7946364879608154,
"learning_rate": 9.960737007080588e-06,
"loss": 1.262,
"step": 1514
},
{
"epoch": 0.43983161561910294,
"grad_norm": 3.6504087448120117,
"learning_rate": 9.960616819049053e-06,
"loss": 1.3897,
"step": 1515
},
{
"epoch": 0.44012193351720136,
"grad_norm": 3.5300283432006836,
"learning_rate": 9.960496448071936e-06,
"loss": 1.0847,
"step": 1516
},
{
"epoch": 0.44041225141529977,
"grad_norm": 3.1674747467041016,
"learning_rate": 9.960375894153682e-06,
"loss": 1.1272,
"step": 1517
},
{
"epoch": 0.4407025693133982,
"grad_norm": 3.515995502471924,
"learning_rate": 9.96025515729873e-06,
"loss": 1.1502,
"step": 1518
},
{
"epoch": 0.4409928872114966,
"grad_norm": 3.667440176010132,
"learning_rate": 9.960134237511538e-06,
"loss": 1.263,
"step": 1519
},
{
"epoch": 0.441283205109595,
"grad_norm": 3.8216376304626465,
"learning_rate": 9.960013134796564e-06,
"loss": 1.3115,
"step": 1520
},
{
"epoch": 0.4415735230076934,
"grad_norm": 3.4460253715515137,
"learning_rate": 9.959891849158275e-06,
"loss": 1.1301,
"step": 1521
},
{
"epoch": 0.44186384090579184,
"grad_norm": 3.636212110519409,
"learning_rate": 9.95977038060114e-06,
"loss": 1.3331,
"step": 1522
},
{
"epoch": 0.44215415880389025,
"grad_norm": 3.424614191055298,
"learning_rate": 9.959648729129642e-06,
"loss": 1.1076,
"step": 1523
},
{
"epoch": 0.44244447670198866,
"grad_norm": 3.6137311458587646,
"learning_rate": 9.959526894748268e-06,
"loss": 1.3869,
"step": 1524
},
{
"epoch": 0.4427347946000871,
"grad_norm": 3.550391912460327,
"learning_rate": 9.959404877461512e-06,
"loss": 1.2157,
"step": 1525
},
{
"epoch": 0.4430251124981855,
"grad_norm": 3.9449851512908936,
"learning_rate": 9.959282677273869e-06,
"loss": 1.1935,
"step": 1526
},
{
"epoch": 0.4433154303962839,
"grad_norm": 3.6746020317077637,
"learning_rate": 9.959160294189852e-06,
"loss": 1.3009,
"step": 1527
},
{
"epoch": 0.4436057482943824,
"grad_norm": 3.3976306915283203,
"learning_rate": 9.959037728213968e-06,
"loss": 1.3389,
"step": 1528
},
{
"epoch": 0.4438960661924808,
"grad_norm": 3.695160150527954,
"learning_rate": 9.958914979350743e-06,
"loss": 1.1807,
"step": 1529
},
{
"epoch": 0.4441863840905792,
"grad_norm": 3.731966257095337,
"learning_rate": 9.9587920476047e-06,
"loss": 1.2079,
"step": 1530
},
{
"epoch": 0.4444767019886776,
"grad_norm": 3.5896048545837402,
"learning_rate": 9.958668932980375e-06,
"loss": 1.1836,
"step": 1531
},
{
"epoch": 0.444767019886776,
"grad_norm": 3.400681972503662,
"learning_rate": 9.958545635482307e-06,
"loss": 1.1317,
"step": 1532
},
{
"epoch": 0.44505733778487444,
"grad_norm": 3.247178077697754,
"learning_rate": 9.958422155115044e-06,
"loss": 1.2038,
"step": 1533
},
{
"epoch": 0.44534765568297285,
"grad_norm": 3.610156536102295,
"learning_rate": 9.95829849188314e-06,
"loss": 1.1852,
"step": 1534
},
{
"epoch": 0.44563797358107127,
"grad_norm": 3.8021605014801025,
"learning_rate": 9.958174645791154e-06,
"loss": 1.4697,
"step": 1535
},
{
"epoch": 0.4459282914791697,
"grad_norm": 3.3716843128204346,
"learning_rate": 9.958050616843655e-06,
"loss": 1.1266,
"step": 1536
},
{
"epoch": 0.4462186093772681,
"grad_norm": 3.840357780456543,
"learning_rate": 9.957926405045219e-06,
"loss": 1.2474,
"step": 1537
},
{
"epoch": 0.4465089272753665,
"grad_norm": 3.4997823238372803,
"learning_rate": 9.957802010400423e-06,
"loss": 1.1936,
"step": 1538
},
{
"epoch": 0.4467992451734649,
"grad_norm": 3.3240110874176025,
"learning_rate": 9.957677432913855e-06,
"loss": 1.1124,
"step": 1539
},
{
"epoch": 0.4470895630715634,
"grad_norm": 3.7043850421905518,
"learning_rate": 9.957552672590111e-06,
"loss": 1.1571,
"step": 1540
},
{
"epoch": 0.4473798809696618,
"grad_norm": 3.405775308609009,
"learning_rate": 9.957427729433794e-06,
"loss": 1.2005,
"step": 1541
},
{
"epoch": 0.4476701988677602,
"grad_norm": 3.6422696113586426,
"learning_rate": 9.957302603449508e-06,
"loss": 1.3203,
"step": 1542
},
{
"epoch": 0.44796051676585863,
"grad_norm": 3.397426128387451,
"learning_rate": 9.95717729464187e-06,
"loss": 1.2018,
"step": 1543
},
{
"epoch": 0.44825083466395704,
"grad_norm": 3.974717617034912,
"learning_rate": 9.9570518030155e-06,
"loss": 1.3058,
"step": 1544
},
{
"epoch": 0.44854115256205546,
"grad_norm": 3.8308608531951904,
"learning_rate": 9.956926128575026e-06,
"loss": 1.2463,
"step": 1545
},
{
"epoch": 0.44883147046015387,
"grad_norm": 3.5619077682495117,
"learning_rate": 9.956800271325084e-06,
"loss": 1.2587,
"step": 1546
},
{
"epoch": 0.4491217883582523,
"grad_norm": 3.4124200344085693,
"learning_rate": 9.956674231270316e-06,
"loss": 1.1719,
"step": 1547
},
{
"epoch": 0.4494121062563507,
"grad_norm": 3.5342917442321777,
"learning_rate": 9.95654800841537e-06,
"loss": 1.1438,
"step": 1548
},
{
"epoch": 0.4497024241544491,
"grad_norm": 3.613375663757324,
"learning_rate": 9.956421602764899e-06,
"loss": 1.0305,
"step": 1549
},
{
"epoch": 0.4499927420525475,
"grad_norm": 3.55999493598938,
"learning_rate": 9.956295014323566e-06,
"loss": 1.122,
"step": 1550
},
{
"epoch": 0.45028305995064594,
"grad_norm": 3.425326347351074,
"learning_rate": 9.956168243096039e-06,
"loss": 0.9979,
"step": 1551
},
{
"epoch": 0.45057337784874435,
"grad_norm": 3.199810028076172,
"learning_rate": 9.956041289086995e-06,
"loss": 1.1511,
"step": 1552
},
{
"epoch": 0.4508636957468428,
"grad_norm": 3.714824914932251,
"learning_rate": 9.955914152301115e-06,
"loss": 1.2827,
"step": 1553
},
{
"epoch": 0.45115401364494123,
"grad_norm": 3.588531732559204,
"learning_rate": 9.955786832743089e-06,
"loss": 1.2596,
"step": 1554
},
{
"epoch": 0.45144433154303965,
"grad_norm": 3.7227511405944824,
"learning_rate": 9.955659330417608e-06,
"loss": 1.2919,
"step": 1555
},
{
"epoch": 0.45173464944113806,
"grad_norm": 3.487367868423462,
"learning_rate": 9.95553164532938e-06,
"loss": 1.118,
"step": 1556
},
{
"epoch": 0.45202496733923647,
"grad_norm": 3.4509451389312744,
"learning_rate": 9.955403777483112e-06,
"loss": 1.1279,
"step": 1557
},
{
"epoch": 0.4523152852373349,
"grad_norm": 3.383143663406372,
"learning_rate": 9.955275726883517e-06,
"loss": 1.0833,
"step": 1558
},
{
"epoch": 0.4526056031354333,
"grad_norm": 3.47957444190979,
"learning_rate": 9.955147493535321e-06,
"loss": 1.0278,
"step": 1559
},
{
"epoch": 0.4528959210335317,
"grad_norm": 3.340008497238159,
"learning_rate": 9.95501907744325e-06,
"loss": 1.2062,
"step": 1560
},
{
"epoch": 0.4531862389316301,
"grad_norm": 3.7595670223236084,
"learning_rate": 9.954890478612045e-06,
"loss": 1.142,
"step": 1561
},
{
"epoch": 0.45347655682972854,
"grad_norm": 3.9946539402008057,
"learning_rate": 9.954761697046445e-06,
"loss": 1.326,
"step": 1562
},
{
"epoch": 0.45376687472782695,
"grad_norm": 3.490159273147583,
"learning_rate": 9.954632732751196e-06,
"loss": 1.2648,
"step": 1563
},
{
"epoch": 0.45405719262592537,
"grad_norm": 3.5875537395477295,
"learning_rate": 9.954503585731061e-06,
"loss": 1.3082,
"step": 1564
},
{
"epoch": 0.4543475105240238,
"grad_norm": 3.562396764755249,
"learning_rate": 9.9543742559908e-06,
"loss": 1.1663,
"step": 1565
},
{
"epoch": 0.45463782842212225,
"grad_norm": 3.3653926849365234,
"learning_rate": 9.954244743535181e-06,
"loss": 1.1193,
"step": 1566
},
{
"epoch": 0.45492814632022066,
"grad_norm": 3.2246313095092773,
"learning_rate": 9.954115048368984e-06,
"loss": 1.1123,
"step": 1567
},
{
"epoch": 0.4552184642183191,
"grad_norm": 3.596186876296997,
"learning_rate": 9.953985170496989e-06,
"loss": 1.1279,
"step": 1568
},
{
"epoch": 0.4555087821164175,
"grad_norm": 3.476844072341919,
"learning_rate": 9.953855109923987e-06,
"loss": 1.1921,
"step": 1569
},
{
"epoch": 0.4557991000145159,
"grad_norm": 3.237635612487793,
"learning_rate": 9.953724866654775e-06,
"loss": 1.1454,
"step": 1570
},
{
"epoch": 0.4560894179126143,
"grad_norm": 3.8363256454467773,
"learning_rate": 9.953594440694154e-06,
"loss": 1.3695,
"step": 1571
},
{
"epoch": 0.45637973581071273,
"grad_norm": 3.6838483810424805,
"learning_rate": 9.953463832046936e-06,
"loss": 1.1117,
"step": 1572
},
{
"epoch": 0.45667005370881114,
"grad_norm": 3.7038700580596924,
"learning_rate": 9.953333040717938e-06,
"loss": 1.1542,
"step": 1573
},
{
"epoch": 0.45696037160690955,
"grad_norm": 3.6134519577026367,
"learning_rate": 9.953202066711985e-06,
"loss": 1.0318,
"step": 1574
},
{
"epoch": 0.45725068950500797,
"grad_norm": 3.8328986167907715,
"learning_rate": 9.953070910033904e-06,
"loss": 1.3801,
"step": 1575
},
{
"epoch": 0.4575410074031064,
"grad_norm": 3.912644624710083,
"learning_rate": 9.952939570688532e-06,
"loss": 1.1623,
"step": 1576
},
{
"epoch": 0.4578313253012048,
"grad_norm": 3.586677074432373,
"learning_rate": 9.952808048680716e-06,
"loss": 1.1486,
"step": 1577
},
{
"epoch": 0.45812164319930326,
"grad_norm": 3.4135282039642334,
"learning_rate": 9.952676344015304e-06,
"loss": 1.1422,
"step": 1578
},
{
"epoch": 0.4584119610974017,
"grad_norm": 3.606527090072632,
"learning_rate": 9.952544456697153e-06,
"loss": 1.1445,
"step": 1579
},
{
"epoch": 0.4587022789955001,
"grad_norm": 3.4661526679992676,
"learning_rate": 9.95241238673113e-06,
"loss": 1.2884,
"step": 1580
},
{
"epoch": 0.4589925968935985,
"grad_norm": 3.521548271179199,
"learning_rate": 9.9522801341221e-06,
"loss": 1.1434,
"step": 1581
},
{
"epoch": 0.4592829147916969,
"grad_norm": 3.239595890045166,
"learning_rate": 9.952147698874948e-06,
"loss": 1.0461,
"step": 1582
},
{
"epoch": 0.45957323268979533,
"grad_norm": 3.476299524307251,
"learning_rate": 9.95201508099455e-06,
"loss": 1.2104,
"step": 1583
},
{
"epoch": 0.45986355058789374,
"grad_norm": 3.398822784423828,
"learning_rate": 9.951882280485805e-06,
"loss": 1.2335,
"step": 1584
},
{
"epoch": 0.46015386848599216,
"grad_norm": 3.5042576789855957,
"learning_rate": 9.951749297353605e-06,
"loss": 1.1804,
"step": 1585
},
{
"epoch": 0.46044418638409057,
"grad_norm": 3.163114547729492,
"learning_rate": 9.951616131602855e-06,
"loss": 1.0034,
"step": 1586
},
{
"epoch": 0.460734504282189,
"grad_norm": 3.456465244293213,
"learning_rate": 9.951482783238468e-06,
"loss": 1.1458,
"step": 1587
},
{
"epoch": 0.4610248221802874,
"grad_norm": 3.5666391849517822,
"learning_rate": 9.95134925226536e-06,
"loss": 1.2278,
"step": 1588
},
{
"epoch": 0.4613151400783858,
"grad_norm": 3.3443286418914795,
"learning_rate": 9.951215538688456e-06,
"loss": 1.1107,
"step": 1589
},
{
"epoch": 0.4616054579764842,
"grad_norm": 3.3506739139556885,
"learning_rate": 9.95108164251269e-06,
"loss": 1.0595,
"step": 1590
},
{
"epoch": 0.4618957758745827,
"grad_norm": 3.423740863800049,
"learning_rate": 9.950947563742997e-06,
"loss": 1.0907,
"step": 1591
},
{
"epoch": 0.4621860937726811,
"grad_norm": 3.432969808578491,
"learning_rate": 9.950813302384322e-06,
"loss": 1.13,
"step": 1592
},
{
"epoch": 0.4624764116707795,
"grad_norm": 3.5011508464813232,
"learning_rate": 9.950678858441616e-06,
"loss": 1.2519,
"step": 1593
},
{
"epoch": 0.46276672956887793,
"grad_norm": 3.8555173873901367,
"learning_rate": 9.950544231919841e-06,
"loss": 1.2269,
"step": 1594
},
{
"epoch": 0.46305704746697635,
"grad_norm": 3.934401750564575,
"learning_rate": 9.950409422823957e-06,
"loss": 1.4339,
"step": 1595
},
{
"epoch": 0.46334736536507476,
"grad_norm": 3.346092700958252,
"learning_rate": 9.95027443115894e-06,
"loss": 1.0944,
"step": 1596
},
{
"epoch": 0.4636376832631732,
"grad_norm": 3.4434292316436768,
"learning_rate": 9.950139256929765e-06,
"loss": 1.2216,
"step": 1597
},
{
"epoch": 0.4639280011612716,
"grad_norm": 3.2562990188598633,
"learning_rate": 9.950003900141418e-06,
"loss": 1.139,
"step": 1598
},
{
"epoch": 0.46421831905937,
"grad_norm": 3.5434086322784424,
"learning_rate": 9.949868360798893e-06,
"loss": 1.1897,
"step": 1599
},
{
"epoch": 0.4645086369574684,
"grad_norm": 3.396911382675171,
"learning_rate": 9.949732638907186e-06,
"loss": 1.1597,
"step": 1600
},
{
"epoch": 0.4647989548555668,
"grad_norm": 3.415681838989258,
"learning_rate": 9.949596734471304e-06,
"loss": 1.0674,
"step": 1601
},
{
"epoch": 0.46508927275366524,
"grad_norm": 3.7457592487335205,
"learning_rate": 9.949460647496258e-06,
"loss": 1.4005,
"step": 1602
},
{
"epoch": 0.4653795906517637,
"grad_norm": 3.612797737121582,
"learning_rate": 9.949324377987069e-06,
"loss": 1.226,
"step": 1603
},
{
"epoch": 0.4656699085498621,
"grad_norm": 3.3432776927948,
"learning_rate": 9.94918792594876e-06,
"loss": 1.1843,
"step": 1604
},
{
"epoch": 0.46596022644796053,
"grad_norm": 3.31148624420166,
"learning_rate": 9.949051291386365e-06,
"loss": 1.0242,
"step": 1605
},
{
"epoch": 0.46625054434605895,
"grad_norm": 3.4100899696350098,
"learning_rate": 9.948914474304922e-06,
"loss": 1.2697,
"step": 1606
},
{
"epoch": 0.46654086224415736,
"grad_norm": 3.507978916168213,
"learning_rate": 9.948777474709477e-06,
"loss": 1.0508,
"step": 1607
},
{
"epoch": 0.4668311801422558,
"grad_norm": 3.3306009769439697,
"learning_rate": 9.948640292605081e-06,
"loss": 1.1063,
"step": 1608
},
{
"epoch": 0.4671214980403542,
"grad_norm": 3.5736498832702637,
"learning_rate": 9.948502927996797e-06,
"loss": 1.1513,
"step": 1609
},
{
"epoch": 0.4674118159384526,
"grad_norm": 3.465364933013916,
"learning_rate": 9.948365380889688e-06,
"loss": 1.1332,
"step": 1610
},
{
"epoch": 0.467702133836551,
"grad_norm": 3.7221927642822266,
"learning_rate": 9.948227651288828e-06,
"loss": 1.1749,
"step": 1611
},
{
"epoch": 0.46799245173464943,
"grad_norm": 3.762308359146118,
"learning_rate": 9.948089739199296e-06,
"loss": 1.1774,
"step": 1612
},
{
"epoch": 0.46828276963274784,
"grad_norm": 3.715789794921875,
"learning_rate": 9.947951644626177e-06,
"loss": 1.2722,
"step": 1613
},
{
"epoch": 0.46857308753084626,
"grad_norm": 3.3541386127471924,
"learning_rate": 9.947813367574564e-06,
"loss": 1.1911,
"step": 1614
},
{
"epoch": 0.46886340542894467,
"grad_norm": 3.2428276538848877,
"learning_rate": 9.94767490804956e-06,
"loss": 1.0802,
"step": 1615
},
{
"epoch": 0.46915372332704314,
"grad_norm": 3.3992302417755127,
"learning_rate": 9.947536266056269e-06,
"loss": 1.1518,
"step": 1616
},
{
"epoch": 0.46944404122514155,
"grad_norm": 3.9083375930786133,
"learning_rate": 9.947397441599801e-06,
"loss": 1.3027,
"step": 1617
},
{
"epoch": 0.46973435912323996,
"grad_norm": 4.152743816375732,
"learning_rate": 9.947258434685281e-06,
"loss": 1.2554,
"step": 1618
},
{
"epoch": 0.4700246770213384,
"grad_norm": 4.119356632232666,
"learning_rate": 9.947119245317832e-06,
"loss": 1.2819,
"step": 1619
},
{
"epoch": 0.4703149949194368,
"grad_norm": 3.8427681922912598,
"learning_rate": 9.946979873502589e-06,
"loss": 1.2107,
"step": 1620
},
{
"epoch": 0.4706053128175352,
"grad_norm": 3.865187883377075,
"learning_rate": 9.94684031924469e-06,
"loss": 1.2721,
"step": 1621
},
{
"epoch": 0.4708956307156336,
"grad_norm": 3.146252155303955,
"learning_rate": 9.946700582549285e-06,
"loss": 1.0884,
"step": 1622
},
{
"epoch": 0.47118594861373203,
"grad_norm": 3.6837799549102783,
"learning_rate": 9.946560663421525e-06,
"loss": 1.1676,
"step": 1623
},
{
"epoch": 0.47147626651183044,
"grad_norm": 3.769131898880005,
"learning_rate": 9.94642056186657e-06,
"loss": 1.3335,
"step": 1624
},
{
"epoch": 0.47176658440992886,
"grad_norm": 3.6001875400543213,
"learning_rate": 9.946280277889589e-06,
"loss": 1.1265,
"step": 1625
},
{
"epoch": 0.47205690230802727,
"grad_norm": 4.254703044891357,
"learning_rate": 9.946139811495752e-06,
"loss": 1.3297,
"step": 1626
},
{
"epoch": 0.4723472202061257,
"grad_norm": 3.61510968208313,
"learning_rate": 9.945999162690243e-06,
"loss": 1.1887,
"step": 1627
},
{
"epoch": 0.47263753810422415,
"grad_norm": 3.536651611328125,
"learning_rate": 9.945858331478249e-06,
"loss": 1.098,
"step": 1628
},
{
"epoch": 0.47292785600232257,
"grad_norm": 3.742727041244507,
"learning_rate": 9.94571731786496e-06,
"loss": 1.3315,
"step": 1629
},
{
"epoch": 0.473218173900421,
"grad_norm": 3.31262469291687,
"learning_rate": 9.94557612185558e-06,
"loss": 1.1736,
"step": 1630
},
{
"epoch": 0.4735084917985194,
"grad_norm": 3.649885892868042,
"learning_rate": 9.945434743455315e-06,
"loss": 1.1563,
"step": 1631
},
{
"epoch": 0.4737988096966178,
"grad_norm": 3.665729284286499,
"learning_rate": 9.945293182669379e-06,
"loss": 1.1454,
"step": 1632
},
{
"epoch": 0.4740891275947162,
"grad_norm": 3.2671260833740234,
"learning_rate": 9.945151439502994e-06,
"loss": 1.1382,
"step": 1633
},
{
"epoch": 0.47437944549281463,
"grad_norm": 3.785245180130005,
"learning_rate": 9.945009513961386e-06,
"loss": 1.1418,
"step": 1634
},
{
"epoch": 0.47466976339091305,
"grad_norm": 3.435044527053833,
"learning_rate": 9.94486740604979e-06,
"loss": 1.1039,
"step": 1635
},
{
"epoch": 0.47496008128901146,
"grad_norm": 3.3379416465759277,
"learning_rate": 9.944725115773444e-06,
"loss": 1.1867,
"step": 1636
},
{
"epoch": 0.4752503991871099,
"grad_norm": 3.381946563720703,
"learning_rate": 9.9445826431376e-06,
"loss": 1.1049,
"step": 1637
},
{
"epoch": 0.4755407170852083,
"grad_norm": 3.501094341278076,
"learning_rate": 9.944439988147509e-06,
"loss": 1.2041,
"step": 1638
},
{
"epoch": 0.4758310349833067,
"grad_norm": 3.4139304161071777,
"learning_rate": 9.944297150808435e-06,
"loss": 1.1924,
"step": 1639
},
{
"epoch": 0.4761213528814051,
"grad_norm": 3.5083329677581787,
"learning_rate": 9.944154131125643e-06,
"loss": 1.094,
"step": 1640
},
{
"epoch": 0.4764116707795036,
"grad_norm": 3.6780874729156494,
"learning_rate": 9.94401092910441e-06,
"loss": 1.3628,
"step": 1641
},
{
"epoch": 0.476701988677602,
"grad_norm": 3.515752077102661,
"learning_rate": 9.943867544750014e-06,
"loss": 1.2409,
"step": 1642
},
{
"epoch": 0.4769923065757004,
"grad_norm": 3.191023349761963,
"learning_rate": 9.943723978067747e-06,
"loss": 0.9894,
"step": 1643
},
{
"epoch": 0.4772826244737988,
"grad_norm": 3.679292678833008,
"learning_rate": 9.943580229062899e-06,
"loss": 1.2552,
"step": 1644
},
{
"epoch": 0.47757294237189724,
"grad_norm": 3.752819299697876,
"learning_rate": 9.943436297740775e-06,
"loss": 1.3449,
"step": 1645
},
{
"epoch": 0.47786326026999565,
"grad_norm": 3.826658248901367,
"learning_rate": 9.943292184106684e-06,
"loss": 1.239,
"step": 1646
},
{
"epoch": 0.47815357816809406,
"grad_norm": 3.6658759117126465,
"learning_rate": 9.943147888165936e-06,
"loss": 1.2737,
"step": 1647
},
{
"epoch": 0.4784438960661925,
"grad_norm": 3.1992828845977783,
"learning_rate": 9.943003409923857e-06,
"loss": 1.1231,
"step": 1648
},
{
"epoch": 0.4787342139642909,
"grad_norm": 4.053700923919678,
"learning_rate": 9.942858749385774e-06,
"loss": 1.1836,
"step": 1649
},
{
"epoch": 0.4790245318623893,
"grad_norm": 3.2630503177642822,
"learning_rate": 9.942713906557022e-06,
"loss": 1.2698,
"step": 1650
},
{
"epoch": 0.4793148497604877,
"grad_norm": 3.746953010559082,
"learning_rate": 9.942568881442942e-06,
"loss": 1.302,
"step": 1651
},
{
"epoch": 0.47960516765858613,
"grad_norm": 3.554513692855835,
"learning_rate": 9.942423674048883e-06,
"loss": 1.233,
"step": 1652
},
{
"epoch": 0.47989548555668454,
"grad_norm": 3.5243356227874756,
"learning_rate": 9.9422782843802e-06,
"loss": 1.2497,
"step": 1653
},
{
"epoch": 0.480185803454783,
"grad_norm": 3.6694653034210205,
"learning_rate": 9.942132712442256e-06,
"loss": 1.1968,
"step": 1654
},
{
"epoch": 0.4804761213528814,
"grad_norm": 3.7765867710113525,
"learning_rate": 9.941986958240419e-06,
"loss": 1.3024,
"step": 1655
},
{
"epoch": 0.48076643925097984,
"grad_norm": 3.853088855743408,
"learning_rate": 9.941841021780064e-06,
"loss": 1.2627,
"step": 1656
},
{
"epoch": 0.48105675714907825,
"grad_norm": 3.233306646347046,
"learning_rate": 9.941694903066572e-06,
"loss": 1.1378,
"step": 1657
},
{
"epoch": 0.48134707504717666,
"grad_norm": 3.6022415161132812,
"learning_rate": 9.941548602105333e-06,
"loss": 1.1581,
"step": 1658
},
{
"epoch": 0.4816373929452751,
"grad_norm": 3.3151590824127197,
"learning_rate": 9.941402118901743e-06,
"loss": 1.0879,
"step": 1659
},
{
"epoch": 0.4819277108433735,
"grad_norm": 3.559082508087158,
"learning_rate": 9.941255453461205e-06,
"loss": 1.2952,
"step": 1660
},
{
"epoch": 0.4822180287414719,
"grad_norm": 3.499293565750122,
"learning_rate": 9.941108605789125e-06,
"loss": 1.1496,
"step": 1661
},
{
"epoch": 0.4825083466395703,
"grad_norm": 3.5328094959259033,
"learning_rate": 9.940961575890921e-06,
"loss": 1.1707,
"step": 1662
},
{
"epoch": 0.48279866453766873,
"grad_norm": 3.5672430992126465,
"learning_rate": 9.940814363772016e-06,
"loss": 1.1496,
"step": 1663
},
{
"epoch": 0.48308898243576714,
"grad_norm": 3.3060715198516846,
"learning_rate": 9.940666969437836e-06,
"loss": 1.1478,
"step": 1664
},
{
"epoch": 0.48337930033386556,
"grad_norm": 3.711249828338623,
"learning_rate": 9.94051939289382e-06,
"loss": 1.2939,
"step": 1665
},
{
"epoch": 0.483669618231964,
"grad_norm": 3.299621343612671,
"learning_rate": 9.94037163414541e-06,
"loss": 1.1671,
"step": 1666
},
{
"epoch": 0.48395993613006244,
"grad_norm": 3.329033851623535,
"learning_rate": 9.940223693198054e-06,
"loss": 1.1649,
"step": 1667
},
{
"epoch": 0.48425025402816085,
"grad_norm": 3.5311896800994873,
"learning_rate": 9.940075570057209e-06,
"loss": 1.2479,
"step": 1668
},
{
"epoch": 0.48454057192625927,
"grad_norm": 3.478177785873413,
"learning_rate": 9.939927264728337e-06,
"loss": 1.0782,
"step": 1669
},
{
"epoch": 0.4848308898243577,
"grad_norm": 3.5076146125793457,
"learning_rate": 9.939778777216906e-06,
"loss": 1.1456,
"step": 1670
},
{
"epoch": 0.4851212077224561,
"grad_norm": 3.6281466484069824,
"learning_rate": 9.939630107528398e-06,
"loss": 1.1161,
"step": 1671
},
{
"epoch": 0.4854115256205545,
"grad_norm": 3.4649016857147217,
"learning_rate": 9.93948125566829e-06,
"loss": 1.1357,
"step": 1672
},
{
"epoch": 0.4857018435186529,
"grad_norm": 3.5469138622283936,
"learning_rate": 9.939332221642072e-06,
"loss": 1.1384,
"step": 1673
},
{
"epoch": 0.48599216141675133,
"grad_norm": 3.2848334312438965,
"learning_rate": 9.939183005455243e-06,
"loss": 1.1347,
"step": 1674
},
{
"epoch": 0.48628247931484975,
"grad_norm": 3.8708393573760986,
"learning_rate": 9.939033607113304e-06,
"loss": 1.2536,
"step": 1675
},
{
"epoch": 0.48657279721294816,
"grad_norm": 3.4363129138946533,
"learning_rate": 9.938884026621766e-06,
"loss": 1.2946,
"step": 1676
},
{
"epoch": 0.4868631151110466,
"grad_norm": 3.6415820121765137,
"learning_rate": 9.938734263986144e-06,
"loss": 1.2418,
"step": 1677
},
{
"epoch": 0.487153433009145,
"grad_norm": 3.5188496112823486,
"learning_rate": 9.938584319211965e-06,
"loss": 1.2058,
"step": 1678
},
{
"epoch": 0.48744375090724346,
"grad_norm": 3.30953049659729,
"learning_rate": 9.938434192304756e-06,
"loss": 1.1317,
"step": 1679
},
{
"epoch": 0.48773406880534187,
"grad_norm": 3.760052442550659,
"learning_rate": 9.938283883270051e-06,
"loss": 1.1917,
"step": 1680
},
{
"epoch": 0.4880243867034403,
"grad_norm": 3.384671688079834,
"learning_rate": 9.938133392113399e-06,
"loss": 1.1273,
"step": 1681
},
{
"epoch": 0.4883147046015387,
"grad_norm": 3.7452921867370605,
"learning_rate": 9.937982718840345e-06,
"loss": 1.2016,
"step": 1682
},
{
"epoch": 0.4886050224996371,
"grad_norm": 3.7120046615600586,
"learning_rate": 9.937831863456448e-06,
"loss": 1.3403,
"step": 1683
},
{
"epoch": 0.4888953403977355,
"grad_norm": 3.808293581008911,
"learning_rate": 9.937680825967272e-06,
"loss": 1.165,
"step": 1684
},
{
"epoch": 0.48918565829583394,
"grad_norm": 3.2630043029785156,
"learning_rate": 9.937529606378387e-06,
"loss": 1.125,
"step": 1685
},
{
"epoch": 0.48947597619393235,
"grad_norm": 3.6727232933044434,
"learning_rate": 9.937378204695368e-06,
"loss": 1.0798,
"step": 1686
},
{
"epoch": 0.48976629409203076,
"grad_norm": 3.460695505142212,
"learning_rate": 9.9372266209238e-06,
"loss": 1.1424,
"step": 1687
},
{
"epoch": 0.4900566119901292,
"grad_norm": 3.477473258972168,
"learning_rate": 9.937074855069276e-06,
"loss": 1.2076,
"step": 1688
},
{
"epoch": 0.4903469298882276,
"grad_norm": 3.641740322113037,
"learning_rate": 9.93692290713739e-06,
"loss": 1.3142,
"step": 1689
},
{
"epoch": 0.490637247786326,
"grad_norm": 3.400716543197632,
"learning_rate": 9.936770777133744e-06,
"loss": 1.2266,
"step": 1690
},
{
"epoch": 0.49092756568442447,
"grad_norm": 3.436521053314209,
"learning_rate": 9.936618465063955e-06,
"loss": 1.1197,
"step": 1691
},
{
"epoch": 0.4912178835825229,
"grad_norm": 3.466358184814453,
"learning_rate": 9.936465970933632e-06,
"loss": 1.2037,
"step": 1692
},
{
"epoch": 0.4915082014806213,
"grad_norm": 4.054111480712891,
"learning_rate": 9.936313294748405e-06,
"loss": 1.5063,
"step": 1693
},
{
"epoch": 0.4917985193787197,
"grad_norm": 3.775129556655884,
"learning_rate": 9.936160436513902e-06,
"loss": 1.2823,
"step": 1694
},
{
"epoch": 0.4920888372768181,
"grad_norm": 3.5445947647094727,
"learning_rate": 9.93600739623576e-06,
"loss": 1.2434,
"step": 1695
},
{
"epoch": 0.49237915517491654,
"grad_norm": 3.2320921421051025,
"learning_rate": 9.935854173919625e-06,
"loss": 1.1279,
"step": 1696
},
{
"epoch": 0.49266947307301495,
"grad_norm": 3.1317856311798096,
"learning_rate": 9.935700769571148e-06,
"loss": 1.1443,
"step": 1697
},
{
"epoch": 0.49295979097111337,
"grad_norm": 3.772987127304077,
"learning_rate": 9.935547183195985e-06,
"loss": 1.2283,
"step": 1698
},
{
"epoch": 0.4932501088692118,
"grad_norm": 3.737846851348877,
"learning_rate": 9.935393414799797e-06,
"loss": 1.1608,
"step": 1699
},
{
"epoch": 0.4935404267673102,
"grad_norm": 4.081494331359863,
"learning_rate": 9.935239464388262e-06,
"loss": 1.2129,
"step": 1700
},
{
"epoch": 0.4938307446654086,
"grad_norm": 3.5556063652038574,
"learning_rate": 9.935085331967054e-06,
"loss": 1.2782,
"step": 1701
},
{
"epoch": 0.494121062563507,
"grad_norm": 3.9093804359436035,
"learning_rate": 9.934931017541856e-06,
"loss": 1.2373,
"step": 1702
},
{
"epoch": 0.49441138046160543,
"grad_norm": 3.6765968799591064,
"learning_rate": 9.934776521118362e-06,
"loss": 1.2736,
"step": 1703
},
{
"epoch": 0.4947016983597039,
"grad_norm": 3.605074644088745,
"learning_rate": 9.934621842702265e-06,
"loss": 1.1006,
"step": 1704
},
{
"epoch": 0.4949920162578023,
"grad_norm": 3.1863555908203125,
"learning_rate": 9.934466982299276e-06,
"loss": 1.0095,
"step": 1705
},
{
"epoch": 0.4952823341559007,
"grad_norm": 3.9221925735473633,
"learning_rate": 9.934311939915101e-06,
"loss": 1.2584,
"step": 1706
},
{
"epoch": 0.49557265205399914,
"grad_norm": 3.368342161178589,
"learning_rate": 9.93415671555546e-06,
"loss": 1.2051,
"step": 1707
},
{
"epoch": 0.49586296995209755,
"grad_norm": 3.4629364013671875,
"learning_rate": 9.934001309226079e-06,
"loss": 1.0938,
"step": 1708
},
{
"epoch": 0.49615328785019597,
"grad_norm": 3.376192331314087,
"learning_rate": 9.933845720932685e-06,
"loss": 1.1602,
"step": 1709
},
{
"epoch": 0.4964436057482944,
"grad_norm": 3.689114809036255,
"learning_rate": 9.933689950681021e-06,
"loss": 1.1903,
"step": 1710
},
{
"epoch": 0.4967339236463928,
"grad_norm": 3.5950229167938232,
"learning_rate": 9.933533998476828e-06,
"loss": 1.1267,
"step": 1711
},
{
"epoch": 0.4970242415444912,
"grad_norm": 3.553500175476074,
"learning_rate": 9.933377864325861e-06,
"loss": 1.1726,
"step": 1712
},
{
"epoch": 0.4973145594425896,
"grad_norm": 3.4887280464172363,
"learning_rate": 9.933221548233875e-06,
"loss": 1.1724,
"step": 1713
},
{
"epoch": 0.49760487734068803,
"grad_norm": 3.257399082183838,
"learning_rate": 9.933065050206635e-06,
"loss": 1.1709,
"step": 1714
},
{
"epoch": 0.49789519523878645,
"grad_norm": 3.813685655593872,
"learning_rate": 9.932908370249914e-06,
"loss": 1.3864,
"step": 1715
},
{
"epoch": 0.4981855131368849,
"grad_norm": 3.354031562805176,
"learning_rate": 9.932751508369492e-06,
"loss": 1.2201,
"step": 1716
},
{
"epoch": 0.49847583103498333,
"grad_norm": 3.2486491203308105,
"learning_rate": 9.93259446457115e-06,
"loss": 1.2024,
"step": 1717
},
{
"epoch": 0.49876614893308174,
"grad_norm": 3.415264368057251,
"learning_rate": 9.932437238860682e-06,
"loss": 1.1056,
"step": 1718
},
{
"epoch": 0.49905646683118016,
"grad_norm": 3.367347478866577,
"learning_rate": 9.932279831243884e-06,
"loss": 1.1409,
"step": 1719
},
{
"epoch": 0.49934678472927857,
"grad_norm": 3.6677513122558594,
"learning_rate": 9.932122241726565e-06,
"loss": 1.1554,
"step": 1720
},
{
"epoch": 0.499637102627377,
"grad_norm": 3.5150060653686523,
"learning_rate": 9.931964470314535e-06,
"loss": 1.2135,
"step": 1721
},
{
"epoch": 0.4999274205254754,
"grad_norm": 3.3909170627593994,
"learning_rate": 9.931806517013612e-06,
"loss": 1.0787,
"step": 1722
},
{
"epoch": 0.5002177384235739,
"grad_norm": 3.4581210613250732,
"learning_rate": 9.931648381829623e-06,
"loss": 1.0847,
"step": 1723
},
{
"epoch": 0.5005080563216723,
"grad_norm": 3.3497819900512695,
"learning_rate": 9.931490064768397e-06,
"loss": 1.1567,
"step": 1724
},
{
"epoch": 0.5007983742197707,
"grad_norm": 3.56885027885437,
"learning_rate": 9.931331565835775e-06,
"loss": 1.2172,
"step": 1725
},
{
"epoch": 0.5010886921178691,
"grad_norm": 3.825061321258545,
"learning_rate": 9.931172885037604e-06,
"loss": 1.3385,
"step": 1726
},
{
"epoch": 0.5013790100159675,
"grad_norm": 3.5020551681518555,
"learning_rate": 9.93101402237973e-06,
"loss": 1.17,
"step": 1727
},
{
"epoch": 0.5016693279140659,
"grad_norm": 3.285560369491577,
"learning_rate": 9.930854977868019e-06,
"loss": 1.0894,
"step": 1728
},
{
"epoch": 0.5019596458121643,
"grad_norm": 3.811467409133911,
"learning_rate": 9.930695751508333e-06,
"loss": 1.3049,
"step": 1729
},
{
"epoch": 0.5022499637102628,
"grad_norm": 3.4026193618774414,
"learning_rate": 9.930536343306542e-06,
"loss": 1.1131,
"step": 1730
},
{
"epoch": 0.5025402816083612,
"grad_norm": 3.4770872592926025,
"learning_rate": 9.93037675326853e-06,
"loss": 1.2083,
"step": 1731
},
{
"epoch": 0.5028305995064596,
"grad_norm": 3.191282272338867,
"learning_rate": 9.930216981400176e-06,
"loss": 1.0672,
"step": 1732
},
{
"epoch": 0.503120917404558,
"grad_norm": 3.5323238372802734,
"learning_rate": 9.93005702770738e-06,
"loss": 1.2014,
"step": 1733
},
{
"epoch": 0.5034112353026564,
"grad_norm": 3.5278778076171875,
"learning_rate": 9.929896892196036e-06,
"loss": 1.2671,
"step": 1734
},
{
"epoch": 0.5037015532007548,
"grad_norm": 4.011770248413086,
"learning_rate": 9.929736574872052e-06,
"loss": 1.1926,
"step": 1735
},
{
"epoch": 0.5039918710988532,
"grad_norm": 3.5172500610351562,
"learning_rate": 9.929576075741335e-06,
"loss": 1.2072,
"step": 1736
},
{
"epoch": 0.5042821889969517,
"grad_norm": 3.119262218475342,
"learning_rate": 9.929415394809813e-06,
"loss": 1.2116,
"step": 1737
},
{
"epoch": 0.5045725068950501,
"grad_norm": 3.0061795711517334,
"learning_rate": 9.929254532083406e-06,
"loss": 0.9696,
"step": 1738
},
{
"epoch": 0.5048628247931485,
"grad_norm": 3.344226598739624,
"learning_rate": 9.929093487568048e-06,
"loss": 1.2049,
"step": 1739
},
{
"epoch": 0.5051531426912469,
"grad_norm": 3.819347620010376,
"learning_rate": 9.928932261269679e-06,
"loss": 1.4237,
"step": 1740
},
{
"epoch": 0.5054434605893453,
"grad_norm": 3.798185348510742,
"learning_rate": 9.928770853194245e-06,
"loss": 1.2619,
"step": 1741
},
{
"epoch": 0.5057337784874437,
"grad_norm": 3.4737367630004883,
"learning_rate": 9.928609263347695e-06,
"loss": 1.168,
"step": 1742
},
{
"epoch": 0.5060240963855421,
"grad_norm": 3.425579786300659,
"learning_rate": 9.928447491735994e-06,
"loss": 1.1395,
"step": 1743
},
{
"epoch": 0.5063144142836405,
"grad_norm": 3.61008882522583,
"learning_rate": 9.928285538365104e-06,
"loss": 1.2144,
"step": 1744
},
{
"epoch": 0.506604732181739,
"grad_norm": 3.7203760147094727,
"learning_rate": 9.928123403240999e-06,
"loss": 1.2222,
"step": 1745
},
{
"epoch": 0.5068950500798374,
"grad_norm": 3.9801478385925293,
"learning_rate": 9.927961086369658e-06,
"loss": 1.3081,
"step": 1746
},
{
"epoch": 0.5071853679779358,
"grad_norm": 3.47685170173645,
"learning_rate": 9.927798587757068e-06,
"loss": 1.2011,
"step": 1747
},
{
"epoch": 0.5074756858760343,
"grad_norm": 3.4289331436157227,
"learning_rate": 9.927635907409224e-06,
"loss": 1.0605,
"step": 1748
},
{
"epoch": 0.5077660037741327,
"grad_norm": 3.3467659950256348,
"learning_rate": 9.92747304533212e-06,
"loss": 1.166,
"step": 1749
},
{
"epoch": 0.5080563216722311,
"grad_norm": 3.1214489936828613,
"learning_rate": 9.927310001531767e-06,
"loss": 1.122,
"step": 1750
},
{
"epoch": 0.5083466395703296,
"grad_norm": 3.7944412231445312,
"learning_rate": 9.927146776014176e-06,
"loss": 1.1537,
"step": 1751
},
{
"epoch": 0.508636957468428,
"grad_norm": 3.5604870319366455,
"learning_rate": 9.926983368785367e-06,
"loss": 1.2378,
"step": 1752
},
{
"epoch": 0.5089272753665264,
"grad_norm": 3.4572842121124268,
"learning_rate": 9.926819779851366e-06,
"loss": 1.1053,
"step": 1753
},
{
"epoch": 0.5092175932646248,
"grad_norm": 3.5131027698516846,
"learning_rate": 9.926656009218208e-06,
"loss": 1.1586,
"step": 1754
},
{
"epoch": 0.5095079111627232,
"grad_norm": 3.2035908699035645,
"learning_rate": 9.926492056891932e-06,
"loss": 1.0894,
"step": 1755
},
{
"epoch": 0.5097982290608216,
"grad_norm": 3.468350887298584,
"learning_rate": 9.926327922878582e-06,
"loss": 1.1021,
"step": 1756
},
{
"epoch": 0.51008854695892,
"grad_norm": 3.570665121078491,
"learning_rate": 9.926163607184215e-06,
"loss": 1.2883,
"step": 1757
},
{
"epoch": 0.5103788648570184,
"grad_norm": 3.7645089626312256,
"learning_rate": 9.925999109814888e-06,
"loss": 1.4159,
"step": 1758
},
{
"epoch": 0.5106691827551169,
"grad_norm": 3.5040338039398193,
"learning_rate": 9.925834430776668e-06,
"loss": 1.1979,
"step": 1759
},
{
"epoch": 0.5109595006532153,
"grad_norm": 3.4286630153656006,
"learning_rate": 9.92566957007563e-06,
"loss": 1.1681,
"step": 1760
},
{
"epoch": 0.5112498185513137,
"grad_norm": 3.727626085281372,
"learning_rate": 9.925504527717855e-06,
"loss": 1.2216,
"step": 1761
},
{
"epoch": 0.5115401364494121,
"grad_norm": 4.011921405792236,
"learning_rate": 9.925339303709424e-06,
"loss": 1.3667,
"step": 1762
},
{
"epoch": 0.5118304543475105,
"grad_norm": 3.5719776153564453,
"learning_rate": 9.925173898056436e-06,
"loss": 1.3837,
"step": 1763
},
{
"epoch": 0.5121207722456089,
"grad_norm": 3.6549808979034424,
"learning_rate": 9.925008310764988e-06,
"loss": 1.21,
"step": 1764
},
{
"epoch": 0.5124110901437073,
"grad_norm": 3.3320508003234863,
"learning_rate": 9.924842541841188e-06,
"loss": 1.1101,
"step": 1765
},
{
"epoch": 0.5127014080418058,
"grad_norm": 3.2522830963134766,
"learning_rate": 9.924676591291152e-06,
"loss": 1.1344,
"step": 1766
},
{
"epoch": 0.5129917259399042,
"grad_norm": 3.3519628047943115,
"learning_rate": 9.924510459120996e-06,
"loss": 1.1014,
"step": 1767
},
{
"epoch": 0.5132820438380026,
"grad_norm": 3.706505537033081,
"learning_rate": 9.924344145336847e-06,
"loss": 1.0986,
"step": 1768
},
{
"epoch": 0.513572361736101,
"grad_norm": 4.049661636352539,
"learning_rate": 9.924177649944841e-06,
"loss": 1.2321,
"step": 1769
},
{
"epoch": 0.5138626796341994,
"grad_norm": 3.567394495010376,
"learning_rate": 9.924010972951116e-06,
"loss": 1.2176,
"step": 1770
},
{
"epoch": 0.5141529975322978,
"grad_norm": 3.6421961784362793,
"learning_rate": 9.923844114361823e-06,
"loss": 1.1742,
"step": 1771
},
{
"epoch": 0.5144433154303962,
"grad_norm": 3.635004997253418,
"learning_rate": 9.923677074183112e-06,
"loss": 1.3064,
"step": 1772
},
{
"epoch": 0.5147336333284948,
"grad_norm": 3.6937615871429443,
"learning_rate": 9.923509852421144e-06,
"loss": 1.1709,
"step": 1773
},
{
"epoch": 0.5150239512265932,
"grad_norm": 3.8052608966827393,
"learning_rate": 9.923342449082088e-06,
"loss": 1.3354,
"step": 1774
},
{
"epoch": 0.5153142691246916,
"grad_norm": 3.4751036167144775,
"learning_rate": 9.923174864172114e-06,
"loss": 1.19,
"step": 1775
},
{
"epoch": 0.51560458702279,
"grad_norm": 3.4713563919067383,
"learning_rate": 9.923007097697406e-06,
"loss": 1.2449,
"step": 1776
},
{
"epoch": 0.5158949049208884,
"grad_norm": 3.4838809967041016,
"learning_rate": 9.92283914966415e-06,
"loss": 1.3047,
"step": 1777
},
{
"epoch": 0.5161852228189868,
"grad_norm": 3.8674657344818115,
"learning_rate": 9.92267102007854e-06,
"loss": 1.3504,
"step": 1778
},
{
"epoch": 0.5164755407170852,
"grad_norm": 3.565331220626831,
"learning_rate": 9.922502708946776e-06,
"loss": 1.1211,
"step": 1779
},
{
"epoch": 0.5167658586151836,
"grad_norm": 3.551572561264038,
"learning_rate": 9.922334216275065e-06,
"loss": 1.0492,
"step": 1780
},
{
"epoch": 0.5170561765132821,
"grad_norm": 3.5140163898468018,
"learning_rate": 9.922165542069621e-06,
"loss": 1.2428,
"step": 1781
},
{
"epoch": 0.5173464944113805,
"grad_norm": 3.665693759918213,
"learning_rate": 9.921996686336665e-06,
"loss": 1.3825,
"step": 1782
},
{
"epoch": 0.5176368123094789,
"grad_norm": 3.2127161026000977,
"learning_rate": 9.921827649082426e-06,
"loss": 1.0095,
"step": 1783
},
{
"epoch": 0.5179271302075773,
"grad_norm": 3.358623504638672,
"learning_rate": 9.921658430313136e-06,
"loss": 1.1453,
"step": 1784
},
{
"epoch": 0.5182174481056757,
"grad_norm": 3.582932472229004,
"learning_rate": 9.921489030035036e-06,
"loss": 1.2084,
"step": 1785
},
{
"epoch": 0.5185077660037741,
"grad_norm": 3.42759108543396,
"learning_rate": 9.921319448254374e-06,
"loss": 1.2317,
"step": 1786
},
{
"epoch": 0.5187980839018725,
"grad_norm": 3.603374481201172,
"learning_rate": 9.921149684977402e-06,
"loss": 1.261,
"step": 1787
},
{
"epoch": 0.519088401799971,
"grad_norm": 3.466707944869995,
"learning_rate": 9.920979740210383e-06,
"loss": 1.151,
"step": 1788
},
{
"epoch": 0.5193787196980694,
"grad_norm": 3.541694164276123,
"learning_rate": 9.920809613959585e-06,
"loss": 1.2843,
"step": 1789
},
{
"epoch": 0.5196690375961678,
"grad_norm": 3.0089690685272217,
"learning_rate": 9.920639306231282e-06,
"loss": 1.0789,
"step": 1790
},
{
"epoch": 0.5199593554942662,
"grad_norm": 3.8396410942077637,
"learning_rate": 9.920468817031754e-06,
"loss": 1.1977,
"step": 1791
},
{
"epoch": 0.5202496733923646,
"grad_norm": 3.1440768241882324,
"learning_rate": 9.920298146367287e-06,
"loss": 0.9836,
"step": 1792
},
{
"epoch": 0.520539991290463,
"grad_norm": 3.4042434692382812,
"learning_rate": 9.920127294244178e-06,
"loss": 1.0643,
"step": 1793
},
{
"epoch": 0.5208303091885614,
"grad_norm": 3.349679946899414,
"learning_rate": 9.919956260668726e-06,
"loss": 1.2135,
"step": 1794
},
{
"epoch": 0.5211206270866598,
"grad_norm": 3.703922986984253,
"learning_rate": 9.91978504564724e-06,
"loss": 1.275,
"step": 1795
},
{
"epoch": 0.5214109449847583,
"grad_norm": 3.6810669898986816,
"learning_rate": 9.919613649186034e-06,
"loss": 1.2586,
"step": 1796
},
{
"epoch": 0.5217012628828567,
"grad_norm": 3.357159376144409,
"learning_rate": 9.919442071291428e-06,
"loss": 1.0915,
"step": 1797
},
{
"epoch": 0.5219915807809552,
"grad_norm": 3.6635754108428955,
"learning_rate": 9.919270311969752e-06,
"loss": 1.2885,
"step": 1798
},
{
"epoch": 0.5222818986790536,
"grad_norm": 3.329967975616455,
"learning_rate": 9.919098371227338e-06,
"loss": 1.2306,
"step": 1799
},
{
"epoch": 0.522572216577152,
"grad_norm": 3.19476056098938,
"learning_rate": 9.918926249070528e-06,
"loss": 1.1987,
"step": 1800
},
{
"epoch": 0.5228625344752504,
"grad_norm": 3.237572431564331,
"learning_rate": 9.918753945505671e-06,
"loss": 1.2641,
"step": 1801
},
{
"epoch": 0.5231528523733489,
"grad_norm": 3.1060678958892822,
"learning_rate": 9.91858146053912e-06,
"loss": 1.1233,
"step": 1802
},
{
"epoch": 0.5234431702714473,
"grad_norm": 3.3326449394226074,
"learning_rate": 9.918408794177236e-06,
"loss": 1.0348,
"step": 1803
},
{
"epoch": 0.5237334881695457,
"grad_norm": 3.1791276931762695,
"learning_rate": 9.918235946426389e-06,
"loss": 1.0184,
"step": 1804
},
{
"epoch": 0.5240238060676441,
"grad_norm": 3.0655264854431152,
"learning_rate": 9.918062917292951e-06,
"loss": 1.0412,
"step": 1805
},
{
"epoch": 0.5243141239657425,
"grad_norm": 3.459871768951416,
"learning_rate": 9.917889706783304e-06,
"loss": 1.1433,
"step": 1806
},
{
"epoch": 0.5246044418638409,
"grad_norm": 3.5047693252563477,
"learning_rate": 9.917716314903838e-06,
"loss": 1.2992,
"step": 1807
},
{
"epoch": 0.5248947597619393,
"grad_norm": 3.4301116466522217,
"learning_rate": 9.917542741660943e-06,
"loss": 1.2329,
"step": 1808
},
{
"epoch": 0.5251850776600377,
"grad_norm": 3.4077882766723633,
"learning_rate": 9.917368987061026e-06,
"loss": 1.1486,
"step": 1809
},
{
"epoch": 0.5254753955581362,
"grad_norm": 3.492203950881958,
"learning_rate": 9.917195051110492e-06,
"loss": 1.1808,
"step": 1810
},
{
"epoch": 0.5257657134562346,
"grad_norm": 3.3428704738616943,
"learning_rate": 9.917020933815753e-06,
"loss": 1.1278,
"step": 1811
},
{
"epoch": 0.526056031354333,
"grad_norm": 3.4210922718048096,
"learning_rate": 9.916846635183235e-06,
"loss": 1.1373,
"step": 1812
},
{
"epoch": 0.5263463492524314,
"grad_norm": 3.6874492168426514,
"learning_rate": 9.916672155219365e-06,
"loss": 1.4229,
"step": 1813
},
{
"epoch": 0.5266366671505298,
"grad_norm": 3.638094902038574,
"learning_rate": 9.916497493930574e-06,
"loss": 1.1038,
"step": 1814
},
{
"epoch": 0.5269269850486282,
"grad_norm": 3.680783987045288,
"learning_rate": 9.91632265132331e-06,
"loss": 1.3286,
"step": 1815
},
{
"epoch": 0.5272173029467266,
"grad_norm": 3.4472410678863525,
"learning_rate": 9.916147627404016e-06,
"loss": 1.2083,
"step": 1816
},
{
"epoch": 0.527507620844825,
"grad_norm": 3.3821465969085693,
"learning_rate": 9.91597242217915e-06,
"loss": 1.1939,
"step": 1817
},
{
"epoch": 0.5277979387429235,
"grad_norm": 3.5459840297698975,
"learning_rate": 9.91579703565517e-06,
"loss": 1.162,
"step": 1818
},
{
"epoch": 0.5280882566410219,
"grad_norm": 3.6665494441986084,
"learning_rate": 9.915621467838546e-06,
"loss": 1.202,
"step": 1819
},
{
"epoch": 0.5283785745391203,
"grad_norm": 3.051856756210327,
"learning_rate": 9.915445718735755e-06,
"loss": 1.0528,
"step": 1820
},
{
"epoch": 0.5286688924372187,
"grad_norm": 3.6828055381774902,
"learning_rate": 9.915269788353274e-06,
"loss": 1.1336,
"step": 1821
},
{
"epoch": 0.5289592103353171,
"grad_norm": 3.4396843910217285,
"learning_rate": 9.915093676697597e-06,
"loss": 1.2528,
"step": 1822
},
{
"epoch": 0.5292495282334156,
"grad_norm": 3.3097896575927734,
"learning_rate": 9.914917383775211e-06,
"loss": 1.1547,
"step": 1823
},
{
"epoch": 0.5295398461315141,
"grad_norm": 3.2964537143707275,
"learning_rate": 9.914740909592627e-06,
"loss": 1.1173,
"step": 1824
},
{
"epoch": 0.5298301640296125,
"grad_norm": 3.3613085746765137,
"learning_rate": 9.914564254156345e-06,
"loss": 1.0037,
"step": 1825
},
{
"epoch": 0.5301204819277109,
"grad_norm": 3.1005563735961914,
"learning_rate": 9.914387417472886e-06,
"loss": 1.1261,
"step": 1826
},
{
"epoch": 0.5304107998258093,
"grad_norm": 3.254185914993286,
"learning_rate": 9.914210399548768e-06,
"loss": 1.2195,
"step": 1827
},
{
"epoch": 0.5307011177239077,
"grad_norm": 3.0074527263641357,
"learning_rate": 9.91403320039052e-06,
"loss": 1.0624,
"step": 1828
},
{
"epoch": 0.5309914356220061,
"grad_norm": 3.3639132976531982,
"learning_rate": 9.91385582000468e-06,
"loss": 1.1939,
"step": 1829
},
{
"epoch": 0.5312817535201045,
"grad_norm": 3.1890807151794434,
"learning_rate": 9.913678258397785e-06,
"loss": 1.1671,
"step": 1830
},
{
"epoch": 0.531572071418203,
"grad_norm": 3.707369327545166,
"learning_rate": 9.913500515576388e-06,
"loss": 1.1766,
"step": 1831
},
{
"epoch": 0.5318623893163014,
"grad_norm": 3.2515759468078613,
"learning_rate": 9.913322591547042e-06,
"loss": 1.15,
"step": 1832
},
{
"epoch": 0.5321527072143998,
"grad_norm": 3.618812322616577,
"learning_rate": 9.913144486316306e-06,
"loss": 1.2448,
"step": 1833
},
{
"epoch": 0.5324430251124982,
"grad_norm": 3.4694342613220215,
"learning_rate": 9.912966199890753e-06,
"loss": 1.3931,
"step": 1834
},
{
"epoch": 0.5327333430105966,
"grad_norm": 3.811699628829956,
"learning_rate": 9.912787732276955e-06,
"loss": 1.2158,
"step": 1835
},
{
"epoch": 0.533023660908695,
"grad_norm": 3.5045254230499268,
"learning_rate": 9.912609083481494e-06,
"loss": 1.1664,
"step": 1836
},
{
"epoch": 0.5333139788067934,
"grad_norm": 3.1756935119628906,
"learning_rate": 9.912430253510963e-06,
"loss": 1.1034,
"step": 1837
},
{
"epoch": 0.5336042967048918,
"grad_norm": 3.141693115234375,
"learning_rate": 9.912251242371952e-06,
"loss": 0.9284,
"step": 1838
},
{
"epoch": 0.5338946146029903,
"grad_norm": 3.484868288040161,
"learning_rate": 9.912072050071063e-06,
"loss": 1.2705,
"step": 1839
},
{
"epoch": 0.5341849325010887,
"grad_norm": 3.564931631088257,
"learning_rate": 9.911892676614908e-06,
"loss": 1.1495,
"step": 1840
},
{
"epoch": 0.5344752503991871,
"grad_norm": 3.510122060775757,
"learning_rate": 9.9117131220101e-06,
"loss": 1.3716,
"step": 1841
},
{
"epoch": 0.5347655682972855,
"grad_norm": 3.416837453842163,
"learning_rate": 9.911533386263262e-06,
"loss": 1.3552,
"step": 1842
},
{
"epoch": 0.5350558861953839,
"grad_norm": 3.3100061416625977,
"learning_rate": 9.91135346938102e-06,
"loss": 1.2652,
"step": 1843
},
{
"epoch": 0.5353462040934823,
"grad_norm": 3.4213778972625732,
"learning_rate": 9.91117337137001e-06,
"loss": 1.0756,
"step": 1844
},
{
"epoch": 0.5356365219915807,
"grad_norm": 3.4177422523498535,
"learning_rate": 9.910993092236878e-06,
"loss": 1.1127,
"step": 1845
},
{
"epoch": 0.5359268398896792,
"grad_norm": 3.432579278945923,
"learning_rate": 9.910812631988268e-06,
"loss": 1.117,
"step": 1846
},
{
"epoch": 0.5362171577877776,
"grad_norm": 3.2651829719543457,
"learning_rate": 9.910631990630837e-06,
"loss": 1.1663,
"step": 1847
},
{
"epoch": 0.5365074756858761,
"grad_norm": 3.6530210971832275,
"learning_rate": 9.910451168171248e-06,
"loss": 1.0423,
"step": 1848
},
{
"epoch": 0.5367977935839745,
"grad_norm": 3.6912314891815186,
"learning_rate": 9.910270164616168e-06,
"loss": 1.1442,
"step": 1849
},
{
"epoch": 0.5370881114820729,
"grad_norm": 3.458739757537842,
"learning_rate": 9.910088979972272e-06,
"loss": 1.1812,
"step": 1850
},
{
"epoch": 0.5373784293801713,
"grad_norm": 3.281719923019409,
"learning_rate": 9.909907614246244e-06,
"loss": 1.101,
"step": 1851
},
{
"epoch": 0.5376687472782697,
"grad_norm": 3.4149019718170166,
"learning_rate": 9.909726067444772e-06,
"loss": 1.1371,
"step": 1852
},
{
"epoch": 0.5379590651763682,
"grad_norm": 3.3794870376586914,
"learning_rate": 9.909544339574549e-06,
"loss": 1.1995,
"step": 1853
},
{
"epoch": 0.5382493830744666,
"grad_norm": 3.4699738025665283,
"learning_rate": 9.90936243064228e-06,
"loss": 1.2481,
"step": 1854
},
{
"epoch": 0.538539700972565,
"grad_norm": 3.468823194503784,
"learning_rate": 9.909180340654674e-06,
"loss": 1.2427,
"step": 1855
},
{
"epoch": 0.5388300188706634,
"grad_norm": 3.8242857456207275,
"learning_rate": 9.908998069618445e-06,
"loss": 1.1741,
"step": 1856
},
{
"epoch": 0.5391203367687618,
"grad_norm": 3.8072662353515625,
"learning_rate": 9.908815617540314e-06,
"loss": 1.238,
"step": 1857
},
{
"epoch": 0.5394106546668602,
"grad_norm": 3.4818027019500732,
"learning_rate": 9.908632984427012e-06,
"loss": 1.3667,
"step": 1858
},
{
"epoch": 0.5397009725649586,
"grad_norm": 3.2435076236724854,
"learning_rate": 9.908450170285273e-06,
"loss": 1.133,
"step": 1859
},
{
"epoch": 0.539991290463057,
"grad_norm": 3.8168723583221436,
"learning_rate": 9.90826717512184e-06,
"loss": 1.232,
"step": 1860
},
{
"epoch": 0.5402816083611555,
"grad_norm": 3.5808327198028564,
"learning_rate": 9.90808399894346e-06,
"loss": 1.2569,
"step": 1861
},
{
"epoch": 0.5405719262592539,
"grad_norm": 3.1764636039733887,
"learning_rate": 9.907900641756891e-06,
"loss": 1.0774,
"step": 1862
},
{
"epoch": 0.5408622441573523,
"grad_norm": 3.4908952713012695,
"learning_rate": 9.907717103568895e-06,
"loss": 1.239,
"step": 1863
},
{
"epoch": 0.5411525620554507,
"grad_norm": 3.6539740562438965,
"learning_rate": 9.907533384386238e-06,
"loss": 1.2073,
"step": 1864
},
{
"epoch": 0.5414428799535491,
"grad_norm": 3.764848470687866,
"learning_rate": 9.907349484215698e-06,
"loss": 1.3799,
"step": 1865
},
{
"epoch": 0.5417331978516475,
"grad_norm": 3.1396989822387695,
"learning_rate": 9.907165403064057e-06,
"loss": 1.1709,
"step": 1866
},
{
"epoch": 0.5420235157497459,
"grad_norm": 3.9477617740631104,
"learning_rate": 9.906981140938102e-06,
"loss": 1.2874,
"step": 1867
},
{
"epoch": 0.5423138336478444,
"grad_norm": 3.45196795463562,
"learning_rate": 9.90679669784463e-06,
"loss": 1.2163,
"step": 1868
},
{
"epoch": 0.5426041515459428,
"grad_norm": 3.3559353351593018,
"learning_rate": 9.906612073790443e-06,
"loss": 1.1945,
"step": 1869
},
{
"epoch": 0.5428944694440412,
"grad_norm": 3.3837227821350098,
"learning_rate": 9.906427268782351e-06,
"loss": 1.1423,
"step": 1870
},
{
"epoch": 0.5431847873421396,
"grad_norm": 3.3866822719573975,
"learning_rate": 9.906242282827167e-06,
"loss": 1.1683,
"step": 1871
},
{
"epoch": 0.543475105240238,
"grad_norm": 3.538224220275879,
"learning_rate": 9.906057115931716e-06,
"loss": 1.0664,
"step": 1872
},
{
"epoch": 0.5437654231383364,
"grad_norm": 3.6277942657470703,
"learning_rate": 9.905871768102824e-06,
"loss": 1.2454,
"step": 1873
},
{
"epoch": 0.544055741036435,
"grad_norm": 3.9439074993133545,
"learning_rate": 9.905686239347329e-06,
"loss": 1.105,
"step": 1874
},
{
"epoch": 0.5443460589345334,
"grad_norm": 3.3200228214263916,
"learning_rate": 9.905500529672072e-06,
"loss": 1.0594,
"step": 1875
},
{
"epoch": 0.5446363768326318,
"grad_norm": 3.45715594291687,
"learning_rate": 9.905314639083902e-06,
"loss": 1.1502,
"step": 1876
},
{
"epoch": 0.5449266947307302,
"grad_norm": 3.3772661685943604,
"learning_rate": 9.905128567589674e-06,
"loss": 1.2959,
"step": 1877
},
{
"epoch": 0.5452170126288286,
"grad_norm": 3.353414297103882,
"learning_rate": 9.904942315196253e-06,
"loss": 1.337,
"step": 1878
},
{
"epoch": 0.545507330526927,
"grad_norm": 3.0528392791748047,
"learning_rate": 9.904755881910504e-06,
"loss": 0.9722,
"step": 1879
},
{
"epoch": 0.5457976484250254,
"grad_norm": 3.1231632232666016,
"learning_rate": 9.904569267739305e-06,
"loss": 1.0102,
"step": 1880
},
{
"epoch": 0.5460879663231238,
"grad_norm": 3.1487677097320557,
"learning_rate": 9.904382472689539e-06,
"loss": 1.1653,
"step": 1881
},
{
"epoch": 0.5463782842212223,
"grad_norm": 3.181234359741211,
"learning_rate": 9.904195496768092e-06,
"loss": 1.0459,
"step": 1882
},
{
"epoch": 0.5466686021193207,
"grad_norm": 3.1367695331573486,
"learning_rate": 9.904008339981861e-06,
"loss": 1.2362,
"step": 1883
},
{
"epoch": 0.5469589200174191,
"grad_norm": 3.5613062381744385,
"learning_rate": 9.90382100233775e-06,
"loss": 1.1144,
"step": 1884
},
{
"epoch": 0.5472492379155175,
"grad_norm": 3.170631170272827,
"learning_rate": 9.903633483842666e-06,
"loss": 1.0733,
"step": 1885
},
{
"epoch": 0.5475395558136159,
"grad_norm": 3.4632740020751953,
"learning_rate": 9.903445784503525e-06,
"loss": 1.1683,
"step": 1886
},
{
"epoch": 0.5478298737117143,
"grad_norm": 3.335059642791748,
"learning_rate": 9.90325790432725e-06,
"loss": 1.1356,
"step": 1887
},
{
"epoch": 0.5481201916098127,
"grad_norm": 3.508770704269409,
"learning_rate": 9.903069843320768e-06,
"loss": 1.1451,
"step": 1888
},
{
"epoch": 0.5484105095079111,
"grad_norm": 3.5093181133270264,
"learning_rate": 9.902881601491018e-06,
"loss": 1.1466,
"step": 1889
},
{
"epoch": 0.5487008274060096,
"grad_norm": 3.2411201000213623,
"learning_rate": 9.902693178844937e-06,
"loss": 1.1672,
"step": 1890
},
{
"epoch": 0.548991145304108,
"grad_norm": 3.413616418838501,
"learning_rate": 9.902504575389477e-06,
"loss": 1.1278,
"step": 1891
},
{
"epoch": 0.5492814632022064,
"grad_norm": 3.36161732673645,
"learning_rate": 9.902315791131596e-06,
"loss": 1.144,
"step": 1892
},
{
"epoch": 0.5495717811003048,
"grad_norm": 3.3496720790863037,
"learning_rate": 9.902126826078254e-06,
"loss": 1.2088,
"step": 1893
},
{
"epoch": 0.5498620989984032,
"grad_norm": 3.206235647201538,
"learning_rate": 9.901937680236419e-06,
"loss": 1.0498,
"step": 1894
},
{
"epoch": 0.5501524168965016,
"grad_norm": 3.4635236263275146,
"learning_rate": 9.901748353613069e-06,
"loss": 1.2035,
"step": 1895
},
{
"epoch": 0.5504427347946,
"grad_norm": 3.303619623184204,
"learning_rate": 9.901558846215185e-06,
"loss": 1.2198,
"step": 1896
},
{
"epoch": 0.5507330526926985,
"grad_norm": 3.401362419128418,
"learning_rate": 9.901369158049755e-06,
"loss": 1.1518,
"step": 1897
},
{
"epoch": 0.5510233705907969,
"grad_norm": 3.1420388221740723,
"learning_rate": 9.901179289123775e-06,
"loss": 1.1403,
"step": 1898
},
{
"epoch": 0.5513136884888954,
"grad_norm": 3.4058547019958496,
"learning_rate": 9.900989239444248e-06,
"loss": 1.1038,
"step": 1899
},
{
"epoch": 0.5516040063869938,
"grad_norm": 3.373687982559204,
"learning_rate": 9.900799009018183e-06,
"loss": 1.1644,
"step": 1900
},
{
"epoch": 0.5518943242850922,
"grad_norm": 3.383594512939453,
"learning_rate": 9.900608597852595e-06,
"loss": 1.2449,
"step": 1901
},
{
"epoch": 0.5521846421831906,
"grad_norm": 3.35893177986145,
"learning_rate": 9.900418005954506e-06,
"loss": 1.223,
"step": 1902
},
{
"epoch": 0.552474960081289,
"grad_norm": 3.3038265705108643,
"learning_rate": 9.900227233330947e-06,
"loss": 1.1816,
"step": 1903
},
{
"epoch": 0.5527652779793875,
"grad_norm": 3.53434157371521,
"learning_rate": 9.900036279988953e-06,
"loss": 1.191,
"step": 1904
},
{
"epoch": 0.5530555958774859,
"grad_norm": 3.4917852878570557,
"learning_rate": 9.899845145935563e-06,
"loss": 1.279,
"step": 1905
},
{
"epoch": 0.5533459137755843,
"grad_norm": 3.4064924716949463,
"learning_rate": 9.899653831177831e-06,
"loss": 1.1646,
"step": 1906
},
{
"epoch": 0.5536362316736827,
"grad_norm": 3.37669038772583,
"learning_rate": 9.89946233572281e-06,
"loss": 1.1901,
"step": 1907
},
{
"epoch": 0.5539265495717811,
"grad_norm": 3.174514055252075,
"learning_rate": 9.89927065957756e-06,
"loss": 0.989,
"step": 1908
},
{
"epoch": 0.5542168674698795,
"grad_norm": 4.024920463562012,
"learning_rate": 9.899078802749153e-06,
"loss": 1.3463,
"step": 1909
},
{
"epoch": 0.5545071853679779,
"grad_norm": 3.733576774597168,
"learning_rate": 9.898886765244663e-06,
"loss": 1.1643,
"step": 1910
},
{
"epoch": 0.5547975032660764,
"grad_norm": 3.5115084648132324,
"learning_rate": 9.898694547071177e-06,
"loss": 1.2633,
"step": 1911
},
{
"epoch": 0.5550878211641748,
"grad_norm": 3.4509117603302,
"learning_rate": 9.898502148235777e-06,
"loss": 1.1849,
"step": 1912
},
{
"epoch": 0.5553781390622732,
"grad_norm": 3.595416784286499,
"learning_rate": 9.898309568745562e-06,
"loss": 1.196,
"step": 1913
},
{
"epoch": 0.5556684569603716,
"grad_norm": 3.3942644596099854,
"learning_rate": 9.898116808607634e-06,
"loss": 1.1612,
"step": 1914
},
{
"epoch": 0.55595877485847,
"grad_norm": 3.5363807678222656,
"learning_rate": 9.897923867829102e-06,
"loss": 1.2277,
"step": 1915
},
{
"epoch": 0.5562490927565684,
"grad_norm": 3.9670045375823975,
"learning_rate": 9.897730746417082e-06,
"loss": 1.2816,
"step": 1916
},
{
"epoch": 0.5565394106546668,
"grad_norm": 3.706681251525879,
"learning_rate": 9.897537444378696e-06,
"loss": 1.1865,
"step": 1917
},
{
"epoch": 0.5568297285527652,
"grad_norm": 3.671945571899414,
"learning_rate": 9.897343961721071e-06,
"loss": 1.284,
"step": 1918
},
{
"epoch": 0.5571200464508637,
"grad_norm": 3.5591561794281006,
"learning_rate": 9.897150298451346e-06,
"loss": 1.1434,
"step": 1919
},
{
"epoch": 0.5574103643489621,
"grad_norm": 3.2542104721069336,
"learning_rate": 9.89695645457666e-06,
"loss": 1.0362,
"step": 1920
},
{
"epoch": 0.5577006822470605,
"grad_norm": 3.382683753967285,
"learning_rate": 9.896762430104163e-06,
"loss": 1.1443,
"step": 1921
},
{
"epoch": 0.5579910001451589,
"grad_norm": 3.6778106689453125,
"learning_rate": 9.896568225041013e-06,
"loss": 1.2972,
"step": 1922
},
{
"epoch": 0.5582813180432573,
"grad_norm": 3.39172101020813,
"learning_rate": 9.896373839394367e-06,
"loss": 1.0591,
"step": 1923
},
{
"epoch": 0.5585716359413558,
"grad_norm": 3.2635951042175293,
"learning_rate": 9.8961792731714e-06,
"loss": 1.1169,
"step": 1924
},
{
"epoch": 0.5588619538394543,
"grad_norm": 3.4910495281219482,
"learning_rate": 9.895984526379282e-06,
"loss": 1.1934,
"step": 1925
},
{
"epoch": 0.5591522717375527,
"grad_norm": 3.5301356315612793,
"learning_rate": 9.895789599025198e-06,
"loss": 1.1124,
"step": 1926
},
{
"epoch": 0.5594425896356511,
"grad_norm": 3.9778127670288086,
"learning_rate": 9.895594491116336e-06,
"loss": 1.184,
"step": 1927
},
{
"epoch": 0.5597329075337495,
"grad_norm": 3.4717602729797363,
"learning_rate": 9.895399202659892e-06,
"loss": 1.145,
"step": 1928
},
{
"epoch": 0.5600232254318479,
"grad_norm": 3.5621719360351562,
"learning_rate": 9.89520373366307e-06,
"loss": 1.1355,
"step": 1929
},
{
"epoch": 0.5603135433299463,
"grad_norm": 3.188401460647583,
"learning_rate": 9.895008084133075e-06,
"loss": 1.1089,
"step": 1930
},
{
"epoch": 0.5606038612280447,
"grad_norm": 3.5980637073516846,
"learning_rate": 9.894812254077126e-06,
"loss": 1.1874,
"step": 1931
},
{
"epoch": 0.5608941791261431,
"grad_norm": 3.370637893676758,
"learning_rate": 9.894616243502442e-06,
"loss": 1.2691,
"step": 1932
},
{
"epoch": 0.5611844970242416,
"grad_norm": 3.3705739974975586,
"learning_rate": 9.894420052416253e-06,
"loss": 1.2136,
"step": 1933
},
{
"epoch": 0.56147481492234,
"grad_norm": 3.4017226696014404,
"learning_rate": 9.894223680825797e-06,
"loss": 1.1104,
"step": 1934
},
{
"epoch": 0.5617651328204384,
"grad_norm": 3.2392518520355225,
"learning_rate": 9.894027128738311e-06,
"loss": 1.2475,
"step": 1935
},
{
"epoch": 0.5620554507185368,
"grad_norm": 3.236485004425049,
"learning_rate": 9.893830396161049e-06,
"loss": 1.011,
"step": 1936
},
{
"epoch": 0.5623457686166352,
"grad_norm": 3.775726795196533,
"learning_rate": 9.893633483101264e-06,
"loss": 1.3489,
"step": 1937
},
{
"epoch": 0.5626360865147336,
"grad_norm": 3.4214670658111572,
"learning_rate": 9.893436389566215e-06,
"loss": 1.1987,
"step": 1938
},
{
"epoch": 0.562926404412832,
"grad_norm": 3.5680062770843506,
"learning_rate": 9.893239115563179e-06,
"loss": 1.2214,
"step": 1939
},
{
"epoch": 0.5632167223109305,
"grad_norm": 3.623807191848755,
"learning_rate": 9.893041661099422e-06,
"loss": 1.2361,
"step": 1940
},
{
"epoch": 0.5635070402090289,
"grad_norm": 3.6621768474578857,
"learning_rate": 9.89284402618223e-06,
"loss": 1.2852,
"step": 1941
},
{
"epoch": 0.5637973581071273,
"grad_norm": 3.4510340690612793,
"learning_rate": 9.892646210818894e-06,
"loss": 1.2343,
"step": 1942
},
{
"epoch": 0.5640876760052257,
"grad_norm": 3.459193468093872,
"learning_rate": 9.892448215016708e-06,
"loss": 1.115,
"step": 1943
},
{
"epoch": 0.5643779939033241,
"grad_norm": 3.3555784225463867,
"learning_rate": 9.892250038782972e-06,
"loss": 1.1979,
"step": 1944
},
{
"epoch": 0.5646683118014225,
"grad_norm": 3.4835281372070312,
"learning_rate": 9.892051682124996e-06,
"loss": 1.1841,
"step": 1945
},
{
"epoch": 0.5649586296995209,
"grad_norm": 3.4608845710754395,
"learning_rate": 9.891853145050097e-06,
"loss": 1.1358,
"step": 1946
},
{
"epoch": 0.5652489475976193,
"grad_norm": 3.647038698196411,
"learning_rate": 9.891654427565594e-06,
"loss": 1.3349,
"step": 1947
},
{
"epoch": 0.5655392654957178,
"grad_norm": 3.701260805130005,
"learning_rate": 9.891455529678815e-06,
"loss": 1.2177,
"step": 1948
},
{
"epoch": 0.5658295833938163,
"grad_norm": 3.342308759689331,
"learning_rate": 9.8912564513971e-06,
"loss": 1.2119,
"step": 1949
},
{
"epoch": 0.5661199012919147,
"grad_norm": 3.529751777648926,
"learning_rate": 9.891057192727787e-06,
"loss": 1.1177,
"step": 1950
},
{
"epoch": 0.5664102191900131,
"grad_norm": 3.2894372940063477,
"learning_rate": 9.890857753678225e-06,
"loss": 1.3006,
"step": 1951
},
{
"epoch": 0.5667005370881115,
"grad_norm": 3.307856798171997,
"learning_rate": 9.890658134255771e-06,
"loss": 1.2037,
"step": 1952
},
{
"epoch": 0.5669908549862099,
"grad_norm": 3.4086251258850098,
"learning_rate": 9.890458334467784e-06,
"loss": 1.1736,
"step": 1953
},
{
"epoch": 0.5672811728843083,
"grad_norm": 3.872767925262451,
"learning_rate": 9.890258354321638e-06,
"loss": 1.309,
"step": 1954
},
{
"epoch": 0.5675714907824068,
"grad_norm": 3.3691158294677734,
"learning_rate": 9.890058193824702e-06,
"loss": 1.1146,
"step": 1955
},
{
"epoch": 0.5678618086805052,
"grad_norm": 3.3088929653167725,
"learning_rate": 9.88985785298436e-06,
"loss": 1.1527,
"step": 1956
},
{
"epoch": 0.5681521265786036,
"grad_norm": 3.4965968132019043,
"learning_rate": 9.889657331808003e-06,
"loss": 1.2041,
"step": 1957
},
{
"epoch": 0.568442444476702,
"grad_norm": 3.3518784046173096,
"learning_rate": 9.889456630303022e-06,
"loss": 1.2014,
"step": 1958
},
{
"epoch": 0.5687327623748004,
"grad_norm": 3.304481267929077,
"learning_rate": 9.88925574847682e-06,
"loss": 1.1083,
"step": 1959
},
{
"epoch": 0.5690230802728988,
"grad_norm": 3.6226377487182617,
"learning_rate": 9.889054686336808e-06,
"loss": 1.2176,
"step": 1960
},
{
"epoch": 0.5693133981709972,
"grad_norm": 3.2320313453674316,
"learning_rate": 9.8888534438904e-06,
"loss": 1.1255,
"step": 1961
},
{
"epoch": 0.5696037160690957,
"grad_norm": 3.6871187686920166,
"learning_rate": 9.888652021145015e-06,
"loss": 1.1531,
"step": 1962
},
{
"epoch": 0.5698940339671941,
"grad_norm": 3.502007007598877,
"learning_rate": 9.888450418108085e-06,
"loss": 1.226,
"step": 1963
},
{
"epoch": 0.5701843518652925,
"grad_norm": 3.3673317432403564,
"learning_rate": 9.888248634787044e-06,
"loss": 1.1027,
"step": 1964
},
{
"epoch": 0.5704746697633909,
"grad_norm": 3.250483751296997,
"learning_rate": 9.888046671189331e-06,
"loss": 1.0451,
"step": 1965
},
{
"epoch": 0.5707649876614893,
"grad_norm": 3.357563018798828,
"learning_rate": 9.887844527322398e-06,
"loss": 1.0807,
"step": 1966
},
{
"epoch": 0.5710553055595877,
"grad_norm": 3.171480655670166,
"learning_rate": 9.887642203193699e-06,
"loss": 1.0291,
"step": 1967
},
{
"epoch": 0.5713456234576861,
"grad_norm": 3.627028703689575,
"learning_rate": 9.887439698810694e-06,
"loss": 1.2565,
"step": 1968
},
{
"epoch": 0.5716359413557845,
"grad_norm": 3.0720813274383545,
"learning_rate": 9.887237014180853e-06,
"loss": 1.0151,
"step": 1969
},
{
"epoch": 0.571926259253883,
"grad_norm": 3.1045854091644287,
"learning_rate": 9.88703414931165e-06,
"loss": 1.0729,
"step": 1970
},
{
"epoch": 0.5722165771519814,
"grad_norm": 3.7090137004852295,
"learning_rate": 9.886831104210567e-06,
"loss": 1.2588,
"step": 1971
},
{
"epoch": 0.5725068950500798,
"grad_norm": 3.418719530105591,
"learning_rate": 9.886627878885093e-06,
"loss": 1.0532,
"step": 1972
},
{
"epoch": 0.5727972129481782,
"grad_norm": 4.114670276641846,
"learning_rate": 9.88642447334272e-06,
"loss": 1.2155,
"step": 1973
},
{
"epoch": 0.5730875308462767,
"grad_norm": 3.3780999183654785,
"learning_rate": 9.886220887590953e-06,
"loss": 1.0976,
"step": 1974
},
{
"epoch": 0.5733778487443751,
"grad_norm": 3.3988523483276367,
"learning_rate": 9.886017121637299e-06,
"loss": 1.1996,
"step": 1975
},
{
"epoch": 0.5736681666424736,
"grad_norm": 3.189674139022827,
"learning_rate": 9.885813175489272e-06,
"loss": 1.0011,
"step": 1976
},
{
"epoch": 0.573958484540572,
"grad_norm": 3.2904489040374756,
"learning_rate": 9.885609049154395e-06,
"loss": 1.0865,
"step": 1977
},
{
"epoch": 0.5742488024386704,
"grad_norm": 3.6734063625335693,
"learning_rate": 9.885404742640192e-06,
"loss": 1.2685,
"step": 1978
},
{
"epoch": 0.5745391203367688,
"grad_norm": 3.972599983215332,
"learning_rate": 9.885200255954203e-06,
"loss": 1.4054,
"step": 1979
},
{
"epoch": 0.5748294382348672,
"grad_norm": 3.402681589126587,
"learning_rate": 9.884995589103967e-06,
"loss": 1.1284,
"step": 1980
},
{
"epoch": 0.5751197561329656,
"grad_norm": 3.822906970977783,
"learning_rate": 9.884790742097032e-06,
"loss": 1.3255,
"step": 1981
},
{
"epoch": 0.575410074031064,
"grad_norm": 3.3857011795043945,
"learning_rate": 9.884585714940953e-06,
"loss": 1.1057,
"step": 1982
},
{
"epoch": 0.5757003919291624,
"grad_norm": 3.3248820304870605,
"learning_rate": 9.884380507643293e-06,
"loss": 1.1137,
"step": 1983
},
{
"epoch": 0.5759907098272609,
"grad_norm": 2.989927053451538,
"learning_rate": 9.884175120211616e-06,
"loss": 0.9767,
"step": 1984
},
{
"epoch": 0.5762810277253593,
"grad_norm": 3.6067261695861816,
"learning_rate": 9.8839695526535e-06,
"loss": 1.2875,
"step": 1985
},
{
"epoch": 0.5765713456234577,
"grad_norm": 3.1623098850250244,
"learning_rate": 9.883763804976525e-06,
"loss": 1.1397,
"step": 1986
},
{
"epoch": 0.5768616635215561,
"grad_norm": 3.215427875518799,
"learning_rate": 9.883557877188276e-06,
"loss": 1.0948,
"step": 1987
},
{
"epoch": 0.5771519814196545,
"grad_norm": 3.4448931217193604,
"learning_rate": 9.883351769296355e-06,
"loss": 1.1696,
"step": 1988
},
{
"epoch": 0.5774422993177529,
"grad_norm": 3.0168240070343018,
"learning_rate": 9.883145481308356e-06,
"loss": 0.9926,
"step": 1989
},
{
"epoch": 0.5777326172158513,
"grad_norm": 3.3162906169891357,
"learning_rate": 9.88293901323189e-06,
"loss": 1.161,
"step": 1990
},
{
"epoch": 0.5780229351139498,
"grad_norm": 3.2119832038879395,
"learning_rate": 9.882732365074572e-06,
"loss": 1.0616,
"step": 1991
},
{
"epoch": 0.5783132530120482,
"grad_norm": 3.3098132610321045,
"learning_rate": 9.88252553684402e-06,
"loss": 1.1829,
"step": 1992
},
{
"epoch": 0.5786035709101466,
"grad_norm": 4.403481960296631,
"learning_rate": 9.882318528547866e-06,
"loss": 1.205,
"step": 1993
},
{
"epoch": 0.578893888808245,
"grad_norm": 3.6638176441192627,
"learning_rate": 9.88211134019374e-06,
"loss": 1.1866,
"step": 1994
},
{
"epoch": 0.5791842067063434,
"grad_norm": 4.162400245666504,
"learning_rate": 9.881903971789285e-06,
"loss": 1.2875,
"step": 1995
},
{
"epoch": 0.5794745246044418,
"grad_norm": 3.9328815937042236,
"learning_rate": 9.88169642334215e-06,
"loss": 1.3008,
"step": 1996
},
{
"epoch": 0.5797648425025402,
"grad_norm": 3.249154567718506,
"learning_rate": 9.88148869485999e-06,
"loss": 1.1718,
"step": 1997
},
{
"epoch": 0.5800551604006386,
"grad_norm": 3.618673324584961,
"learning_rate": 9.88128078635046e-06,
"loss": 1.228,
"step": 1998
},
{
"epoch": 0.5803454782987372,
"grad_norm": 3.3982481956481934,
"learning_rate": 9.881072697821235e-06,
"loss": 1.3055,
"step": 1999
},
{
"epoch": 0.5806357961968356,
"grad_norm": 3.4904940128326416,
"learning_rate": 9.880864429279984e-06,
"loss": 1.2941,
"step": 2000
},
{
"epoch": 0.5806357961968356,
"eval_loss": 1.2087479829788208,
"eval_runtime": 11.634,
"eval_samples_per_second": 34.382,
"eval_steps_per_second": 4.298,
"step": 2000
},
{
"epoch": 0.580926114094934,
"grad_norm": 3.0078070163726807,
"learning_rate": 9.880655980734391e-06,
"loss": 1.0619,
"step": 2001
},
{
"epoch": 0.5812164319930324,
"grad_norm": 3.5126662254333496,
"learning_rate": 9.88044735219214e-06,
"loss": 1.1839,
"step": 2002
},
{
"epoch": 0.5815067498911308,
"grad_norm": 3.569251537322998,
"learning_rate": 9.88023854366093e-06,
"loss": 1.2774,
"step": 2003
},
{
"epoch": 0.5817970677892292,
"grad_norm": 3.7420945167541504,
"learning_rate": 9.880029555148458e-06,
"loss": 1.2724,
"step": 2004
},
{
"epoch": 0.5820873856873277,
"grad_norm": 3.3116486072540283,
"learning_rate": 9.879820386662434e-06,
"loss": 1.1711,
"step": 2005
},
{
"epoch": 0.5823777035854261,
"grad_norm": 3.6330201625823975,
"learning_rate": 9.879611038210569e-06,
"loss": 1.3515,
"step": 2006
},
{
"epoch": 0.5826680214835245,
"grad_norm": 3.2152249813079834,
"learning_rate": 9.879401509800586e-06,
"loss": 1.1697,
"step": 2007
},
{
"epoch": 0.5829583393816229,
"grad_norm": 3.21633243560791,
"learning_rate": 9.87919180144021e-06,
"loss": 1.1338,
"step": 2008
},
{
"epoch": 0.5832486572797213,
"grad_norm": 3.2678074836730957,
"learning_rate": 9.878981913137178e-06,
"loss": 1.0825,
"step": 2009
},
{
"epoch": 0.5835389751778197,
"grad_norm": 3.4714841842651367,
"learning_rate": 9.87877184489923e-06,
"loss": 1.2087,
"step": 2010
},
{
"epoch": 0.5838292930759181,
"grad_norm": 3.3108625411987305,
"learning_rate": 9.878561596734112e-06,
"loss": 1.237,
"step": 2011
},
{
"epoch": 0.5841196109740165,
"grad_norm": 3.4311110973358154,
"learning_rate": 9.878351168649579e-06,
"loss": 1.1453,
"step": 2012
},
{
"epoch": 0.584409928872115,
"grad_norm": 3.5887632369995117,
"learning_rate": 9.878140560653389e-06,
"loss": 1.2367,
"step": 2013
},
{
"epoch": 0.5847002467702134,
"grad_norm": 3.0961368083953857,
"learning_rate": 9.877929772753311e-06,
"loss": 1.1024,
"step": 2014
},
{
"epoch": 0.5849905646683118,
"grad_norm": 3.4218029975891113,
"learning_rate": 9.87771880495712e-06,
"loss": 1.1504,
"step": 2015
},
{
"epoch": 0.5852808825664102,
"grad_norm": 3.509666919708252,
"learning_rate": 9.877507657272596e-06,
"loss": 1.2652,
"step": 2016
},
{
"epoch": 0.5855712004645086,
"grad_norm": 3.555070161819458,
"learning_rate": 9.877296329707522e-06,
"loss": 1.3375,
"step": 2017
},
{
"epoch": 0.585861518362607,
"grad_norm": 3.184847116470337,
"learning_rate": 9.877084822269699e-06,
"loss": 1.1544,
"step": 2018
},
{
"epoch": 0.5861518362607054,
"grad_norm": 3.6347262859344482,
"learning_rate": 9.87687313496692e-06,
"loss": 1.2741,
"step": 2019
},
{
"epoch": 0.5864421541588039,
"grad_norm": 3.189941883087158,
"learning_rate": 9.876661267806995e-06,
"loss": 1.099,
"step": 2020
},
{
"epoch": 0.5867324720569023,
"grad_norm": 3.7185311317443848,
"learning_rate": 9.876449220797738e-06,
"loss": 1.3849,
"step": 2021
},
{
"epoch": 0.5870227899550007,
"grad_norm": 3.4867780208587646,
"learning_rate": 9.87623699394697e-06,
"loss": 1.2745,
"step": 2022
},
{
"epoch": 0.5873131078530991,
"grad_norm": 3.8721914291381836,
"learning_rate": 9.876024587262517e-06,
"loss": 1.2656,
"step": 2023
},
{
"epoch": 0.5876034257511976,
"grad_norm": 3.4358508586883545,
"learning_rate": 9.875812000752212e-06,
"loss": 1.1847,
"step": 2024
},
{
"epoch": 0.587893743649296,
"grad_norm": 3.7810873985290527,
"learning_rate": 9.875599234423895e-06,
"loss": 1.3291,
"step": 2025
},
{
"epoch": 0.5881840615473944,
"grad_norm": 3.518967628479004,
"learning_rate": 9.875386288285413e-06,
"loss": 1.1975,
"step": 2026
},
{
"epoch": 0.5884743794454929,
"grad_norm": 3.171295642852783,
"learning_rate": 9.875173162344618e-06,
"loss": 1.2229,
"step": 2027
},
{
"epoch": 0.5887646973435913,
"grad_norm": 3.1784987449645996,
"learning_rate": 9.874959856609374e-06,
"loss": 1.1273,
"step": 2028
},
{
"epoch": 0.5890550152416897,
"grad_norm": 3.9516916275024414,
"learning_rate": 9.874746371087543e-06,
"loss": 1.1746,
"step": 2029
},
{
"epoch": 0.5893453331397881,
"grad_norm": 3.0694680213928223,
"learning_rate": 9.874532705787e-06,
"loss": 1.0642,
"step": 2030
},
{
"epoch": 0.5896356510378865,
"grad_norm": 3.7301106452941895,
"learning_rate": 9.874318860715628e-06,
"loss": 1.2201,
"step": 2031
},
{
"epoch": 0.5899259689359849,
"grad_norm": 3.441990852355957,
"learning_rate": 9.874104835881308e-06,
"loss": 1.1172,
"step": 2032
},
{
"epoch": 0.5902162868340833,
"grad_norm": 3.696392059326172,
"learning_rate": 9.873890631291938e-06,
"loss": 1.3655,
"step": 2033
},
{
"epoch": 0.5905066047321818,
"grad_norm": 3.153104066848755,
"learning_rate": 9.873676246955415e-06,
"loss": 1.2507,
"step": 2034
},
{
"epoch": 0.5907969226302802,
"grad_norm": 3.5448312759399414,
"learning_rate": 9.873461682879646e-06,
"loss": 1.2732,
"step": 2035
},
{
"epoch": 0.5910872405283786,
"grad_norm": 3.10785174369812,
"learning_rate": 9.873246939072543e-06,
"loss": 1.1011,
"step": 2036
},
{
"epoch": 0.591377558426477,
"grad_norm": 3.3473784923553467,
"learning_rate": 9.873032015542027e-06,
"loss": 1.2466,
"step": 2037
},
{
"epoch": 0.5916678763245754,
"grad_norm": 3.331484794616699,
"learning_rate": 9.872816912296025e-06,
"loss": 1.1508,
"step": 2038
},
{
"epoch": 0.5919581942226738,
"grad_norm": 3.114262342453003,
"learning_rate": 9.872601629342468e-06,
"loss": 1.0387,
"step": 2039
},
{
"epoch": 0.5922485121207722,
"grad_norm": 3.09680438041687,
"learning_rate": 9.872386166689298e-06,
"loss": 1.271,
"step": 2040
},
{
"epoch": 0.5925388300188706,
"grad_norm": 3.1893184185028076,
"learning_rate": 9.872170524344458e-06,
"loss": 1.2041,
"step": 2041
},
{
"epoch": 0.5928291479169691,
"grad_norm": 3.161381959915161,
"learning_rate": 9.871954702315905e-06,
"loss": 0.9993,
"step": 2042
},
{
"epoch": 0.5931194658150675,
"grad_norm": 3.595938205718994,
"learning_rate": 9.871738700611593e-06,
"loss": 1.2812,
"step": 2043
},
{
"epoch": 0.5934097837131659,
"grad_norm": 3.2868971824645996,
"learning_rate": 9.871522519239493e-06,
"loss": 1.1479,
"step": 2044
},
{
"epoch": 0.5937001016112643,
"grad_norm": 3.5087060928344727,
"learning_rate": 9.871306158207575e-06,
"loss": 1.1772,
"step": 2045
},
{
"epoch": 0.5939904195093627,
"grad_norm": 3.3445980548858643,
"learning_rate": 9.87108961752382e-06,
"loss": 1.1756,
"step": 2046
},
{
"epoch": 0.5942807374074611,
"grad_norm": 3.3986401557922363,
"learning_rate": 9.870872897196211e-06,
"loss": 1.0586,
"step": 2047
},
{
"epoch": 0.5945710553055595,
"grad_norm": 3.7029547691345215,
"learning_rate": 9.870655997232743e-06,
"loss": 1.1999,
"step": 2048
},
{
"epoch": 0.594861373203658,
"grad_norm": 3.178635597229004,
"learning_rate": 9.870438917641416e-06,
"loss": 1.1305,
"step": 2049
},
{
"epoch": 0.5951516911017565,
"grad_norm": 3.1712300777435303,
"learning_rate": 9.870221658430233e-06,
"loss": 1.0952,
"step": 2050
},
{
"epoch": 0.5954420089998549,
"grad_norm": 3.49641489982605,
"learning_rate": 9.87000421960721e-06,
"loss": 1.1492,
"step": 2051
},
{
"epoch": 0.5957323268979533,
"grad_norm": 2.970425605773926,
"learning_rate": 9.869786601180362e-06,
"loss": 1.1189,
"step": 2052
},
{
"epoch": 0.5960226447960517,
"grad_norm": 3.2928876876831055,
"learning_rate": 9.869568803157717e-06,
"loss": 1.1222,
"step": 2053
},
{
"epoch": 0.5963129626941501,
"grad_norm": 3.36665678024292,
"learning_rate": 9.869350825547308e-06,
"loss": 1.2153,
"step": 2054
},
{
"epoch": 0.5966032805922485,
"grad_norm": 3.5911707878112793,
"learning_rate": 9.86913266835717e-06,
"loss": 1.2397,
"step": 2055
},
{
"epoch": 0.596893598490347,
"grad_norm": 3.268590211868286,
"learning_rate": 9.868914331595355e-06,
"loss": 1.1961,
"step": 2056
},
{
"epoch": 0.5971839163884454,
"grad_norm": 3.1666085720062256,
"learning_rate": 9.86869581526991e-06,
"loss": 1.0769,
"step": 2057
},
{
"epoch": 0.5974742342865438,
"grad_norm": 3.4120047092437744,
"learning_rate": 9.868477119388897e-06,
"loss": 1.1425,
"step": 2058
},
{
"epoch": 0.5977645521846422,
"grad_norm": 3.238154888153076,
"learning_rate": 9.868258243960378e-06,
"loss": 1.215,
"step": 2059
},
{
"epoch": 0.5980548700827406,
"grad_norm": 3.396493434906006,
"learning_rate": 9.868039188992427e-06,
"loss": 1.1295,
"step": 2060
},
{
"epoch": 0.598345187980839,
"grad_norm": 3.3043999671936035,
"learning_rate": 9.867819954493123e-06,
"loss": 1.0419,
"step": 2061
},
{
"epoch": 0.5986355058789374,
"grad_norm": 3.630920886993408,
"learning_rate": 9.86760054047055e-06,
"loss": 1.303,
"step": 2062
},
{
"epoch": 0.5989258237770358,
"grad_norm": 3.177386522293091,
"learning_rate": 9.867380946932803e-06,
"loss": 1.0805,
"step": 2063
},
{
"epoch": 0.5992161416751343,
"grad_norm": 3.366000175476074,
"learning_rate": 9.867161173887976e-06,
"loss": 1.1559,
"step": 2064
},
{
"epoch": 0.5995064595732327,
"grad_norm": 3.76708984375,
"learning_rate": 9.866941221344176e-06,
"loss": 1.4349,
"step": 2065
},
{
"epoch": 0.5997967774713311,
"grad_norm": 3.7043986320495605,
"learning_rate": 9.866721089309516e-06,
"loss": 1.1992,
"step": 2066
},
{
"epoch": 0.6000870953694295,
"grad_norm": 3.3993911743164062,
"learning_rate": 9.866500777792115e-06,
"loss": 1.1641,
"step": 2067
},
{
"epoch": 0.6003774132675279,
"grad_norm": 4.189173221588135,
"learning_rate": 9.866280286800093e-06,
"loss": 1.3878,
"step": 2068
},
{
"epoch": 0.6006677311656263,
"grad_norm": 3.4979851245880127,
"learning_rate": 9.86605961634159e-06,
"loss": 1.2999,
"step": 2069
},
{
"epoch": 0.6009580490637247,
"grad_norm": 3.66668963432312,
"learning_rate": 9.865838766424735e-06,
"loss": 1.0979,
"step": 2070
},
{
"epoch": 0.6012483669618232,
"grad_norm": 3.5483312606811523,
"learning_rate": 9.86561773705768e-06,
"loss": 1.1104,
"step": 2071
},
{
"epoch": 0.6015386848599216,
"grad_norm": 3.277080774307251,
"learning_rate": 9.865396528248572e-06,
"loss": 1.2044,
"step": 2072
},
{
"epoch": 0.60182900275802,
"grad_norm": 3.374983549118042,
"learning_rate": 9.865175140005571e-06,
"loss": 1.1618,
"step": 2073
},
{
"epoch": 0.6021193206561184,
"grad_norm": 3.7250962257385254,
"learning_rate": 9.864953572336843e-06,
"loss": 1.1848,
"step": 2074
},
{
"epoch": 0.6024096385542169,
"grad_norm": 3.2824532985687256,
"learning_rate": 9.864731825250557e-06,
"loss": 1.1748,
"step": 2075
},
{
"epoch": 0.6026999564523153,
"grad_norm": 3.4487054347991943,
"learning_rate": 9.864509898754891e-06,
"loss": 1.2878,
"step": 2076
},
{
"epoch": 0.6029902743504137,
"grad_norm": 3.3688509464263916,
"learning_rate": 9.864287792858032e-06,
"loss": 1.0886,
"step": 2077
},
{
"epoch": 0.6032805922485122,
"grad_norm": 3.505753517150879,
"learning_rate": 9.864065507568168e-06,
"loss": 1.2099,
"step": 2078
},
{
"epoch": 0.6035709101466106,
"grad_norm": 3.142094850540161,
"learning_rate": 9.863843042893499e-06,
"loss": 1.1276,
"step": 2079
},
{
"epoch": 0.603861228044709,
"grad_norm": 3.47158145904541,
"learning_rate": 9.863620398842229e-06,
"loss": 1.4327,
"step": 2080
},
{
"epoch": 0.6041515459428074,
"grad_norm": 3.2158539295196533,
"learning_rate": 9.863397575422569e-06,
"loss": 1.1101,
"step": 2081
},
{
"epoch": 0.6044418638409058,
"grad_norm": 3.1480183601379395,
"learning_rate": 9.863174572642736e-06,
"loss": 1.1376,
"step": 2082
},
{
"epoch": 0.6047321817390042,
"grad_norm": 3.2654166221618652,
"learning_rate": 9.862951390510953e-06,
"loss": 1.0447,
"step": 2083
},
{
"epoch": 0.6050224996371026,
"grad_norm": 3.2870917320251465,
"learning_rate": 9.862728029035454e-06,
"loss": 1.0577,
"step": 2084
},
{
"epoch": 0.605312817535201,
"grad_norm": 3.607374429702759,
"learning_rate": 9.862504488224477e-06,
"loss": 1.1754,
"step": 2085
},
{
"epoch": 0.6056031354332995,
"grad_norm": 4.0213470458984375,
"learning_rate": 9.86228076808626e-06,
"loss": 1.2472,
"step": 2086
},
{
"epoch": 0.6058934533313979,
"grad_norm": 3.1948390007019043,
"learning_rate": 9.86205686862906e-06,
"loss": 1.0298,
"step": 2087
},
{
"epoch": 0.6061837712294963,
"grad_norm": 3.687624454498291,
"learning_rate": 9.861832789861132e-06,
"loss": 1.1702,
"step": 2088
},
{
"epoch": 0.6064740891275947,
"grad_norm": 3.001420259475708,
"learning_rate": 9.861608531790741e-06,
"loss": 1.0514,
"step": 2089
},
{
"epoch": 0.6067644070256931,
"grad_norm": 3.481722354888916,
"learning_rate": 9.861384094426155e-06,
"loss": 1.1585,
"step": 2090
},
{
"epoch": 0.6070547249237915,
"grad_norm": 3.38626766204834,
"learning_rate": 9.861159477775653e-06,
"loss": 1.2134,
"step": 2091
},
{
"epoch": 0.60734504282189,
"grad_norm": 3.476393699645996,
"learning_rate": 9.86093468184752e-06,
"loss": 1.1685,
"step": 2092
},
{
"epoch": 0.6076353607199884,
"grad_norm": 3.7456226348876953,
"learning_rate": 9.860709706650043e-06,
"loss": 1.3925,
"step": 2093
},
{
"epoch": 0.6079256786180868,
"grad_norm": 3.1248064041137695,
"learning_rate": 9.860484552191523e-06,
"loss": 1.3072,
"step": 2094
},
{
"epoch": 0.6082159965161852,
"grad_norm": 3.2425031661987305,
"learning_rate": 9.860259218480259e-06,
"loss": 1.1772,
"step": 2095
},
{
"epoch": 0.6085063144142836,
"grad_norm": 3.4490549564361572,
"learning_rate": 9.860033705524566e-06,
"loss": 1.149,
"step": 2096
},
{
"epoch": 0.608796632312382,
"grad_norm": 3.476717948913574,
"learning_rate": 9.859808013332758e-06,
"loss": 1.1662,
"step": 2097
},
{
"epoch": 0.6090869502104804,
"grad_norm": 3.7627527713775635,
"learning_rate": 9.859582141913159e-06,
"loss": 1.2424,
"step": 2098
},
{
"epoch": 0.6093772681085788,
"grad_norm": 3.658005952835083,
"learning_rate": 9.859356091274099e-06,
"loss": 1.3146,
"step": 2099
},
{
"epoch": 0.6096675860066774,
"grad_norm": 3.8518424034118652,
"learning_rate": 9.859129861423915e-06,
"loss": 1.3079,
"step": 2100
},
{
"epoch": 0.6099579039047758,
"grad_norm": 3.3938114643096924,
"learning_rate": 9.858903452370949e-06,
"loss": 1.1353,
"step": 2101
},
{
"epoch": 0.6102482218028742,
"grad_norm": 3.4915430545806885,
"learning_rate": 9.858676864123553e-06,
"loss": 1.2039,
"step": 2102
},
{
"epoch": 0.6105385397009726,
"grad_norm": 3.37498140335083,
"learning_rate": 9.858450096690082e-06,
"loss": 1.1422,
"step": 2103
},
{
"epoch": 0.610828857599071,
"grad_norm": 3.400315761566162,
"learning_rate": 9.858223150078898e-06,
"loss": 1.1419,
"step": 2104
},
{
"epoch": 0.6111191754971694,
"grad_norm": 3.458354949951172,
"learning_rate": 9.857996024298374e-06,
"loss": 1.2601,
"step": 2105
},
{
"epoch": 0.6114094933952678,
"grad_norm": 3.3237063884735107,
"learning_rate": 9.857768719356884e-06,
"loss": 1.1714,
"step": 2106
},
{
"epoch": 0.6116998112933663,
"grad_norm": 3.4677350521087646,
"learning_rate": 9.85754123526281e-06,
"loss": 1.2117,
"step": 2107
},
{
"epoch": 0.6119901291914647,
"grad_norm": 3.1911838054656982,
"learning_rate": 9.857313572024545e-06,
"loss": 1.2366,
"step": 2108
},
{
"epoch": 0.6122804470895631,
"grad_norm": 3.291783332824707,
"learning_rate": 9.857085729650483e-06,
"loss": 1.1905,
"step": 2109
},
{
"epoch": 0.6125707649876615,
"grad_norm": 3.323556900024414,
"learning_rate": 9.856857708149025e-06,
"loss": 1.0904,
"step": 2110
},
{
"epoch": 0.6128610828857599,
"grad_norm": 3.2824766635894775,
"learning_rate": 9.856629507528583e-06,
"loss": 1.2211,
"step": 2111
},
{
"epoch": 0.6131514007838583,
"grad_norm": 3.0334928035736084,
"learning_rate": 9.856401127797572e-06,
"loss": 1.1749,
"step": 2112
},
{
"epoch": 0.6134417186819567,
"grad_norm": 3.456289291381836,
"learning_rate": 9.856172568964415e-06,
"loss": 1.341,
"step": 2113
},
{
"epoch": 0.6137320365800552,
"grad_norm": 3.350088119506836,
"learning_rate": 9.85594383103754e-06,
"loss": 1.2313,
"step": 2114
},
{
"epoch": 0.6140223544781536,
"grad_norm": 3.1120920181274414,
"learning_rate": 9.855714914025386e-06,
"loss": 0.9967,
"step": 2115
},
{
"epoch": 0.614312672376252,
"grad_norm": 3.164459228515625,
"learning_rate": 9.85548581793639e-06,
"loss": 1.1195,
"step": 2116
},
{
"epoch": 0.6146029902743504,
"grad_norm": 3.0181405544281006,
"learning_rate": 9.855256542779006e-06,
"loss": 1.0873,
"step": 2117
},
{
"epoch": 0.6148933081724488,
"grad_norm": 3.3663463592529297,
"learning_rate": 9.855027088561686e-06,
"loss": 1.2191,
"step": 2118
},
{
"epoch": 0.6151836260705472,
"grad_norm": 3.3779163360595703,
"learning_rate": 9.854797455292892e-06,
"loss": 1.1955,
"step": 2119
},
{
"epoch": 0.6154739439686456,
"grad_norm": 3.592621326446533,
"learning_rate": 9.854567642981098e-06,
"loss": 1.1278,
"step": 2120
},
{
"epoch": 0.615764261866744,
"grad_norm": 3.7898104190826416,
"learning_rate": 9.854337651634773e-06,
"loss": 1.2219,
"step": 2121
},
{
"epoch": 0.6160545797648425,
"grad_norm": 3.6015334129333496,
"learning_rate": 9.854107481262405e-06,
"loss": 1.1104,
"step": 2122
},
{
"epoch": 0.6163448976629409,
"grad_norm": 3.665905237197876,
"learning_rate": 9.853877131872475e-06,
"loss": 1.1972,
"step": 2123
},
{
"epoch": 0.6166352155610393,
"grad_norm": 3.183523416519165,
"learning_rate": 9.853646603473486e-06,
"loss": 1.1768,
"step": 2124
},
{
"epoch": 0.6169255334591378,
"grad_norm": 3.5019726753234863,
"learning_rate": 9.853415896073935e-06,
"loss": 1.1711,
"step": 2125
},
{
"epoch": 0.6172158513572362,
"grad_norm": 3.454185962677002,
"learning_rate": 9.853185009682332e-06,
"loss": 1.3214,
"step": 2126
},
{
"epoch": 0.6175061692553346,
"grad_norm": 3.2762303352355957,
"learning_rate": 9.852953944307192e-06,
"loss": 1.1759,
"step": 2127
},
{
"epoch": 0.617796487153433,
"grad_norm": 3.4141921997070312,
"learning_rate": 9.852722699957036e-06,
"loss": 1.1992,
"step": 2128
},
{
"epoch": 0.6180868050515315,
"grad_norm": 3.2765979766845703,
"learning_rate": 9.852491276640393e-06,
"loss": 1.0911,
"step": 2129
},
{
"epoch": 0.6183771229496299,
"grad_norm": 3.3329086303710938,
"learning_rate": 9.852259674365798e-06,
"loss": 1.1718,
"step": 2130
},
{
"epoch": 0.6186674408477283,
"grad_norm": 3.2059311866760254,
"learning_rate": 9.852027893141791e-06,
"loss": 1.0346,
"step": 2131
},
{
"epoch": 0.6189577587458267,
"grad_norm": 3.5297727584838867,
"learning_rate": 9.851795932976919e-06,
"loss": 1.1456,
"step": 2132
},
{
"epoch": 0.6192480766439251,
"grad_norm": 3.6350655555725098,
"learning_rate": 9.851563793879742e-06,
"loss": 1.1363,
"step": 2133
},
{
"epoch": 0.6195383945420235,
"grad_norm": 3.7481045722961426,
"learning_rate": 9.851331475858813e-06,
"loss": 1.285,
"step": 2134
},
{
"epoch": 0.6198287124401219,
"grad_norm": 3.4366955757141113,
"learning_rate": 9.851098978922708e-06,
"loss": 1.1945,
"step": 2135
},
{
"epoch": 0.6201190303382204,
"grad_norm": 3.219010829925537,
"learning_rate": 9.850866303079997e-06,
"loss": 1.15,
"step": 2136
},
{
"epoch": 0.6204093482363188,
"grad_norm": 3.1487579345703125,
"learning_rate": 9.850633448339262e-06,
"loss": 1.1192,
"step": 2137
},
{
"epoch": 0.6206996661344172,
"grad_norm": 3.2304723262786865,
"learning_rate": 9.85040041470909e-06,
"loss": 1.1732,
"step": 2138
},
{
"epoch": 0.6209899840325156,
"grad_norm": 3.366379737854004,
"learning_rate": 9.850167202198075e-06,
"loss": 1.1433,
"step": 2139
},
{
"epoch": 0.621280301930614,
"grad_norm": 3.346491575241089,
"learning_rate": 9.849933810814819e-06,
"loss": 1.2081,
"step": 2140
},
{
"epoch": 0.6215706198287124,
"grad_norm": 3.2903072834014893,
"learning_rate": 9.849700240567928e-06,
"loss": 1.1726,
"step": 2141
},
{
"epoch": 0.6218609377268108,
"grad_norm": 3.296185255050659,
"learning_rate": 9.849466491466017e-06,
"loss": 1.1533,
"step": 2142
},
{
"epoch": 0.6221512556249092,
"grad_norm": 3.3224129676818848,
"learning_rate": 9.849232563517706e-06,
"loss": 1.1278,
"step": 2143
},
{
"epoch": 0.6224415735230077,
"grad_norm": 3.283273458480835,
"learning_rate": 9.848998456731622e-06,
"loss": 1.1298,
"step": 2144
},
{
"epoch": 0.6227318914211061,
"grad_norm": 3.311249017715454,
"learning_rate": 9.848764171116401e-06,
"loss": 1.1447,
"step": 2145
},
{
"epoch": 0.6230222093192045,
"grad_norm": 2.9450314044952393,
"learning_rate": 9.84852970668068e-06,
"loss": 1.0555,
"step": 2146
},
{
"epoch": 0.6233125272173029,
"grad_norm": 3.447150707244873,
"learning_rate": 9.848295063433108e-06,
"loss": 1.2113,
"step": 2147
},
{
"epoch": 0.6236028451154013,
"grad_norm": 3.5015945434570312,
"learning_rate": 9.848060241382339e-06,
"loss": 1.1897,
"step": 2148
},
{
"epoch": 0.6238931630134997,
"grad_norm": 2.8743700981140137,
"learning_rate": 9.84782524053703e-06,
"loss": 1.0311,
"step": 2149
},
{
"epoch": 0.6241834809115983,
"grad_norm": 3.2919156551361084,
"learning_rate": 9.847590060905851e-06,
"loss": 1.2051,
"step": 2150
},
{
"epoch": 0.6244737988096967,
"grad_norm": 3.5934643745422363,
"learning_rate": 9.847354702497475e-06,
"loss": 1.165,
"step": 2151
},
{
"epoch": 0.6247641167077951,
"grad_norm": 3.7064201831817627,
"learning_rate": 9.84711916532058e-06,
"loss": 1.1528,
"step": 2152
},
{
"epoch": 0.6250544346058935,
"grad_norm": 3.3166534900665283,
"learning_rate": 9.846883449383854e-06,
"loss": 1.08,
"step": 2153
},
{
"epoch": 0.6253447525039919,
"grad_norm": 3.3062987327575684,
"learning_rate": 9.84664755469599e-06,
"loss": 1.1791,
"step": 2154
},
{
"epoch": 0.6256350704020903,
"grad_norm": 3.352381706237793,
"learning_rate": 9.846411481265687e-06,
"loss": 1.0613,
"step": 2155
},
{
"epoch": 0.6259253883001887,
"grad_norm": 3.193981409072876,
"learning_rate": 9.846175229101654e-06,
"loss": 1.1526,
"step": 2156
},
{
"epoch": 0.6262157061982871,
"grad_norm": 3.394362449645996,
"learning_rate": 9.8459387982126e-06,
"loss": 1.3341,
"step": 2157
},
{
"epoch": 0.6265060240963856,
"grad_norm": 3.4602437019348145,
"learning_rate": 9.845702188607246e-06,
"loss": 1.2346,
"step": 2158
},
{
"epoch": 0.626796341994484,
"grad_norm": 3.1883201599121094,
"learning_rate": 9.845465400294318e-06,
"loss": 1.0331,
"step": 2159
},
{
"epoch": 0.6270866598925824,
"grad_norm": 3.407731056213379,
"learning_rate": 9.84522843328255e-06,
"loss": 1.0798,
"step": 2160
},
{
"epoch": 0.6273769777906808,
"grad_norm": 3.20255184173584,
"learning_rate": 9.84499128758068e-06,
"loss": 1.0629,
"step": 2161
},
{
"epoch": 0.6276672956887792,
"grad_norm": 3.4190375804901123,
"learning_rate": 9.844753963197454e-06,
"loss": 1.3578,
"step": 2162
},
{
"epoch": 0.6279576135868776,
"grad_norm": 3.4097230434417725,
"learning_rate": 9.844516460141622e-06,
"loss": 1.1523,
"step": 2163
},
{
"epoch": 0.628247931484976,
"grad_norm": 3.4188151359558105,
"learning_rate": 9.844278778421947e-06,
"loss": 1.2496,
"step": 2164
},
{
"epoch": 0.6285382493830745,
"grad_norm": 3.40053653717041,
"learning_rate": 9.844040918047194e-06,
"loss": 1.2374,
"step": 2165
},
{
"epoch": 0.6288285672811729,
"grad_norm": 3.3556363582611084,
"learning_rate": 9.843802879026135e-06,
"loss": 1.1735,
"step": 2166
},
{
"epoch": 0.6291188851792713,
"grad_norm": 3.5495386123657227,
"learning_rate": 9.843564661367547e-06,
"loss": 1.2863,
"step": 2167
},
{
"epoch": 0.6294092030773697,
"grad_norm": 3.5947186946868896,
"learning_rate": 9.843326265080215e-06,
"loss": 1.2074,
"step": 2168
},
{
"epoch": 0.6296995209754681,
"grad_norm": 3.3430442810058594,
"learning_rate": 9.843087690172933e-06,
"loss": 1.151,
"step": 2169
},
{
"epoch": 0.6299898388735665,
"grad_norm": 3.2726352214813232,
"learning_rate": 9.8428489366545e-06,
"loss": 1.1052,
"step": 2170
},
{
"epoch": 0.6302801567716649,
"grad_norm": 3.2520737648010254,
"learning_rate": 9.842610004533719e-06,
"loss": 1.189,
"step": 2171
},
{
"epoch": 0.6305704746697633,
"grad_norm": 3.6799371242523193,
"learning_rate": 9.842370893819404e-06,
"loss": 1.2571,
"step": 2172
},
{
"epoch": 0.6308607925678618,
"grad_norm": 3.68361759185791,
"learning_rate": 9.84213160452037e-06,
"loss": 1.3269,
"step": 2173
},
{
"epoch": 0.6311511104659602,
"grad_norm": 3.2377898693084717,
"learning_rate": 9.841892136645445e-06,
"loss": 1.0609,
"step": 2174
},
{
"epoch": 0.6314414283640587,
"grad_norm": 3.5017290115356445,
"learning_rate": 9.84165249020346e-06,
"loss": 1.2455,
"step": 2175
},
{
"epoch": 0.6317317462621571,
"grad_norm": 3.285425901412964,
"learning_rate": 9.841412665203252e-06,
"loss": 1.1918,
"step": 2176
},
{
"epoch": 0.6320220641602555,
"grad_norm": 3.036376476287842,
"learning_rate": 9.841172661653666e-06,
"loss": 0.9972,
"step": 2177
},
{
"epoch": 0.6323123820583539,
"grad_norm": 3.130056858062744,
"learning_rate": 9.840932479563555e-06,
"loss": 1.2004,
"step": 2178
},
{
"epoch": 0.6326026999564524,
"grad_norm": 3.232766628265381,
"learning_rate": 9.840692118941774e-06,
"loss": 1.3199,
"step": 2179
},
{
"epoch": 0.6328930178545508,
"grad_norm": 3.6254005432128906,
"learning_rate": 9.840451579797187e-06,
"loss": 1.2094,
"step": 2180
},
{
"epoch": 0.6331833357526492,
"grad_norm": 3.1795482635498047,
"learning_rate": 9.840210862138669e-06,
"loss": 1.1589,
"step": 2181
},
{
"epoch": 0.6334736536507476,
"grad_norm": 3.2265725135803223,
"learning_rate": 9.839969965975095e-06,
"loss": 1.1383,
"step": 2182
},
{
"epoch": 0.633763971548846,
"grad_norm": 3.373206615447998,
"learning_rate": 9.839728891315347e-06,
"loss": 1.171,
"step": 2183
},
{
"epoch": 0.6340542894469444,
"grad_norm": 4.074607849121094,
"learning_rate": 9.839487638168321e-06,
"loss": 1.2394,
"step": 2184
},
{
"epoch": 0.6343446073450428,
"grad_norm": 3.1658074855804443,
"learning_rate": 9.839246206542909e-06,
"loss": 1.0554,
"step": 2185
},
{
"epoch": 0.6346349252431412,
"grad_norm": 3.2978014945983887,
"learning_rate": 9.839004596448019e-06,
"loss": 1.1405,
"step": 2186
},
{
"epoch": 0.6349252431412397,
"grad_norm": 3.3122334480285645,
"learning_rate": 9.83876280789256e-06,
"loss": 1.3262,
"step": 2187
},
{
"epoch": 0.6352155610393381,
"grad_norm": 3.4572861194610596,
"learning_rate": 9.838520840885449e-06,
"loss": 1.2122,
"step": 2188
},
{
"epoch": 0.6355058789374365,
"grad_norm": 3.5060789585113525,
"learning_rate": 9.838278695435609e-06,
"loss": 1.1584,
"step": 2189
},
{
"epoch": 0.6357961968355349,
"grad_norm": 3.2355239391326904,
"learning_rate": 9.83803637155197e-06,
"loss": 1.0703,
"step": 2190
},
{
"epoch": 0.6360865147336333,
"grad_norm": 3.302013635635376,
"learning_rate": 9.837793869243468e-06,
"loss": 1.2737,
"step": 2191
},
{
"epoch": 0.6363768326317317,
"grad_norm": 3.2123663425445557,
"learning_rate": 9.83755118851905e-06,
"loss": 1.237,
"step": 2192
},
{
"epoch": 0.6366671505298301,
"grad_norm": 3.7422244548797607,
"learning_rate": 9.837308329387664e-06,
"loss": 1.2597,
"step": 2193
},
{
"epoch": 0.6369574684279286,
"grad_norm": 3.2257628440856934,
"learning_rate": 9.837065291858267e-06,
"loss": 1.1498,
"step": 2194
},
{
"epoch": 0.637247786326027,
"grad_norm": 3.217024087905884,
"learning_rate": 9.83682207593982e-06,
"loss": 1.1622,
"step": 2195
},
{
"epoch": 0.6375381042241254,
"grad_norm": 3.39605450630188,
"learning_rate": 9.836578681641295e-06,
"loss": 1.1444,
"step": 2196
},
{
"epoch": 0.6378284221222238,
"grad_norm": 3.1654269695281982,
"learning_rate": 9.836335108971668e-06,
"loss": 1.2435,
"step": 2197
},
{
"epoch": 0.6381187400203222,
"grad_norm": 3.087963104248047,
"learning_rate": 9.83609135793992e-06,
"loss": 1.0901,
"step": 2198
},
{
"epoch": 0.6384090579184206,
"grad_norm": 3.3197085857391357,
"learning_rate": 9.835847428555042e-06,
"loss": 1.152,
"step": 2199
},
{
"epoch": 0.6386993758165191,
"grad_norm": 3.3169407844543457,
"learning_rate": 9.835603320826032e-06,
"loss": 1.1586,
"step": 2200
},
{
"epoch": 0.6389896937146176,
"grad_norm": 3.6764190196990967,
"learning_rate": 9.835359034761888e-06,
"loss": 1.359,
"step": 2201
},
{
"epoch": 0.639280011612716,
"grad_norm": 3.44268798828125,
"learning_rate": 9.835114570371624e-06,
"loss": 1.3031,
"step": 2202
},
{
"epoch": 0.6395703295108144,
"grad_norm": 3.2723872661590576,
"learning_rate": 9.834869927664253e-06,
"loss": 1.2116,
"step": 2203
},
{
"epoch": 0.6398606474089128,
"grad_norm": 3.278549909591675,
"learning_rate": 9.834625106648796e-06,
"loss": 1.2105,
"step": 2204
},
{
"epoch": 0.6401509653070112,
"grad_norm": 3.3444881439208984,
"learning_rate": 9.834380107334284e-06,
"loss": 1.2564,
"step": 2205
},
{
"epoch": 0.6404412832051096,
"grad_norm": 3.176098585128784,
"learning_rate": 9.834134929729752e-06,
"loss": 1.3411,
"step": 2206
},
{
"epoch": 0.640731601103208,
"grad_norm": 3.3489229679107666,
"learning_rate": 9.833889573844245e-06,
"loss": 1.3759,
"step": 2207
},
{
"epoch": 0.6410219190013065,
"grad_norm": 3.012814521789551,
"learning_rate": 9.833644039686806e-06,
"loss": 1.0518,
"step": 2208
},
{
"epoch": 0.6413122368994049,
"grad_norm": 3.1896815299987793,
"learning_rate": 9.833398327266494e-06,
"loss": 1.317,
"step": 2209
},
{
"epoch": 0.6416025547975033,
"grad_norm": 3.2311453819274902,
"learning_rate": 9.83315243659237e-06,
"loss": 1.1722,
"step": 2210
},
{
"epoch": 0.6418928726956017,
"grad_norm": 3.300663471221924,
"learning_rate": 9.8329063676735e-06,
"loss": 1.1816,
"step": 2211
},
{
"epoch": 0.6421831905937001,
"grad_norm": 3.508462429046631,
"learning_rate": 9.832660120518964e-06,
"loss": 1.1476,
"step": 2212
},
{
"epoch": 0.6424735084917985,
"grad_norm": 3.417879581451416,
"learning_rate": 9.832413695137839e-06,
"loss": 1.1925,
"step": 2213
},
{
"epoch": 0.6427638263898969,
"grad_norm": 3.324315071105957,
"learning_rate": 9.832167091539215e-06,
"loss": 1.2362,
"step": 2214
},
{
"epoch": 0.6430541442879953,
"grad_norm": 3.466980457305908,
"learning_rate": 9.831920309732184e-06,
"loss": 1.0621,
"step": 2215
},
{
"epoch": 0.6433444621860938,
"grad_norm": 3.5176475048065186,
"learning_rate": 9.831673349725852e-06,
"loss": 1.1971,
"step": 2216
},
{
"epoch": 0.6436347800841922,
"grad_norm": 3.5018646717071533,
"learning_rate": 9.831426211529324e-06,
"loss": 1.1557,
"step": 2217
},
{
"epoch": 0.6439250979822906,
"grad_norm": 3.705435276031494,
"learning_rate": 9.831178895151715e-06,
"loss": 1.2571,
"step": 2218
},
{
"epoch": 0.644215415880389,
"grad_norm": 4.033883571624756,
"learning_rate": 9.830931400602144e-06,
"loss": 1.3683,
"step": 2219
},
{
"epoch": 0.6445057337784874,
"grad_norm": 3.2899346351623535,
"learning_rate": 9.830683727889741e-06,
"loss": 1.1005,
"step": 2220
},
{
"epoch": 0.6447960516765858,
"grad_norm": 3.1492760181427,
"learning_rate": 9.830435877023639e-06,
"loss": 1.1345,
"step": 2221
},
{
"epoch": 0.6450863695746842,
"grad_norm": 3.2546796798706055,
"learning_rate": 9.830187848012979e-06,
"loss": 1.1064,
"step": 2222
},
{
"epoch": 0.6453766874727827,
"grad_norm": 3.29607892036438,
"learning_rate": 9.829939640866907e-06,
"loss": 1.2367,
"step": 2223
},
{
"epoch": 0.6456670053708811,
"grad_norm": 3.307436466217041,
"learning_rate": 9.82969125559458e-06,
"loss": 1.1053,
"step": 2224
},
{
"epoch": 0.6459573232689795,
"grad_norm": 3.5715291500091553,
"learning_rate": 9.829442692205153e-06,
"loss": 1.2292,
"step": 2225
},
{
"epoch": 0.646247641167078,
"grad_norm": 3.4303536415100098,
"learning_rate": 9.829193950707798e-06,
"loss": 1.1351,
"step": 2226
},
{
"epoch": 0.6465379590651764,
"grad_norm": 2.975395441055298,
"learning_rate": 9.828945031111686e-06,
"loss": 1.0084,
"step": 2227
},
{
"epoch": 0.6468282769632748,
"grad_norm": 3.295159101486206,
"learning_rate": 9.828695933425997e-06,
"loss": 1.1417,
"step": 2228
},
{
"epoch": 0.6471185948613732,
"grad_norm": 3.2531330585479736,
"learning_rate": 9.828446657659919e-06,
"loss": 1.1857,
"step": 2229
},
{
"epoch": 0.6474089127594717,
"grad_norm": 3.3126182556152344,
"learning_rate": 9.828197203822645e-06,
"loss": 1.2185,
"step": 2230
},
{
"epoch": 0.6476992306575701,
"grad_norm": 3.2954418659210205,
"learning_rate": 9.827947571923373e-06,
"loss": 1.1762,
"step": 2231
},
{
"epoch": 0.6479895485556685,
"grad_norm": 3.3297324180603027,
"learning_rate": 9.827697761971311e-06,
"loss": 1.2222,
"step": 2232
},
{
"epoch": 0.6482798664537669,
"grad_norm": 3.3421590328216553,
"learning_rate": 9.827447773975672e-06,
"loss": 1.1304,
"step": 2233
},
{
"epoch": 0.6485701843518653,
"grad_norm": 3.5584068298339844,
"learning_rate": 9.827197607945673e-06,
"loss": 1.2349,
"step": 2234
},
{
"epoch": 0.6488605022499637,
"grad_norm": 3.217658519744873,
"learning_rate": 9.826947263890542e-06,
"loss": 1.1348,
"step": 2235
},
{
"epoch": 0.6491508201480621,
"grad_norm": 3.6436023712158203,
"learning_rate": 9.826696741819513e-06,
"loss": 1.1754,
"step": 2236
},
{
"epoch": 0.6494411380461605,
"grad_norm": 3.1794240474700928,
"learning_rate": 9.826446041741821e-06,
"loss": 1.1274,
"step": 2237
},
{
"epoch": 0.649731455944259,
"grad_norm": 3.486071825027466,
"learning_rate": 9.826195163666717e-06,
"loss": 1.2021,
"step": 2238
},
{
"epoch": 0.6500217738423574,
"grad_norm": 3.734785795211792,
"learning_rate": 9.82594410760345e-06,
"loss": 1.2959,
"step": 2239
},
{
"epoch": 0.6503120917404558,
"grad_norm": 3.603210926055908,
"learning_rate": 9.825692873561278e-06,
"loss": 1.2613,
"step": 2240
},
{
"epoch": 0.6506024096385542,
"grad_norm": 3.3361124992370605,
"learning_rate": 9.825441461549469e-06,
"loss": 1.1428,
"step": 2241
},
{
"epoch": 0.6508927275366526,
"grad_norm": 3.122087240219116,
"learning_rate": 9.825189871577294e-06,
"loss": 1.0691,
"step": 2242
},
{
"epoch": 0.651183045434751,
"grad_norm": 3.1546952724456787,
"learning_rate": 9.824938103654031e-06,
"loss": 1.1187,
"step": 2243
},
{
"epoch": 0.6514733633328494,
"grad_norm": 3.2291035652160645,
"learning_rate": 9.824686157788968e-06,
"loss": 1.0736,
"step": 2244
},
{
"epoch": 0.6517636812309479,
"grad_norm": 3.363553762435913,
"learning_rate": 9.82443403399139e-06,
"loss": 1.182,
"step": 2245
},
{
"epoch": 0.6520539991290463,
"grad_norm": 3.5415096282958984,
"learning_rate": 9.824181732270601e-06,
"loss": 1.2854,
"step": 2246
},
{
"epoch": 0.6523443170271447,
"grad_norm": 3.141082525253296,
"learning_rate": 9.823929252635905e-06,
"loss": 1.155,
"step": 2247
},
{
"epoch": 0.6526346349252431,
"grad_norm": 3.1211352348327637,
"learning_rate": 9.823676595096612e-06,
"loss": 1.0612,
"step": 2248
},
{
"epoch": 0.6529249528233415,
"grad_norm": 3.169532060623169,
"learning_rate": 9.823423759662039e-06,
"loss": 1.1733,
"step": 2249
},
{
"epoch": 0.6532152707214399,
"grad_norm": 3.2215521335601807,
"learning_rate": 9.823170746341513e-06,
"loss": 1.2333,
"step": 2250
},
{
"epoch": 0.6535055886195384,
"grad_norm": 3.0309600830078125,
"learning_rate": 9.822917555144364e-06,
"loss": 1.1244,
"step": 2251
},
{
"epoch": 0.6537959065176369,
"grad_norm": 3.429142475128174,
"learning_rate": 9.822664186079928e-06,
"loss": 1.1219,
"step": 2252
},
{
"epoch": 0.6540862244157353,
"grad_norm": 3.5349714756011963,
"learning_rate": 9.822410639157554e-06,
"loss": 1.1846,
"step": 2253
},
{
"epoch": 0.6543765423138337,
"grad_norm": 3.37827205657959,
"learning_rate": 9.822156914386587e-06,
"loss": 1.083,
"step": 2254
},
{
"epoch": 0.6546668602119321,
"grad_norm": 3.172299861907959,
"learning_rate": 9.821903011776385e-06,
"loss": 1.0561,
"step": 2255
},
{
"epoch": 0.6549571781100305,
"grad_norm": 3.613541841506958,
"learning_rate": 9.821648931336316e-06,
"loss": 1.2298,
"step": 2256
},
{
"epoch": 0.6552474960081289,
"grad_norm": 3.3095669746398926,
"learning_rate": 9.821394673075749e-06,
"loss": 1.1434,
"step": 2257
},
{
"epoch": 0.6555378139062273,
"grad_norm": 3.3738560676574707,
"learning_rate": 9.821140237004056e-06,
"loss": 1.0829,
"step": 2258
},
{
"epoch": 0.6558281318043258,
"grad_norm": 3.2556138038635254,
"learning_rate": 9.820885623130626e-06,
"loss": 1.2057,
"step": 2259
},
{
"epoch": 0.6561184497024242,
"grad_norm": 3.1285338401794434,
"learning_rate": 9.820630831464848e-06,
"loss": 1.0995,
"step": 2260
},
{
"epoch": 0.6564087676005226,
"grad_norm": 3.290846109390259,
"learning_rate": 9.820375862016116e-06,
"loss": 1.1008,
"step": 2261
},
{
"epoch": 0.656699085498621,
"grad_norm": 3.7028110027313232,
"learning_rate": 9.820120714793837e-06,
"loss": 1.296,
"step": 2262
},
{
"epoch": 0.6569894033967194,
"grad_norm": 3.056378126144409,
"learning_rate": 9.819865389807418e-06,
"loss": 1.1055,
"step": 2263
},
{
"epoch": 0.6572797212948178,
"grad_norm": 3.3602118492126465,
"learning_rate": 9.819609887066277e-06,
"loss": 1.2804,
"step": 2264
},
{
"epoch": 0.6575700391929162,
"grad_norm": 3.4260177612304688,
"learning_rate": 9.819354206579837e-06,
"loss": 1.1645,
"step": 2265
},
{
"epoch": 0.6578603570910146,
"grad_norm": 3.3738510608673096,
"learning_rate": 9.819098348357524e-06,
"loss": 1.2217,
"step": 2266
},
{
"epoch": 0.6581506749891131,
"grad_norm": 3.576476573944092,
"learning_rate": 9.818842312408776e-06,
"loss": 1.1926,
"step": 2267
},
{
"epoch": 0.6584409928872115,
"grad_norm": 3.448089838027954,
"learning_rate": 9.818586098743038e-06,
"loss": 1.3726,
"step": 2268
},
{
"epoch": 0.6587313107853099,
"grad_norm": 3.3965907096862793,
"learning_rate": 9.818329707369755e-06,
"loss": 1.2387,
"step": 2269
},
{
"epoch": 0.6590216286834083,
"grad_norm": 3.6523730754852295,
"learning_rate": 9.818073138298386e-06,
"loss": 1.1913,
"step": 2270
},
{
"epoch": 0.6593119465815067,
"grad_norm": 3.646683931350708,
"learning_rate": 9.817816391538391e-06,
"loss": 1.231,
"step": 2271
},
{
"epoch": 0.6596022644796051,
"grad_norm": 2.9595463275909424,
"learning_rate": 9.81755946709924e-06,
"loss": 1.165,
"step": 2272
},
{
"epoch": 0.6598925823777035,
"grad_norm": 3.1737749576568604,
"learning_rate": 9.817302364990406e-06,
"loss": 1.0447,
"step": 2273
},
{
"epoch": 0.660182900275802,
"grad_norm": 3.2275867462158203,
"learning_rate": 9.817045085221373e-06,
"loss": 1.1765,
"step": 2274
},
{
"epoch": 0.6604732181739004,
"grad_norm": 3.4508190155029297,
"learning_rate": 9.81678762780163e-06,
"loss": 1.2429,
"step": 2275
},
{
"epoch": 0.6607635360719989,
"grad_norm": 3.456575632095337,
"learning_rate": 9.81652999274067e-06,
"loss": 1.2266,
"step": 2276
},
{
"epoch": 0.6610538539700973,
"grad_norm": 3.2471117973327637,
"learning_rate": 9.816272180047996e-06,
"loss": 1.0078,
"step": 2277
},
{
"epoch": 0.6613441718681957,
"grad_norm": 3.268442153930664,
"learning_rate": 9.816014189733114e-06,
"loss": 1.1238,
"step": 2278
},
{
"epoch": 0.6616344897662941,
"grad_norm": 3.4898526668548584,
"learning_rate": 9.81575602180554e-06,
"loss": 1.1437,
"step": 2279
},
{
"epoch": 0.6619248076643925,
"grad_norm": 3.3566908836364746,
"learning_rate": 9.815497676274796e-06,
"loss": 1.0441,
"step": 2280
},
{
"epoch": 0.662215125562491,
"grad_norm": 3.3789467811584473,
"learning_rate": 9.815239153150408e-06,
"loss": 1.1994,
"step": 2281
},
{
"epoch": 0.6625054434605894,
"grad_norm": 3.390451669692993,
"learning_rate": 9.81498045244191e-06,
"loss": 1.3149,
"step": 2282
},
{
"epoch": 0.6627957613586878,
"grad_norm": 3.3824403285980225,
"learning_rate": 9.814721574158846e-06,
"loss": 1.076,
"step": 2283
},
{
"epoch": 0.6630860792567862,
"grad_norm": 3.420539379119873,
"learning_rate": 9.81446251831076e-06,
"loss": 1.193,
"step": 2284
},
{
"epoch": 0.6633763971548846,
"grad_norm": 3.389395236968994,
"learning_rate": 9.814203284907207e-06,
"loss": 1.1161,
"step": 2285
},
{
"epoch": 0.663666715052983,
"grad_norm": 3.054683208465576,
"learning_rate": 9.813943873957748e-06,
"loss": 1.055,
"step": 2286
},
{
"epoch": 0.6639570329510814,
"grad_norm": 2.9350805282592773,
"learning_rate": 9.813684285471947e-06,
"loss": 1.0195,
"step": 2287
},
{
"epoch": 0.6642473508491799,
"grad_norm": 3.091355800628662,
"learning_rate": 9.81342451945938e-06,
"loss": 1.0988,
"step": 2288
},
{
"epoch": 0.6645376687472783,
"grad_norm": 3.1102099418640137,
"learning_rate": 9.813164575929628e-06,
"loss": 1.0639,
"step": 2289
},
{
"epoch": 0.6648279866453767,
"grad_norm": 3.5209128856658936,
"learning_rate": 9.812904454892276e-06,
"loss": 1.2014,
"step": 2290
},
{
"epoch": 0.6651183045434751,
"grad_norm": 3.12597393989563,
"learning_rate": 9.812644156356919e-06,
"loss": 1.0899,
"step": 2291
},
{
"epoch": 0.6654086224415735,
"grad_norm": 2.8330626487731934,
"learning_rate": 9.812383680333155e-06,
"loss": 1.1208,
"step": 2292
},
{
"epoch": 0.6656989403396719,
"grad_norm": 3.543325185775757,
"learning_rate": 9.812123026830589e-06,
"loss": 1.1893,
"step": 2293
},
{
"epoch": 0.6659892582377703,
"grad_norm": 3.1367380619049072,
"learning_rate": 9.811862195858837e-06,
"loss": 1.1395,
"step": 2294
},
{
"epoch": 0.6662795761358687,
"grad_norm": 3.0807571411132812,
"learning_rate": 9.811601187427516e-06,
"loss": 1.1274,
"step": 2295
},
{
"epoch": 0.6665698940339672,
"grad_norm": 3.28458309173584,
"learning_rate": 9.811340001546252e-06,
"loss": 1.0711,
"step": 2296
},
{
"epoch": 0.6668602119320656,
"grad_norm": 3.28643798828125,
"learning_rate": 9.81107863822468e-06,
"loss": 1.2233,
"step": 2297
},
{
"epoch": 0.667150529830164,
"grad_norm": 3.4898693561553955,
"learning_rate": 9.810817097472436e-06,
"loss": 1.2142,
"step": 2298
},
{
"epoch": 0.6674408477282624,
"grad_norm": 3.2157557010650635,
"learning_rate": 9.810555379299166e-06,
"loss": 1.2659,
"step": 2299
},
{
"epoch": 0.6677311656263608,
"grad_norm": 3.494442939758301,
"learning_rate": 9.810293483714523e-06,
"loss": 1.2787,
"step": 2300
},
{
"epoch": 0.6680214835244593,
"grad_norm": 3.61946702003479,
"learning_rate": 9.810031410728164e-06,
"loss": 1.1851,
"step": 2301
},
{
"epoch": 0.6683118014225577,
"grad_norm": 3.2607109546661377,
"learning_rate": 9.809769160349758e-06,
"loss": 1.1155,
"step": 2302
},
{
"epoch": 0.6686021193206562,
"grad_norm": 3.383884906768799,
"learning_rate": 9.809506732588972e-06,
"loss": 1.2479,
"step": 2303
},
{
"epoch": 0.6688924372187546,
"grad_norm": 3.2273740768432617,
"learning_rate": 9.809244127455488e-06,
"loss": 1.1941,
"step": 2304
},
{
"epoch": 0.669182755116853,
"grad_norm": 3.4954328536987305,
"learning_rate": 9.808981344958988e-06,
"loss": 1.1645,
"step": 2305
},
{
"epoch": 0.6694730730149514,
"grad_norm": 3.2053277492523193,
"learning_rate": 9.808718385109165e-06,
"loss": 1.2592,
"step": 2306
},
{
"epoch": 0.6697633909130498,
"grad_norm": 3.0955846309661865,
"learning_rate": 9.808455247915715e-06,
"loss": 1.2793,
"step": 2307
},
{
"epoch": 0.6700537088111482,
"grad_norm": 3.197502374649048,
"learning_rate": 9.808191933388345e-06,
"loss": 1.0838,
"step": 2308
},
{
"epoch": 0.6703440267092466,
"grad_norm": 3.3631088733673096,
"learning_rate": 9.807928441536762e-06,
"loss": 1.1083,
"step": 2309
},
{
"epoch": 0.6706343446073451,
"grad_norm": 2.953148126602173,
"learning_rate": 9.807664772370689e-06,
"loss": 1.0448,
"step": 2310
},
{
"epoch": 0.6709246625054435,
"grad_norm": 3.3612277507781982,
"learning_rate": 9.807400925899846e-06,
"loss": 1.0393,
"step": 2311
},
{
"epoch": 0.6712149804035419,
"grad_norm": 3.6656582355499268,
"learning_rate": 9.807136902133965e-06,
"loss": 1.2362,
"step": 2312
},
{
"epoch": 0.6715052983016403,
"grad_norm": 3.5118401050567627,
"learning_rate": 9.806872701082781e-06,
"loss": 1.2117,
"step": 2313
},
{
"epoch": 0.6717956161997387,
"grad_norm": 3.3114728927612305,
"learning_rate": 9.806608322756042e-06,
"loss": 1.1594,
"step": 2314
},
{
"epoch": 0.6720859340978371,
"grad_norm": 3.28566837310791,
"learning_rate": 9.806343767163494e-06,
"loss": 1.1699,
"step": 2315
},
{
"epoch": 0.6723762519959355,
"grad_norm": 3.1415863037109375,
"learning_rate": 9.806079034314895e-06,
"loss": 1.0319,
"step": 2316
},
{
"epoch": 0.672666569894034,
"grad_norm": 3.3450355529785156,
"learning_rate": 9.80581412422001e-06,
"loss": 1.1448,
"step": 2317
},
{
"epoch": 0.6729568877921324,
"grad_norm": 3.2889275550842285,
"learning_rate": 9.805549036888605e-06,
"loss": 1.1007,
"step": 2318
},
{
"epoch": 0.6732472056902308,
"grad_norm": 3.367488384246826,
"learning_rate": 9.80528377233046e-06,
"loss": 1.1438,
"step": 2319
},
{
"epoch": 0.6735375235883292,
"grad_norm": 3.3112919330596924,
"learning_rate": 9.805018330555356e-06,
"loss": 1.3459,
"step": 2320
},
{
"epoch": 0.6738278414864276,
"grad_norm": 3.415867567062378,
"learning_rate": 9.804752711573082e-06,
"loss": 1.1417,
"step": 2321
},
{
"epoch": 0.674118159384526,
"grad_norm": 3.7435660362243652,
"learning_rate": 9.804486915393437e-06,
"loss": 1.3839,
"step": 2322
},
{
"epoch": 0.6744084772826244,
"grad_norm": 3.293759822845459,
"learning_rate": 9.80422094202622e-06,
"loss": 1.103,
"step": 2323
},
{
"epoch": 0.6746987951807228,
"grad_norm": 3.387779474258423,
"learning_rate": 9.803954791481239e-06,
"loss": 1.2076,
"step": 2324
},
{
"epoch": 0.6749891130788213,
"grad_norm": 3.345348358154297,
"learning_rate": 9.803688463768314e-06,
"loss": 1.1311,
"step": 2325
},
{
"epoch": 0.6752794309769198,
"grad_norm": 3.251539707183838,
"learning_rate": 9.803421958897264e-06,
"loss": 1.1487,
"step": 2326
},
{
"epoch": 0.6755697488750182,
"grad_norm": 3.229526996612549,
"learning_rate": 9.803155276877918e-06,
"loss": 1.1344,
"step": 2327
},
{
"epoch": 0.6758600667731166,
"grad_norm": 3.530510187149048,
"learning_rate": 9.802888417720113e-06,
"loss": 1.2112,
"step": 2328
},
{
"epoch": 0.676150384671215,
"grad_norm": 3.2944540977478027,
"learning_rate": 9.802621381433687e-06,
"loss": 1.2135,
"step": 2329
},
{
"epoch": 0.6764407025693134,
"grad_norm": 3.1269474029541016,
"learning_rate": 9.802354168028491e-06,
"loss": 1.1785,
"step": 2330
},
{
"epoch": 0.6767310204674118,
"grad_norm": 3.0783286094665527,
"learning_rate": 9.80208677751438e-06,
"loss": 1.1425,
"step": 2331
},
{
"epoch": 0.6770213383655103,
"grad_norm": 3.5151352882385254,
"learning_rate": 9.801819209901214e-06,
"loss": 1.2729,
"step": 2332
},
{
"epoch": 0.6773116562636087,
"grad_norm": 3.083354949951172,
"learning_rate": 9.801551465198862e-06,
"loss": 1.0144,
"step": 2333
},
{
"epoch": 0.6776019741617071,
"grad_norm": 3.382624387741089,
"learning_rate": 9.801283543417195e-06,
"loss": 1.1739,
"step": 2334
},
{
"epoch": 0.6778922920598055,
"grad_norm": 3.231215000152588,
"learning_rate": 9.801015444566097e-06,
"loss": 1.2779,
"step": 2335
},
{
"epoch": 0.6781826099579039,
"grad_norm": 3.257922887802124,
"learning_rate": 9.800747168655455e-06,
"loss": 1.2151,
"step": 2336
},
{
"epoch": 0.6784729278560023,
"grad_norm": 3.3422892093658447,
"learning_rate": 9.800478715695165e-06,
"loss": 1.1516,
"step": 2337
},
{
"epoch": 0.6787632457541007,
"grad_norm": 3.452329158782959,
"learning_rate": 9.800210085695122e-06,
"loss": 1.1959,
"step": 2338
},
{
"epoch": 0.6790535636521992,
"grad_norm": 3.49959397315979,
"learning_rate": 9.799941278665237e-06,
"loss": 1.1562,
"step": 2339
},
{
"epoch": 0.6793438815502976,
"grad_norm": 3.652210235595703,
"learning_rate": 9.79967229461542e-06,
"loss": 1.1846,
"step": 2340
},
{
"epoch": 0.679634199448396,
"grad_norm": 2.9146311283111572,
"learning_rate": 9.799403133555596e-06,
"loss": 1.1545,
"step": 2341
},
{
"epoch": 0.6799245173464944,
"grad_norm": 3.4553141593933105,
"learning_rate": 9.79913379549569e-06,
"loss": 1.1622,
"step": 2342
},
{
"epoch": 0.6802148352445928,
"grad_norm": 3.6774072647094727,
"learning_rate": 9.798864280445633e-06,
"loss": 1.3584,
"step": 2343
},
{
"epoch": 0.6805051531426912,
"grad_norm": 3.1811299324035645,
"learning_rate": 9.798594588415364e-06,
"loss": 1.1414,
"step": 2344
},
{
"epoch": 0.6807954710407896,
"grad_norm": 3.348858594894409,
"learning_rate": 9.798324719414833e-06,
"loss": 1.1112,
"step": 2345
},
{
"epoch": 0.681085788938888,
"grad_norm": 3.5631508827209473,
"learning_rate": 9.79805467345399e-06,
"loss": 1.2831,
"step": 2346
},
{
"epoch": 0.6813761068369865,
"grad_norm": 3.5303027629852295,
"learning_rate": 9.797784450542794e-06,
"loss": 1.1016,
"step": 2347
},
{
"epoch": 0.6816664247350849,
"grad_norm": 3.4458773136138916,
"learning_rate": 9.79751405069121e-06,
"loss": 1.2462,
"step": 2348
},
{
"epoch": 0.6819567426331833,
"grad_norm": 3.3334274291992188,
"learning_rate": 9.797243473909214e-06,
"loss": 1.1773,
"step": 2349
},
{
"epoch": 0.6822470605312817,
"grad_norm": 3.3247268199920654,
"learning_rate": 9.796972720206783e-06,
"loss": 1.1246,
"step": 2350
},
{
"epoch": 0.6825373784293802,
"grad_norm": 3.354071617126465,
"learning_rate": 9.796701789593902e-06,
"loss": 1.1596,
"step": 2351
},
{
"epoch": 0.6828276963274786,
"grad_norm": 3.145782709121704,
"learning_rate": 9.79643068208056e-06,
"loss": 1.1527,
"step": 2352
},
{
"epoch": 0.683118014225577,
"grad_norm": 3.3376376628875732,
"learning_rate": 9.796159397676758e-06,
"loss": 1.1915,
"step": 2353
},
{
"epoch": 0.6834083321236755,
"grad_norm": 3.3845038414001465,
"learning_rate": 9.795887936392502e-06,
"loss": 1.1748,
"step": 2354
},
{
"epoch": 0.6836986500217739,
"grad_norm": 3.6921133995056152,
"learning_rate": 9.795616298237802e-06,
"loss": 1.1177,
"step": 2355
},
{
"epoch": 0.6839889679198723,
"grad_norm": 4.1208600997924805,
"learning_rate": 9.795344483222675e-06,
"loss": 1.183,
"step": 2356
},
{
"epoch": 0.6842792858179707,
"grad_norm": 3.442371368408203,
"learning_rate": 9.795072491357147e-06,
"loss": 1.2422,
"step": 2357
},
{
"epoch": 0.6845696037160691,
"grad_norm": 3.38021183013916,
"learning_rate": 9.79480032265125e-06,
"loss": 1.2806,
"step": 2358
},
{
"epoch": 0.6848599216141675,
"grad_norm": 3.3694331645965576,
"learning_rate": 9.794527977115019e-06,
"loss": 1.168,
"step": 2359
},
{
"epoch": 0.685150239512266,
"grad_norm": 3.2959866523742676,
"learning_rate": 9.794255454758497e-06,
"loss": 1.0299,
"step": 2360
},
{
"epoch": 0.6854405574103644,
"grad_norm": 3.3888444900512695,
"learning_rate": 9.793982755591738e-06,
"loss": 1.3449,
"step": 2361
},
{
"epoch": 0.6857308753084628,
"grad_norm": 3.2652950286865234,
"learning_rate": 9.793709879624797e-06,
"loss": 1.1281,
"step": 2362
},
{
"epoch": 0.6860211932065612,
"grad_norm": 3.525996208190918,
"learning_rate": 9.793436826867737e-06,
"loss": 1.2652,
"step": 2363
},
{
"epoch": 0.6863115111046596,
"grad_norm": 3.430039405822754,
"learning_rate": 9.79316359733063e-06,
"loss": 1.2594,
"step": 2364
},
{
"epoch": 0.686601829002758,
"grad_norm": 3.4313323497772217,
"learning_rate": 9.792890191023551e-06,
"loss": 1.1357,
"step": 2365
},
{
"epoch": 0.6868921469008564,
"grad_norm": 3.3758277893066406,
"learning_rate": 9.792616607956585e-06,
"loss": 1.2663,
"step": 2366
},
{
"epoch": 0.6871824647989548,
"grad_norm": 3.622230052947998,
"learning_rate": 9.79234284813982e-06,
"loss": 1.2205,
"step": 2367
},
{
"epoch": 0.6874727826970533,
"grad_norm": 3.0984694957733154,
"learning_rate": 9.792068911583353e-06,
"loss": 1.0823,
"step": 2368
},
{
"epoch": 0.6877631005951517,
"grad_norm": 3.3490564823150635,
"learning_rate": 9.791794798297286e-06,
"loss": 1.2032,
"step": 2369
},
{
"epoch": 0.6880534184932501,
"grad_norm": 3.1726980209350586,
"learning_rate": 9.791520508291728e-06,
"loss": 1.11,
"step": 2370
},
{
"epoch": 0.6883437363913485,
"grad_norm": 3.6225693225860596,
"learning_rate": 9.791246041576795e-06,
"loss": 1.3124,
"step": 2371
},
{
"epoch": 0.6886340542894469,
"grad_norm": 3.639941692352295,
"learning_rate": 9.790971398162608e-06,
"loss": 1.1873,
"step": 2372
},
{
"epoch": 0.6889243721875453,
"grad_norm": 3.2535839080810547,
"learning_rate": 9.7906965780593e-06,
"loss": 1.1934,
"step": 2373
},
{
"epoch": 0.6892146900856437,
"grad_norm": 3.317662000656128,
"learning_rate": 9.790421581277002e-06,
"loss": 1.167,
"step": 2374
},
{
"epoch": 0.6895050079837421,
"grad_norm": 3.376481533050537,
"learning_rate": 9.790146407825856e-06,
"loss": 1.1746,
"step": 2375
},
{
"epoch": 0.6897953258818407,
"grad_norm": 3.3618693351745605,
"learning_rate": 9.789871057716012e-06,
"loss": 1.2363,
"step": 2376
},
{
"epoch": 0.6900856437799391,
"grad_norm": 3.43084979057312,
"learning_rate": 9.789595530957626e-06,
"loss": 1.1278,
"step": 2377
},
{
"epoch": 0.6903759616780375,
"grad_norm": 3.321505546569824,
"learning_rate": 9.789319827560854e-06,
"loss": 1.2212,
"step": 2378
},
{
"epoch": 0.6906662795761359,
"grad_norm": 3.113330364227295,
"learning_rate": 9.78904394753587e-06,
"loss": 1.0621,
"step": 2379
},
{
"epoch": 0.6909565974742343,
"grad_norm": 3.3849680423736572,
"learning_rate": 9.788767890892845e-06,
"loss": 1.2761,
"step": 2380
},
{
"epoch": 0.6912469153723327,
"grad_norm": 3.285853147506714,
"learning_rate": 9.78849165764196e-06,
"loss": 1.0482,
"step": 2381
},
{
"epoch": 0.6915372332704312,
"grad_norm": 3.0740060806274414,
"learning_rate": 9.788215247793405e-06,
"loss": 1.1211,
"step": 2382
},
{
"epoch": 0.6918275511685296,
"grad_norm": 3.0753612518310547,
"learning_rate": 9.78793866135737e-06,
"loss": 1.0918,
"step": 2383
},
{
"epoch": 0.692117869066628,
"grad_norm": 3.350917100906372,
"learning_rate": 9.787661898344058e-06,
"loss": 1.348,
"step": 2384
},
{
"epoch": 0.6924081869647264,
"grad_norm": 3.713820219039917,
"learning_rate": 9.787384958763674e-06,
"loss": 1.2728,
"step": 2385
},
{
"epoch": 0.6926985048628248,
"grad_norm": 3.2374231815338135,
"learning_rate": 9.787107842626434e-06,
"loss": 1.1106,
"step": 2386
},
{
"epoch": 0.6929888227609232,
"grad_norm": 3.0998446941375732,
"learning_rate": 9.786830549942556e-06,
"loss": 1.147,
"step": 2387
},
{
"epoch": 0.6932791406590216,
"grad_norm": 3.490924835205078,
"learning_rate": 9.786553080722266e-06,
"loss": 1.3013,
"step": 2388
},
{
"epoch": 0.69356945855712,
"grad_norm": 3.3626949787139893,
"learning_rate": 9.786275434975797e-06,
"loss": 1.2637,
"step": 2389
},
{
"epoch": 0.6938597764552185,
"grad_norm": 3.2617788314819336,
"learning_rate": 9.785997612713391e-06,
"loss": 1.0639,
"step": 2390
},
{
"epoch": 0.6941500943533169,
"grad_norm": 3.3937413692474365,
"learning_rate": 9.785719613945293e-06,
"loss": 1.1385,
"step": 2391
},
{
"epoch": 0.6944404122514153,
"grad_norm": 3.2378339767456055,
"learning_rate": 9.785441438681755e-06,
"loss": 1.1471,
"step": 2392
},
{
"epoch": 0.6947307301495137,
"grad_norm": 3.2014105319976807,
"learning_rate": 9.785163086933034e-06,
"loss": 1.1106,
"step": 2393
},
{
"epoch": 0.6950210480476121,
"grad_norm": 3.524437665939331,
"learning_rate": 9.784884558709398e-06,
"loss": 1.1607,
"step": 2394
},
{
"epoch": 0.6953113659457105,
"grad_norm": 3.2841367721557617,
"learning_rate": 9.784605854021118e-06,
"loss": 0.9346,
"step": 2395
},
{
"epoch": 0.6956016838438089,
"grad_norm": 3.702146291732788,
"learning_rate": 9.784326972878474e-06,
"loss": 1.266,
"step": 2396
},
{
"epoch": 0.6958920017419074,
"grad_norm": 3.6109771728515625,
"learning_rate": 9.784047915291748e-06,
"loss": 1.1987,
"step": 2397
},
{
"epoch": 0.6961823196400058,
"grad_norm": 3.68677020072937,
"learning_rate": 9.783768681271234e-06,
"loss": 1.3537,
"step": 2398
},
{
"epoch": 0.6964726375381042,
"grad_norm": 2.9631056785583496,
"learning_rate": 9.78348927082723e-06,
"loss": 1.0584,
"step": 2399
},
{
"epoch": 0.6967629554362026,
"grad_norm": 3.4369635581970215,
"learning_rate": 9.78320968397004e-06,
"loss": 1.1968,
"step": 2400
},
{
"epoch": 0.697053273334301,
"grad_norm": 3.149402379989624,
"learning_rate": 9.782929920709974e-06,
"loss": 1.1627,
"step": 2401
},
{
"epoch": 0.6973435912323995,
"grad_norm": 3.3772337436676025,
"learning_rate": 9.782649981057352e-06,
"loss": 1.1989,
"step": 2402
},
{
"epoch": 0.6976339091304979,
"grad_norm": 3.39142107963562,
"learning_rate": 9.782369865022495e-06,
"loss": 1.2028,
"step": 2403
},
{
"epoch": 0.6979242270285964,
"grad_norm": 3.2515244483947754,
"learning_rate": 9.782089572615737e-06,
"loss": 1.1666,
"step": 2404
},
{
"epoch": 0.6982145449266948,
"grad_norm": 2.9869136810302734,
"learning_rate": 9.781809103847411e-06,
"loss": 1.0236,
"step": 2405
},
{
"epoch": 0.6985048628247932,
"grad_norm": 3.331195592880249,
"learning_rate": 9.781528458727865e-06,
"loss": 1.1569,
"step": 2406
},
{
"epoch": 0.6987951807228916,
"grad_norm": 3.2006444931030273,
"learning_rate": 9.781247637267446e-06,
"loss": 1.0676,
"step": 2407
},
{
"epoch": 0.69908549862099,
"grad_norm": 3.203761577606201,
"learning_rate": 9.780966639476513e-06,
"loss": 1.2282,
"step": 2408
},
{
"epoch": 0.6993758165190884,
"grad_norm": 3.381657600402832,
"learning_rate": 9.780685465365426e-06,
"loss": 1.1954,
"step": 2409
},
{
"epoch": 0.6996661344171868,
"grad_norm": 3.2319588661193848,
"learning_rate": 9.780404114944556e-06,
"loss": 1.1636,
"step": 2410
},
{
"epoch": 0.6999564523152852,
"grad_norm": 3.4879820346832275,
"learning_rate": 9.780122588224278e-06,
"loss": 1.2639,
"step": 2411
},
{
"epoch": 0.7002467702133837,
"grad_norm": 3.1994943618774414,
"learning_rate": 9.77984088521498e-06,
"loss": 1.1396,
"step": 2412
},
{
"epoch": 0.7005370881114821,
"grad_norm": 3.4960827827453613,
"learning_rate": 9.779559005927043e-06,
"loss": 1.1809,
"step": 2413
},
{
"epoch": 0.7008274060095805,
"grad_norm": 3.188183307647705,
"learning_rate": 9.779276950370868e-06,
"loss": 1.1677,
"step": 2414
},
{
"epoch": 0.7011177239076789,
"grad_norm": 3.095752000808716,
"learning_rate": 9.778994718556856e-06,
"loss": 1.0553,
"step": 2415
},
{
"epoch": 0.7014080418057773,
"grad_norm": 3.390242099761963,
"learning_rate": 9.778712310495415e-06,
"loss": 1.2226,
"step": 2416
},
{
"epoch": 0.7016983597038757,
"grad_norm": 2.846047878265381,
"learning_rate": 9.77842972619696e-06,
"loss": 1.0552,
"step": 2417
},
{
"epoch": 0.7019886776019741,
"grad_norm": 3.244255304336548,
"learning_rate": 9.778146965671915e-06,
"loss": 1.2517,
"step": 2418
},
{
"epoch": 0.7022789955000726,
"grad_norm": 3.267493724822998,
"learning_rate": 9.777864028930705e-06,
"loss": 1.1721,
"step": 2419
},
{
"epoch": 0.702569313398171,
"grad_norm": 3.073822259902954,
"learning_rate": 9.777580915983765e-06,
"loss": 1.129,
"step": 2420
},
{
"epoch": 0.7028596312962694,
"grad_norm": 3.1357955932617188,
"learning_rate": 9.777297626841536e-06,
"loss": 1.2401,
"step": 2421
},
{
"epoch": 0.7031499491943678,
"grad_norm": 3.211599349975586,
"learning_rate": 9.777014161514468e-06,
"loss": 1.203,
"step": 2422
},
{
"epoch": 0.7034402670924662,
"grad_norm": 3.394411325454712,
"learning_rate": 9.776730520013013e-06,
"loss": 1.2225,
"step": 2423
},
{
"epoch": 0.7037305849905646,
"grad_norm": 3.4315035343170166,
"learning_rate": 9.77644670234763e-06,
"loss": 1.1715,
"step": 2424
},
{
"epoch": 0.704020902888663,
"grad_norm": 3.435701847076416,
"learning_rate": 9.776162708528792e-06,
"loss": 1.2022,
"step": 2425
},
{
"epoch": 0.7043112207867614,
"grad_norm": 3.5279853343963623,
"learning_rate": 9.775878538566965e-06,
"loss": 1.1028,
"step": 2426
},
{
"epoch": 0.70460153868486,
"grad_norm": 3.295423984527588,
"learning_rate": 9.775594192472635e-06,
"loss": 1.2768,
"step": 2427
},
{
"epoch": 0.7048918565829584,
"grad_norm": 3.0675647258758545,
"learning_rate": 9.775309670256286e-06,
"loss": 1.2386,
"step": 2428
},
{
"epoch": 0.7051821744810568,
"grad_norm": 3.320549726486206,
"learning_rate": 9.77502497192841e-06,
"loss": 1.1444,
"step": 2429
},
{
"epoch": 0.7054724923791552,
"grad_norm": 3.095872402191162,
"learning_rate": 9.774740097499509e-06,
"loss": 1.0612,
"step": 2430
},
{
"epoch": 0.7057628102772536,
"grad_norm": 3.0651066303253174,
"learning_rate": 9.774455046980087e-06,
"loss": 0.9936,
"step": 2431
},
{
"epoch": 0.706053128175352,
"grad_norm": 3.40466570854187,
"learning_rate": 9.77416982038066e-06,
"loss": 1.113,
"step": 2432
},
{
"epoch": 0.7063434460734505,
"grad_norm": 3.6496083736419678,
"learning_rate": 9.773884417711743e-06,
"loss": 1.2631,
"step": 2433
},
{
"epoch": 0.7066337639715489,
"grad_norm": 3.3464813232421875,
"learning_rate": 9.773598838983863e-06,
"loss": 1.3191,
"step": 2434
},
{
"epoch": 0.7069240818696473,
"grad_norm": 3.3084921836853027,
"learning_rate": 9.773313084207552e-06,
"loss": 1.2405,
"step": 2435
},
{
"epoch": 0.7072143997677457,
"grad_norm": 3.0100600719451904,
"learning_rate": 9.773027153393349e-06,
"loss": 1.0613,
"step": 2436
},
{
"epoch": 0.7075047176658441,
"grad_norm": 3.3531084060668945,
"learning_rate": 9.772741046551798e-06,
"loss": 1.1767,
"step": 2437
},
{
"epoch": 0.7077950355639425,
"grad_norm": 3.3284599781036377,
"learning_rate": 9.772454763693453e-06,
"loss": 1.1301,
"step": 2438
},
{
"epoch": 0.7080853534620409,
"grad_norm": 3.4888689517974854,
"learning_rate": 9.772168304828869e-06,
"loss": 1.1039,
"step": 2439
},
{
"epoch": 0.7083756713601393,
"grad_norm": 3.0899245738983154,
"learning_rate": 9.771881669968611e-06,
"loss": 1.0399,
"step": 2440
},
{
"epoch": 0.7086659892582378,
"grad_norm": 3.2881476879119873,
"learning_rate": 9.771594859123252e-06,
"loss": 1.2318,
"step": 2441
},
{
"epoch": 0.7089563071563362,
"grad_norm": 4.053572654724121,
"learning_rate": 9.771307872303365e-06,
"loss": 1.2404,
"step": 2442
},
{
"epoch": 0.7092466250544346,
"grad_norm": 3.781298875808716,
"learning_rate": 9.77102070951954e-06,
"loss": 1.3447,
"step": 2443
},
{
"epoch": 0.709536942952533,
"grad_norm": 3.022076368331909,
"learning_rate": 9.770733370782365e-06,
"loss": 1.1249,
"step": 2444
},
{
"epoch": 0.7098272608506314,
"grad_norm": 3.1669278144836426,
"learning_rate": 9.770445856102438e-06,
"loss": 0.9911,
"step": 2445
},
{
"epoch": 0.7101175787487298,
"grad_norm": 3.3084747791290283,
"learning_rate": 9.770158165490358e-06,
"loss": 1.0994,
"step": 2446
},
{
"epoch": 0.7104078966468282,
"grad_norm": 3.027456760406494,
"learning_rate": 9.769870298956739e-06,
"loss": 1.0671,
"step": 2447
},
{
"epoch": 0.7106982145449267,
"grad_norm": 3.577392816543579,
"learning_rate": 9.769582256512195e-06,
"loss": 1.2498,
"step": 2448
},
{
"epoch": 0.7109885324430251,
"grad_norm": 3.087620735168457,
"learning_rate": 9.76929403816735e-06,
"loss": 1.2372,
"step": 2449
},
{
"epoch": 0.7112788503411235,
"grad_norm": 3.3493881225585938,
"learning_rate": 9.769005643932833e-06,
"loss": 1.1223,
"step": 2450
},
{
"epoch": 0.7115691682392219,
"grad_norm": 3.309208631515503,
"learning_rate": 9.768717073819282e-06,
"loss": 1.2156,
"step": 2451
},
{
"epoch": 0.7118594861373204,
"grad_norm": 3.5544214248657227,
"learning_rate": 9.768428327837339e-06,
"loss": 1.2821,
"step": 2452
},
{
"epoch": 0.7121498040354188,
"grad_norm": 3.2072324752807617,
"learning_rate": 9.76813940599765e-06,
"loss": 1.0891,
"step": 2453
},
{
"epoch": 0.7124401219335172,
"grad_norm": 3.3209030628204346,
"learning_rate": 9.767850308310872e-06,
"loss": 1.1572,
"step": 2454
},
{
"epoch": 0.7127304398316157,
"grad_norm": 3.294210910797119,
"learning_rate": 9.767561034787666e-06,
"loss": 1.0957,
"step": 2455
},
{
"epoch": 0.7130207577297141,
"grad_norm": 3.353680372238159,
"learning_rate": 9.767271585438703e-06,
"loss": 1.1803,
"step": 2456
},
{
"epoch": 0.7133110756278125,
"grad_norm": 2.933467149734497,
"learning_rate": 9.766981960274653e-06,
"loss": 1.0839,
"step": 2457
},
{
"epoch": 0.7136013935259109,
"grad_norm": 3.1124205589294434,
"learning_rate": 9.766692159306202e-06,
"loss": 1.0837,
"step": 2458
},
{
"epoch": 0.7138917114240093,
"grad_norm": 3.372271776199341,
"learning_rate": 9.766402182544034e-06,
"loss": 1.1596,
"step": 2459
},
{
"epoch": 0.7141820293221077,
"grad_norm": 3.386247396469116,
"learning_rate": 9.766112029998847e-06,
"loss": 1.1766,
"step": 2460
},
{
"epoch": 0.7144723472202061,
"grad_norm": 3.4302918910980225,
"learning_rate": 9.76582170168134e-06,
"loss": 1.1653,
"step": 2461
},
{
"epoch": 0.7147626651183046,
"grad_norm": 3.3646481037139893,
"learning_rate": 9.765531197602219e-06,
"loss": 1.2086,
"step": 2462
},
{
"epoch": 0.715052983016403,
"grad_norm": 3.197026491165161,
"learning_rate": 9.765240517772196e-06,
"loss": 1.1854,
"step": 2463
},
{
"epoch": 0.7153433009145014,
"grad_norm": 3.009091377258301,
"learning_rate": 9.764949662201997e-06,
"loss": 1.0761,
"step": 2464
},
{
"epoch": 0.7156336188125998,
"grad_norm": 3.1493172645568848,
"learning_rate": 9.764658630902345e-06,
"loss": 1.0669,
"step": 2465
},
{
"epoch": 0.7159239367106982,
"grad_norm": 3.1372087001800537,
"learning_rate": 9.764367423883973e-06,
"loss": 1.1141,
"step": 2466
},
{
"epoch": 0.7162142546087966,
"grad_norm": 3.358511209487915,
"learning_rate": 9.76407604115762e-06,
"loss": 1.1396,
"step": 2467
},
{
"epoch": 0.716504572506895,
"grad_norm": 3.5119621753692627,
"learning_rate": 9.763784482734035e-06,
"loss": 1.2956,
"step": 2468
},
{
"epoch": 0.7167948904049934,
"grad_norm": 3.1730403900146484,
"learning_rate": 9.763492748623969e-06,
"loss": 1.0829,
"step": 2469
},
{
"epoch": 0.7170852083030919,
"grad_norm": 3.2893500328063965,
"learning_rate": 9.763200838838178e-06,
"loss": 1.1184,
"step": 2470
},
{
"epoch": 0.7173755262011903,
"grad_norm": 2.979743480682373,
"learning_rate": 9.762908753387432e-06,
"loss": 1.0347,
"step": 2471
},
{
"epoch": 0.7176658440992887,
"grad_norm": 3.22346568107605,
"learning_rate": 9.762616492282502e-06,
"loss": 1.0688,
"step": 2472
},
{
"epoch": 0.7179561619973871,
"grad_norm": 3.191016912460327,
"learning_rate": 9.762324055534165e-06,
"loss": 1.1585,
"step": 2473
},
{
"epoch": 0.7182464798954855,
"grad_norm": 2.974458932876587,
"learning_rate": 9.762031443153207e-06,
"loss": 0.9389,
"step": 2474
},
{
"epoch": 0.7185367977935839,
"grad_norm": 3.3603460788726807,
"learning_rate": 9.761738655150419e-06,
"loss": 1.1379,
"step": 2475
},
{
"epoch": 0.7188271156916823,
"grad_norm": 3.3447885513305664,
"learning_rate": 9.761445691536598e-06,
"loss": 1.1837,
"step": 2476
},
{
"epoch": 0.7191174335897809,
"grad_norm": 3.482642412185669,
"learning_rate": 9.76115255232255e-06,
"loss": 1.1967,
"step": 2477
},
{
"epoch": 0.7194077514878793,
"grad_norm": 3.208934783935547,
"learning_rate": 9.760859237519087e-06,
"loss": 1.1285,
"step": 2478
},
{
"epoch": 0.7196980693859777,
"grad_norm": 3.199887990951538,
"learning_rate": 9.760565747137023e-06,
"loss": 1.0891,
"step": 2479
},
{
"epoch": 0.7199883872840761,
"grad_norm": 3.1284048557281494,
"learning_rate": 9.760272081187183e-06,
"loss": 1.1122,
"step": 2480
},
{
"epoch": 0.7202787051821745,
"grad_norm": 3.603379726409912,
"learning_rate": 9.7599782396804e-06,
"loss": 1.2686,
"step": 2481
},
{
"epoch": 0.7205690230802729,
"grad_norm": 3.496004581451416,
"learning_rate": 9.759684222627506e-06,
"loss": 1.2055,
"step": 2482
},
{
"epoch": 0.7208593409783713,
"grad_norm": 3.3529865741729736,
"learning_rate": 9.759390030039347e-06,
"loss": 1.154,
"step": 2483
},
{
"epoch": 0.7211496588764698,
"grad_norm": 3.08897066116333,
"learning_rate": 9.759095661926772e-06,
"loss": 1.0814,
"step": 2484
},
{
"epoch": 0.7214399767745682,
"grad_norm": 3.2618985176086426,
"learning_rate": 9.758801118300638e-06,
"loss": 1.1316,
"step": 2485
},
{
"epoch": 0.7217302946726666,
"grad_norm": 3.4715993404388428,
"learning_rate": 9.758506399171808e-06,
"loss": 1.2883,
"step": 2486
},
{
"epoch": 0.722020612570765,
"grad_norm": 3.0561084747314453,
"learning_rate": 9.758211504551151e-06,
"loss": 1.0894,
"step": 2487
},
{
"epoch": 0.7223109304688634,
"grad_norm": 3.1737711429595947,
"learning_rate": 9.75791643444954e-06,
"loss": 1.238,
"step": 2488
},
{
"epoch": 0.7226012483669618,
"grad_norm": 3.498148202896118,
"learning_rate": 9.757621188877861e-06,
"loss": 1.3628,
"step": 2489
},
{
"epoch": 0.7228915662650602,
"grad_norm": 2.9819672107696533,
"learning_rate": 9.757325767846999e-06,
"loss": 0.9908,
"step": 2490
},
{
"epoch": 0.7231818841631587,
"grad_norm": 2.9681432247161865,
"learning_rate": 9.757030171367852e-06,
"loss": 1.111,
"step": 2491
},
{
"epoch": 0.7234722020612571,
"grad_norm": 3.207848072052002,
"learning_rate": 9.756734399451318e-06,
"loss": 1.0846,
"step": 2492
},
{
"epoch": 0.7237625199593555,
"grad_norm": 3.4582133293151855,
"learning_rate": 9.756438452108307e-06,
"loss": 1.1117,
"step": 2493
},
{
"epoch": 0.7240528378574539,
"grad_norm": 3.1228976249694824,
"learning_rate": 9.756142329349737e-06,
"loss": 1.1891,
"step": 2494
},
{
"epoch": 0.7243431557555523,
"grad_norm": 3.249508857727051,
"learning_rate": 9.755846031186521e-06,
"loss": 1.0953,
"step": 2495
},
{
"epoch": 0.7246334736536507,
"grad_norm": 3.248222589492798,
"learning_rate": 9.755549557629593e-06,
"loss": 1.1658,
"step": 2496
},
{
"epoch": 0.7249237915517491,
"grad_norm": 3.254011869430542,
"learning_rate": 9.755252908689885e-06,
"loss": 1.117,
"step": 2497
},
{
"epoch": 0.7252141094498475,
"grad_norm": 3.4545297622680664,
"learning_rate": 9.754956084378336e-06,
"loss": 1.1358,
"step": 2498
},
{
"epoch": 0.725504427347946,
"grad_norm": 3.3574445247650146,
"learning_rate": 9.754659084705893e-06,
"loss": 1.1986,
"step": 2499
},
{
"epoch": 0.7257947452460444,
"grad_norm": 3.6412932872772217,
"learning_rate": 9.75436190968351e-06,
"loss": 1.2042,
"step": 2500
},
{
"epoch": 0.7257947452460444,
"eval_loss": 1.1983624696731567,
"eval_runtime": 11.2813,
"eval_samples_per_second": 35.457,
"eval_steps_per_second": 4.432,
"step": 2500
},
{
"epoch": 0.7260850631441428,
"grad_norm": 3.041032314300537,
"learning_rate": 9.754064559322147e-06,
"loss": 1.0758,
"step": 2501
},
{
"epoch": 0.7263753810422413,
"grad_norm": 3.4390034675598145,
"learning_rate": 9.753767033632769e-06,
"loss": 1.2908,
"step": 2502
},
{
"epoch": 0.7266656989403397,
"grad_norm": 3.178821563720703,
"learning_rate": 9.75346933262635e-06,
"loss": 1.0938,
"step": 2503
},
{
"epoch": 0.7269560168384381,
"grad_norm": 3.250523567199707,
"learning_rate": 9.753171456313868e-06,
"loss": 1.143,
"step": 2504
},
{
"epoch": 0.7272463347365365,
"grad_norm": 3.777912139892578,
"learning_rate": 9.752873404706309e-06,
"loss": 1.2468,
"step": 2505
},
{
"epoch": 0.727536652634635,
"grad_norm": 3.3552846908569336,
"learning_rate": 9.752575177814664e-06,
"loss": 1.0887,
"step": 2506
},
{
"epoch": 0.7278269705327334,
"grad_norm": 3.36442232131958,
"learning_rate": 9.752276775649934e-06,
"loss": 1.1639,
"step": 2507
},
{
"epoch": 0.7281172884308318,
"grad_norm": 3.309434175491333,
"learning_rate": 9.75197819822312e-06,
"loss": 1.2119,
"step": 2508
},
{
"epoch": 0.7284076063289302,
"grad_norm": 3.211569309234619,
"learning_rate": 9.751679445545239e-06,
"loss": 1.1335,
"step": 2509
},
{
"epoch": 0.7286979242270286,
"grad_norm": 3.2672746181488037,
"learning_rate": 9.751380517627304e-06,
"loss": 1.0993,
"step": 2510
},
{
"epoch": 0.728988242125127,
"grad_norm": 3.273798704147339,
"learning_rate": 9.751081414480342e-06,
"loss": 1.2028,
"step": 2511
},
{
"epoch": 0.7292785600232254,
"grad_norm": 3.2062716484069824,
"learning_rate": 9.750782136115381e-06,
"loss": 1.0892,
"step": 2512
},
{
"epoch": 0.7295688779213239,
"grad_norm": 3.3710551261901855,
"learning_rate": 9.75048268254346e-06,
"loss": 1.1791,
"step": 2513
},
{
"epoch": 0.7298591958194223,
"grad_norm": 3.117218255996704,
"learning_rate": 9.750183053775625e-06,
"loss": 1.0583,
"step": 2514
},
{
"epoch": 0.7301495137175207,
"grad_norm": 2.7797436714172363,
"learning_rate": 9.749883249822923e-06,
"loss": 0.9885,
"step": 2515
},
{
"epoch": 0.7304398316156191,
"grad_norm": 3.564326524734497,
"learning_rate": 9.749583270696413e-06,
"loss": 1.3298,
"step": 2516
},
{
"epoch": 0.7307301495137175,
"grad_norm": 3.287993907928467,
"learning_rate": 9.749283116407155e-06,
"loss": 1.2807,
"step": 2517
},
{
"epoch": 0.7310204674118159,
"grad_norm": 3.1724064350128174,
"learning_rate": 9.74898278696622e-06,
"loss": 1.1995,
"step": 2518
},
{
"epoch": 0.7313107853099143,
"grad_norm": 3.066631555557251,
"learning_rate": 9.748682282384685e-06,
"loss": 1.2402,
"step": 2519
},
{
"epoch": 0.7316011032080127,
"grad_norm": 3.3963117599487305,
"learning_rate": 9.748381602673633e-06,
"loss": 1.2954,
"step": 2520
},
{
"epoch": 0.7318914211061112,
"grad_norm": 3.1889572143554688,
"learning_rate": 9.74808074784415e-06,
"loss": 1.0468,
"step": 2521
},
{
"epoch": 0.7321817390042096,
"grad_norm": 3.008392810821533,
"learning_rate": 9.747779717907336e-06,
"loss": 1.0372,
"step": 2522
},
{
"epoch": 0.732472056902308,
"grad_norm": 3.1272335052490234,
"learning_rate": 9.747478512874288e-06,
"loss": 1.2067,
"step": 2523
},
{
"epoch": 0.7327623748004064,
"grad_norm": 3.072211503982544,
"learning_rate": 9.747177132756117e-06,
"loss": 0.9834,
"step": 2524
},
{
"epoch": 0.7330526926985048,
"grad_norm": 3.123993158340454,
"learning_rate": 9.746875577563936e-06,
"loss": 1.1079,
"step": 2525
},
{
"epoch": 0.7333430105966032,
"grad_norm": 3.211639404296875,
"learning_rate": 9.746573847308869e-06,
"loss": 1.1979,
"step": 2526
},
{
"epoch": 0.7336333284947018,
"grad_norm": 3.380052328109741,
"learning_rate": 9.746271942002042e-06,
"loss": 1.1854,
"step": 2527
},
{
"epoch": 0.7339236463928002,
"grad_norm": 3.1952614784240723,
"learning_rate": 9.745969861654589e-06,
"loss": 1.0955,
"step": 2528
},
{
"epoch": 0.7342139642908986,
"grad_norm": 3.376279592514038,
"learning_rate": 9.74566760627765e-06,
"loss": 1.3143,
"step": 2529
},
{
"epoch": 0.734504282188997,
"grad_norm": 3.431368589401245,
"learning_rate": 9.745365175882372e-06,
"loss": 1.2247,
"step": 2530
},
{
"epoch": 0.7347946000870954,
"grad_norm": 3.4958410263061523,
"learning_rate": 9.745062570479912e-06,
"loss": 1.1536,
"step": 2531
},
{
"epoch": 0.7350849179851938,
"grad_norm": 3.3066039085388184,
"learning_rate": 9.744759790081426e-06,
"loss": 1.1474,
"step": 2532
},
{
"epoch": 0.7353752358832922,
"grad_norm": 3.381757974624634,
"learning_rate": 9.744456834698083e-06,
"loss": 1.2692,
"step": 2533
},
{
"epoch": 0.7356655537813906,
"grad_norm": 3.070390224456787,
"learning_rate": 9.744153704341056e-06,
"loss": 1.1146,
"step": 2534
},
{
"epoch": 0.7359558716794891,
"grad_norm": 3.0699477195739746,
"learning_rate": 9.743850399021519e-06,
"loss": 1.2264,
"step": 2535
},
{
"epoch": 0.7362461895775875,
"grad_norm": 3.2143630981445312,
"learning_rate": 9.743546918750664e-06,
"loss": 1.2258,
"step": 2536
},
{
"epoch": 0.7365365074756859,
"grad_norm": 3.471107244491577,
"learning_rate": 9.743243263539681e-06,
"loss": 1.2183,
"step": 2537
},
{
"epoch": 0.7368268253737843,
"grad_norm": 3.6511921882629395,
"learning_rate": 9.742939433399769e-06,
"loss": 1.332,
"step": 2538
},
{
"epoch": 0.7371171432718827,
"grad_norm": 2.9969394207000732,
"learning_rate": 9.742635428342133e-06,
"loss": 1.1155,
"step": 2539
},
{
"epoch": 0.7374074611699811,
"grad_norm": 3.1637327671051025,
"learning_rate": 9.742331248377985e-06,
"loss": 1.2107,
"step": 2540
},
{
"epoch": 0.7376977790680795,
"grad_norm": 3.3259994983673096,
"learning_rate": 9.742026893518541e-06,
"loss": 1.1766,
"step": 2541
},
{
"epoch": 0.737988096966178,
"grad_norm": 3.2825498580932617,
"learning_rate": 9.741722363775029e-06,
"loss": 1.1946,
"step": 2542
},
{
"epoch": 0.7382784148642764,
"grad_norm": 3.317887783050537,
"learning_rate": 9.741417659158674e-06,
"loss": 1.0025,
"step": 2543
},
{
"epoch": 0.7385687327623748,
"grad_norm": 3.05649471282959,
"learning_rate": 9.741112779680721e-06,
"loss": 1.0689,
"step": 2544
},
{
"epoch": 0.7388590506604732,
"grad_norm": 3.0476882457733154,
"learning_rate": 9.740807725352408e-06,
"loss": 1.0704,
"step": 2545
},
{
"epoch": 0.7391493685585716,
"grad_norm": 2.8864781856536865,
"learning_rate": 9.740502496184989e-06,
"loss": 1.0802,
"step": 2546
},
{
"epoch": 0.73943968645667,
"grad_norm": 3.207580089569092,
"learning_rate": 9.740197092189718e-06,
"loss": 1.0071,
"step": 2547
},
{
"epoch": 0.7397300043547684,
"grad_norm": 2.972710371017456,
"learning_rate": 9.739891513377859e-06,
"loss": 1.0015,
"step": 2548
},
{
"epoch": 0.7400203222528668,
"grad_norm": 3.0222017765045166,
"learning_rate": 9.739585759760684e-06,
"loss": 1.1943,
"step": 2549
},
{
"epoch": 0.7403106401509653,
"grad_norm": 3.6331703662872314,
"learning_rate": 9.739279831349466e-06,
"loss": 1.0644,
"step": 2550
},
{
"epoch": 0.7406009580490637,
"grad_norm": 3.1713831424713135,
"learning_rate": 9.738973728155487e-06,
"loss": 1.1909,
"step": 2551
},
{
"epoch": 0.7408912759471622,
"grad_norm": 3.3440420627593994,
"learning_rate": 9.738667450190041e-06,
"loss": 1.1456,
"step": 2552
},
{
"epoch": 0.7411815938452606,
"grad_norm": 3.2886013984680176,
"learning_rate": 9.738360997464417e-06,
"loss": 1.1896,
"step": 2553
},
{
"epoch": 0.741471911743359,
"grad_norm": 3.303163528442383,
"learning_rate": 9.73805436998992e-06,
"loss": 1.174,
"step": 2554
},
{
"epoch": 0.7417622296414574,
"grad_norm": 3.4284379482269287,
"learning_rate": 9.737747567777859e-06,
"loss": 1.0949,
"step": 2555
},
{
"epoch": 0.7420525475395559,
"grad_norm": 3.026108980178833,
"learning_rate": 9.737440590839547e-06,
"loss": 1.2386,
"step": 2556
},
{
"epoch": 0.7423428654376543,
"grad_norm": 3.3348286151885986,
"learning_rate": 9.737133439186306e-06,
"loss": 1.1645,
"step": 2557
},
{
"epoch": 0.7426331833357527,
"grad_norm": 3.4476053714752197,
"learning_rate": 9.736826112829465e-06,
"loss": 1.2243,
"step": 2558
},
{
"epoch": 0.7429235012338511,
"grad_norm": 3.123429298400879,
"learning_rate": 9.736518611780356e-06,
"loss": 1.1967,
"step": 2559
},
{
"epoch": 0.7432138191319495,
"grad_norm": 3.2243711948394775,
"learning_rate": 9.73621093605032e-06,
"loss": 1.2283,
"step": 2560
},
{
"epoch": 0.7435041370300479,
"grad_norm": 3.192667245864868,
"learning_rate": 9.735903085650704e-06,
"loss": 1.1169,
"step": 2561
},
{
"epoch": 0.7437944549281463,
"grad_norm": 3.227220296859741,
"learning_rate": 9.735595060592861e-06,
"loss": 1.1867,
"step": 2562
},
{
"epoch": 0.7440847728262447,
"grad_norm": 3.1448750495910645,
"learning_rate": 9.735286860888153e-06,
"loss": 1.0588,
"step": 2563
},
{
"epoch": 0.7443750907243432,
"grad_norm": 3.9255151748657227,
"learning_rate": 9.734978486547943e-06,
"loss": 1.1771,
"step": 2564
},
{
"epoch": 0.7446654086224416,
"grad_norm": 3.173152208328247,
"learning_rate": 9.734669937583607e-06,
"loss": 1.0428,
"step": 2565
},
{
"epoch": 0.74495572652054,
"grad_norm": 2.9990289211273193,
"learning_rate": 9.734361214006523e-06,
"loss": 1.1064,
"step": 2566
},
{
"epoch": 0.7452460444186384,
"grad_norm": 3.705312490463257,
"learning_rate": 9.734052315828073e-06,
"loss": 1.2724,
"step": 2567
},
{
"epoch": 0.7455363623167368,
"grad_norm": 3.1329221725463867,
"learning_rate": 9.733743243059656e-06,
"loss": 1.0587,
"step": 2568
},
{
"epoch": 0.7458266802148352,
"grad_norm": 3.6346309185028076,
"learning_rate": 9.733433995712665e-06,
"loss": 1.2955,
"step": 2569
},
{
"epoch": 0.7461169981129336,
"grad_norm": 3.671525239944458,
"learning_rate": 9.733124573798507e-06,
"loss": 1.3279,
"step": 2570
},
{
"epoch": 0.746407316011032,
"grad_norm": 3.5882644653320312,
"learning_rate": 9.732814977328593e-06,
"loss": 1.3109,
"step": 2571
},
{
"epoch": 0.7466976339091305,
"grad_norm": 3.4163684844970703,
"learning_rate": 9.73250520631434e-06,
"loss": 1.2869,
"step": 2572
},
{
"epoch": 0.7469879518072289,
"grad_norm": 3.318476915359497,
"learning_rate": 9.732195260767175e-06,
"loss": 1.1014,
"step": 2573
},
{
"epoch": 0.7472782697053273,
"grad_norm": 3.565654993057251,
"learning_rate": 9.731885140698523e-06,
"loss": 1.3466,
"step": 2574
},
{
"epoch": 0.7475685876034257,
"grad_norm": 3.701667308807373,
"learning_rate": 9.73157484611983e-06,
"loss": 1.3208,
"step": 2575
},
{
"epoch": 0.7478589055015241,
"grad_norm": 3.6942193508148193,
"learning_rate": 9.73126437704253e-06,
"loss": 1.2147,
"step": 2576
},
{
"epoch": 0.7481492233996225,
"grad_norm": 3.2307727336883545,
"learning_rate": 9.73095373347808e-06,
"loss": 1.0473,
"step": 2577
},
{
"epoch": 0.7484395412977211,
"grad_norm": 3.1755237579345703,
"learning_rate": 9.730642915437932e-06,
"loss": 1.1311,
"step": 2578
},
{
"epoch": 0.7487298591958195,
"grad_norm": 2.977376937866211,
"learning_rate": 9.73033192293355e-06,
"loss": 1.0612,
"step": 2579
},
{
"epoch": 0.7490201770939179,
"grad_norm": 3.5205020904541016,
"learning_rate": 9.730020755976405e-06,
"loss": 1.2816,
"step": 2580
},
{
"epoch": 0.7493104949920163,
"grad_norm": 3.407058000564575,
"learning_rate": 9.729709414577971e-06,
"loss": 1.3124,
"step": 2581
},
{
"epoch": 0.7496008128901147,
"grad_norm": 3.4231269359588623,
"learning_rate": 9.729397898749732e-06,
"loss": 1.3177,
"step": 2582
},
{
"epoch": 0.7498911307882131,
"grad_norm": 3.3981311321258545,
"learning_rate": 9.729086208503174e-06,
"loss": 1.3057,
"step": 2583
},
{
"epoch": 0.7501814486863115,
"grad_norm": 3.3072404861450195,
"learning_rate": 9.728774343849794e-06,
"loss": 1.1111,
"step": 2584
},
{
"epoch": 0.75047176658441,
"grad_norm": 3.3770785331726074,
"learning_rate": 9.728462304801092e-06,
"loss": 1.0387,
"step": 2585
},
{
"epoch": 0.7507620844825084,
"grad_norm": 3.214796304702759,
"learning_rate": 9.728150091368578e-06,
"loss": 1.1361,
"step": 2586
},
{
"epoch": 0.7510524023806068,
"grad_norm": 3.14668345451355,
"learning_rate": 9.727837703563763e-06,
"loss": 1.1013,
"step": 2587
},
{
"epoch": 0.7513427202787052,
"grad_norm": 3.61557674407959,
"learning_rate": 9.727525141398172e-06,
"loss": 1.1335,
"step": 2588
},
{
"epoch": 0.7516330381768036,
"grad_norm": 3.3926947116851807,
"learning_rate": 9.727212404883328e-06,
"loss": 1.2092,
"step": 2589
},
{
"epoch": 0.751923356074902,
"grad_norm": 3.5248970985412598,
"learning_rate": 9.726899494030768e-06,
"loss": 1.2138,
"step": 2590
},
{
"epoch": 0.7522136739730004,
"grad_norm": 2.885737180709839,
"learning_rate": 9.72658640885203e-06,
"loss": 1.0495,
"step": 2591
},
{
"epoch": 0.7525039918710988,
"grad_norm": 3.0727686882019043,
"learning_rate": 9.726273149358661e-06,
"loss": 1.0749,
"step": 2592
},
{
"epoch": 0.7527943097691973,
"grad_norm": 3.084850549697876,
"learning_rate": 9.725959715562212e-06,
"loss": 1.2351,
"step": 2593
},
{
"epoch": 0.7530846276672957,
"grad_norm": 3.28760027885437,
"learning_rate": 9.725646107474245e-06,
"loss": 1.2275,
"step": 2594
},
{
"epoch": 0.7533749455653941,
"grad_norm": 3.085083246231079,
"learning_rate": 9.725332325106326e-06,
"loss": 1.1941,
"step": 2595
},
{
"epoch": 0.7536652634634925,
"grad_norm": 3.4755539894104004,
"learning_rate": 9.725018368470025e-06,
"loss": 1.324,
"step": 2596
},
{
"epoch": 0.7539555813615909,
"grad_norm": 3.1657776832580566,
"learning_rate": 9.724704237576924e-06,
"loss": 1.0582,
"step": 2597
},
{
"epoch": 0.7542458992596893,
"grad_norm": 3.143900156021118,
"learning_rate": 9.724389932438603e-06,
"loss": 1.1709,
"step": 2598
},
{
"epoch": 0.7545362171577877,
"grad_norm": 3.3038413524627686,
"learning_rate": 9.724075453066655e-06,
"loss": 1.1156,
"step": 2599
},
{
"epoch": 0.7548265350558861,
"grad_norm": 3.384906530380249,
"learning_rate": 9.723760799472681e-06,
"loss": 1.2913,
"step": 2600
},
{
"epoch": 0.7551168529539846,
"grad_norm": 3.3545148372650146,
"learning_rate": 9.723445971668284e-06,
"loss": 1.1701,
"step": 2601
},
{
"epoch": 0.755407170852083,
"grad_norm": 3.308631181716919,
"learning_rate": 9.723130969665073e-06,
"loss": 1.1446,
"step": 2602
},
{
"epoch": 0.7556974887501815,
"grad_norm": 3.1468513011932373,
"learning_rate": 9.722815793474667e-06,
"loss": 1.0866,
"step": 2603
},
{
"epoch": 0.7559878066482799,
"grad_norm": 3.327813148498535,
"learning_rate": 9.722500443108687e-06,
"loss": 1.1291,
"step": 2604
},
{
"epoch": 0.7562781245463783,
"grad_norm": 3.189318895339966,
"learning_rate": 9.722184918578765e-06,
"loss": 1.0912,
"step": 2605
},
{
"epoch": 0.7565684424444767,
"grad_norm": 3.209308385848999,
"learning_rate": 9.721869219896539e-06,
"loss": 1.2015,
"step": 2606
},
{
"epoch": 0.7568587603425752,
"grad_norm": 3.2611427307128906,
"learning_rate": 9.72155334707365e-06,
"loss": 1.1894,
"step": 2607
},
{
"epoch": 0.7571490782406736,
"grad_norm": 3.0698297023773193,
"learning_rate": 9.721237300121744e-06,
"loss": 1.1468,
"step": 2608
},
{
"epoch": 0.757439396138772,
"grad_norm": 3.030074119567871,
"learning_rate": 9.720921079052483e-06,
"loss": 1.0497,
"step": 2609
},
{
"epoch": 0.7577297140368704,
"grad_norm": 3.3314547538757324,
"learning_rate": 9.720604683877524e-06,
"loss": 1.2847,
"step": 2610
},
{
"epoch": 0.7580200319349688,
"grad_norm": 3.3319008350372314,
"learning_rate": 9.72028811460854e-06,
"loss": 1.1846,
"step": 2611
},
{
"epoch": 0.7583103498330672,
"grad_norm": 2.8318731784820557,
"learning_rate": 9.719971371257201e-06,
"loss": 1.1269,
"step": 2612
},
{
"epoch": 0.7586006677311656,
"grad_norm": 2.9825758934020996,
"learning_rate": 9.719654453835192e-06,
"loss": 1.172,
"step": 2613
},
{
"epoch": 0.758890985629264,
"grad_norm": 3.155717611312866,
"learning_rate": 9.7193373623542e-06,
"loss": 1.0468,
"step": 2614
},
{
"epoch": 0.7591813035273625,
"grad_norm": 3.3703644275665283,
"learning_rate": 9.71902009682592e-06,
"loss": 1.1021,
"step": 2615
},
{
"epoch": 0.7594716214254609,
"grad_norm": 3.448974132537842,
"learning_rate": 9.718702657262049e-06,
"loss": 1.3663,
"step": 2616
},
{
"epoch": 0.7597619393235593,
"grad_norm": 3.0262529850006104,
"learning_rate": 9.718385043674298e-06,
"loss": 1.0723,
"step": 2617
},
{
"epoch": 0.7600522572216577,
"grad_norm": 3.7767655849456787,
"learning_rate": 9.718067256074378e-06,
"loss": 1.2078,
"step": 2618
},
{
"epoch": 0.7603425751197561,
"grad_norm": 2.984757900238037,
"learning_rate": 9.71774929447401e-06,
"loss": 1.065,
"step": 2619
},
{
"epoch": 0.7606328930178545,
"grad_norm": 3.351996660232544,
"learning_rate": 9.717431158884922e-06,
"loss": 1.2249,
"step": 2620
},
{
"epoch": 0.7609232109159529,
"grad_norm": 3.374985933303833,
"learning_rate": 9.717112849318844e-06,
"loss": 1.1868,
"step": 2621
},
{
"epoch": 0.7612135288140514,
"grad_norm": 3.2836148738861084,
"learning_rate": 9.716794365787516e-06,
"loss": 1.3113,
"step": 2622
},
{
"epoch": 0.7615038467121498,
"grad_norm": 3.3848886489868164,
"learning_rate": 9.716475708302683e-06,
"loss": 1.2438,
"step": 2623
},
{
"epoch": 0.7617941646102482,
"grad_norm": 3.5439648628234863,
"learning_rate": 9.716156876876096e-06,
"loss": 1.1124,
"step": 2624
},
{
"epoch": 0.7620844825083466,
"grad_norm": 2.9537434577941895,
"learning_rate": 9.715837871519518e-06,
"loss": 1.0228,
"step": 2625
},
{
"epoch": 0.762374800406445,
"grad_norm": 3.688227891921997,
"learning_rate": 9.71551869224471e-06,
"loss": 1.1742,
"step": 2626
},
{
"epoch": 0.7626651183045434,
"grad_norm": 3.6073129177093506,
"learning_rate": 9.715199339063444e-06,
"loss": 1.1558,
"step": 2627
},
{
"epoch": 0.762955436202642,
"grad_norm": 3.2027735710144043,
"learning_rate": 9.714879811987496e-06,
"loss": 1.0795,
"step": 2628
},
{
"epoch": 0.7632457541007404,
"grad_norm": 3.0256600379943848,
"learning_rate": 9.714560111028654e-06,
"loss": 1.0514,
"step": 2629
},
{
"epoch": 0.7635360719988388,
"grad_norm": 3.2667462825775146,
"learning_rate": 9.714240236198704e-06,
"loss": 1.2406,
"step": 2630
},
{
"epoch": 0.7638263898969372,
"grad_norm": 3.4051690101623535,
"learning_rate": 9.713920187509445e-06,
"loss": 1.1812,
"step": 2631
},
{
"epoch": 0.7641167077950356,
"grad_norm": 3.3208694458007812,
"learning_rate": 9.713599964972682e-06,
"loss": 1.1577,
"step": 2632
},
{
"epoch": 0.764407025693134,
"grad_norm": 3.5661416053771973,
"learning_rate": 9.71327956860022e-06,
"loss": 1.2215,
"step": 2633
},
{
"epoch": 0.7646973435912324,
"grad_norm": 3.286116361618042,
"learning_rate": 9.712958998403881e-06,
"loss": 1.1043,
"step": 2634
},
{
"epoch": 0.7649876614893308,
"grad_norm": 3.0886998176574707,
"learning_rate": 9.712638254395481e-06,
"loss": 1.0814,
"step": 2635
},
{
"epoch": 0.7652779793874293,
"grad_norm": 3.3840620517730713,
"learning_rate": 9.712317336586854e-06,
"loss": 1.0548,
"step": 2636
},
{
"epoch": 0.7655682972855277,
"grad_norm": 3.4241580963134766,
"learning_rate": 9.711996244989835e-06,
"loss": 1.0526,
"step": 2637
},
{
"epoch": 0.7658586151836261,
"grad_norm": 3.7336814403533936,
"learning_rate": 9.711674979616263e-06,
"loss": 1.3548,
"step": 2638
},
{
"epoch": 0.7661489330817245,
"grad_norm": 3.1186118125915527,
"learning_rate": 9.711353540477988e-06,
"loss": 1.1147,
"step": 2639
},
{
"epoch": 0.7664392509798229,
"grad_norm": 3.3635342121124268,
"learning_rate": 9.711031927586864e-06,
"loss": 1.3023,
"step": 2640
},
{
"epoch": 0.7667295688779213,
"grad_norm": 3.2632579803466797,
"learning_rate": 9.710710140954752e-06,
"loss": 1.2382,
"step": 2641
},
{
"epoch": 0.7670198867760197,
"grad_norm": 3.1245193481445312,
"learning_rate": 9.710388180593518e-06,
"loss": 1.1616,
"step": 2642
},
{
"epoch": 0.7673102046741181,
"grad_norm": 3.439480781555176,
"learning_rate": 9.710066046515039e-06,
"loss": 1.24,
"step": 2643
},
{
"epoch": 0.7676005225722166,
"grad_norm": 3.172135353088379,
"learning_rate": 9.709743738731191e-06,
"loss": 0.993,
"step": 2644
},
{
"epoch": 0.767890840470315,
"grad_norm": 3.2096140384674072,
"learning_rate": 9.709421257253865e-06,
"loss": 1.2152,
"step": 2645
},
{
"epoch": 0.7681811583684134,
"grad_norm": 3.3263416290283203,
"learning_rate": 9.709098602094952e-06,
"loss": 1.1902,
"step": 2646
},
{
"epoch": 0.7684714762665118,
"grad_norm": 3.186981201171875,
"learning_rate": 9.708775773266353e-06,
"loss": 1.2518,
"step": 2647
},
{
"epoch": 0.7687617941646102,
"grad_norm": 3.4535677433013916,
"learning_rate": 9.708452770779967e-06,
"loss": 1.2558,
"step": 2648
},
{
"epoch": 0.7690521120627086,
"grad_norm": 3.2888617515563965,
"learning_rate": 9.708129594647716e-06,
"loss": 1.1457,
"step": 2649
},
{
"epoch": 0.769342429960807,
"grad_norm": 3.6258974075317383,
"learning_rate": 9.707806244881513e-06,
"loss": 1.3135,
"step": 2650
},
{
"epoch": 0.7696327478589055,
"grad_norm": 3.227768898010254,
"learning_rate": 9.707482721493282e-06,
"loss": 1.3181,
"step": 2651
},
{
"epoch": 0.7699230657570039,
"grad_norm": 3.445146322250366,
"learning_rate": 9.707159024494958e-06,
"loss": 1.0569,
"step": 2652
},
{
"epoch": 0.7702133836551024,
"grad_norm": 3.3416175842285156,
"learning_rate": 9.706835153898476e-06,
"loss": 1.0999,
"step": 2653
},
{
"epoch": 0.7705037015532008,
"grad_norm": 3.45808744430542,
"learning_rate": 9.706511109715782e-06,
"loss": 1.2106,
"step": 2654
},
{
"epoch": 0.7707940194512992,
"grad_norm": 3.3738346099853516,
"learning_rate": 9.706186891958826e-06,
"loss": 1.2624,
"step": 2655
},
{
"epoch": 0.7710843373493976,
"grad_norm": 3.630474328994751,
"learning_rate": 9.705862500639565e-06,
"loss": 1.3611,
"step": 2656
},
{
"epoch": 0.771374655247496,
"grad_norm": 3.3824191093444824,
"learning_rate": 9.705537935769962e-06,
"loss": 1.3021,
"step": 2657
},
{
"epoch": 0.7716649731455945,
"grad_norm": 3.4706802368164062,
"learning_rate": 9.705213197361989e-06,
"loss": 1.2166,
"step": 2658
},
{
"epoch": 0.7719552910436929,
"grad_norm": 3.271436929702759,
"learning_rate": 9.704888285427618e-06,
"loss": 1.232,
"step": 2659
},
{
"epoch": 0.7722456089417913,
"grad_norm": 3.3436808586120605,
"learning_rate": 9.704563199978837e-06,
"loss": 1.1864,
"step": 2660
},
{
"epoch": 0.7725359268398897,
"grad_norm": 3.2927451133728027,
"learning_rate": 9.70423794102763e-06,
"loss": 1.1901,
"step": 2661
},
{
"epoch": 0.7728262447379881,
"grad_norm": 3.3609869480133057,
"learning_rate": 9.703912508585995e-06,
"loss": 1.27,
"step": 2662
},
{
"epoch": 0.7731165626360865,
"grad_norm": 3.4284236431121826,
"learning_rate": 9.703586902665932e-06,
"loss": 1.3389,
"step": 2663
},
{
"epoch": 0.7734068805341849,
"grad_norm": 3.34993052482605,
"learning_rate": 9.703261123279453e-06,
"loss": 1.2551,
"step": 2664
},
{
"epoch": 0.7736971984322834,
"grad_norm": 3.4748470783233643,
"learning_rate": 9.70293517043857e-06,
"loss": 1.138,
"step": 2665
},
{
"epoch": 0.7739875163303818,
"grad_norm": 3.0106701850891113,
"learning_rate": 9.702609044155303e-06,
"loss": 1.1568,
"step": 2666
},
{
"epoch": 0.7742778342284802,
"grad_norm": 3.5232250690460205,
"learning_rate": 9.70228274444168e-06,
"loss": 1.1744,
"step": 2667
},
{
"epoch": 0.7745681521265786,
"grad_norm": 2.9455854892730713,
"learning_rate": 9.701956271309736e-06,
"loss": 1.0484,
"step": 2668
},
{
"epoch": 0.774858470024677,
"grad_norm": 3.023559808731079,
"learning_rate": 9.701629624771512e-06,
"loss": 1.074,
"step": 2669
},
{
"epoch": 0.7751487879227754,
"grad_norm": 3.59647798538208,
"learning_rate": 9.701302804839052e-06,
"loss": 1.4052,
"step": 2670
},
{
"epoch": 0.7754391058208738,
"grad_norm": 3.113689661026001,
"learning_rate": 9.70097581152441e-06,
"loss": 1.031,
"step": 2671
},
{
"epoch": 0.7757294237189722,
"grad_norm": 3.235813617706299,
"learning_rate": 9.700648644839647e-06,
"loss": 1.2389,
"step": 2672
},
{
"epoch": 0.7760197416170707,
"grad_norm": 3.190761089324951,
"learning_rate": 9.700321304796825e-06,
"loss": 1.1777,
"step": 2673
},
{
"epoch": 0.7763100595151691,
"grad_norm": 3.0125646591186523,
"learning_rate": 9.69999379140802e-06,
"loss": 1.1096,
"step": 2674
},
{
"epoch": 0.7766003774132675,
"grad_norm": 3.218435287475586,
"learning_rate": 9.69966610468531e-06,
"loss": 1.0491,
"step": 2675
},
{
"epoch": 0.7768906953113659,
"grad_norm": 3.141157865524292,
"learning_rate": 9.699338244640779e-06,
"loss": 1.1652,
"step": 2676
},
{
"epoch": 0.7771810132094643,
"grad_norm": 3.2786238193511963,
"learning_rate": 9.699010211286516e-06,
"loss": 1.2433,
"step": 2677
},
{
"epoch": 0.7774713311075628,
"grad_norm": 2.9467108249664307,
"learning_rate": 9.698682004634624e-06,
"loss": 1.1513,
"step": 2678
},
{
"epoch": 0.7777616490056612,
"grad_norm": 3.2171337604522705,
"learning_rate": 9.698353624697202e-06,
"loss": 1.1458,
"step": 2679
},
{
"epoch": 0.7780519669037597,
"grad_norm": 3.1761419773101807,
"learning_rate": 9.698025071486363e-06,
"loss": 1.1981,
"step": 2680
},
{
"epoch": 0.7783422848018581,
"grad_norm": 3.1694602966308594,
"learning_rate": 9.697696345014225e-06,
"loss": 1.1642,
"step": 2681
},
{
"epoch": 0.7786326026999565,
"grad_norm": 3.392407178878784,
"learning_rate": 9.69736744529291e-06,
"loss": 1.1867,
"step": 2682
},
{
"epoch": 0.7789229205980549,
"grad_norm": 3.022423028945923,
"learning_rate": 9.697038372334548e-06,
"loss": 1.0707,
"step": 2683
},
{
"epoch": 0.7792132384961533,
"grad_norm": 3.068240165710449,
"learning_rate": 9.696709126151274e-06,
"loss": 1.159,
"step": 2684
},
{
"epoch": 0.7795035563942517,
"grad_norm": 3.0357422828674316,
"learning_rate": 9.69637970675523e-06,
"loss": 1.0268,
"step": 2685
},
{
"epoch": 0.7797938742923501,
"grad_norm": 3.256622076034546,
"learning_rate": 9.696050114158569e-06,
"loss": 1.2258,
"step": 2686
},
{
"epoch": 0.7800841921904486,
"grad_norm": 3.265336275100708,
"learning_rate": 9.69572034837344e-06,
"loss": 1.1091,
"step": 2687
},
{
"epoch": 0.780374510088547,
"grad_norm": 3.419400453567505,
"learning_rate": 9.695390409412011e-06,
"loss": 1.2144,
"step": 2688
},
{
"epoch": 0.7806648279866454,
"grad_norm": 3.241852045059204,
"learning_rate": 9.695060297286445e-06,
"loss": 1.185,
"step": 2689
},
{
"epoch": 0.7809551458847438,
"grad_norm": 3.128333568572998,
"learning_rate": 9.694730012008919e-06,
"loss": 1.166,
"step": 2690
},
{
"epoch": 0.7812454637828422,
"grad_norm": 3.2814202308654785,
"learning_rate": 9.694399553591614e-06,
"loss": 1.1328,
"step": 2691
},
{
"epoch": 0.7815357816809406,
"grad_norm": 3.5707764625549316,
"learning_rate": 9.694068922046715e-06,
"loss": 1.3243,
"step": 2692
},
{
"epoch": 0.781826099579039,
"grad_norm": 3.2367355823516846,
"learning_rate": 9.693738117386419e-06,
"loss": 1.3495,
"step": 2693
},
{
"epoch": 0.7821164174771374,
"grad_norm": 3.425107479095459,
"learning_rate": 9.693407139622922e-06,
"loss": 1.1423,
"step": 2694
},
{
"epoch": 0.7824067353752359,
"grad_norm": 3.4596445560455322,
"learning_rate": 9.693075988768433e-06,
"loss": 1.2778,
"step": 2695
},
{
"epoch": 0.7826970532733343,
"grad_norm": 3.4609477519989014,
"learning_rate": 9.692744664835164e-06,
"loss": 1.159,
"step": 2696
},
{
"epoch": 0.7829873711714327,
"grad_norm": 3.192476272583008,
"learning_rate": 9.692413167835334e-06,
"loss": 1.1078,
"step": 2697
},
{
"epoch": 0.7832776890695311,
"grad_norm": 2.891274929046631,
"learning_rate": 9.692081497781168e-06,
"loss": 1.012,
"step": 2698
},
{
"epoch": 0.7835680069676295,
"grad_norm": 3.200326442718506,
"learning_rate": 9.691749654684899e-06,
"loss": 1.2797,
"step": 2699
},
{
"epoch": 0.7838583248657279,
"grad_norm": 3.1819984912872314,
"learning_rate": 9.691417638558764e-06,
"loss": 1.1548,
"step": 2700
},
{
"epoch": 0.7841486427638263,
"grad_norm": 2.8674476146698,
"learning_rate": 9.69108544941501e-06,
"loss": 0.9269,
"step": 2701
},
{
"epoch": 0.7844389606619248,
"grad_norm": 3.1889965534210205,
"learning_rate": 9.690753087265883e-06,
"loss": 1.2377,
"step": 2702
},
{
"epoch": 0.7847292785600233,
"grad_norm": 3.410156488418579,
"learning_rate": 9.690420552123645e-06,
"loss": 1.1583,
"step": 2703
},
{
"epoch": 0.7850195964581217,
"grad_norm": 2.966400146484375,
"learning_rate": 9.69008784400056e-06,
"loss": 1.0389,
"step": 2704
},
{
"epoch": 0.7853099143562201,
"grad_norm": 3.1139185428619385,
"learning_rate": 9.689754962908895e-06,
"loss": 1.0267,
"step": 2705
},
{
"epoch": 0.7856002322543185,
"grad_norm": 3.2307214736938477,
"learning_rate": 9.689421908860928e-06,
"loss": 1.0453,
"step": 2706
},
{
"epoch": 0.7858905501524169,
"grad_norm": 3.1317498683929443,
"learning_rate": 9.689088681868941e-06,
"loss": 1.193,
"step": 2707
},
{
"epoch": 0.7861808680505153,
"grad_norm": 2.7882258892059326,
"learning_rate": 9.688755281945226e-06,
"loss": 0.9895,
"step": 2708
},
{
"epoch": 0.7864711859486138,
"grad_norm": 3.129871368408203,
"learning_rate": 9.688421709102076e-06,
"loss": 1.2207,
"step": 2709
},
{
"epoch": 0.7867615038467122,
"grad_norm": 3.189854621887207,
"learning_rate": 9.688087963351795e-06,
"loss": 1.1442,
"step": 2710
},
{
"epoch": 0.7870518217448106,
"grad_norm": 3.1260828971862793,
"learning_rate": 9.68775404470669e-06,
"loss": 1.0848,
"step": 2711
},
{
"epoch": 0.787342139642909,
"grad_norm": 3.461789846420288,
"learning_rate": 9.687419953179074e-06,
"loss": 1.3078,
"step": 2712
},
{
"epoch": 0.7876324575410074,
"grad_norm": 3.009683132171631,
"learning_rate": 9.687085688781273e-06,
"loss": 0.9739,
"step": 2713
},
{
"epoch": 0.7879227754391058,
"grad_norm": 3.2000815868377686,
"learning_rate": 9.68675125152561e-06,
"loss": 1.2845,
"step": 2714
},
{
"epoch": 0.7882130933372042,
"grad_norm": 3.3149054050445557,
"learning_rate": 9.686416641424422e-06,
"loss": 1.1578,
"step": 2715
},
{
"epoch": 0.7885034112353027,
"grad_norm": 2.903021812438965,
"learning_rate": 9.686081858490047e-06,
"loss": 0.9999,
"step": 2716
},
{
"epoch": 0.7887937291334011,
"grad_norm": 3.2274374961853027,
"learning_rate": 9.685746902734834e-06,
"loss": 1.2606,
"step": 2717
},
{
"epoch": 0.7890840470314995,
"grad_norm": 3.3526039123535156,
"learning_rate": 9.685411774171133e-06,
"loss": 1.2573,
"step": 2718
},
{
"epoch": 0.7893743649295979,
"grad_norm": 3.025444269180298,
"learning_rate": 9.685076472811305e-06,
"loss": 1.12,
"step": 2719
},
{
"epoch": 0.7896646828276963,
"grad_norm": 3.1881661415100098,
"learning_rate": 9.684740998667718e-06,
"loss": 1.1475,
"step": 2720
},
{
"epoch": 0.7899550007257947,
"grad_norm": 3.1479337215423584,
"learning_rate": 9.68440535175274e-06,
"loss": 1.0881,
"step": 2721
},
{
"epoch": 0.7902453186238931,
"grad_norm": 3.6872854232788086,
"learning_rate": 9.684069532078753e-06,
"loss": 1.2607,
"step": 2722
},
{
"epoch": 0.7905356365219915,
"grad_norm": 2.9365339279174805,
"learning_rate": 9.68373353965814e-06,
"loss": 1.0904,
"step": 2723
},
{
"epoch": 0.79082595442009,
"grad_norm": 2.9232428073883057,
"learning_rate": 9.683397374503293e-06,
"loss": 1.1098,
"step": 2724
},
{
"epoch": 0.7911162723181884,
"grad_norm": 3.091132402420044,
"learning_rate": 9.683061036626608e-06,
"loss": 1.191,
"step": 2725
},
{
"epoch": 0.7914065902162868,
"grad_norm": 3.380723237991333,
"learning_rate": 9.682724526040493e-06,
"loss": 1.2003,
"step": 2726
},
{
"epoch": 0.7916969081143852,
"grad_norm": 3.4118423461914062,
"learning_rate": 9.682387842757354e-06,
"loss": 1.1715,
"step": 2727
},
{
"epoch": 0.7919872260124837,
"grad_norm": 3.095881462097168,
"learning_rate": 9.682050986789609e-06,
"loss": 1.1167,
"step": 2728
},
{
"epoch": 0.7922775439105821,
"grad_norm": 3.4140207767486572,
"learning_rate": 9.681713958149683e-06,
"loss": 1.1926,
"step": 2729
},
{
"epoch": 0.7925678618086806,
"grad_norm": 3.4278016090393066,
"learning_rate": 9.681376756850003e-06,
"loss": 1.2509,
"step": 2730
},
{
"epoch": 0.792858179706779,
"grad_norm": 3.5882339477539062,
"learning_rate": 9.681039382903007e-06,
"loss": 1.3001,
"step": 2731
},
{
"epoch": 0.7931484976048774,
"grad_norm": 3.4812803268432617,
"learning_rate": 9.680701836321135e-06,
"loss": 1.2319,
"step": 2732
},
{
"epoch": 0.7934388155029758,
"grad_norm": 3.3065333366394043,
"learning_rate": 9.680364117116838e-06,
"loss": 1.1813,
"step": 2733
},
{
"epoch": 0.7937291334010742,
"grad_norm": 3.2521045207977295,
"learning_rate": 9.680026225302568e-06,
"loss": 1.2133,
"step": 2734
},
{
"epoch": 0.7940194512991726,
"grad_norm": 2.7159008979797363,
"learning_rate": 9.67968816089079e-06,
"loss": 1.0154,
"step": 2735
},
{
"epoch": 0.794309769197271,
"grad_norm": 3.323042869567871,
"learning_rate": 9.679349923893968e-06,
"loss": 1.2234,
"step": 2736
},
{
"epoch": 0.7946000870953694,
"grad_norm": 3.2154958248138428,
"learning_rate": 9.679011514324579e-06,
"loss": 1.0341,
"step": 2737
},
{
"epoch": 0.7948904049934679,
"grad_norm": 3.100257396697998,
"learning_rate": 9.678672932195101e-06,
"loss": 1.0728,
"step": 2738
},
{
"epoch": 0.7951807228915663,
"grad_norm": 2.962118625640869,
"learning_rate": 9.678334177518022e-06,
"loss": 1.0618,
"step": 2739
},
{
"epoch": 0.7954710407896647,
"grad_norm": 3.430203914642334,
"learning_rate": 9.677995250305836e-06,
"loss": 1.3019,
"step": 2740
},
{
"epoch": 0.7957613586877631,
"grad_norm": 3.404595375061035,
"learning_rate": 9.677656150571042e-06,
"loss": 1.2069,
"step": 2741
},
{
"epoch": 0.7960516765858615,
"grad_norm": 3.271411418914795,
"learning_rate": 9.677316878326144e-06,
"loss": 1.1914,
"step": 2742
},
{
"epoch": 0.7963419944839599,
"grad_norm": 3.5595600605010986,
"learning_rate": 9.676977433583656e-06,
"loss": 1.404,
"step": 2743
},
{
"epoch": 0.7966323123820583,
"grad_norm": 3.423607587814331,
"learning_rate": 9.676637816356098e-06,
"loss": 1.2709,
"step": 2744
},
{
"epoch": 0.7969226302801568,
"grad_norm": 3.162513017654419,
"learning_rate": 9.676298026655992e-06,
"loss": 1.2843,
"step": 2745
},
{
"epoch": 0.7972129481782552,
"grad_norm": 3.023754119873047,
"learning_rate": 9.675958064495869e-06,
"loss": 1.2077,
"step": 2746
},
{
"epoch": 0.7975032660763536,
"grad_norm": 3.2960760593414307,
"learning_rate": 9.675617929888271e-06,
"loss": 1.1551,
"step": 2747
},
{
"epoch": 0.797793583974452,
"grad_norm": 3.2949986457824707,
"learning_rate": 9.675277622845736e-06,
"loss": 1.2885,
"step": 2748
},
{
"epoch": 0.7980839018725504,
"grad_norm": 3.3253605365753174,
"learning_rate": 9.67493714338082e-06,
"loss": 1.1524,
"step": 2749
},
{
"epoch": 0.7983742197706488,
"grad_norm": 3.1859922409057617,
"learning_rate": 9.674596491506077e-06,
"loss": 1.2582,
"step": 2750
},
{
"epoch": 0.7986645376687472,
"grad_norm": 3.2374212741851807,
"learning_rate": 9.67425566723407e-06,
"loss": 1.3034,
"step": 2751
},
{
"epoch": 0.7989548555668456,
"grad_norm": 3.093991279602051,
"learning_rate": 9.673914670577369e-06,
"loss": 1.1687,
"step": 2752
},
{
"epoch": 0.7992451734649441,
"grad_norm": 2.8157095909118652,
"learning_rate": 9.67357350154855e-06,
"loss": 1.0783,
"step": 2753
},
{
"epoch": 0.7995354913630426,
"grad_norm": 3.2308690547943115,
"learning_rate": 9.673232160160195e-06,
"loss": 1.1821,
"step": 2754
},
{
"epoch": 0.799825809261141,
"grad_norm": 2.980912208557129,
"learning_rate": 9.67289064642489e-06,
"loss": 1.1247,
"step": 2755
},
{
"epoch": 0.8001161271592394,
"grad_norm": 2.8929474353790283,
"learning_rate": 9.672548960355236e-06,
"loss": 1.0361,
"step": 2756
},
{
"epoch": 0.8004064450573378,
"grad_norm": 3.199467658996582,
"learning_rate": 9.672207101963828e-06,
"loss": 1.161,
"step": 2757
},
{
"epoch": 0.8006967629554362,
"grad_norm": 3.3019492626190186,
"learning_rate": 9.671865071263278e-06,
"loss": 1.0657,
"step": 2758
},
{
"epoch": 0.8009870808535346,
"grad_norm": 3.4587512016296387,
"learning_rate": 9.671522868266197e-06,
"loss": 1.1823,
"step": 2759
},
{
"epoch": 0.8012773987516331,
"grad_norm": 3.3693933486938477,
"learning_rate": 9.671180492985207e-06,
"loss": 1.1788,
"step": 2760
},
{
"epoch": 0.8015677166497315,
"grad_norm": 3.195629358291626,
"learning_rate": 9.670837945432934e-06,
"loss": 1.1368,
"step": 2761
},
{
"epoch": 0.8018580345478299,
"grad_norm": 3.206254243850708,
"learning_rate": 9.670495225622011e-06,
"loss": 1.1581,
"step": 2762
},
{
"epoch": 0.8021483524459283,
"grad_norm": 3.264477014541626,
"learning_rate": 9.670152333565078e-06,
"loss": 1.1068,
"step": 2763
},
{
"epoch": 0.8024386703440267,
"grad_norm": 3.518728256225586,
"learning_rate": 9.669809269274779e-06,
"loss": 1.1533,
"step": 2764
},
{
"epoch": 0.8027289882421251,
"grad_norm": 3.5006842613220215,
"learning_rate": 9.669466032763768e-06,
"loss": 1.2964,
"step": 2765
},
{
"epoch": 0.8030193061402235,
"grad_norm": 3.7323036193847656,
"learning_rate": 9.669122624044704e-06,
"loss": 1.2684,
"step": 2766
},
{
"epoch": 0.803309624038322,
"grad_norm": 3.5423648357391357,
"learning_rate": 9.668779043130249e-06,
"loss": 1.2638,
"step": 2767
},
{
"epoch": 0.8035999419364204,
"grad_norm": 3.037662982940674,
"learning_rate": 9.668435290033076e-06,
"loss": 0.916,
"step": 2768
},
{
"epoch": 0.8038902598345188,
"grad_norm": 3.0804009437561035,
"learning_rate": 9.668091364765862e-06,
"loss": 1.1467,
"step": 2769
},
{
"epoch": 0.8041805777326172,
"grad_norm": 3.015153169631958,
"learning_rate": 9.66774726734129e-06,
"loss": 1.0218,
"step": 2770
},
{
"epoch": 0.8044708956307156,
"grad_norm": 3.360714912414551,
"learning_rate": 9.667402997772052e-06,
"loss": 1.3612,
"step": 2771
},
{
"epoch": 0.804761213528814,
"grad_norm": 3.091615915298462,
"learning_rate": 9.667058556070846e-06,
"loss": 1.0789,
"step": 2772
},
{
"epoch": 0.8050515314269124,
"grad_norm": 3.4261224269866943,
"learning_rate": 9.66671394225037e-06,
"loss": 1.0892,
"step": 2773
},
{
"epoch": 0.8053418493250109,
"grad_norm": 3.1172802448272705,
"learning_rate": 9.666369156323335e-06,
"loss": 1.094,
"step": 2774
},
{
"epoch": 0.8056321672231093,
"grad_norm": 3.621525764465332,
"learning_rate": 9.666024198302459e-06,
"loss": 1.2377,
"step": 2775
},
{
"epoch": 0.8059224851212077,
"grad_norm": 3.2709341049194336,
"learning_rate": 9.665679068200463e-06,
"loss": 1.1966,
"step": 2776
},
{
"epoch": 0.8062128030193061,
"grad_norm": 3.9319911003112793,
"learning_rate": 9.66533376603007e-06,
"loss": 1.2563,
"step": 2777
},
{
"epoch": 0.8065031209174045,
"grad_norm": 3.317229747772217,
"learning_rate": 9.664988291804025e-06,
"loss": 1.0844,
"step": 2778
},
{
"epoch": 0.806793438815503,
"grad_norm": 3.2305257320404053,
"learning_rate": 9.664642645535058e-06,
"loss": 1.2113,
"step": 2779
},
{
"epoch": 0.8070837567136014,
"grad_norm": 2.9735424518585205,
"learning_rate": 9.664296827235924e-06,
"loss": 1.1671,
"step": 2780
},
{
"epoch": 0.8073740746116999,
"grad_norm": 3.4373373985290527,
"learning_rate": 9.663950836919373e-06,
"loss": 1.2868,
"step": 2781
},
{
"epoch": 0.8076643925097983,
"grad_norm": 3.469642400741577,
"learning_rate": 9.663604674598169e-06,
"loss": 1.1692,
"step": 2782
},
{
"epoch": 0.8079547104078967,
"grad_norm": 3.3247344493865967,
"learning_rate": 9.663258340285071e-06,
"loss": 1.1078,
"step": 2783
},
{
"epoch": 0.8082450283059951,
"grad_norm": 3.2038064002990723,
"learning_rate": 9.662911833992858e-06,
"loss": 1.2648,
"step": 2784
},
{
"epoch": 0.8085353462040935,
"grad_norm": 3.3712222576141357,
"learning_rate": 9.662565155734308e-06,
"loss": 1.1988,
"step": 2785
},
{
"epoch": 0.8088256641021919,
"grad_norm": 3.159156560897827,
"learning_rate": 9.662218305522204e-06,
"loss": 1.1781,
"step": 2786
},
{
"epoch": 0.8091159820002903,
"grad_norm": 2.919067859649658,
"learning_rate": 9.661871283369337e-06,
"loss": 1.048,
"step": 2787
},
{
"epoch": 0.8094062998983887,
"grad_norm": 3.1437933444976807,
"learning_rate": 9.66152408928851e-06,
"loss": 1.1889,
"step": 2788
},
{
"epoch": 0.8096966177964872,
"grad_norm": 3.3572521209716797,
"learning_rate": 9.661176723292524e-06,
"loss": 1.2144,
"step": 2789
},
{
"epoch": 0.8099869356945856,
"grad_norm": 3.069945812225342,
"learning_rate": 9.660829185394189e-06,
"loss": 1.0188,
"step": 2790
},
{
"epoch": 0.810277253592684,
"grad_norm": 2.9657914638519287,
"learning_rate": 9.660481475606325e-06,
"loss": 1.0332,
"step": 2791
},
{
"epoch": 0.8105675714907824,
"grad_norm": 3.1732230186462402,
"learning_rate": 9.660133593941752e-06,
"loss": 1.2119,
"step": 2792
},
{
"epoch": 0.8108578893888808,
"grad_norm": 3.295893430709839,
"learning_rate": 9.659785540413303e-06,
"loss": 1.1986,
"step": 2793
},
{
"epoch": 0.8111482072869792,
"grad_norm": 3.3230507373809814,
"learning_rate": 9.65943731503381e-06,
"loss": 1.2974,
"step": 2794
},
{
"epoch": 0.8114385251850776,
"grad_norm": 3.2661449909210205,
"learning_rate": 9.65908891781612e-06,
"loss": 1.1286,
"step": 2795
},
{
"epoch": 0.811728843083176,
"grad_norm": 3.3028149604797363,
"learning_rate": 9.658740348773079e-06,
"loss": 1.2416,
"step": 2796
},
{
"epoch": 0.8120191609812745,
"grad_norm": 3.1426446437835693,
"learning_rate": 9.658391607917543e-06,
"loss": 1.1046,
"step": 2797
},
{
"epoch": 0.8123094788793729,
"grad_norm": 3.3629467487335205,
"learning_rate": 9.658042695262373e-06,
"loss": 1.2118,
"step": 2798
},
{
"epoch": 0.8125997967774713,
"grad_norm": 3.356700897216797,
"learning_rate": 9.657693610820437e-06,
"loss": 1.0999,
"step": 2799
},
{
"epoch": 0.8128901146755697,
"grad_norm": 2.8955090045928955,
"learning_rate": 9.65734435460461e-06,
"loss": 1.1629,
"step": 2800
},
{
"epoch": 0.8131804325736681,
"grad_norm": 3.2146928310394287,
"learning_rate": 9.656994926627769e-06,
"loss": 1.1164,
"step": 2801
},
{
"epoch": 0.8134707504717665,
"grad_norm": 3.1054909229278564,
"learning_rate": 9.656645326902804e-06,
"loss": 1.0392,
"step": 2802
},
{
"epoch": 0.813761068369865,
"grad_norm": 4.134510517120361,
"learning_rate": 9.656295555442608e-06,
"loss": 1.1675,
"step": 2803
},
{
"epoch": 0.8140513862679635,
"grad_norm": 3.0019631385803223,
"learning_rate": 9.65594561226008e-06,
"loss": 0.9988,
"step": 2804
},
{
"epoch": 0.8143417041660619,
"grad_norm": 3.312530994415283,
"learning_rate": 9.655595497368123e-06,
"loss": 1.2161,
"step": 2805
},
{
"epoch": 0.8146320220641603,
"grad_norm": 3.215278387069702,
"learning_rate": 9.655245210779653e-06,
"loss": 1.0485,
"step": 2806
},
{
"epoch": 0.8149223399622587,
"grad_norm": 3.1792635917663574,
"learning_rate": 9.654894752507589e-06,
"loss": 1.1354,
"step": 2807
},
{
"epoch": 0.8152126578603571,
"grad_norm": 3.156052827835083,
"learning_rate": 9.654544122564852e-06,
"loss": 1.1189,
"step": 2808
},
{
"epoch": 0.8155029757584555,
"grad_norm": 3.3468096256256104,
"learning_rate": 9.654193320964374e-06,
"loss": 1.2148,
"step": 2809
},
{
"epoch": 0.815793293656554,
"grad_norm": 3.149667501449585,
"learning_rate": 9.653842347719094e-06,
"loss": 1.089,
"step": 2810
},
{
"epoch": 0.8160836115546524,
"grad_norm": 2.945133686065674,
"learning_rate": 9.653491202841955e-06,
"loss": 1.1251,
"step": 2811
},
{
"epoch": 0.8163739294527508,
"grad_norm": 3.5497055053710938,
"learning_rate": 9.653139886345909e-06,
"loss": 1.3452,
"step": 2812
},
{
"epoch": 0.8166642473508492,
"grad_norm": 3.0823254585266113,
"learning_rate": 9.652788398243908e-06,
"loss": 1.096,
"step": 2813
},
{
"epoch": 0.8169545652489476,
"grad_norm": 2.955162525177002,
"learning_rate": 9.652436738548917e-06,
"loss": 1.1443,
"step": 2814
},
{
"epoch": 0.817244883147046,
"grad_norm": 3.125523567199707,
"learning_rate": 9.652084907273908e-06,
"loss": 1.1199,
"step": 2815
},
{
"epoch": 0.8175352010451444,
"grad_norm": 3.3003995418548584,
"learning_rate": 9.651732904431852e-06,
"loss": 1.22,
"step": 2816
},
{
"epoch": 0.8178255189432428,
"grad_norm": 3.1056740283966064,
"learning_rate": 9.651380730035733e-06,
"loss": 1.112,
"step": 2817
},
{
"epoch": 0.8181158368413413,
"grad_norm": 3.12873911857605,
"learning_rate": 9.651028384098538e-06,
"loss": 0.9787,
"step": 2818
},
{
"epoch": 0.8184061547394397,
"grad_norm": 3.148348093032837,
"learning_rate": 9.650675866633263e-06,
"loss": 1.1535,
"step": 2819
},
{
"epoch": 0.8186964726375381,
"grad_norm": 3.1877403259277344,
"learning_rate": 9.650323177652907e-06,
"loss": 1.1669,
"step": 2820
},
{
"epoch": 0.8189867905356365,
"grad_norm": 3.172475576400757,
"learning_rate": 9.649970317170478e-06,
"loss": 1.1416,
"step": 2821
},
{
"epoch": 0.8192771084337349,
"grad_norm": 3.273035764694214,
"learning_rate": 9.649617285198988e-06,
"loss": 1.1465,
"step": 2822
},
{
"epoch": 0.8195674263318333,
"grad_norm": 3.1054487228393555,
"learning_rate": 9.649264081751457e-06,
"loss": 1.1381,
"step": 2823
},
{
"epoch": 0.8198577442299317,
"grad_norm": 3.0874011516571045,
"learning_rate": 9.648910706840913e-06,
"loss": 1.1209,
"step": 2824
},
{
"epoch": 0.8201480621280302,
"grad_norm": 3.571061611175537,
"learning_rate": 9.648557160480387e-06,
"loss": 1.3397,
"step": 2825
},
{
"epoch": 0.8204383800261286,
"grad_norm": 3.1404099464416504,
"learning_rate": 9.648203442682917e-06,
"loss": 1.3083,
"step": 2826
},
{
"epoch": 0.820728697924227,
"grad_norm": 3.4728198051452637,
"learning_rate": 9.64784955346155e-06,
"loss": 1.1533,
"step": 2827
},
{
"epoch": 0.8210190158223254,
"grad_norm": 3.192854404449463,
"learning_rate": 9.647495492829336e-06,
"loss": 1.2375,
"step": 2828
},
{
"epoch": 0.8213093337204239,
"grad_norm": 3.1363725662231445,
"learning_rate": 9.64714126079933e-06,
"loss": 0.974,
"step": 2829
},
{
"epoch": 0.8215996516185223,
"grad_norm": 3.270286798477173,
"learning_rate": 9.6467868573846e-06,
"loss": 1.1538,
"step": 2830
},
{
"epoch": 0.8218899695166207,
"grad_norm": 3.2243921756744385,
"learning_rate": 9.646432282598215e-06,
"loss": 1.1136,
"step": 2831
},
{
"epoch": 0.8221802874147192,
"grad_norm": 3.2147057056427,
"learning_rate": 9.646077536453251e-06,
"loss": 1.18,
"step": 2832
},
{
"epoch": 0.8224706053128176,
"grad_norm": 3.073420524597168,
"learning_rate": 9.64572261896279e-06,
"loss": 1.1108,
"step": 2833
},
{
"epoch": 0.822760923210916,
"grad_norm": 3.137725591659546,
"learning_rate": 9.645367530139925e-06,
"loss": 1.1624,
"step": 2834
},
{
"epoch": 0.8230512411090144,
"grad_norm": 3.2300662994384766,
"learning_rate": 9.645012269997747e-06,
"loss": 1.2579,
"step": 2835
},
{
"epoch": 0.8233415590071128,
"grad_norm": 3.178576707839966,
"learning_rate": 9.64465683854936e-06,
"loss": 1.1127,
"step": 2836
},
{
"epoch": 0.8236318769052112,
"grad_norm": 3.1000449657440186,
"learning_rate": 9.644301235807872e-06,
"loss": 1.0045,
"step": 2837
},
{
"epoch": 0.8239221948033096,
"grad_norm": 3.1290085315704346,
"learning_rate": 9.643945461786397e-06,
"loss": 1.0721,
"step": 2838
},
{
"epoch": 0.824212512701408,
"grad_norm": 3.3767518997192383,
"learning_rate": 9.643589516498057e-06,
"loss": 1.268,
"step": 2839
},
{
"epoch": 0.8245028305995065,
"grad_norm": 3.204231023788452,
"learning_rate": 9.64323339995598e-06,
"loss": 1.1592,
"step": 2840
},
{
"epoch": 0.8247931484976049,
"grad_norm": 2.6525983810424805,
"learning_rate": 9.642877112173294e-06,
"loss": 1.0086,
"step": 2841
},
{
"epoch": 0.8250834663957033,
"grad_norm": 3.5629663467407227,
"learning_rate": 9.642520653163146e-06,
"loss": 1.1653,
"step": 2842
},
{
"epoch": 0.8253737842938017,
"grad_norm": 3.5206522941589355,
"learning_rate": 9.642164022938678e-06,
"loss": 1.1618,
"step": 2843
},
{
"epoch": 0.8256641021919001,
"grad_norm": 3.1275205612182617,
"learning_rate": 9.641807221513041e-06,
"loss": 1.0722,
"step": 2844
},
{
"epoch": 0.8259544200899985,
"grad_norm": 3.354448080062866,
"learning_rate": 9.641450248899397e-06,
"loss": 1.2366,
"step": 2845
},
{
"epoch": 0.8262447379880969,
"grad_norm": 3.196295976638794,
"learning_rate": 9.64109310511091e-06,
"loss": 1.0949,
"step": 2846
},
{
"epoch": 0.8265350558861954,
"grad_norm": 3.35182785987854,
"learning_rate": 9.640735790160751e-06,
"loss": 1.3141,
"step": 2847
},
{
"epoch": 0.8268253737842938,
"grad_norm": 3.4913763999938965,
"learning_rate": 9.640378304062099e-06,
"loss": 1.3896,
"step": 2848
},
{
"epoch": 0.8271156916823922,
"grad_norm": 3.162344455718994,
"learning_rate": 9.640020646828134e-06,
"loss": 1.1087,
"step": 2849
},
{
"epoch": 0.8274060095804906,
"grad_norm": 2.9598472118377686,
"learning_rate": 9.639662818472051e-06,
"loss": 1.0635,
"step": 2850
},
{
"epoch": 0.827696327478589,
"grad_norm": 3.6094932556152344,
"learning_rate": 9.639304819007043e-06,
"loss": 1.286,
"step": 2851
},
{
"epoch": 0.8279866453766874,
"grad_norm": 2.9936227798461914,
"learning_rate": 9.638946648446314e-06,
"loss": 1.1463,
"step": 2852
},
{
"epoch": 0.8282769632747858,
"grad_norm": 3.988034725189209,
"learning_rate": 9.638588306803075e-06,
"loss": 1.2177,
"step": 2853
},
{
"epoch": 0.8285672811728844,
"grad_norm": 3.431546926498413,
"learning_rate": 9.63822979409054e-06,
"loss": 1.0094,
"step": 2854
},
{
"epoch": 0.8288575990709828,
"grad_norm": 3.446589231491089,
"learning_rate": 9.63787111032193e-06,
"loss": 1.315,
"step": 2855
},
{
"epoch": 0.8291479169690812,
"grad_norm": 3.355750322341919,
"learning_rate": 9.637512255510475e-06,
"loss": 1.1084,
"step": 2856
},
{
"epoch": 0.8294382348671796,
"grad_norm": 3.808082103729248,
"learning_rate": 9.637153229669407e-06,
"loss": 1.1741,
"step": 2857
},
{
"epoch": 0.829728552765278,
"grad_norm": 3.1000587940216064,
"learning_rate": 9.636794032811968e-06,
"loss": 1.0451,
"step": 2858
},
{
"epoch": 0.8300188706633764,
"grad_norm": 3.0135488510131836,
"learning_rate": 9.636434664951407e-06,
"loss": 1.1303,
"step": 2859
},
{
"epoch": 0.8303091885614748,
"grad_norm": 3.029987096786499,
"learning_rate": 9.636075126100974e-06,
"loss": 1.2556,
"step": 2860
},
{
"epoch": 0.8305995064595733,
"grad_norm": 3.5480244159698486,
"learning_rate": 9.63571541627393e-06,
"loss": 1.0877,
"step": 2861
},
{
"epoch": 0.8308898243576717,
"grad_norm": 3.170466423034668,
"learning_rate": 9.635355535483541e-06,
"loss": 1.1736,
"step": 2862
},
{
"epoch": 0.8311801422557701,
"grad_norm": 3.1938586235046387,
"learning_rate": 9.634995483743079e-06,
"loss": 1.2071,
"step": 2863
},
{
"epoch": 0.8314704601538685,
"grad_norm": 3.2252891063690186,
"learning_rate": 9.634635261065824e-06,
"loss": 1.1202,
"step": 2864
},
{
"epoch": 0.8317607780519669,
"grad_norm": 2.953683853149414,
"learning_rate": 9.634274867465058e-06,
"loss": 1.1123,
"step": 2865
},
{
"epoch": 0.8320510959500653,
"grad_norm": 3.30548357963562,
"learning_rate": 9.633914302954077e-06,
"loss": 1.1805,
"step": 2866
},
{
"epoch": 0.8323414138481637,
"grad_norm": 3.3781816959381104,
"learning_rate": 9.633553567546173e-06,
"loss": 1.2113,
"step": 2867
},
{
"epoch": 0.8326317317462621,
"grad_norm": 3.3362321853637695,
"learning_rate": 9.633192661254654e-06,
"loss": 1.2132,
"step": 2868
},
{
"epoch": 0.8329220496443606,
"grad_norm": 3.1321659088134766,
"learning_rate": 9.632831584092826e-06,
"loss": 1.1416,
"step": 2869
},
{
"epoch": 0.833212367542459,
"grad_norm": 3.464764356613159,
"learning_rate": 9.632470336074009e-06,
"loss": 1.2914,
"step": 2870
},
{
"epoch": 0.8335026854405574,
"grad_norm": 3.633310079574585,
"learning_rate": 9.632108917211525e-06,
"loss": 1.1349,
"step": 2871
},
{
"epoch": 0.8337930033386558,
"grad_norm": 2.9007396697998047,
"learning_rate": 9.6317473275187e-06,
"loss": 1.1294,
"step": 2872
},
{
"epoch": 0.8340833212367542,
"grad_norm": 3.544186592102051,
"learning_rate": 9.631385567008876e-06,
"loss": 1.1775,
"step": 2873
},
{
"epoch": 0.8343736391348526,
"grad_norm": 3.3772568702697754,
"learning_rate": 9.631023635695387e-06,
"loss": 1.2087,
"step": 2874
},
{
"epoch": 0.834663957032951,
"grad_norm": 5.305667877197266,
"learning_rate": 9.630661533591584e-06,
"loss": 1.0834,
"step": 2875
},
{
"epoch": 0.8349542749310495,
"grad_norm": 2.999448299407959,
"learning_rate": 9.630299260710821e-06,
"loss": 1.1121,
"step": 2876
},
{
"epoch": 0.8352445928291479,
"grad_norm": 3.4550819396972656,
"learning_rate": 9.629936817066459e-06,
"loss": 1.2967,
"step": 2877
},
{
"epoch": 0.8355349107272463,
"grad_norm": 2.9293079376220703,
"learning_rate": 9.629574202671866e-06,
"loss": 1.0916,
"step": 2878
},
{
"epoch": 0.8358252286253448,
"grad_norm": 3.328514814376831,
"learning_rate": 9.629211417540412e-06,
"loss": 1.2201,
"step": 2879
},
{
"epoch": 0.8361155465234432,
"grad_norm": 3.393035650253296,
"learning_rate": 9.628848461685479e-06,
"loss": 1.1133,
"step": 2880
},
{
"epoch": 0.8364058644215416,
"grad_norm": 3.4126694202423096,
"learning_rate": 9.62848533512045e-06,
"loss": 1.1965,
"step": 2881
},
{
"epoch": 0.83669618231964,
"grad_norm": 3.150296688079834,
"learning_rate": 9.62812203785872e-06,
"loss": 1.2777,
"step": 2882
},
{
"epoch": 0.8369865002177385,
"grad_norm": 3.2624874114990234,
"learning_rate": 9.627758569913687e-06,
"loss": 1.08,
"step": 2883
},
{
"epoch": 0.8372768181158369,
"grad_norm": 3.2924187183380127,
"learning_rate": 9.627394931298752e-06,
"loss": 1.1596,
"step": 2884
},
{
"epoch": 0.8375671360139353,
"grad_norm": 3.2016308307647705,
"learning_rate": 9.62703112202733e-06,
"loss": 1.0904,
"step": 2885
},
{
"epoch": 0.8378574539120337,
"grad_norm": 2.954402446746826,
"learning_rate": 9.626667142112835e-06,
"loss": 1.0328,
"step": 2886
},
{
"epoch": 0.8381477718101321,
"grad_norm": 3.052061080932617,
"learning_rate": 9.626302991568693e-06,
"loss": 1.0774,
"step": 2887
},
{
"epoch": 0.8384380897082305,
"grad_norm": 3.575716972351074,
"learning_rate": 9.625938670408332e-06,
"loss": 1.2461,
"step": 2888
},
{
"epoch": 0.8387284076063289,
"grad_norm": 3.2799222469329834,
"learning_rate": 9.62557417864519e-06,
"loss": 1.1622,
"step": 2889
},
{
"epoch": 0.8390187255044274,
"grad_norm": 3.241396188735962,
"learning_rate": 9.625209516292706e-06,
"loss": 1.2957,
"step": 2890
},
{
"epoch": 0.8393090434025258,
"grad_norm": 3.083571195602417,
"learning_rate": 9.62484468336433e-06,
"loss": 1.1481,
"step": 2891
},
{
"epoch": 0.8395993613006242,
"grad_norm": 2.80134654045105,
"learning_rate": 9.62447967987352e-06,
"loss": 1.0736,
"step": 2892
},
{
"epoch": 0.8398896791987226,
"grad_norm": 3.1099038124084473,
"learning_rate": 9.624114505833732e-06,
"loss": 1.2471,
"step": 2893
},
{
"epoch": 0.840179997096821,
"grad_norm": 2.9737226963043213,
"learning_rate": 9.623749161258437e-06,
"loss": 1.2019,
"step": 2894
},
{
"epoch": 0.8404703149949194,
"grad_norm": 3.2130281925201416,
"learning_rate": 9.623383646161108e-06,
"loss": 1.2244,
"step": 2895
},
{
"epoch": 0.8407606328930178,
"grad_norm": 3.3365936279296875,
"learning_rate": 9.623017960555226e-06,
"loss": 1.2363,
"step": 2896
},
{
"epoch": 0.8410509507911162,
"grad_norm": 3.4677717685699463,
"learning_rate": 9.622652104454274e-06,
"loss": 1.2702,
"step": 2897
},
{
"epoch": 0.8413412686892147,
"grad_norm": 3.4130473136901855,
"learning_rate": 9.622286077871748e-06,
"loss": 1.2962,
"step": 2898
},
{
"epoch": 0.8416315865873131,
"grad_norm": 3.2819225788116455,
"learning_rate": 9.621919880821145e-06,
"loss": 1.2152,
"step": 2899
},
{
"epoch": 0.8419219044854115,
"grad_norm": 3.008981227874756,
"learning_rate": 9.621553513315972e-06,
"loss": 1.0549,
"step": 2900
},
{
"epoch": 0.8422122223835099,
"grad_norm": 3.9223222732543945,
"learning_rate": 9.621186975369739e-06,
"loss": 1.0762,
"step": 2901
},
{
"epoch": 0.8425025402816083,
"grad_norm": 3.2732174396514893,
"learning_rate": 9.620820266995963e-06,
"loss": 1.174,
"step": 2902
},
{
"epoch": 0.8427928581797067,
"grad_norm": 3.5400829315185547,
"learning_rate": 9.620453388208171e-06,
"loss": 1.1838,
"step": 2903
},
{
"epoch": 0.8430831760778053,
"grad_norm": 3.2847681045532227,
"learning_rate": 9.620086339019892e-06,
"loss": 1.1586,
"step": 2904
},
{
"epoch": 0.8433734939759037,
"grad_norm": 3.5318374633789062,
"learning_rate": 9.619719119444662e-06,
"loss": 1.238,
"step": 2905
},
{
"epoch": 0.8436638118740021,
"grad_norm": 3.087456464767456,
"learning_rate": 9.619351729496022e-06,
"loss": 1.1586,
"step": 2906
},
{
"epoch": 0.8439541297721005,
"grad_norm": 3.138263702392578,
"learning_rate": 9.618984169187525e-06,
"loss": 1.0592,
"step": 2907
},
{
"epoch": 0.8442444476701989,
"grad_norm": 3.5749359130859375,
"learning_rate": 9.618616438532725e-06,
"loss": 1.3117,
"step": 2908
},
{
"epoch": 0.8445347655682973,
"grad_norm": 3.131622076034546,
"learning_rate": 9.618248537545182e-06,
"loss": 1.1527,
"step": 2909
},
{
"epoch": 0.8448250834663957,
"grad_norm": 3.2335894107818604,
"learning_rate": 9.617880466238468e-06,
"loss": 1.1853,
"step": 2910
},
{
"epoch": 0.8451154013644941,
"grad_norm": 3.500901222229004,
"learning_rate": 9.617512224626153e-06,
"loss": 1.2586,
"step": 2911
},
{
"epoch": 0.8454057192625926,
"grad_norm": 3.386972188949585,
"learning_rate": 9.61714381272182e-06,
"loss": 1.0863,
"step": 2912
},
{
"epoch": 0.845696037160691,
"grad_norm": 3.1666817665100098,
"learning_rate": 9.616775230539057e-06,
"loss": 1.1175,
"step": 2913
},
{
"epoch": 0.8459863550587894,
"grad_norm": 3.1548240184783936,
"learning_rate": 9.616406478091453e-06,
"loss": 1.2446,
"step": 2914
},
{
"epoch": 0.8462766729568878,
"grad_norm": 3.5241239070892334,
"learning_rate": 9.616037555392612e-06,
"loss": 1.2566,
"step": 2915
},
{
"epoch": 0.8465669908549862,
"grad_norm": 3.256432294845581,
"learning_rate": 9.615668462456138e-06,
"loss": 1.1985,
"step": 2916
},
{
"epoch": 0.8468573087530846,
"grad_norm": 3.100454807281494,
"learning_rate": 9.615299199295643e-06,
"loss": 1.0913,
"step": 2917
},
{
"epoch": 0.847147626651183,
"grad_norm": 3.2860195636749268,
"learning_rate": 9.614929765924743e-06,
"loss": 1.1788,
"step": 2918
},
{
"epoch": 0.8474379445492815,
"grad_norm": 3.776573419570923,
"learning_rate": 9.614560162357065e-06,
"loss": 1.1846,
"step": 2919
},
{
"epoch": 0.8477282624473799,
"grad_norm": 3.1679580211639404,
"learning_rate": 9.61419038860624e-06,
"loss": 1.1935,
"step": 2920
},
{
"epoch": 0.8480185803454783,
"grad_norm": 3.056650161743164,
"learning_rate": 9.613820444685905e-06,
"loss": 1.2031,
"step": 2921
},
{
"epoch": 0.8483088982435767,
"grad_norm": 3.017367362976074,
"learning_rate": 9.613450330609702e-06,
"loss": 1.0897,
"step": 2922
},
{
"epoch": 0.8485992161416751,
"grad_norm": 3.249253273010254,
"learning_rate": 9.613080046391283e-06,
"loss": 1.0954,
"step": 2923
},
{
"epoch": 0.8488895340397735,
"grad_norm": 3.4139556884765625,
"learning_rate": 9.612709592044302e-06,
"loss": 1.1066,
"step": 2924
},
{
"epoch": 0.8491798519378719,
"grad_norm": 2.860640048980713,
"learning_rate": 9.612338967582422e-06,
"loss": 1.0388,
"step": 2925
},
{
"epoch": 0.8494701698359703,
"grad_norm": 3.295013666152954,
"learning_rate": 9.61196817301931e-06,
"loss": 1.1332,
"step": 2926
},
{
"epoch": 0.8497604877340688,
"grad_norm": 3.217747449874878,
"learning_rate": 9.611597208368643e-06,
"loss": 1.1077,
"step": 2927
},
{
"epoch": 0.8500508056321672,
"grad_norm": 3.2518575191497803,
"learning_rate": 9.6112260736441e-06,
"loss": 1.1124,
"step": 2928
},
{
"epoch": 0.8503411235302656,
"grad_norm": 3.5065808296203613,
"learning_rate": 9.61085476885937e-06,
"loss": 1.4305,
"step": 2929
},
{
"epoch": 0.8506314414283641,
"grad_norm": 3.3835043907165527,
"learning_rate": 9.610483294028146e-06,
"loss": 1.1893,
"step": 2930
},
{
"epoch": 0.8509217593264625,
"grad_norm": 3.2871642112731934,
"learning_rate": 9.610111649164128e-06,
"loss": 0.9748,
"step": 2931
},
{
"epoch": 0.8512120772245609,
"grad_norm": 3.6779463291168213,
"learning_rate": 9.609739834281023e-06,
"loss": 1.1088,
"step": 2932
},
{
"epoch": 0.8515023951226594,
"grad_norm": 3.250479221343994,
"learning_rate": 9.609367849392538e-06,
"loss": 1.2176,
"step": 2933
},
{
"epoch": 0.8517927130207578,
"grad_norm": 3.5712833404541016,
"learning_rate": 9.6089956945124e-06,
"loss": 1.2416,
"step": 2934
},
{
"epoch": 0.8520830309188562,
"grad_norm": 3.58555269241333,
"learning_rate": 9.608623369654329e-06,
"loss": 1.2917,
"step": 2935
},
{
"epoch": 0.8523733488169546,
"grad_norm": 3.2484397888183594,
"learning_rate": 9.608250874832056e-06,
"loss": 1.2379,
"step": 2936
},
{
"epoch": 0.852663666715053,
"grad_norm": 3.329904556274414,
"learning_rate": 9.607878210059319e-06,
"loss": 1.1517,
"step": 2937
},
{
"epoch": 0.8529539846131514,
"grad_norm": 3.4330637454986572,
"learning_rate": 9.607505375349863e-06,
"loss": 1.1697,
"step": 2938
},
{
"epoch": 0.8532443025112498,
"grad_norm": 3.325636386871338,
"learning_rate": 9.607132370717438e-06,
"loss": 1.2163,
"step": 2939
},
{
"epoch": 0.8535346204093482,
"grad_norm": 3.112339973449707,
"learning_rate": 9.606759196175799e-06,
"loss": 1.1753,
"step": 2940
},
{
"epoch": 0.8538249383074467,
"grad_norm": 2.995211362838745,
"learning_rate": 9.606385851738709e-06,
"loss": 0.9425,
"step": 2941
},
{
"epoch": 0.8541152562055451,
"grad_norm": 3.3022618293762207,
"learning_rate": 9.606012337419935e-06,
"loss": 1.0678,
"step": 2942
},
{
"epoch": 0.8544055741036435,
"grad_norm": 3.29768967628479,
"learning_rate": 9.605638653233256e-06,
"loss": 1.0541,
"step": 2943
},
{
"epoch": 0.8546958920017419,
"grad_norm": 3.348756790161133,
"learning_rate": 9.605264799192451e-06,
"loss": 1.1323,
"step": 2944
},
{
"epoch": 0.8549862098998403,
"grad_norm": 3.145399808883667,
"learning_rate": 9.604890775311306e-06,
"loss": 1.1527,
"step": 2945
},
{
"epoch": 0.8552765277979387,
"grad_norm": 3.2217440605163574,
"learning_rate": 9.604516581603618e-06,
"loss": 1.1699,
"step": 2946
},
{
"epoch": 0.8555668456960371,
"grad_norm": 3.144026041030884,
"learning_rate": 9.604142218083186e-06,
"loss": 1.1709,
"step": 2947
},
{
"epoch": 0.8558571635941356,
"grad_norm": 3.078562021255493,
"learning_rate": 9.603767684763816e-06,
"loss": 1.0826,
"step": 2948
},
{
"epoch": 0.856147481492234,
"grad_norm": 3.4146084785461426,
"learning_rate": 9.60339298165932e-06,
"loss": 1.2019,
"step": 2949
},
{
"epoch": 0.8564377993903324,
"grad_norm": 3.4038820266723633,
"learning_rate": 9.603018108783518e-06,
"loss": 1.2677,
"step": 2950
},
{
"epoch": 0.8567281172884308,
"grad_norm": 2.9391040802001953,
"learning_rate": 9.602643066150235e-06,
"loss": 0.9942,
"step": 2951
},
{
"epoch": 0.8570184351865292,
"grad_norm": 3.679786443710327,
"learning_rate": 9.602267853773301e-06,
"loss": 1.3242,
"step": 2952
},
{
"epoch": 0.8573087530846276,
"grad_norm": 2.8453195095062256,
"learning_rate": 9.601892471666556e-06,
"loss": 0.9488,
"step": 2953
},
{
"epoch": 0.857599070982726,
"grad_norm": 3.4040963649749756,
"learning_rate": 9.601516919843843e-06,
"loss": 1.2333,
"step": 2954
},
{
"epoch": 0.8578893888808246,
"grad_norm": 2.9734036922454834,
"learning_rate": 9.601141198319013e-06,
"loss": 1.0074,
"step": 2955
},
{
"epoch": 0.858179706778923,
"grad_norm": 3.0131356716156006,
"learning_rate": 9.600765307105919e-06,
"loss": 1.1091,
"step": 2956
},
{
"epoch": 0.8584700246770214,
"grad_norm": 3.171550750732422,
"learning_rate": 9.60038924621843e-06,
"loss": 0.9531,
"step": 2957
},
{
"epoch": 0.8587603425751198,
"grad_norm": 3.324277639389038,
"learning_rate": 9.600013015670408e-06,
"loss": 1.3101,
"step": 2958
},
{
"epoch": 0.8590506604732182,
"grad_norm": 3.18428635597229,
"learning_rate": 9.599636615475731e-06,
"loss": 1.1184,
"step": 2959
},
{
"epoch": 0.8593409783713166,
"grad_norm": 3.0067083835601807,
"learning_rate": 9.599260045648281e-06,
"loss": 1.1813,
"step": 2960
},
{
"epoch": 0.859631296269415,
"grad_norm": 2.8176536560058594,
"learning_rate": 9.598883306201949e-06,
"loss": 1.062,
"step": 2961
},
{
"epoch": 0.8599216141675134,
"grad_norm": 3.1799442768096924,
"learning_rate": 9.598506397150623e-06,
"loss": 1.1755,
"step": 2962
},
{
"epoch": 0.8602119320656119,
"grad_norm": 2.9520862102508545,
"learning_rate": 9.598129318508207e-06,
"loss": 0.923,
"step": 2963
},
{
"epoch": 0.8605022499637103,
"grad_norm": 3.538482666015625,
"learning_rate": 9.597752070288607e-06,
"loss": 1.2052,
"step": 2964
},
{
"epoch": 0.8607925678618087,
"grad_norm": 3.4400877952575684,
"learning_rate": 9.597374652505733e-06,
"loss": 1.1748,
"step": 2965
},
{
"epoch": 0.8610828857599071,
"grad_norm": 3.192110300064087,
"learning_rate": 9.596997065173508e-06,
"loss": 1.1613,
"step": 2966
},
{
"epoch": 0.8613732036580055,
"grad_norm": 3.294027328491211,
"learning_rate": 9.596619308305855e-06,
"loss": 1.1743,
"step": 2967
},
{
"epoch": 0.8616635215561039,
"grad_norm": 3.0262019634246826,
"learning_rate": 9.596241381916704e-06,
"loss": 1.074,
"step": 2968
},
{
"epoch": 0.8619538394542023,
"grad_norm": 3.1539053916931152,
"learning_rate": 9.595863286019997e-06,
"loss": 1.2264,
"step": 2969
},
{
"epoch": 0.8622441573523008,
"grad_norm": 2.9892208576202393,
"learning_rate": 9.595485020629676e-06,
"loss": 1.0432,
"step": 2970
},
{
"epoch": 0.8625344752503992,
"grad_norm": 3.0038716793060303,
"learning_rate": 9.59510658575969e-06,
"loss": 1.0812,
"step": 2971
},
{
"epoch": 0.8628247931484976,
"grad_norm": 3.4315454959869385,
"learning_rate": 9.594727981423998e-06,
"loss": 1.2797,
"step": 2972
},
{
"epoch": 0.863115111046596,
"grad_norm": 3.2693030834198,
"learning_rate": 9.594349207636559e-06,
"loss": 1.1986,
"step": 2973
},
{
"epoch": 0.8634054289446944,
"grad_norm": 3.197600841522217,
"learning_rate": 9.593970264411348e-06,
"loss": 1.1726,
"step": 2974
},
{
"epoch": 0.8636957468427928,
"grad_norm": 3.848891496658325,
"learning_rate": 9.593591151762334e-06,
"loss": 1.1903,
"step": 2975
},
{
"epoch": 0.8639860647408912,
"grad_norm": 3.898817539215088,
"learning_rate": 9.593211869703503e-06,
"loss": 1.145,
"step": 2976
},
{
"epoch": 0.8642763826389896,
"grad_norm": 3.280470609664917,
"learning_rate": 9.592832418248838e-06,
"loss": 1.2771,
"step": 2977
},
{
"epoch": 0.8645667005370881,
"grad_norm": 2.8223423957824707,
"learning_rate": 9.59245279741234e-06,
"loss": 1.035,
"step": 2978
},
{
"epoch": 0.8648570184351865,
"grad_norm": 3.2701332569122314,
"learning_rate": 9.592073007208003e-06,
"loss": 1.3028,
"step": 2979
},
{
"epoch": 0.865147336333285,
"grad_norm": 3.103128671646118,
"learning_rate": 9.591693047649834e-06,
"loss": 1.1035,
"step": 2980
},
{
"epoch": 0.8654376542313834,
"grad_norm": 3.201188802719116,
"learning_rate": 9.591312918751852e-06,
"loss": 1.176,
"step": 2981
},
{
"epoch": 0.8657279721294818,
"grad_norm": 3.016108274459839,
"learning_rate": 9.590932620528068e-06,
"loss": 1.0289,
"step": 2982
},
{
"epoch": 0.8660182900275802,
"grad_norm": 3.240518093109131,
"learning_rate": 9.590552152992512e-06,
"loss": 1.1196,
"step": 2983
},
{
"epoch": 0.8663086079256787,
"grad_norm": 3.302276134490967,
"learning_rate": 9.590171516159214e-06,
"loss": 1.2784,
"step": 2984
},
{
"epoch": 0.8665989258237771,
"grad_norm": 3.3650875091552734,
"learning_rate": 9.589790710042212e-06,
"loss": 1.2402,
"step": 2985
},
{
"epoch": 0.8668892437218755,
"grad_norm": 3.414092779159546,
"learning_rate": 9.589409734655553e-06,
"loss": 1.2323,
"step": 2986
},
{
"epoch": 0.8671795616199739,
"grad_norm": 3.1558945178985596,
"learning_rate": 9.58902859001328e-06,
"loss": 1.0965,
"step": 2987
},
{
"epoch": 0.8674698795180723,
"grad_norm": 3.403278350830078,
"learning_rate": 9.588647276129456e-06,
"loss": 1.1815,
"step": 2988
},
{
"epoch": 0.8677601974161707,
"grad_norm": 2.8990426063537598,
"learning_rate": 9.588265793018141e-06,
"loss": 1.0713,
"step": 2989
},
{
"epoch": 0.8680505153142691,
"grad_norm": 3.296391248703003,
"learning_rate": 9.587884140693404e-06,
"loss": 1.146,
"step": 2990
},
{
"epoch": 0.8683408332123675,
"grad_norm": 3.0492796897888184,
"learning_rate": 9.58750231916932e-06,
"loss": 1.0286,
"step": 2991
},
{
"epoch": 0.868631151110466,
"grad_norm": 3.2753119468688965,
"learning_rate": 9.587120328459973e-06,
"loss": 1.0991,
"step": 2992
},
{
"epoch": 0.8689214690085644,
"grad_norm": 2.943715810775757,
"learning_rate": 9.586738168579446e-06,
"loss": 1.0901,
"step": 2993
},
{
"epoch": 0.8692117869066628,
"grad_norm": 3.236210584640503,
"learning_rate": 9.586355839541836e-06,
"loss": 1.3409,
"step": 2994
},
{
"epoch": 0.8695021048047612,
"grad_norm": 3.17950177192688,
"learning_rate": 9.585973341361244e-06,
"loss": 1.2406,
"step": 2995
},
{
"epoch": 0.8697924227028596,
"grad_norm": 2.9284613132476807,
"learning_rate": 9.585590674051775e-06,
"loss": 1.0142,
"step": 2996
},
{
"epoch": 0.870082740600958,
"grad_norm": 3.4473886489868164,
"learning_rate": 9.585207837627541e-06,
"loss": 1.3138,
"step": 2997
},
{
"epoch": 0.8703730584990564,
"grad_norm": 3.099240303039551,
"learning_rate": 9.58482483210266e-06,
"loss": 1.1775,
"step": 2998
},
{
"epoch": 0.8706633763971549,
"grad_norm": 3.1252505779266357,
"learning_rate": 9.584441657491263e-06,
"loss": 1.0392,
"step": 2999
},
{
"epoch": 0.8709536942952533,
"grad_norm": 3.072007417678833,
"learning_rate": 9.584058313807474e-06,
"loss": 1.0797,
"step": 3000
},
{
"epoch": 0.8709536942952533,
"eval_loss": 1.1775367259979248,
"eval_runtime": 11.589,
"eval_samples_per_second": 34.516,
"eval_steps_per_second": 4.314,
"step": 3000
},
{
"epoch": 0.8712440121933517,
"grad_norm": 3.092594861984253,
"learning_rate": 9.583674801065433e-06,
"loss": 1.1061,
"step": 3001
},
{
"epoch": 0.8715343300914501,
"grad_norm": 3.2414965629577637,
"learning_rate": 9.583291119279285e-06,
"loss": 1.0196,
"step": 3002
},
{
"epoch": 0.8718246479895485,
"grad_norm": 3.3458807468414307,
"learning_rate": 9.58290726846318e-06,
"loss": 1.269,
"step": 3003
},
{
"epoch": 0.8721149658876469,
"grad_norm": 3.083974838256836,
"learning_rate": 9.582523248631273e-06,
"loss": 1.1124,
"step": 3004
},
{
"epoch": 0.8724052837857454,
"grad_norm": 2.8129920959472656,
"learning_rate": 9.582139059797728e-06,
"loss": 1.0657,
"step": 3005
},
{
"epoch": 0.8726956016838439,
"grad_norm": 3.2248311042785645,
"learning_rate": 9.581754701976711e-06,
"loss": 1.2258,
"step": 3006
},
{
"epoch": 0.8729859195819423,
"grad_norm": 2.996952533721924,
"learning_rate": 9.581370175182401e-06,
"loss": 1.1067,
"step": 3007
},
{
"epoch": 0.8732762374800407,
"grad_norm": 3.218592643737793,
"learning_rate": 9.580985479428975e-06,
"loss": 1.0454,
"step": 3008
},
{
"epoch": 0.8735665553781391,
"grad_norm": 3.3797225952148438,
"learning_rate": 9.580600614730624e-06,
"loss": 1.1807,
"step": 3009
},
{
"epoch": 0.8738568732762375,
"grad_norm": 3.1415364742279053,
"learning_rate": 9.580215581101539e-06,
"loss": 1.1201,
"step": 3010
},
{
"epoch": 0.8741471911743359,
"grad_norm": 3.2598962783813477,
"learning_rate": 9.57983037855592e-06,
"loss": 1.1755,
"step": 3011
},
{
"epoch": 0.8744375090724343,
"grad_norm": 3.2180087566375732,
"learning_rate": 9.579445007107977e-06,
"loss": 1.2463,
"step": 3012
},
{
"epoch": 0.8747278269705328,
"grad_norm": 3.349390983581543,
"learning_rate": 9.579059466771918e-06,
"loss": 1.1918,
"step": 3013
},
{
"epoch": 0.8750181448686312,
"grad_norm": 3.22566819190979,
"learning_rate": 9.578673757561963e-06,
"loss": 1.1867,
"step": 3014
},
{
"epoch": 0.8753084627667296,
"grad_norm": 3.3200433254241943,
"learning_rate": 9.578287879492336e-06,
"loss": 1.0604,
"step": 3015
},
{
"epoch": 0.875598780664828,
"grad_norm": 2.9759771823883057,
"learning_rate": 9.577901832577269e-06,
"loss": 1.0893,
"step": 3016
},
{
"epoch": 0.8758890985629264,
"grad_norm": 3.5478708744049072,
"learning_rate": 9.577515616831e-06,
"loss": 1.231,
"step": 3017
},
{
"epoch": 0.8761794164610248,
"grad_norm": 3.2979137897491455,
"learning_rate": 9.577129232267772e-06,
"loss": 1.1449,
"step": 3018
},
{
"epoch": 0.8764697343591232,
"grad_norm": 3.123936653137207,
"learning_rate": 9.576742678901833e-06,
"loss": 1.1683,
"step": 3019
},
{
"epoch": 0.8767600522572216,
"grad_norm": 3.3888375759124756,
"learning_rate": 9.57635595674744e-06,
"loss": 1.2465,
"step": 3020
},
{
"epoch": 0.8770503701553201,
"grad_norm": 2.825896739959717,
"learning_rate": 9.575969065818856e-06,
"loss": 1.0497,
"step": 3021
},
{
"epoch": 0.8773406880534185,
"grad_norm": 3.0169923305511475,
"learning_rate": 9.57558200613035e-06,
"loss": 1.0137,
"step": 3022
},
{
"epoch": 0.8776310059515169,
"grad_norm": 3.445631265640259,
"learning_rate": 9.575194777696194e-06,
"loss": 1.1816,
"step": 3023
},
{
"epoch": 0.8779213238496153,
"grad_norm": 2.809177875518799,
"learning_rate": 9.57480738053067e-06,
"loss": 1.1858,
"step": 3024
},
{
"epoch": 0.8782116417477137,
"grad_norm": 3.311002254486084,
"learning_rate": 9.574419814648065e-06,
"loss": 1.2344,
"step": 3025
},
{
"epoch": 0.8785019596458121,
"grad_norm": 2.9318954944610596,
"learning_rate": 9.574032080062673e-06,
"loss": 1.1236,
"step": 3026
},
{
"epoch": 0.8787922775439105,
"grad_norm": 3.338117837905884,
"learning_rate": 9.573644176788795e-06,
"loss": 1.272,
"step": 3027
},
{
"epoch": 0.879082595442009,
"grad_norm": 3.30912446975708,
"learning_rate": 9.573256104840732e-06,
"loss": 1.1346,
"step": 3028
},
{
"epoch": 0.8793729133401074,
"grad_norm": 3.140470027923584,
"learning_rate": 9.572867864232799e-06,
"loss": 1.1724,
"step": 3029
},
{
"epoch": 0.8796632312382059,
"grad_norm": 3.1311466693878174,
"learning_rate": 9.572479454979315e-06,
"loss": 1.0638,
"step": 3030
},
{
"epoch": 0.8799535491363043,
"grad_norm": 3.1193671226501465,
"learning_rate": 9.572090877094604e-06,
"loss": 1.2142,
"step": 3031
},
{
"epoch": 0.8802438670344027,
"grad_norm": 3.0533499717712402,
"learning_rate": 9.571702130592994e-06,
"loss": 1.2326,
"step": 3032
},
{
"epoch": 0.8805341849325011,
"grad_norm": 3.523092269897461,
"learning_rate": 9.571313215488824e-06,
"loss": 1.0997,
"step": 3033
},
{
"epoch": 0.8808245028305995,
"grad_norm": 3.402045726776123,
"learning_rate": 9.570924131796437e-06,
"loss": 1.06,
"step": 3034
},
{
"epoch": 0.881114820728698,
"grad_norm": 3.0997350215911865,
"learning_rate": 9.570534879530182e-06,
"loss": 1.0053,
"step": 3035
},
{
"epoch": 0.8814051386267964,
"grad_norm": 2.9039306640625,
"learning_rate": 9.570145458704416e-06,
"loss": 1.0801,
"step": 3036
},
{
"epoch": 0.8816954565248948,
"grad_norm": 3.0941872596740723,
"learning_rate": 9.569755869333497e-06,
"loss": 1.16,
"step": 3037
},
{
"epoch": 0.8819857744229932,
"grad_norm": 3.2002017498016357,
"learning_rate": 9.569366111431794e-06,
"loss": 1.2813,
"step": 3038
},
{
"epoch": 0.8822760923210916,
"grad_norm": 3.665795087814331,
"learning_rate": 9.568976185013685e-06,
"loss": 1.3266,
"step": 3039
},
{
"epoch": 0.88256641021919,
"grad_norm": 3.3414106369018555,
"learning_rate": 9.568586090093545e-06,
"loss": 1.1968,
"step": 3040
},
{
"epoch": 0.8828567281172884,
"grad_norm": 3.1864659786224365,
"learning_rate": 9.568195826685765e-06,
"loss": 1.2351,
"step": 3041
},
{
"epoch": 0.8831470460153868,
"grad_norm": 3.338440179824829,
"learning_rate": 9.567805394804734e-06,
"loss": 1.1602,
"step": 3042
},
{
"epoch": 0.8834373639134853,
"grad_norm": 3.411781072616577,
"learning_rate": 9.567414794464854e-06,
"loss": 1.2741,
"step": 3043
},
{
"epoch": 0.8837276818115837,
"grad_norm": 2.922380208969116,
"learning_rate": 9.567024025680529e-06,
"loss": 1.0612,
"step": 3044
},
{
"epoch": 0.8840179997096821,
"grad_norm": 3.3472232818603516,
"learning_rate": 9.566633088466169e-06,
"loss": 1.0968,
"step": 3045
},
{
"epoch": 0.8843083176077805,
"grad_norm": 3.23529052734375,
"learning_rate": 9.566241982836193e-06,
"loss": 1.303,
"step": 3046
},
{
"epoch": 0.8845986355058789,
"grad_norm": 3.1247169971466064,
"learning_rate": 9.565850708805025e-06,
"loss": 1.2335,
"step": 3047
},
{
"epoch": 0.8848889534039773,
"grad_norm": 3.1896188259124756,
"learning_rate": 9.565459266387096e-06,
"loss": 1.2399,
"step": 3048
},
{
"epoch": 0.8851792713020757,
"grad_norm": 3.411284923553467,
"learning_rate": 9.56506765559684e-06,
"loss": 1.3471,
"step": 3049
},
{
"epoch": 0.8854695892001742,
"grad_norm": 3.114387273788452,
"learning_rate": 9.5646758764487e-06,
"loss": 1.195,
"step": 3050
},
{
"epoch": 0.8857599070982726,
"grad_norm": 3.2049310207366943,
"learning_rate": 9.564283928957126e-06,
"loss": 1.157,
"step": 3051
},
{
"epoch": 0.886050224996371,
"grad_norm": 3.156636953353882,
"learning_rate": 9.563891813136571e-06,
"loss": 1.1504,
"step": 3052
},
{
"epoch": 0.8863405428944694,
"grad_norm": 3.385990619659424,
"learning_rate": 9.563499529001498e-06,
"loss": 1.1591,
"step": 3053
},
{
"epoch": 0.8866308607925678,
"grad_norm": 3.049511671066284,
"learning_rate": 9.563107076566373e-06,
"loss": 1.1171,
"step": 3054
},
{
"epoch": 0.8869211786906663,
"grad_norm": 3.1001222133636475,
"learning_rate": 9.56271445584567e-06,
"loss": 1.0276,
"step": 3055
},
{
"epoch": 0.8872114965887647,
"grad_norm": 3.2549166679382324,
"learning_rate": 9.562321666853868e-06,
"loss": 1.1241,
"step": 3056
},
{
"epoch": 0.8875018144868632,
"grad_norm": 3.0443809032440186,
"learning_rate": 9.561928709605454e-06,
"loss": 1.0743,
"step": 3057
},
{
"epoch": 0.8877921323849616,
"grad_norm": 3.459932804107666,
"learning_rate": 9.561535584114919e-06,
"loss": 1.1445,
"step": 3058
},
{
"epoch": 0.88808245028306,
"grad_norm": 2.758932113647461,
"learning_rate": 9.561142290396763e-06,
"loss": 1.0656,
"step": 3059
},
{
"epoch": 0.8883727681811584,
"grad_norm": 2.894343852996826,
"learning_rate": 9.560748828465486e-06,
"loss": 1.1935,
"step": 3060
},
{
"epoch": 0.8886630860792568,
"grad_norm": 2.8865163326263428,
"learning_rate": 9.560355198335607e-06,
"loss": 0.9562,
"step": 3061
},
{
"epoch": 0.8889534039773552,
"grad_norm": 3.2808666229248047,
"learning_rate": 9.559961400021636e-06,
"loss": 1.0705,
"step": 3062
},
{
"epoch": 0.8892437218754536,
"grad_norm": 3.1613757610321045,
"learning_rate": 9.559567433538097e-06,
"loss": 1.1494,
"step": 3063
},
{
"epoch": 0.889534039773552,
"grad_norm": 3.128833532333374,
"learning_rate": 9.55917329889952e-06,
"loss": 1.1202,
"step": 3064
},
{
"epoch": 0.8898243576716505,
"grad_norm": 3.2559049129486084,
"learning_rate": 9.558778996120443e-06,
"loss": 1.2322,
"step": 3065
},
{
"epoch": 0.8901146755697489,
"grad_norm": 3.2830514907836914,
"learning_rate": 9.558384525215406e-06,
"loss": 1.2362,
"step": 3066
},
{
"epoch": 0.8904049934678473,
"grad_norm": 3.1671226024627686,
"learning_rate": 9.557989886198955e-06,
"loss": 1.3601,
"step": 3067
},
{
"epoch": 0.8906953113659457,
"grad_norm": 3.2132253646850586,
"learning_rate": 9.557595079085646e-06,
"loss": 0.9999,
"step": 3068
},
{
"epoch": 0.8909856292640441,
"grad_norm": 2.914524555206299,
"learning_rate": 9.557200103890038e-06,
"loss": 0.9415,
"step": 3069
},
{
"epoch": 0.8912759471621425,
"grad_norm": 3.0425221920013428,
"learning_rate": 9.556804960626702e-06,
"loss": 1.1311,
"step": 3070
},
{
"epoch": 0.891566265060241,
"grad_norm": 3.347184658050537,
"learning_rate": 9.556409649310206e-06,
"loss": 1.1673,
"step": 3071
},
{
"epoch": 0.8918565829583394,
"grad_norm": 3.4314563274383545,
"learning_rate": 9.556014169955128e-06,
"loss": 1.2945,
"step": 3072
},
{
"epoch": 0.8921469008564378,
"grad_norm": 2.9853997230529785,
"learning_rate": 9.555618522576058e-06,
"loss": 1.0987,
"step": 3073
},
{
"epoch": 0.8924372187545362,
"grad_norm": 3.1625750064849854,
"learning_rate": 9.555222707187584e-06,
"loss": 1.0362,
"step": 3074
},
{
"epoch": 0.8927275366526346,
"grad_norm": 3.226891279220581,
"learning_rate": 9.554826723804304e-06,
"loss": 1.2553,
"step": 3075
},
{
"epoch": 0.893017854550733,
"grad_norm": 3.2344210147857666,
"learning_rate": 9.554430572440822e-06,
"loss": 1.1399,
"step": 3076
},
{
"epoch": 0.8933081724488314,
"grad_norm": 3.3998959064483643,
"learning_rate": 9.554034253111747e-06,
"loss": 1.2145,
"step": 3077
},
{
"epoch": 0.8935984903469298,
"grad_norm": 3.6094846725463867,
"learning_rate": 9.553637765831697e-06,
"loss": 1.2089,
"step": 3078
},
{
"epoch": 0.8938888082450283,
"grad_norm": 2.996131181716919,
"learning_rate": 9.553241110615294e-06,
"loss": 1.0733,
"step": 3079
},
{
"epoch": 0.8941791261431268,
"grad_norm": 3.7459475994110107,
"learning_rate": 9.552844287477165e-06,
"loss": 1.3399,
"step": 3080
},
{
"epoch": 0.8944694440412252,
"grad_norm": 3.1052403450012207,
"learning_rate": 9.552447296431945e-06,
"loss": 1.1049,
"step": 3081
},
{
"epoch": 0.8947597619393236,
"grad_norm": 3.407588005065918,
"learning_rate": 9.552050137494275e-06,
"loss": 1.2035,
"step": 3082
},
{
"epoch": 0.895050079837422,
"grad_norm": 3.0574097633361816,
"learning_rate": 9.551652810678804e-06,
"loss": 1.0939,
"step": 3083
},
{
"epoch": 0.8953403977355204,
"grad_norm": 3.173433780670166,
"learning_rate": 9.551255316000183e-06,
"loss": 1.1121,
"step": 3084
},
{
"epoch": 0.8956307156336188,
"grad_norm": 3.04433274269104,
"learning_rate": 9.550857653473072e-06,
"loss": 1.0842,
"step": 3085
},
{
"epoch": 0.8959210335317173,
"grad_norm": 2.9734885692596436,
"learning_rate": 9.550459823112134e-06,
"loss": 1.0842,
"step": 3086
},
{
"epoch": 0.8962113514298157,
"grad_norm": 3.3427157402038574,
"learning_rate": 9.550061824932047e-06,
"loss": 1.1935,
"step": 3087
},
{
"epoch": 0.8965016693279141,
"grad_norm": 3.2677273750305176,
"learning_rate": 9.549663658947484e-06,
"loss": 1.2635,
"step": 3088
},
{
"epoch": 0.8967919872260125,
"grad_norm": 3.1517832279205322,
"learning_rate": 9.549265325173132e-06,
"loss": 1.3644,
"step": 3089
},
{
"epoch": 0.8970823051241109,
"grad_norm": 3.031965732574463,
"learning_rate": 9.548866823623679e-06,
"loss": 1.1241,
"step": 3090
},
{
"epoch": 0.8973726230222093,
"grad_norm": 3.4026827812194824,
"learning_rate": 9.548468154313822e-06,
"loss": 1.2084,
"step": 3091
},
{
"epoch": 0.8976629409203077,
"grad_norm": 3.157986879348755,
"learning_rate": 9.548069317258267e-06,
"loss": 1.016,
"step": 3092
},
{
"epoch": 0.8979532588184062,
"grad_norm": 3.4387762546539307,
"learning_rate": 9.547670312471718e-06,
"loss": 1.2204,
"step": 3093
},
{
"epoch": 0.8982435767165046,
"grad_norm": 3.1353819370269775,
"learning_rate": 9.547271139968893e-06,
"loss": 1.1181,
"step": 3094
},
{
"epoch": 0.898533894614603,
"grad_norm": 3.1333255767822266,
"learning_rate": 9.546871799764513e-06,
"loss": 1.2261,
"step": 3095
},
{
"epoch": 0.8988242125127014,
"grad_norm": 3.0457921028137207,
"learning_rate": 9.546472291873306e-06,
"loss": 1.0156,
"step": 3096
},
{
"epoch": 0.8991145304107998,
"grad_norm": 3.1292712688446045,
"learning_rate": 9.546072616310005e-06,
"loss": 1.0354,
"step": 3097
},
{
"epoch": 0.8994048483088982,
"grad_norm": 3.471691131591797,
"learning_rate": 9.54567277308935e-06,
"loss": 1.21,
"step": 3098
},
{
"epoch": 0.8996951662069966,
"grad_norm": 3.4814560413360596,
"learning_rate": 9.545272762226086e-06,
"loss": 1.2114,
"step": 3099
},
{
"epoch": 0.899985484105095,
"grad_norm": 3.2234396934509277,
"learning_rate": 9.544872583734967e-06,
"loss": 1.1872,
"step": 3100
},
{
"epoch": 0.9002758020031935,
"grad_norm": 3.178117275238037,
"learning_rate": 9.544472237630751e-06,
"loss": 1.0513,
"step": 3101
},
{
"epoch": 0.9005661199012919,
"grad_norm": 3.4485244750976562,
"learning_rate": 9.544071723928202e-06,
"loss": 1.3207,
"step": 3102
},
{
"epoch": 0.9008564377993903,
"grad_norm": 3.10819935798645,
"learning_rate": 9.54367104264209e-06,
"loss": 1.077,
"step": 3103
},
{
"epoch": 0.9011467556974887,
"grad_norm": 3.2871968746185303,
"learning_rate": 9.543270193787195e-06,
"loss": 1.1986,
"step": 3104
},
{
"epoch": 0.9014370735955871,
"grad_norm": 3.138451099395752,
"learning_rate": 9.542869177378298e-06,
"loss": 1.0721,
"step": 3105
},
{
"epoch": 0.9017273914936856,
"grad_norm": 3.0248279571533203,
"learning_rate": 9.542467993430189e-06,
"loss": 0.989,
"step": 3106
},
{
"epoch": 0.902017709391784,
"grad_norm": 2.8113856315612793,
"learning_rate": 9.542066641957661e-06,
"loss": 1.0949,
"step": 3107
},
{
"epoch": 0.9023080272898825,
"grad_norm": 2.728372573852539,
"learning_rate": 9.54166512297552e-06,
"loss": 0.9767,
"step": 3108
},
{
"epoch": 0.9025983451879809,
"grad_norm": 3.231879472732544,
"learning_rate": 9.541263436498568e-06,
"loss": 1.2046,
"step": 3109
},
{
"epoch": 0.9028886630860793,
"grad_norm": 3.1025683879852295,
"learning_rate": 9.540861582541624e-06,
"loss": 1.1099,
"step": 3110
},
{
"epoch": 0.9031789809841777,
"grad_norm": 3.0091891288757324,
"learning_rate": 9.540459561119508e-06,
"loss": 1.1656,
"step": 3111
},
{
"epoch": 0.9034692988822761,
"grad_norm": 3.297088861465454,
"learning_rate": 9.540057372247044e-06,
"loss": 1.0799,
"step": 3112
},
{
"epoch": 0.9037596167803745,
"grad_norm": 3.128406286239624,
"learning_rate": 9.539655015939068e-06,
"loss": 1.0659,
"step": 3113
},
{
"epoch": 0.9040499346784729,
"grad_norm": 3.099379777908325,
"learning_rate": 9.539252492210416e-06,
"loss": 1.1781,
"step": 3114
},
{
"epoch": 0.9043402525765714,
"grad_norm": 3.0667364597320557,
"learning_rate": 9.538849801075931e-06,
"loss": 1.0704,
"step": 3115
},
{
"epoch": 0.9046305704746698,
"grad_norm": 2.9172818660736084,
"learning_rate": 9.538446942550468e-06,
"loss": 0.9518,
"step": 3116
},
{
"epoch": 0.9049208883727682,
"grad_norm": 3.077747106552124,
"learning_rate": 9.538043916648884e-06,
"loss": 1.0487,
"step": 3117
},
{
"epoch": 0.9052112062708666,
"grad_norm": 3.0355618000030518,
"learning_rate": 9.53764072338604e-06,
"loss": 1.0977,
"step": 3118
},
{
"epoch": 0.905501524168965,
"grad_norm": 3.0987133979797363,
"learning_rate": 9.537237362776805e-06,
"loss": 1.2059,
"step": 3119
},
{
"epoch": 0.9057918420670634,
"grad_norm": 3.300485134124756,
"learning_rate": 9.53683383483606e-06,
"loss": 1.392,
"step": 3120
},
{
"epoch": 0.9060821599651618,
"grad_norm": 3.3400747776031494,
"learning_rate": 9.536430139578683e-06,
"loss": 1.251,
"step": 3121
},
{
"epoch": 0.9063724778632603,
"grad_norm": 3.356792688369751,
"learning_rate": 9.536026277019562e-06,
"loss": 1.3177,
"step": 3122
},
{
"epoch": 0.9066627957613587,
"grad_norm": 3.4476516246795654,
"learning_rate": 9.53562224717359e-06,
"loss": 1.2698,
"step": 3123
},
{
"epoch": 0.9069531136594571,
"grad_norm": 3.273559808731079,
"learning_rate": 9.535218050055672e-06,
"loss": 1.0991,
"step": 3124
},
{
"epoch": 0.9072434315575555,
"grad_norm": 3.0915908813476562,
"learning_rate": 9.53481368568071e-06,
"loss": 1.2781,
"step": 3125
},
{
"epoch": 0.9075337494556539,
"grad_norm": 3.1454083919525146,
"learning_rate": 9.53440915406362e-06,
"loss": 1.1556,
"step": 3126
},
{
"epoch": 0.9078240673537523,
"grad_norm": 3.109560966491699,
"learning_rate": 9.53400445521932e-06,
"loss": 0.9902,
"step": 3127
},
{
"epoch": 0.9081143852518507,
"grad_norm": 3.815458059310913,
"learning_rate": 9.533599589162735e-06,
"loss": 1.209,
"step": 3128
},
{
"epoch": 0.9084047031499491,
"grad_norm": 3.4106128215789795,
"learning_rate": 9.533194555908796e-06,
"loss": 1.2336,
"step": 3129
},
{
"epoch": 0.9086950210480476,
"grad_norm": 3.6380088329315186,
"learning_rate": 9.532789355472441e-06,
"loss": 1.3134,
"step": 3130
},
{
"epoch": 0.9089853389461461,
"grad_norm": 2.9199140071868896,
"learning_rate": 9.532383987868615e-06,
"loss": 1.0422,
"step": 3131
},
{
"epoch": 0.9092756568442445,
"grad_norm": 3.188913583755493,
"learning_rate": 9.531978453112263e-06,
"loss": 1.0525,
"step": 3132
},
{
"epoch": 0.9095659747423429,
"grad_norm": 3.872431516647339,
"learning_rate": 9.531572751218346e-06,
"loss": 1.2834,
"step": 3133
},
{
"epoch": 0.9098562926404413,
"grad_norm": 3.17043399810791,
"learning_rate": 9.531166882201823e-06,
"loss": 1.148,
"step": 3134
},
{
"epoch": 0.9101466105385397,
"grad_norm": 3.4306373596191406,
"learning_rate": 9.530760846077664e-06,
"loss": 1.0991,
"step": 3135
},
{
"epoch": 0.9104369284366381,
"grad_norm": 3.189354658126831,
"learning_rate": 9.530354642860845e-06,
"loss": 1.2444,
"step": 3136
},
{
"epoch": 0.9107272463347366,
"grad_norm": 3.085293769836426,
"learning_rate": 9.52994827256634e-06,
"loss": 1.2831,
"step": 3137
},
{
"epoch": 0.911017564232835,
"grad_norm": 3.2537155151367188,
"learning_rate": 9.529541735209145e-06,
"loss": 1.2515,
"step": 3138
},
{
"epoch": 0.9113078821309334,
"grad_norm": 3.4304065704345703,
"learning_rate": 9.529135030804246e-06,
"loss": 1.3192,
"step": 3139
},
{
"epoch": 0.9115982000290318,
"grad_norm": 3.0350377559661865,
"learning_rate": 9.528728159366644e-06,
"loss": 1.1985,
"step": 3140
},
{
"epoch": 0.9118885179271302,
"grad_norm": 3.5521934032440186,
"learning_rate": 9.528321120911345e-06,
"loss": 1.3126,
"step": 3141
},
{
"epoch": 0.9121788358252286,
"grad_norm": 3.580925941467285,
"learning_rate": 9.527913915453361e-06,
"loss": 1.2,
"step": 3142
},
{
"epoch": 0.912469153723327,
"grad_norm": 3.1894161701202393,
"learning_rate": 9.52750654300771e-06,
"loss": 1.2416,
"step": 3143
},
{
"epoch": 0.9127594716214255,
"grad_norm": 3.018322229385376,
"learning_rate": 9.52709900358941e-06,
"loss": 1.1492,
"step": 3144
},
{
"epoch": 0.9130497895195239,
"grad_norm": 3.544252634048462,
"learning_rate": 9.526691297213499e-06,
"loss": 1.2548,
"step": 3145
},
{
"epoch": 0.9133401074176223,
"grad_norm": 3.4180855751037598,
"learning_rate": 9.526283423895008e-06,
"loss": 1.3203,
"step": 3146
},
{
"epoch": 0.9136304253157207,
"grad_norm": 3.4566452503204346,
"learning_rate": 9.525875383648982e-06,
"loss": 1.1988,
"step": 3147
},
{
"epoch": 0.9139207432138191,
"grad_norm": 3.160930871963501,
"learning_rate": 9.525467176490467e-06,
"loss": 1.1696,
"step": 3148
},
{
"epoch": 0.9142110611119175,
"grad_norm": 3.328986167907715,
"learning_rate": 9.525058802434518e-06,
"loss": 1.2203,
"step": 3149
},
{
"epoch": 0.9145013790100159,
"grad_norm": 3.3570051193237305,
"learning_rate": 9.524650261496195e-06,
"loss": 1.1992,
"step": 3150
},
{
"epoch": 0.9147916969081143,
"grad_norm": 3.1143946647644043,
"learning_rate": 9.524241553690567e-06,
"loss": 1.0589,
"step": 3151
},
{
"epoch": 0.9150820148062128,
"grad_norm": 2.998553514480591,
"learning_rate": 9.523832679032705e-06,
"loss": 1.0533,
"step": 3152
},
{
"epoch": 0.9153723327043112,
"grad_norm": 3.413071632385254,
"learning_rate": 9.52342363753769e-06,
"loss": 1.2558,
"step": 3153
},
{
"epoch": 0.9156626506024096,
"grad_norm": 3.0415122509002686,
"learning_rate": 9.523014429220607e-06,
"loss": 1.1888,
"step": 3154
},
{
"epoch": 0.915952968500508,
"grad_norm": 3.035825490951538,
"learning_rate": 9.522605054096545e-06,
"loss": 1.018,
"step": 3155
},
{
"epoch": 0.9162432863986065,
"grad_norm": 3.2089812755584717,
"learning_rate": 9.522195512180606e-06,
"loss": 1.1775,
"step": 3156
},
{
"epoch": 0.9165336042967049,
"grad_norm": 3.3788814544677734,
"learning_rate": 9.521785803487888e-06,
"loss": 1.1407,
"step": 3157
},
{
"epoch": 0.9168239221948034,
"grad_norm": 3.256770133972168,
"learning_rate": 9.521375928033505e-06,
"loss": 1.2715,
"step": 3158
},
{
"epoch": 0.9171142400929018,
"grad_norm": 3.437924861907959,
"learning_rate": 9.520965885832574e-06,
"loss": 1.1269,
"step": 3159
},
{
"epoch": 0.9174045579910002,
"grad_norm": 3.3418171405792236,
"learning_rate": 9.520555676900214e-06,
"loss": 1.1122,
"step": 3160
},
{
"epoch": 0.9176948758890986,
"grad_norm": 3.2611937522888184,
"learning_rate": 9.520145301251554e-06,
"loss": 1.0641,
"step": 3161
},
{
"epoch": 0.917985193787197,
"grad_norm": 3.1774210929870605,
"learning_rate": 9.519734758901728e-06,
"loss": 1.1638,
"step": 3162
},
{
"epoch": 0.9182755116852954,
"grad_norm": 3.2918379306793213,
"learning_rate": 9.51932404986588e-06,
"loss": 1.2033,
"step": 3163
},
{
"epoch": 0.9185658295833938,
"grad_norm": 3.268033981323242,
"learning_rate": 9.518913174159153e-06,
"loss": 1.0939,
"step": 3164
},
{
"epoch": 0.9188561474814922,
"grad_norm": 3.0575218200683594,
"learning_rate": 9.518502131796701e-06,
"loss": 1.0925,
"step": 3165
},
{
"epoch": 0.9191464653795907,
"grad_norm": 3.339613914489746,
"learning_rate": 9.518090922793685e-06,
"loss": 1.2114,
"step": 3166
},
{
"epoch": 0.9194367832776891,
"grad_norm": 3.2413666248321533,
"learning_rate": 9.517679547165269e-06,
"loss": 1.1209,
"step": 3167
},
{
"epoch": 0.9197271011757875,
"grad_norm": 3.4668829441070557,
"learning_rate": 9.517268004926622e-06,
"loss": 1.13,
"step": 3168
},
{
"epoch": 0.9200174190738859,
"grad_norm": 3.3018696308135986,
"learning_rate": 9.516856296092925e-06,
"loss": 1.2597,
"step": 3169
},
{
"epoch": 0.9203077369719843,
"grad_norm": 3.127471923828125,
"learning_rate": 9.51644442067936e-06,
"loss": 1.069,
"step": 3170
},
{
"epoch": 0.9205980548700827,
"grad_norm": 2.9845657348632812,
"learning_rate": 9.516032378701117e-06,
"loss": 1.1097,
"step": 3171
},
{
"epoch": 0.9208883727681811,
"grad_norm": 3.2858119010925293,
"learning_rate": 9.515620170173392e-06,
"loss": 1.2764,
"step": 3172
},
{
"epoch": 0.9211786906662796,
"grad_norm": 2.8209214210510254,
"learning_rate": 9.515207795111387e-06,
"loss": 0.9764,
"step": 3173
},
{
"epoch": 0.921469008564378,
"grad_norm": 3.091514825820923,
"learning_rate": 9.51479525353031e-06,
"loss": 1.2961,
"step": 3174
},
{
"epoch": 0.9217593264624764,
"grad_norm": 2.9070065021514893,
"learning_rate": 9.514382545445376e-06,
"loss": 1.292,
"step": 3175
},
{
"epoch": 0.9220496443605748,
"grad_norm": 3.108344316482544,
"learning_rate": 9.513969670871805e-06,
"loss": 1.1846,
"step": 3176
},
{
"epoch": 0.9223399622586732,
"grad_norm": 3.2052361965179443,
"learning_rate": 9.513556629824825e-06,
"loss": 1.2653,
"step": 3177
},
{
"epoch": 0.9226302801567716,
"grad_norm": 3.103595018386841,
"learning_rate": 9.513143422319667e-06,
"loss": 1.1459,
"step": 3178
},
{
"epoch": 0.92292059805487,
"grad_norm": 2.842895984649658,
"learning_rate": 9.51273004837157e-06,
"loss": 1.0839,
"step": 3179
},
{
"epoch": 0.9232109159529684,
"grad_norm": 3.2208235263824463,
"learning_rate": 9.51231650799578e-06,
"loss": 1.1171,
"step": 3180
},
{
"epoch": 0.923501233851067,
"grad_norm": 2.9387643337249756,
"learning_rate": 9.511902801207548e-06,
"loss": 1.1748,
"step": 3181
},
{
"epoch": 0.9237915517491654,
"grad_norm": 3.3002710342407227,
"learning_rate": 9.51148892802213e-06,
"loss": 1.1812,
"step": 3182
},
{
"epoch": 0.9240818696472638,
"grad_norm": 3.609367847442627,
"learning_rate": 9.511074888454793e-06,
"loss": 1.1326,
"step": 3183
},
{
"epoch": 0.9243721875453622,
"grad_norm": 3.185091257095337,
"learning_rate": 9.510660682520803e-06,
"loss": 1.2802,
"step": 3184
},
{
"epoch": 0.9246625054434606,
"grad_norm": 3.3810675144195557,
"learning_rate": 9.510246310235438e-06,
"loss": 1.13,
"step": 3185
},
{
"epoch": 0.924952823341559,
"grad_norm": 2.905977725982666,
"learning_rate": 9.509831771613977e-06,
"loss": 0.9673,
"step": 3186
},
{
"epoch": 0.9252431412396575,
"grad_norm": 3.448277473449707,
"learning_rate": 9.50941706667171e-06,
"loss": 1.0962,
"step": 3187
},
{
"epoch": 0.9255334591377559,
"grad_norm": 3.034240961074829,
"learning_rate": 9.509002195423934e-06,
"loss": 1.1603,
"step": 3188
},
{
"epoch": 0.9258237770358543,
"grad_norm": 3.534836530685425,
"learning_rate": 9.508587157885944e-06,
"loss": 1.2476,
"step": 3189
},
{
"epoch": 0.9261140949339527,
"grad_norm": 3.2182629108428955,
"learning_rate": 9.508171954073049e-06,
"loss": 1.1697,
"step": 3190
},
{
"epoch": 0.9264044128320511,
"grad_norm": 3.3119056224823,
"learning_rate": 9.50775658400056e-06,
"loss": 1.1276,
"step": 3191
},
{
"epoch": 0.9266947307301495,
"grad_norm": 2.935210704803467,
"learning_rate": 9.5073410476838e-06,
"loss": 1.0133,
"step": 3192
},
{
"epoch": 0.9269850486282479,
"grad_norm": 2.970475912094116,
"learning_rate": 9.50692534513809e-06,
"loss": 1.3047,
"step": 3193
},
{
"epoch": 0.9272753665263463,
"grad_norm": 2.995439291000366,
"learning_rate": 9.50650947637876e-06,
"loss": 1.03,
"step": 3194
},
{
"epoch": 0.9275656844244448,
"grad_norm": 2.998599052429199,
"learning_rate": 9.50609344142115e-06,
"loss": 1.2295,
"step": 3195
},
{
"epoch": 0.9278560023225432,
"grad_norm": 3.299854040145874,
"learning_rate": 9.505677240280602e-06,
"loss": 1.2555,
"step": 3196
},
{
"epoch": 0.9281463202206416,
"grad_norm": 3.150684118270874,
"learning_rate": 9.505260872972466e-06,
"loss": 1.2473,
"step": 3197
},
{
"epoch": 0.92843663811874,
"grad_norm": 3.107889175415039,
"learning_rate": 9.504844339512096e-06,
"loss": 0.9754,
"step": 3198
},
{
"epoch": 0.9287269560168384,
"grad_norm": 3.0680747032165527,
"learning_rate": 9.504427639914856e-06,
"loss": 1.1238,
"step": 3199
},
{
"epoch": 0.9290172739149368,
"grad_norm": 3.120218276977539,
"learning_rate": 9.504010774196111e-06,
"loss": 1.1543,
"step": 3200
},
{
"epoch": 0.9293075918130352,
"grad_norm": 3.446390390396118,
"learning_rate": 9.503593742371236e-06,
"loss": 1.2022,
"step": 3201
},
{
"epoch": 0.9295979097111337,
"grad_norm": 3.453664541244507,
"learning_rate": 9.503176544455611e-06,
"loss": 1.2489,
"step": 3202
},
{
"epoch": 0.9298882276092321,
"grad_norm": 3.372509479522705,
"learning_rate": 9.502759180464621e-06,
"loss": 1.2709,
"step": 3203
},
{
"epoch": 0.9301785455073305,
"grad_norm": 3.100264072418213,
"learning_rate": 9.50234165041366e-06,
"loss": 1.1211,
"step": 3204
},
{
"epoch": 0.9304688634054289,
"grad_norm": 2.9130682945251465,
"learning_rate": 9.501923954318126e-06,
"loss": 1.0133,
"step": 3205
},
{
"epoch": 0.9307591813035274,
"grad_norm": 3.162043809890747,
"learning_rate": 9.501506092193424e-06,
"loss": 1.0223,
"step": 3206
},
{
"epoch": 0.9310494992016258,
"grad_norm": 3.3077001571655273,
"learning_rate": 9.501088064054963e-06,
"loss": 1.2443,
"step": 3207
},
{
"epoch": 0.9313398170997242,
"grad_norm": 3.330491781234741,
"learning_rate": 9.50066986991816e-06,
"loss": 1.128,
"step": 3208
},
{
"epoch": 0.9316301349978227,
"grad_norm": 3.372661828994751,
"learning_rate": 9.500251509798438e-06,
"loss": 1.3112,
"step": 3209
},
{
"epoch": 0.9319204528959211,
"grad_norm": 3.3673317432403564,
"learning_rate": 9.499832983711226e-06,
"loss": 1.2208,
"step": 3210
},
{
"epoch": 0.9322107707940195,
"grad_norm": 3.226531744003296,
"learning_rate": 9.499414291671961e-06,
"loss": 1.2343,
"step": 3211
},
{
"epoch": 0.9325010886921179,
"grad_norm": 3.247696876525879,
"learning_rate": 9.498995433696081e-06,
"loss": 1.1313,
"step": 3212
},
{
"epoch": 0.9327914065902163,
"grad_norm": 3.215843915939331,
"learning_rate": 9.498576409799034e-06,
"loss": 1.4321,
"step": 3213
},
{
"epoch": 0.9330817244883147,
"grad_norm": 3.0820136070251465,
"learning_rate": 9.498157219996275e-06,
"loss": 1.2786,
"step": 3214
},
{
"epoch": 0.9333720423864131,
"grad_norm": 3.309765100479126,
"learning_rate": 9.497737864303265e-06,
"loss": 1.1981,
"step": 3215
},
{
"epoch": 0.9336623602845115,
"grad_norm": 3.2941930294036865,
"learning_rate": 9.497318342735466e-06,
"loss": 1.1813,
"step": 3216
},
{
"epoch": 0.93395267818261,
"grad_norm": 3.4502313137054443,
"learning_rate": 9.49689865530835e-06,
"loss": 1.1647,
"step": 3217
},
{
"epoch": 0.9342429960807084,
"grad_norm": 3.085756778717041,
"learning_rate": 9.496478802037396e-06,
"loss": 1.1329,
"step": 3218
},
{
"epoch": 0.9345333139788068,
"grad_norm": 3.3223068714141846,
"learning_rate": 9.496058782938088e-06,
"loss": 1.2166,
"step": 3219
},
{
"epoch": 0.9348236318769052,
"grad_norm": 3.3261163234710693,
"learning_rate": 9.49563859802592e-06,
"loss": 1.1447,
"step": 3220
},
{
"epoch": 0.9351139497750036,
"grad_norm": 3.140730381011963,
"learning_rate": 9.495218247316381e-06,
"loss": 1.1553,
"step": 3221
},
{
"epoch": 0.935404267673102,
"grad_norm": 3.2012627124786377,
"learning_rate": 9.494797730824978e-06,
"loss": 1.0707,
"step": 3222
},
{
"epoch": 0.9356945855712004,
"grad_norm": 3.489518404006958,
"learning_rate": 9.494377048567218e-06,
"loss": 1.1577,
"step": 3223
},
{
"epoch": 0.9359849034692989,
"grad_norm": 3.089207172393799,
"learning_rate": 9.493956200558615e-06,
"loss": 1.202,
"step": 3224
},
{
"epoch": 0.9362752213673973,
"grad_norm": 3.1790692806243896,
"learning_rate": 9.493535186814693e-06,
"loss": 1.0798,
"step": 3225
},
{
"epoch": 0.9365655392654957,
"grad_norm": 3.196995496749878,
"learning_rate": 9.493114007350976e-06,
"loss": 1.1304,
"step": 3226
},
{
"epoch": 0.9368558571635941,
"grad_norm": 3.4054222106933594,
"learning_rate": 9.492692662182997e-06,
"loss": 1.0787,
"step": 3227
},
{
"epoch": 0.9371461750616925,
"grad_norm": 3.0863044261932373,
"learning_rate": 9.492271151326295e-06,
"loss": 1.0259,
"step": 3228
},
{
"epoch": 0.9374364929597909,
"grad_norm": 3.0026841163635254,
"learning_rate": 9.491849474796416e-06,
"loss": 1.0096,
"step": 3229
},
{
"epoch": 0.9377268108578893,
"grad_norm": 2.935014247894287,
"learning_rate": 9.49142763260891e-06,
"loss": 1.1263,
"step": 3230
},
{
"epoch": 0.9380171287559879,
"grad_norm": 2.8023691177368164,
"learning_rate": 9.491005624779337e-06,
"loss": 1.0752,
"step": 3231
},
{
"epoch": 0.9383074466540863,
"grad_norm": 3.2768187522888184,
"learning_rate": 9.490583451323258e-06,
"loss": 1.2187,
"step": 3232
},
{
"epoch": 0.9385977645521847,
"grad_norm": 3.004180431365967,
"learning_rate": 9.490161112256242e-06,
"loss": 1.1065,
"step": 3233
},
{
"epoch": 0.9388880824502831,
"grad_norm": 3.0199663639068604,
"learning_rate": 9.489738607593867e-06,
"loss": 1.175,
"step": 3234
},
{
"epoch": 0.9391784003483815,
"grad_norm": 3.029003381729126,
"learning_rate": 9.489315937351715e-06,
"loss": 1.079,
"step": 3235
},
{
"epoch": 0.9394687182464799,
"grad_norm": 3.2275280952453613,
"learning_rate": 9.488893101545372e-06,
"loss": 1.1521,
"step": 3236
},
{
"epoch": 0.9397590361445783,
"grad_norm": 2.9786007404327393,
"learning_rate": 9.488470100190432e-06,
"loss": 1.0745,
"step": 3237
},
{
"epoch": 0.9400493540426768,
"grad_norm": 3.209221839904785,
"learning_rate": 9.488046933302498e-06,
"loss": 1.271,
"step": 3238
},
{
"epoch": 0.9403396719407752,
"grad_norm": 3.101375102996826,
"learning_rate": 9.487623600897172e-06,
"loss": 1.1747,
"step": 3239
},
{
"epoch": 0.9406299898388736,
"grad_norm": 3.2204413414001465,
"learning_rate": 9.487200102990068e-06,
"loss": 1.205,
"step": 3240
},
{
"epoch": 0.940920307736972,
"grad_norm": 3.0944347381591797,
"learning_rate": 9.486776439596808e-06,
"loss": 1.1888,
"step": 3241
},
{
"epoch": 0.9412106256350704,
"grad_norm": 3.1230151653289795,
"learning_rate": 9.48635261073301e-06,
"loss": 1.1052,
"step": 3242
},
{
"epoch": 0.9415009435331688,
"grad_norm": 3.4646267890930176,
"learning_rate": 9.48592861641431e-06,
"loss": 1.3186,
"step": 3243
},
{
"epoch": 0.9417912614312672,
"grad_norm": 3.0284507274627686,
"learning_rate": 9.485504456656343e-06,
"loss": 1.0032,
"step": 3244
},
{
"epoch": 0.9420815793293656,
"grad_norm": 2.971484899520874,
"learning_rate": 9.48508013147475e-06,
"loss": 1.0545,
"step": 3245
},
{
"epoch": 0.9423718972274641,
"grad_norm": 3.0329430103302,
"learning_rate": 9.484655640885183e-06,
"loss": 1.143,
"step": 3246
},
{
"epoch": 0.9426622151255625,
"grad_norm": 3.1452481746673584,
"learning_rate": 9.484230984903296e-06,
"loss": 1.2393,
"step": 3247
},
{
"epoch": 0.9429525330236609,
"grad_norm": 3.5928149223327637,
"learning_rate": 9.483806163544749e-06,
"loss": 1.1103,
"step": 3248
},
{
"epoch": 0.9432428509217593,
"grad_norm": 3.611189126968384,
"learning_rate": 9.48338117682521e-06,
"loss": 1.3648,
"step": 3249
},
{
"epoch": 0.9435331688198577,
"grad_norm": 3.1281070709228516,
"learning_rate": 9.482956024760352e-06,
"loss": 0.9971,
"step": 3250
},
{
"epoch": 0.9438234867179561,
"grad_norm": 3.092606544494629,
"learning_rate": 9.482530707365856e-06,
"loss": 1.0551,
"step": 3251
},
{
"epoch": 0.9441138046160545,
"grad_norm": 3.4306132793426514,
"learning_rate": 9.482105224657406e-06,
"loss": 1.2839,
"step": 3252
},
{
"epoch": 0.944404122514153,
"grad_norm": 3.2871501445770264,
"learning_rate": 9.481679576650693e-06,
"loss": 1.0642,
"step": 3253
},
{
"epoch": 0.9446944404122514,
"grad_norm": 3.144798994064331,
"learning_rate": 9.481253763361415e-06,
"loss": 1.1322,
"step": 3254
},
{
"epoch": 0.9449847583103498,
"grad_norm": 4.029135227203369,
"learning_rate": 9.480827784805278e-06,
"loss": 1.1049,
"step": 3255
},
{
"epoch": 0.9452750762084483,
"grad_norm": 3.037443161010742,
"learning_rate": 9.480401640997991e-06,
"loss": 1.2186,
"step": 3256
},
{
"epoch": 0.9455653941065467,
"grad_norm": 3.2530734539031982,
"learning_rate": 9.479975331955269e-06,
"loss": 1.2415,
"step": 3257
},
{
"epoch": 0.9458557120046451,
"grad_norm": 3.5844802856445312,
"learning_rate": 9.479548857692836e-06,
"loss": 1.1883,
"step": 3258
},
{
"epoch": 0.9461460299027435,
"grad_norm": 2.8868770599365234,
"learning_rate": 9.479122218226415e-06,
"loss": 1.0488,
"step": 3259
},
{
"epoch": 0.946436347800842,
"grad_norm": 3.3103206157684326,
"learning_rate": 9.478695413571747e-06,
"loss": 1.2274,
"step": 3260
},
{
"epoch": 0.9467266656989404,
"grad_norm": 2.9122848510742188,
"learning_rate": 9.478268443744569e-06,
"loss": 1.0438,
"step": 3261
},
{
"epoch": 0.9470169835970388,
"grad_norm": 3.0058131217956543,
"learning_rate": 9.477841308760628e-06,
"loss": 1.027,
"step": 3262
},
{
"epoch": 0.9473073014951372,
"grad_norm": 2.9957618713378906,
"learning_rate": 9.477414008635675e-06,
"loss": 1.2333,
"step": 3263
},
{
"epoch": 0.9475976193932356,
"grad_norm": 3.0428504943847656,
"learning_rate": 9.476986543385472e-06,
"loss": 1.13,
"step": 3264
},
{
"epoch": 0.947887937291334,
"grad_norm": 2.8519036769866943,
"learning_rate": 9.47655891302578e-06,
"loss": 0.9563,
"step": 3265
},
{
"epoch": 0.9481782551894324,
"grad_norm": 2.8498032093048096,
"learning_rate": 9.476131117572373e-06,
"loss": 1.096,
"step": 3266
},
{
"epoch": 0.9484685730875309,
"grad_norm": 3.2216978073120117,
"learning_rate": 9.475703157041028e-06,
"loss": 1.2349,
"step": 3267
},
{
"epoch": 0.9487588909856293,
"grad_norm": 3.696192502975464,
"learning_rate": 9.475275031447525e-06,
"loss": 1.1619,
"step": 3268
},
{
"epoch": 0.9490492088837277,
"grad_norm": 3.411872625350952,
"learning_rate": 9.474846740807655e-06,
"loss": 1.1287,
"step": 3269
},
{
"epoch": 0.9493395267818261,
"grad_norm": 3.1810708045959473,
"learning_rate": 9.474418285137214e-06,
"loss": 1.1311,
"step": 3270
},
{
"epoch": 0.9496298446799245,
"grad_norm": 3.444535255432129,
"learning_rate": 9.473989664452001e-06,
"loss": 1.1452,
"step": 3271
},
{
"epoch": 0.9499201625780229,
"grad_norm": 3.02544903755188,
"learning_rate": 9.473560878767825e-06,
"loss": 1.1944,
"step": 3272
},
{
"epoch": 0.9502104804761213,
"grad_norm": 2.964012384414673,
"learning_rate": 9.4731319281005e-06,
"loss": 1.1086,
"step": 3273
},
{
"epoch": 0.9505007983742197,
"grad_norm": 3.4347403049468994,
"learning_rate": 9.472702812465843e-06,
"loss": 1.2453,
"step": 3274
},
{
"epoch": 0.9507911162723182,
"grad_norm": 3.0634796619415283,
"learning_rate": 9.47227353187968e-06,
"loss": 1.0154,
"step": 3275
},
{
"epoch": 0.9510814341704166,
"grad_norm": 3.2538411617279053,
"learning_rate": 9.471844086357848e-06,
"loss": 1.1605,
"step": 3276
},
{
"epoch": 0.951371752068515,
"grad_norm": 2.976386547088623,
"learning_rate": 9.471414475916179e-06,
"loss": 1.0983,
"step": 3277
},
{
"epoch": 0.9516620699666134,
"grad_norm": 3.2437491416931152,
"learning_rate": 9.470984700570518e-06,
"loss": 1.1463,
"step": 3278
},
{
"epoch": 0.9519523878647118,
"grad_norm": 3.283535957336426,
"learning_rate": 9.470554760336714e-06,
"loss": 1.0749,
"step": 3279
},
{
"epoch": 0.9522427057628102,
"grad_norm": 3.1635475158691406,
"learning_rate": 9.470124655230627e-06,
"loss": 1.1702,
"step": 3280
},
{
"epoch": 0.9525330236609086,
"grad_norm": 3.6238670349121094,
"learning_rate": 9.469694385268115e-06,
"loss": 1.2376,
"step": 3281
},
{
"epoch": 0.9528233415590072,
"grad_norm": 3.029278516769409,
"learning_rate": 9.469263950465048e-06,
"loss": 1.1066,
"step": 3282
},
{
"epoch": 0.9531136594571056,
"grad_norm": 2.8746628761291504,
"learning_rate": 9.468833350837301e-06,
"loss": 1.0827,
"step": 3283
},
{
"epoch": 0.953403977355204,
"grad_norm": 2.8631439208984375,
"learning_rate": 9.468402586400753e-06,
"loss": 0.8597,
"step": 3284
},
{
"epoch": 0.9536942952533024,
"grad_norm": 3.1171255111694336,
"learning_rate": 9.467971657171292e-06,
"loss": 1.086,
"step": 3285
},
{
"epoch": 0.9539846131514008,
"grad_norm": 3.133019208908081,
"learning_rate": 9.467540563164808e-06,
"loss": 1.1201,
"step": 3286
},
{
"epoch": 0.9542749310494992,
"grad_norm": 3.1883506774902344,
"learning_rate": 9.467109304397201e-06,
"loss": 1.1701,
"step": 3287
},
{
"epoch": 0.9545652489475976,
"grad_norm": 3.2414369583129883,
"learning_rate": 9.466677880884376e-06,
"loss": 1.1613,
"step": 3288
},
{
"epoch": 0.9548555668456961,
"grad_norm": 2.8469996452331543,
"learning_rate": 9.466246292642243e-06,
"loss": 0.9667,
"step": 3289
},
{
"epoch": 0.9551458847437945,
"grad_norm": 3.1720969676971436,
"learning_rate": 9.465814539686719e-06,
"loss": 1.1769,
"step": 3290
},
{
"epoch": 0.9554362026418929,
"grad_norm": 3.1476361751556396,
"learning_rate": 9.465382622033727e-06,
"loss": 1.2384,
"step": 3291
},
{
"epoch": 0.9557265205399913,
"grad_norm": 3.4708709716796875,
"learning_rate": 9.464950539699195e-06,
"loss": 1.4053,
"step": 3292
},
{
"epoch": 0.9560168384380897,
"grad_norm": 3.2307510375976562,
"learning_rate": 9.46451829269906e-06,
"loss": 1.0809,
"step": 3293
},
{
"epoch": 0.9563071563361881,
"grad_norm": 3.331270933151245,
"learning_rate": 9.464085881049262e-06,
"loss": 1.1588,
"step": 3294
},
{
"epoch": 0.9565974742342865,
"grad_norm": 3.047401189804077,
"learning_rate": 9.46365330476575e-06,
"loss": 1.2303,
"step": 3295
},
{
"epoch": 0.956887792132385,
"grad_norm": 2.589224338531494,
"learning_rate": 9.463220563864474e-06,
"loss": 0.9973,
"step": 3296
},
{
"epoch": 0.9571781100304834,
"grad_norm": 3.296471357345581,
"learning_rate": 9.462787658361394e-06,
"loss": 1.2449,
"step": 3297
},
{
"epoch": 0.9574684279285818,
"grad_norm": 3.164555788040161,
"learning_rate": 9.462354588272478e-06,
"loss": 1.1311,
"step": 3298
},
{
"epoch": 0.9577587458266802,
"grad_norm": 3.3225278854370117,
"learning_rate": 9.461921353613693e-06,
"loss": 1.2072,
"step": 3299
},
{
"epoch": 0.9580490637247786,
"grad_norm": 3.135514259338379,
"learning_rate": 9.461487954401021e-06,
"loss": 1.0418,
"step": 3300
},
{
"epoch": 0.958339381622877,
"grad_norm": 3.0921425819396973,
"learning_rate": 9.461054390650444e-06,
"loss": 1.2124,
"step": 3301
},
{
"epoch": 0.9586296995209754,
"grad_norm": 3.197275161743164,
"learning_rate": 9.460620662377949e-06,
"loss": 1.2466,
"step": 3302
},
{
"epoch": 0.9589200174190738,
"grad_norm": 3.615117311477661,
"learning_rate": 9.460186769599536e-06,
"loss": 1.239,
"step": 3303
},
{
"epoch": 0.9592103353171723,
"grad_norm": 3.303147077560425,
"learning_rate": 9.459752712331204e-06,
"loss": 1.2606,
"step": 3304
},
{
"epoch": 0.9595006532152707,
"grad_norm": 3.386007308959961,
"learning_rate": 9.459318490588964e-06,
"loss": 1.2938,
"step": 3305
},
{
"epoch": 0.9597909711133691,
"grad_norm": 3.0497190952301025,
"learning_rate": 9.458884104388826e-06,
"loss": 1.1553,
"step": 3306
},
{
"epoch": 0.9600812890114676,
"grad_norm": 2.7740349769592285,
"learning_rate": 9.458449553746812e-06,
"loss": 1.05,
"step": 3307
},
{
"epoch": 0.960371606909566,
"grad_norm": 3.255222797393799,
"learning_rate": 9.458014838678946e-06,
"loss": 0.9898,
"step": 3308
},
{
"epoch": 0.9606619248076644,
"grad_norm": 2.9783425331115723,
"learning_rate": 9.457579959201263e-06,
"loss": 1.0716,
"step": 3309
},
{
"epoch": 0.9609522427057628,
"grad_norm": 3.041851043701172,
"learning_rate": 9.457144915329802e-06,
"loss": 1.1695,
"step": 3310
},
{
"epoch": 0.9612425606038613,
"grad_norm": 3.023836851119995,
"learning_rate": 9.456709707080602e-06,
"loss": 1.0672,
"step": 3311
},
{
"epoch": 0.9615328785019597,
"grad_norm": 2.8885133266448975,
"learning_rate": 9.45627433446972e-06,
"loss": 1.0908,
"step": 3312
},
{
"epoch": 0.9618231964000581,
"grad_norm": 3.162452459335327,
"learning_rate": 9.455838797513206e-06,
"loss": 1.0913,
"step": 3313
},
{
"epoch": 0.9621135142981565,
"grad_norm": 3.567873239517212,
"learning_rate": 9.455403096227126e-06,
"loss": 1.2009,
"step": 3314
},
{
"epoch": 0.9624038321962549,
"grad_norm": 2.9521007537841797,
"learning_rate": 9.454967230627549e-06,
"loss": 1.0564,
"step": 3315
},
{
"epoch": 0.9626941500943533,
"grad_norm": 3.264430284500122,
"learning_rate": 9.45453120073055e-06,
"loss": 1.167,
"step": 3316
},
{
"epoch": 0.9629844679924517,
"grad_norm": 3.638040065765381,
"learning_rate": 9.454095006552204e-06,
"loss": 1.2732,
"step": 3317
},
{
"epoch": 0.9632747858905502,
"grad_norm": 3.109283208847046,
"learning_rate": 9.453658648108604e-06,
"loss": 1.0722,
"step": 3318
},
{
"epoch": 0.9635651037886486,
"grad_norm": 3.268758535385132,
"learning_rate": 9.45322212541584e-06,
"loss": 1.2237,
"step": 3319
},
{
"epoch": 0.963855421686747,
"grad_norm": 3.297163963317871,
"learning_rate": 9.452785438490011e-06,
"loss": 1.2266,
"step": 3320
},
{
"epoch": 0.9641457395848454,
"grad_norm": 3.4363367557525635,
"learning_rate": 9.452348587347224e-06,
"loss": 1.3593,
"step": 3321
},
{
"epoch": 0.9644360574829438,
"grad_norm": 2.6215686798095703,
"learning_rate": 9.451911572003586e-06,
"loss": 1.0826,
"step": 3322
},
{
"epoch": 0.9647263753810422,
"grad_norm": 3.397822380065918,
"learning_rate": 9.451474392475216e-06,
"loss": 1.1542,
"step": 3323
},
{
"epoch": 0.9650166932791406,
"grad_norm": 3.1584107875823975,
"learning_rate": 9.451037048778238e-06,
"loss": 1.113,
"step": 3324
},
{
"epoch": 0.965307011177239,
"grad_norm": 3.2262637615203857,
"learning_rate": 9.450599540928779e-06,
"loss": 1.1506,
"step": 3325
},
{
"epoch": 0.9655973290753375,
"grad_norm": 3.163564443588257,
"learning_rate": 9.450161868942975e-06,
"loss": 1.2236,
"step": 3326
},
{
"epoch": 0.9658876469734359,
"grad_norm": 3.1902246475219727,
"learning_rate": 9.449724032836968e-06,
"loss": 1.2597,
"step": 3327
},
{
"epoch": 0.9661779648715343,
"grad_norm": 3.5280227661132812,
"learning_rate": 9.449286032626904e-06,
"loss": 1.2247,
"step": 3328
},
{
"epoch": 0.9664682827696327,
"grad_norm": 3.1843626499176025,
"learning_rate": 9.448847868328936e-06,
"loss": 1.0195,
"step": 3329
},
{
"epoch": 0.9667586006677311,
"grad_norm": 3.1920642852783203,
"learning_rate": 9.448409539959225e-06,
"loss": 1.1452,
"step": 3330
},
{
"epoch": 0.9670489185658295,
"grad_norm": 4.158785343170166,
"learning_rate": 9.447971047533936e-06,
"loss": 1.2936,
"step": 3331
},
{
"epoch": 0.967339236463928,
"grad_norm": 3.061877727508545,
"learning_rate": 9.447532391069238e-06,
"loss": 1.1663,
"step": 3332
},
{
"epoch": 0.9676295543620265,
"grad_norm": 2.6941730976104736,
"learning_rate": 9.447093570581313e-06,
"loss": 0.9278,
"step": 3333
},
{
"epoch": 0.9679198722601249,
"grad_norm": 3.301288366317749,
"learning_rate": 9.44665458608634e-06,
"loss": 1.1854,
"step": 3334
},
{
"epoch": 0.9682101901582233,
"grad_norm": 3.001420021057129,
"learning_rate": 9.446215437600511e-06,
"loss": 1.0494,
"step": 3335
},
{
"epoch": 0.9685005080563217,
"grad_norm": 3.054023504257202,
"learning_rate": 9.44577612514002e-06,
"loss": 1.0131,
"step": 3336
},
{
"epoch": 0.9687908259544201,
"grad_norm": 3.395092010498047,
"learning_rate": 9.445336648721073e-06,
"loss": 1.2864,
"step": 3337
},
{
"epoch": 0.9690811438525185,
"grad_norm": 3.0327727794647217,
"learning_rate": 9.444897008359871e-06,
"loss": 1.0428,
"step": 3338
},
{
"epoch": 0.969371461750617,
"grad_norm": 2.8564300537109375,
"learning_rate": 9.444457204072632e-06,
"loss": 1.0468,
"step": 3339
},
{
"epoch": 0.9696617796487154,
"grad_norm": 3.084829330444336,
"learning_rate": 9.444017235875577e-06,
"loss": 1.0957,
"step": 3340
},
{
"epoch": 0.9699520975468138,
"grad_norm": 2.857167959213257,
"learning_rate": 9.443577103784927e-06,
"loss": 0.9776,
"step": 3341
},
{
"epoch": 0.9702424154449122,
"grad_norm": 2.935952663421631,
"learning_rate": 9.443136807816919e-06,
"loss": 1.1364,
"step": 3342
},
{
"epoch": 0.9705327333430106,
"grad_norm": 3.175546884536743,
"learning_rate": 9.442696347987787e-06,
"loss": 1.1864,
"step": 3343
},
{
"epoch": 0.970823051241109,
"grad_norm": 3.243807315826416,
"learning_rate": 9.442255724313778e-06,
"loss": 1.1785,
"step": 3344
},
{
"epoch": 0.9711133691392074,
"grad_norm": 2.8106155395507812,
"learning_rate": 9.441814936811142e-06,
"loss": 0.9373,
"step": 3345
},
{
"epoch": 0.9714036870373058,
"grad_norm": 3.255561113357544,
"learning_rate": 9.441373985496133e-06,
"loss": 1.0555,
"step": 3346
},
{
"epoch": 0.9716940049354043,
"grad_norm": 3.7151269912719727,
"learning_rate": 9.440932870385011e-06,
"loss": 1.3468,
"step": 3347
},
{
"epoch": 0.9719843228335027,
"grad_norm": 3.180406093597412,
"learning_rate": 9.44049159149405e-06,
"loss": 1.0678,
"step": 3348
},
{
"epoch": 0.9722746407316011,
"grad_norm": 3.1511247158050537,
"learning_rate": 9.440050148839521e-06,
"loss": 1.1926,
"step": 3349
},
{
"epoch": 0.9725649586296995,
"grad_norm": 3.3239786624908447,
"learning_rate": 9.439608542437704e-06,
"loss": 1.0599,
"step": 3350
},
{
"epoch": 0.9728552765277979,
"grad_norm": 3.1429460048675537,
"learning_rate": 9.439166772304886e-06,
"loss": 1.1076,
"step": 3351
},
{
"epoch": 0.9731455944258963,
"grad_norm": 3.073305368423462,
"learning_rate": 9.438724838457358e-06,
"loss": 1.0712,
"step": 3352
},
{
"epoch": 0.9734359123239947,
"grad_norm": 3.4165472984313965,
"learning_rate": 9.438282740911421e-06,
"loss": 1.1699,
"step": 3353
},
{
"epoch": 0.9737262302220931,
"grad_norm": 3.339623212814331,
"learning_rate": 9.437840479683377e-06,
"loss": 1.1977,
"step": 3354
},
{
"epoch": 0.9740165481201916,
"grad_norm": 2.8841733932495117,
"learning_rate": 9.437398054789537e-06,
"loss": 1.1156,
"step": 3355
},
{
"epoch": 0.97430686601829,
"grad_norm": 3.360177516937256,
"learning_rate": 9.436955466246218e-06,
"loss": 1.1148,
"step": 3356
},
{
"epoch": 0.9745971839163885,
"grad_norm": 3.3974556922912598,
"learning_rate": 9.436512714069742e-06,
"loss": 1.2665,
"step": 3357
},
{
"epoch": 0.9748875018144869,
"grad_norm": 2.819671154022217,
"learning_rate": 9.436069798276438e-06,
"loss": 1.1152,
"step": 3358
},
{
"epoch": 0.9751778197125853,
"grad_norm": 3.1836605072021484,
"learning_rate": 9.43562671888264e-06,
"loss": 1.2375,
"step": 3359
},
{
"epoch": 0.9754681376106837,
"grad_norm": 3.028640031814575,
"learning_rate": 9.435183475904688e-06,
"loss": 1.1392,
"step": 3360
},
{
"epoch": 0.9757584555087822,
"grad_norm": 3.1864209175109863,
"learning_rate": 9.434740069358931e-06,
"loss": 1.217,
"step": 3361
},
{
"epoch": 0.9760487734068806,
"grad_norm": 2.9835257530212402,
"learning_rate": 9.434296499261719e-06,
"loss": 1.0562,
"step": 3362
},
{
"epoch": 0.976339091304979,
"grad_norm": 3.0759634971618652,
"learning_rate": 9.433852765629412e-06,
"loss": 1.1193,
"step": 3363
},
{
"epoch": 0.9766294092030774,
"grad_norm": 3.088196277618408,
"learning_rate": 9.433408868478375e-06,
"loss": 1.1732,
"step": 3364
},
{
"epoch": 0.9769197271011758,
"grad_norm": 2.7689590454101562,
"learning_rate": 9.432964807824979e-06,
"loss": 1.0004,
"step": 3365
},
{
"epoch": 0.9772100449992742,
"grad_norm": 2.8842358589172363,
"learning_rate": 9.432520583685597e-06,
"loss": 1.0616,
"step": 3366
},
{
"epoch": 0.9775003628973726,
"grad_norm": 3.2516438961029053,
"learning_rate": 9.432076196076618e-06,
"loss": 1.1702,
"step": 3367
},
{
"epoch": 0.977790680795471,
"grad_norm": 2.9004099369049072,
"learning_rate": 9.431631645014427e-06,
"loss": 1.0521,
"step": 3368
},
{
"epoch": 0.9780809986935695,
"grad_norm": 3.162397861480713,
"learning_rate": 9.431186930515419e-06,
"loss": 1.1376,
"step": 3369
},
{
"epoch": 0.9783713165916679,
"grad_norm": 3.3717830181121826,
"learning_rate": 9.430742052595995e-06,
"loss": 1.1725,
"step": 3370
},
{
"epoch": 0.9786616344897663,
"grad_norm": 3.5331950187683105,
"learning_rate": 9.430297011272564e-06,
"loss": 1.2318,
"step": 3371
},
{
"epoch": 0.9789519523878647,
"grad_norm": 3.0815625190734863,
"learning_rate": 9.429851806561537e-06,
"loss": 0.9662,
"step": 3372
},
{
"epoch": 0.9792422702859631,
"grad_norm": 3.1928114891052246,
"learning_rate": 9.429406438479332e-06,
"loss": 1.2074,
"step": 3373
},
{
"epoch": 0.9795325881840615,
"grad_norm": 3.0204803943634033,
"learning_rate": 9.428960907042377e-06,
"loss": 1.0493,
"step": 3374
},
{
"epoch": 0.9798229060821599,
"grad_norm": 3.266531467437744,
"learning_rate": 9.4285152122671e-06,
"loss": 1.2921,
"step": 3375
},
{
"epoch": 0.9801132239802584,
"grad_norm": 2.9223287105560303,
"learning_rate": 9.42806935416994e-06,
"loss": 1.0824,
"step": 3376
},
{
"epoch": 0.9804035418783568,
"grad_norm": 3.335517168045044,
"learning_rate": 9.427623332767338e-06,
"loss": 1.3236,
"step": 3377
},
{
"epoch": 0.9806938597764552,
"grad_norm": 3.223524332046509,
"learning_rate": 9.427177148075746e-06,
"loss": 1.2141,
"step": 3378
},
{
"epoch": 0.9809841776745536,
"grad_norm": 3.1920454502105713,
"learning_rate": 9.426730800111618e-06,
"loss": 1.0862,
"step": 3379
},
{
"epoch": 0.981274495572652,
"grad_norm": 3.0022921562194824,
"learning_rate": 9.426284288891415e-06,
"loss": 1.1349,
"step": 3380
},
{
"epoch": 0.9815648134707504,
"grad_norm": 3.008728265762329,
"learning_rate": 9.425837614431601e-06,
"loss": 1.1163,
"step": 3381
},
{
"epoch": 0.9818551313688489,
"grad_norm": 2.845618724822998,
"learning_rate": 9.425390776748656e-06,
"loss": 1.0241,
"step": 3382
},
{
"epoch": 0.9821454492669474,
"grad_norm": 3.227717876434326,
"learning_rate": 9.424943775859052e-06,
"loss": 1.1405,
"step": 3383
},
{
"epoch": 0.9824357671650458,
"grad_norm": 3.4967589378356934,
"learning_rate": 9.424496611779279e-06,
"loss": 1.3153,
"step": 3384
},
{
"epoch": 0.9827260850631442,
"grad_norm": 3.4529168605804443,
"learning_rate": 9.424049284525827e-06,
"loss": 1.2027,
"step": 3385
},
{
"epoch": 0.9830164029612426,
"grad_norm": 3.211639404296875,
"learning_rate": 9.423601794115194e-06,
"loss": 1.0941,
"step": 3386
},
{
"epoch": 0.983306720859341,
"grad_norm": 3.719665765762329,
"learning_rate": 9.42315414056388e-06,
"loss": 1.332,
"step": 3387
},
{
"epoch": 0.9835970387574394,
"grad_norm": 3.154254674911499,
"learning_rate": 9.422706323888398e-06,
"loss": 1.1848,
"step": 3388
},
{
"epoch": 0.9838873566555378,
"grad_norm": 3.1426172256469727,
"learning_rate": 9.422258344105263e-06,
"loss": 1.1643,
"step": 3389
},
{
"epoch": 0.9841776745536363,
"grad_norm": 3.4022419452667236,
"learning_rate": 9.421810201230992e-06,
"loss": 1.3219,
"step": 3390
},
{
"epoch": 0.9844679924517347,
"grad_norm": 3.381171464920044,
"learning_rate": 9.421361895282117e-06,
"loss": 1.3257,
"step": 3391
},
{
"epoch": 0.9847583103498331,
"grad_norm": 3.2930431365966797,
"learning_rate": 9.42091342627517e-06,
"loss": 1.096,
"step": 3392
},
{
"epoch": 0.9850486282479315,
"grad_norm": 3.0404651165008545,
"learning_rate": 9.420464794226691e-06,
"loss": 1.1944,
"step": 3393
},
{
"epoch": 0.9853389461460299,
"grad_norm": 3.1677844524383545,
"learning_rate": 9.420015999153225e-06,
"loss": 1.1356,
"step": 3394
},
{
"epoch": 0.9856292640441283,
"grad_norm": 3.403318166732788,
"learning_rate": 9.41956704107132e-06,
"loss": 1.0833,
"step": 3395
},
{
"epoch": 0.9859195819422267,
"grad_norm": 3.0987493991851807,
"learning_rate": 9.419117919997538e-06,
"loss": 1.106,
"step": 3396
},
{
"epoch": 0.9862098998403251,
"grad_norm": 3.006129503250122,
"learning_rate": 9.418668635948443e-06,
"loss": 1.0986,
"step": 3397
},
{
"epoch": 0.9865002177384236,
"grad_norm": 3.6161084175109863,
"learning_rate": 9.4182191889406e-06,
"loss": 1.3971,
"step": 3398
},
{
"epoch": 0.986790535636522,
"grad_norm": 3.079556465148926,
"learning_rate": 9.417769578990586e-06,
"loss": 1.0629,
"step": 3399
},
{
"epoch": 0.9870808535346204,
"grad_norm": 3.1218533515930176,
"learning_rate": 9.417319806114984e-06,
"loss": 1.1182,
"step": 3400
},
{
"epoch": 0.9873711714327188,
"grad_norm": 2.991771697998047,
"learning_rate": 9.41686987033038e-06,
"loss": 1.0839,
"step": 3401
},
{
"epoch": 0.9876614893308172,
"grad_norm": 3.7504146099090576,
"learning_rate": 9.416419771653368e-06,
"loss": 1.4415,
"step": 3402
},
{
"epoch": 0.9879518072289156,
"grad_norm": 3.217874526977539,
"learning_rate": 9.415969510100549e-06,
"loss": 1.2136,
"step": 3403
},
{
"epoch": 0.988242125127014,
"grad_norm": 3.183932304382324,
"learning_rate": 9.415519085688526e-06,
"loss": 1.0926,
"step": 3404
},
{
"epoch": 0.9885324430251125,
"grad_norm": 3.3624684810638428,
"learning_rate": 9.415068498433912e-06,
"loss": 1.1281,
"step": 3405
},
{
"epoch": 0.9888227609232109,
"grad_norm": 3.2152488231658936,
"learning_rate": 9.414617748353324e-06,
"loss": 1.2438,
"step": 3406
},
{
"epoch": 0.9891130788213094,
"grad_norm": 3.27553391456604,
"learning_rate": 9.414166835463383e-06,
"loss": 1.128,
"step": 3407
},
{
"epoch": 0.9894033967194078,
"grad_norm": 3.2097506523132324,
"learning_rate": 9.413715759780722e-06,
"loss": 1.1601,
"step": 3408
},
{
"epoch": 0.9896937146175062,
"grad_norm": 3.083144187927246,
"learning_rate": 9.413264521321976e-06,
"loss": 1.0782,
"step": 3409
},
{
"epoch": 0.9899840325156046,
"grad_norm": 3.3622589111328125,
"learning_rate": 9.412813120103786e-06,
"loss": 1.1783,
"step": 3410
},
{
"epoch": 0.990274350413703,
"grad_norm": 3.3557496070861816,
"learning_rate": 9.412361556142797e-06,
"loss": 1.2824,
"step": 3411
},
{
"epoch": 0.9905646683118015,
"grad_norm": 3.4692952632904053,
"learning_rate": 9.411909829455667e-06,
"loss": 1.2376,
"step": 3412
},
{
"epoch": 0.9908549862098999,
"grad_norm": 2.9737493991851807,
"learning_rate": 9.411457940059053e-06,
"loss": 0.9969,
"step": 3413
},
{
"epoch": 0.9911453041079983,
"grad_norm": 3.2683541774749756,
"learning_rate": 9.41100588796962e-06,
"loss": 1.172,
"step": 3414
},
{
"epoch": 0.9914356220060967,
"grad_norm": 2.798372268676758,
"learning_rate": 9.41055367320404e-06,
"loss": 1.0958,
"step": 3415
},
{
"epoch": 0.9917259399041951,
"grad_norm": 3.1530799865722656,
"learning_rate": 9.410101295778992e-06,
"loss": 1.1092,
"step": 3416
},
{
"epoch": 0.9920162578022935,
"grad_norm": 3.589674711227417,
"learning_rate": 9.409648755711157e-06,
"loss": 1.4038,
"step": 3417
},
{
"epoch": 0.9923065757003919,
"grad_norm": 2.7075817584991455,
"learning_rate": 9.409196053017227e-06,
"loss": 1.0471,
"step": 3418
},
{
"epoch": 0.9925968935984903,
"grad_norm": 3.057220697402954,
"learning_rate": 9.408743187713895e-06,
"loss": 1.1861,
"step": 3419
},
{
"epoch": 0.9928872114965888,
"grad_norm": 2.9704697132110596,
"learning_rate": 9.408290159817865e-06,
"loss": 1.1141,
"step": 3420
},
{
"epoch": 0.9931775293946872,
"grad_norm": 3.118169069290161,
"learning_rate": 9.407836969345845e-06,
"loss": 1.0851,
"step": 3421
},
{
"epoch": 0.9934678472927856,
"grad_norm": 2.885435104370117,
"learning_rate": 9.407383616314545e-06,
"loss": 1.0472,
"step": 3422
},
{
"epoch": 0.993758165190884,
"grad_norm": 3.142916202545166,
"learning_rate": 9.406930100740686e-06,
"loss": 1.0709,
"step": 3423
},
{
"epoch": 0.9940484830889824,
"grad_norm": 3.0547609329223633,
"learning_rate": 9.406476422640994e-06,
"loss": 1.1419,
"step": 3424
},
{
"epoch": 0.9943388009870808,
"grad_norm": 3.3543431758880615,
"learning_rate": 9.4060225820322e-06,
"loss": 1.1565,
"step": 3425
},
{
"epoch": 0.9946291188851792,
"grad_norm": 3.0204522609710693,
"learning_rate": 9.405568578931042e-06,
"loss": 1.2616,
"step": 3426
},
{
"epoch": 0.9949194367832777,
"grad_norm": 3.07812237739563,
"learning_rate": 9.405114413354261e-06,
"loss": 1.0725,
"step": 3427
},
{
"epoch": 0.9952097546813761,
"grad_norm": 2.8966448307037354,
"learning_rate": 9.40466008531861e-06,
"loss": 1.0558,
"step": 3428
},
{
"epoch": 0.9955000725794745,
"grad_norm": 3.6422510147094727,
"learning_rate": 9.404205594840843e-06,
"loss": 1.0604,
"step": 3429
},
{
"epoch": 0.9957903904775729,
"grad_norm": 3.1371798515319824,
"learning_rate": 9.403750941937723e-06,
"loss": 1.0434,
"step": 3430
},
{
"epoch": 0.9960807083756713,
"grad_norm": 3.1310348510742188,
"learning_rate": 9.403296126626014e-06,
"loss": 1.0345,
"step": 3431
},
{
"epoch": 0.9963710262737698,
"grad_norm": 3.1864089965820312,
"learning_rate": 9.402841148922493e-06,
"loss": 1.1211,
"step": 3432
},
{
"epoch": 0.9966613441718682,
"grad_norm": 3.2112019062042236,
"learning_rate": 9.402386008843935e-06,
"loss": 1.0529,
"step": 3433
},
{
"epoch": 0.9969516620699667,
"grad_norm": 3.1958372592926025,
"learning_rate": 9.401930706407129e-06,
"loss": 1.1574,
"step": 3434
},
{
"epoch": 0.9972419799680651,
"grad_norm": 3.1686742305755615,
"learning_rate": 9.401475241628867e-06,
"loss": 0.9665,
"step": 3435
},
{
"epoch": 0.9975322978661635,
"grad_norm": 2.843740701675415,
"learning_rate": 9.401019614525944e-06,
"loss": 1.0863,
"step": 3436
},
{
"epoch": 0.9978226157642619,
"grad_norm": 2.8418521881103516,
"learning_rate": 9.400563825115163e-06,
"loss": 1.0813,
"step": 3437
},
{
"epoch": 0.9981129336623603,
"grad_norm": 3.322758913040161,
"learning_rate": 9.400107873413335e-06,
"loss": 1.0213,
"step": 3438
},
{
"epoch": 0.9984032515604587,
"grad_norm": 3.388033866882324,
"learning_rate": 9.399651759437276e-06,
"loss": 1.14,
"step": 3439
},
{
"epoch": 0.9986935694585571,
"grad_norm": 3.383345127105713,
"learning_rate": 9.399195483203805e-06,
"loss": 1.2244,
"step": 3440
},
{
"epoch": 0.9989838873566556,
"grad_norm": 3.070141315460205,
"learning_rate": 9.39873904472975e-06,
"loss": 1.1552,
"step": 3441
},
{
"epoch": 0.999274205254754,
"grad_norm": 3.090776205062866,
"learning_rate": 9.398282444031944e-06,
"loss": 1.1257,
"step": 3442
},
{
"epoch": 0.9995645231528524,
"grad_norm": 3.1344099044799805,
"learning_rate": 9.397825681127228e-06,
"loss": 1.278,
"step": 3443
},
{
"epoch": 0.9998548410509508,
"grad_norm": 2.9550633430480957,
"learning_rate": 9.397368756032445e-06,
"loss": 1.0698,
"step": 3444
},
{
"epoch": 1.0001451589490493,
"grad_norm": 3.0842957496643066,
"learning_rate": 9.39691166876445e-06,
"loss": 1.1084,
"step": 3445
},
{
"epoch": 1.0004354768471477,
"grad_norm": 2.8712656497955322,
"learning_rate": 9.396454419340096e-06,
"loss": 0.7726,
"step": 3446
},
{
"epoch": 1.0007257947452461,
"grad_norm": 2.9478588104248047,
"learning_rate": 9.395997007776247e-06,
"loss": 0.8716,
"step": 3447
},
{
"epoch": 1.0010161126433446,
"grad_norm": 3.1845040321350098,
"learning_rate": 9.395539434089773e-06,
"loss": 0.8577,
"step": 3448
},
{
"epoch": 1.001306430541443,
"grad_norm": 2.5706589221954346,
"learning_rate": 9.395081698297549e-06,
"loss": 0.6979,
"step": 3449
},
{
"epoch": 1.0015967484395414,
"grad_norm": 3.145312786102295,
"learning_rate": 9.394623800416456e-06,
"loss": 0.8096,
"step": 3450
},
{
"epoch": 1.0018870663376398,
"grad_norm": 2.9135537147521973,
"learning_rate": 9.394165740463382e-06,
"loss": 0.7561,
"step": 3451
},
{
"epoch": 1.0021773842357382,
"grad_norm": 3.1902127265930176,
"learning_rate": 9.39370751845522e-06,
"loss": 0.7947,
"step": 3452
},
{
"epoch": 1.0024677021338366,
"grad_norm": 3.7546684741973877,
"learning_rate": 9.393249134408866e-06,
"loss": 0.959,
"step": 3453
},
{
"epoch": 1.002758020031935,
"grad_norm": 3.009138584136963,
"learning_rate": 9.392790588341228e-06,
"loss": 0.8543,
"step": 3454
},
{
"epoch": 1.0030483379300335,
"grad_norm": 3.8401989936828613,
"learning_rate": 9.392331880269217e-06,
"loss": 0.9496,
"step": 3455
},
{
"epoch": 1.0033386558281319,
"grad_norm": 3.9304797649383545,
"learning_rate": 9.39187301020975e-06,
"loss": 0.8184,
"step": 3456
},
{
"epoch": 1.0036289737262303,
"grad_norm": 3.111929416656494,
"learning_rate": 9.391413978179748e-06,
"loss": 0.6968,
"step": 3457
},
{
"epoch": 1.0039192916243287,
"grad_norm": 3.6900084018707275,
"learning_rate": 9.390954784196143e-06,
"loss": 0.7946,
"step": 3458
},
{
"epoch": 1.004209609522427,
"grad_norm": 3.747096300125122,
"learning_rate": 9.390495428275866e-06,
"loss": 0.8256,
"step": 3459
},
{
"epoch": 1.0044999274205255,
"grad_norm": 3.514481782913208,
"learning_rate": 9.39003591043586e-06,
"loss": 0.7779,
"step": 3460
},
{
"epoch": 1.004790245318624,
"grad_norm": 3.580620050430298,
"learning_rate": 9.389576230693072e-06,
"loss": 0.8052,
"step": 3461
},
{
"epoch": 1.0050805632167223,
"grad_norm": 3.489169120788574,
"learning_rate": 9.389116389064454e-06,
"loss": 0.7664,
"step": 3462
},
{
"epoch": 1.0053708811148208,
"grad_norm": 3.3632068634033203,
"learning_rate": 9.388656385566967e-06,
"loss": 0.8239,
"step": 3463
},
{
"epoch": 1.0056611990129192,
"grad_norm": 3.4779982566833496,
"learning_rate": 9.388196220217574e-06,
"loss": 0.7322,
"step": 3464
},
{
"epoch": 1.0059515169110176,
"grad_norm": 3.2781569957733154,
"learning_rate": 9.387735893033244e-06,
"loss": 0.7248,
"step": 3465
},
{
"epoch": 1.006241834809116,
"grad_norm": 2.9717156887054443,
"learning_rate": 9.387275404030957e-06,
"loss": 0.6981,
"step": 3466
},
{
"epoch": 1.0065321527072144,
"grad_norm": 3.2096431255340576,
"learning_rate": 9.386814753227694e-06,
"loss": 0.691,
"step": 3467
},
{
"epoch": 1.0068224706053128,
"grad_norm": 3.4639768600463867,
"learning_rate": 9.386353940640442e-06,
"loss": 0.8206,
"step": 3468
},
{
"epoch": 1.0071127885034112,
"grad_norm": 3.3985044956207275,
"learning_rate": 9.3858929662862e-06,
"loss": 0.765,
"step": 3469
},
{
"epoch": 1.0074031064015097,
"grad_norm": 3.998185634613037,
"learning_rate": 9.385431830181963e-06,
"loss": 0.9247,
"step": 3470
},
{
"epoch": 1.007693424299608,
"grad_norm": 4.009119033813477,
"learning_rate": 9.384970532344744e-06,
"loss": 0.8434,
"step": 3471
},
{
"epoch": 1.0079837421977065,
"grad_norm": 3.4947729110717773,
"learning_rate": 9.38450907279155e-06,
"loss": 0.7898,
"step": 3472
},
{
"epoch": 1.008274060095805,
"grad_norm": 3.387531280517578,
"learning_rate": 9.3840474515394e-06,
"loss": 0.7269,
"step": 3473
},
{
"epoch": 1.0085643779939033,
"grad_norm": 3.3790810108184814,
"learning_rate": 9.383585668605321e-06,
"loss": 0.7782,
"step": 3474
},
{
"epoch": 1.0088546958920017,
"grad_norm": 3.5012128353118896,
"learning_rate": 9.383123724006343e-06,
"loss": 0.7547,
"step": 3475
},
{
"epoch": 1.0091450137901001,
"grad_norm": 3.605910539627075,
"learning_rate": 9.382661617759501e-06,
"loss": 0.7258,
"step": 3476
},
{
"epoch": 1.0094353316881985,
"grad_norm": 3.25126576423645,
"learning_rate": 9.382199349881838e-06,
"loss": 0.7431,
"step": 3477
},
{
"epoch": 1.009725649586297,
"grad_norm": 3.136561155319214,
"learning_rate": 9.3817369203904e-06,
"loss": 0.7035,
"step": 3478
},
{
"epoch": 1.0100159674843954,
"grad_norm": 3.353161334991455,
"learning_rate": 9.381274329302244e-06,
"loss": 0.6801,
"step": 3479
},
{
"epoch": 1.0103062853824938,
"grad_norm": 3.6189143657684326,
"learning_rate": 9.38081157663443e-06,
"loss": 0.702,
"step": 3480
},
{
"epoch": 1.0105966032805922,
"grad_norm": 3.855806350708008,
"learning_rate": 9.380348662404024e-06,
"loss": 0.7256,
"step": 3481
},
{
"epoch": 1.0108869211786906,
"grad_norm": 4.021921634674072,
"learning_rate": 9.379885586628098e-06,
"loss": 0.7767,
"step": 3482
},
{
"epoch": 1.011177239076789,
"grad_norm": 3.6086981296539307,
"learning_rate": 9.379422349323728e-06,
"loss": 0.8053,
"step": 3483
},
{
"epoch": 1.0114675569748874,
"grad_norm": 3.474881887435913,
"learning_rate": 9.378958950508001e-06,
"loss": 0.7292,
"step": 3484
},
{
"epoch": 1.0117578748729859,
"grad_norm": 3.6635396480560303,
"learning_rate": 9.378495390198005e-06,
"loss": 0.8161,
"step": 3485
},
{
"epoch": 1.0120481927710843,
"grad_norm": 3.293006420135498,
"learning_rate": 9.378031668410836e-06,
"loss": 0.6933,
"step": 3486
},
{
"epoch": 1.0123385106691827,
"grad_norm": 3.572141408920288,
"learning_rate": 9.377567785163597e-06,
"loss": 0.7402,
"step": 3487
},
{
"epoch": 1.012628828567281,
"grad_norm": 3.474271535873413,
"learning_rate": 9.377103740473396e-06,
"loss": 0.7938,
"step": 3488
},
{
"epoch": 1.0129191464653795,
"grad_norm": 3.1348941326141357,
"learning_rate": 9.376639534357346e-06,
"loss": 0.68,
"step": 3489
},
{
"epoch": 1.013209464363478,
"grad_norm": 3.269479990005493,
"learning_rate": 9.376175166832565e-06,
"loss": 0.7249,
"step": 3490
},
{
"epoch": 1.0134997822615763,
"grad_norm": 3.5079116821289062,
"learning_rate": 9.375710637916182e-06,
"loss": 0.8077,
"step": 3491
},
{
"epoch": 1.0137901001596747,
"grad_norm": 3.673961877822876,
"learning_rate": 9.375245947625326e-06,
"loss": 0.7918,
"step": 3492
},
{
"epoch": 1.0140804180577732,
"grad_norm": 3.6237893104553223,
"learning_rate": 9.374781095977137e-06,
"loss": 0.7134,
"step": 3493
},
{
"epoch": 1.0143707359558716,
"grad_norm": 3.540834903717041,
"learning_rate": 9.374316082988758e-06,
"loss": 0.8578,
"step": 3494
},
{
"epoch": 1.0146610538539702,
"grad_norm": 3.900315046310425,
"learning_rate": 9.373850908677335e-06,
"loss": 0.7959,
"step": 3495
},
{
"epoch": 1.0149513717520686,
"grad_norm": 3.6177544593811035,
"learning_rate": 9.373385573060028e-06,
"loss": 0.7218,
"step": 3496
},
{
"epoch": 1.015241689650167,
"grad_norm": 3.376136064529419,
"learning_rate": 9.372920076153996e-06,
"loss": 0.7929,
"step": 3497
},
{
"epoch": 1.0155320075482654,
"grad_norm": 3.4732577800750732,
"learning_rate": 9.372454417976407e-06,
"loss": 0.7308,
"step": 3498
},
{
"epoch": 1.0158223254463639,
"grad_norm": 3.1645116806030273,
"learning_rate": 9.371988598544434e-06,
"loss": 0.782,
"step": 3499
},
{
"epoch": 1.0161126433444623,
"grad_norm": 3.3945982456207275,
"learning_rate": 9.371522617875258e-06,
"loss": 0.826,
"step": 3500
},
{
"epoch": 1.0161126433444623,
"eval_loss": 1.20125412940979,
"eval_runtime": 13.5944,
"eval_samples_per_second": 29.424,
"eval_steps_per_second": 3.678,
"step": 3500
},
{
"epoch": 1.0164029612425607,
"grad_norm": 2.948904037475586,
"learning_rate": 9.371056475986062e-06,
"loss": 0.6515,
"step": 3501
},
{
"epoch": 1.016693279140659,
"grad_norm": 3.3584020137786865,
"learning_rate": 9.370590172894037e-06,
"loss": 0.757,
"step": 3502
},
{
"epoch": 1.0169835970387575,
"grad_norm": 3.512335777282715,
"learning_rate": 9.370123708616381e-06,
"loss": 0.7603,
"step": 3503
},
{
"epoch": 1.017273914936856,
"grad_norm": 3.194840908050537,
"learning_rate": 9.369657083170297e-06,
"loss": 0.6974,
"step": 3504
},
{
"epoch": 1.0175642328349543,
"grad_norm": 3.945988178253174,
"learning_rate": 9.369190296572994e-06,
"loss": 0.8559,
"step": 3505
},
{
"epoch": 1.0178545507330528,
"grad_norm": 3.3235080242156982,
"learning_rate": 9.368723348841687e-06,
"loss": 0.7431,
"step": 3506
},
{
"epoch": 1.0181448686311512,
"grad_norm": 5.137023448944092,
"learning_rate": 9.368256239993597e-06,
"loss": 0.8582,
"step": 3507
},
{
"epoch": 1.0184351865292496,
"grad_norm": 3.438002824783325,
"learning_rate": 9.367788970045947e-06,
"loss": 0.7126,
"step": 3508
},
{
"epoch": 1.018725504427348,
"grad_norm": 3.168781042098999,
"learning_rate": 9.367321539015977e-06,
"loss": 0.6956,
"step": 3509
},
{
"epoch": 1.0190158223254464,
"grad_norm": 3.538299322128296,
"learning_rate": 9.36685394692092e-06,
"loss": 0.6906,
"step": 3510
},
{
"epoch": 1.0193061402235448,
"grad_norm": 3.4802918434143066,
"learning_rate": 9.366386193778023e-06,
"loss": 0.6902,
"step": 3511
},
{
"epoch": 1.0195964581216432,
"grad_norm": 2.9979684352874756,
"learning_rate": 9.365918279604536e-06,
"loss": 0.744,
"step": 3512
},
{
"epoch": 1.0198867760197416,
"grad_norm": 3.391887664794922,
"learning_rate": 9.365450204417714e-06,
"loss": 0.7245,
"step": 3513
},
{
"epoch": 1.02017709391784,
"grad_norm": 3.5485000610351562,
"learning_rate": 9.364981968234823e-06,
"loss": 0.8228,
"step": 3514
},
{
"epoch": 1.0204674118159385,
"grad_norm": 3.3044545650482178,
"learning_rate": 9.364513571073129e-06,
"loss": 0.6746,
"step": 3515
},
{
"epoch": 1.0207577297140369,
"grad_norm": 3.7134881019592285,
"learning_rate": 9.364045012949904e-06,
"loss": 0.8221,
"step": 3516
},
{
"epoch": 1.0210480476121353,
"grad_norm": 3.1946160793304443,
"learning_rate": 9.363576293882432e-06,
"loss": 0.7176,
"step": 3517
},
{
"epoch": 1.0213383655102337,
"grad_norm": 3.5327720642089844,
"learning_rate": 9.363107413887999e-06,
"loss": 0.7192,
"step": 3518
},
{
"epoch": 1.0216286834083321,
"grad_norm": 3.509906053543091,
"learning_rate": 9.362638372983894e-06,
"loss": 0.7382,
"step": 3519
},
{
"epoch": 1.0219190013064305,
"grad_norm": 3.8694610595703125,
"learning_rate": 9.362169171187419e-06,
"loss": 0.7577,
"step": 3520
},
{
"epoch": 1.022209319204529,
"grad_norm": 3.1691699028015137,
"learning_rate": 9.361699808515877e-06,
"loss": 0.7352,
"step": 3521
},
{
"epoch": 1.0224996371026274,
"grad_norm": 3.552873134613037,
"learning_rate": 9.361230284986573e-06,
"loss": 0.8043,
"step": 3522
},
{
"epoch": 1.0227899550007258,
"grad_norm": 3.761043071746826,
"learning_rate": 9.36076060061683e-06,
"loss": 0.9012,
"step": 3523
},
{
"epoch": 1.0230802728988242,
"grad_norm": 3.4257898330688477,
"learning_rate": 9.360290755423966e-06,
"loss": 0.7829,
"step": 3524
},
{
"epoch": 1.0233705907969226,
"grad_norm": 3.318141460418701,
"learning_rate": 9.359820749425308e-06,
"loss": 0.6867,
"step": 3525
},
{
"epoch": 1.023660908695021,
"grad_norm": 3.2003114223480225,
"learning_rate": 9.359350582638193e-06,
"loss": 0.7361,
"step": 3526
},
{
"epoch": 1.0239512265931194,
"grad_norm": 3.5448482036590576,
"learning_rate": 9.358880255079957e-06,
"loss": 0.7987,
"step": 3527
},
{
"epoch": 1.0242415444912178,
"grad_norm": 3.18243145942688,
"learning_rate": 9.358409766767946e-06,
"loss": 0.7502,
"step": 3528
},
{
"epoch": 1.0245318623893163,
"grad_norm": 3.511103868484497,
"learning_rate": 9.357939117719515e-06,
"loss": 0.6952,
"step": 3529
},
{
"epoch": 1.0248221802874147,
"grad_norm": 3.4447379112243652,
"learning_rate": 9.357468307952019e-06,
"loss": 0.7581,
"step": 3530
},
{
"epoch": 1.025112498185513,
"grad_norm": 4.462029933929443,
"learning_rate": 9.356997337482818e-06,
"loss": 0.9036,
"step": 3531
},
{
"epoch": 1.0254028160836115,
"grad_norm": 4.024928092956543,
"learning_rate": 9.356526206329285e-06,
"loss": 0.7405,
"step": 3532
},
{
"epoch": 1.02569313398171,
"grad_norm": 3.3090834617614746,
"learning_rate": 9.356054914508796e-06,
"loss": 0.6529,
"step": 3533
},
{
"epoch": 1.0259834518798083,
"grad_norm": 3.7456352710723877,
"learning_rate": 9.355583462038728e-06,
"loss": 0.8039,
"step": 3534
},
{
"epoch": 1.0262737697779067,
"grad_norm": 3.3236465454101562,
"learning_rate": 9.355111848936472e-06,
"loss": 0.749,
"step": 3535
},
{
"epoch": 1.0265640876760052,
"grad_norm": 3.631131887435913,
"learning_rate": 9.354640075219419e-06,
"loss": 0.7229,
"step": 3536
},
{
"epoch": 1.0268544055741036,
"grad_norm": 3.345919132232666,
"learning_rate": 9.35416814090497e-06,
"loss": 0.689,
"step": 3537
},
{
"epoch": 1.027144723472202,
"grad_norm": 3.5057573318481445,
"learning_rate": 9.353696046010524e-06,
"loss": 0.6877,
"step": 3538
},
{
"epoch": 1.0274350413703004,
"grad_norm": 3.5284013748168945,
"learning_rate": 9.353223790553499e-06,
"loss": 0.7665,
"step": 3539
},
{
"epoch": 1.0277253592683988,
"grad_norm": 3.2629342079162598,
"learning_rate": 9.352751374551305e-06,
"loss": 0.7404,
"step": 3540
},
{
"epoch": 1.0280156771664972,
"grad_norm": 3.636103630065918,
"learning_rate": 9.35227879802137e-06,
"loss": 0.7259,
"step": 3541
},
{
"epoch": 1.0283059950645956,
"grad_norm": 3.3388805389404297,
"learning_rate": 9.35180606098112e-06,
"loss": 0.8092,
"step": 3542
},
{
"epoch": 1.028596312962694,
"grad_norm": 3.710493326187134,
"learning_rate": 9.351333163447989e-06,
"loss": 0.7778,
"step": 3543
},
{
"epoch": 1.0288866308607925,
"grad_norm": 3.360016345977783,
"learning_rate": 9.350860105439416e-06,
"loss": 0.8075,
"step": 3544
},
{
"epoch": 1.029176948758891,
"grad_norm": 3.6781256198883057,
"learning_rate": 9.35038688697285e-06,
"loss": 0.835,
"step": 3545
},
{
"epoch": 1.0294672666569895,
"grad_norm": 3.3641157150268555,
"learning_rate": 9.349913508065743e-06,
"loss": 0.8336,
"step": 3546
},
{
"epoch": 1.029757584555088,
"grad_norm": 3.334789752960205,
"learning_rate": 9.349439968735551e-06,
"loss": 0.6987,
"step": 3547
},
{
"epoch": 1.0300479024531863,
"grad_norm": 4.041718482971191,
"learning_rate": 9.34896626899974e-06,
"loss": 0.6966,
"step": 3548
},
{
"epoch": 1.0303382203512848,
"grad_norm": 3.1009633541107178,
"learning_rate": 9.348492408875779e-06,
"loss": 0.6439,
"step": 3549
},
{
"epoch": 1.0306285382493832,
"grad_norm": 3.5959973335266113,
"learning_rate": 9.348018388381142e-06,
"loss": 0.7712,
"step": 3550
},
{
"epoch": 1.0309188561474816,
"grad_norm": 3.441721200942993,
"learning_rate": 9.347544207533315e-06,
"loss": 0.6931,
"step": 3551
},
{
"epoch": 1.03120917404558,
"grad_norm": 3.2447519302368164,
"learning_rate": 9.34706986634978e-06,
"loss": 0.6328,
"step": 3552
},
{
"epoch": 1.0314994919436784,
"grad_norm": 3.586515188217163,
"learning_rate": 9.346595364848035e-06,
"loss": 0.8278,
"step": 3553
},
{
"epoch": 1.0317898098417768,
"grad_norm": 3.604525327682495,
"learning_rate": 9.346120703045576e-06,
"loss": 0.8527,
"step": 3554
},
{
"epoch": 1.0320801277398752,
"grad_norm": 3.301090717315674,
"learning_rate": 9.345645880959912e-06,
"loss": 0.6894,
"step": 3555
},
{
"epoch": 1.0323704456379736,
"grad_norm": 3.388200044631958,
"learning_rate": 9.345170898608553e-06,
"loss": 0.7878,
"step": 3556
},
{
"epoch": 1.032660763536072,
"grad_norm": 3.0278408527374268,
"learning_rate": 9.344695756009013e-06,
"loss": 0.7222,
"step": 3557
},
{
"epoch": 1.0329510814341705,
"grad_norm": 3.474755048751831,
"learning_rate": 9.344220453178821e-06,
"loss": 0.7424,
"step": 3558
},
{
"epoch": 1.0332413993322689,
"grad_norm": 3.2388312816619873,
"learning_rate": 9.3437449901355e-06,
"loss": 0.7189,
"step": 3559
},
{
"epoch": 1.0335317172303673,
"grad_norm": 3.3592824935913086,
"learning_rate": 9.343269366896588e-06,
"loss": 0.7467,
"step": 3560
},
{
"epoch": 1.0338220351284657,
"grad_norm": 3.4638187885284424,
"learning_rate": 9.342793583479625e-06,
"loss": 0.7311,
"step": 3561
},
{
"epoch": 1.0341123530265641,
"grad_norm": 3.9923784732818604,
"learning_rate": 9.342317639902158e-06,
"loss": 0.944,
"step": 3562
},
{
"epoch": 1.0344026709246625,
"grad_norm": 3.471781015396118,
"learning_rate": 9.341841536181742e-06,
"loss": 0.7335,
"step": 3563
},
{
"epoch": 1.034692988822761,
"grad_norm": 3.4282989501953125,
"learning_rate": 9.341365272335932e-06,
"loss": 0.8669,
"step": 3564
},
{
"epoch": 1.0349833067208594,
"grad_norm": 3.347621440887451,
"learning_rate": 9.340888848382292e-06,
"loss": 0.72,
"step": 3565
},
{
"epoch": 1.0352736246189578,
"grad_norm": 3.4983551502227783,
"learning_rate": 9.340412264338394e-06,
"loss": 0.8129,
"step": 3566
},
{
"epoch": 1.0355639425170562,
"grad_norm": 3.236875534057617,
"learning_rate": 9.339935520221816e-06,
"loss": 0.7324,
"step": 3567
},
{
"epoch": 1.0358542604151546,
"grad_norm": 3.8020715713500977,
"learning_rate": 9.339458616050137e-06,
"loss": 0.6812,
"step": 3568
},
{
"epoch": 1.036144578313253,
"grad_norm": 3.6184334754943848,
"learning_rate": 9.338981551840947e-06,
"loss": 0.6708,
"step": 3569
},
{
"epoch": 1.0364348962113514,
"grad_norm": 3.225571632385254,
"learning_rate": 9.338504327611839e-06,
"loss": 0.719,
"step": 3570
},
{
"epoch": 1.0367252141094498,
"grad_norm": 3.2746708393096924,
"learning_rate": 9.338026943380413e-06,
"loss": 0.7274,
"step": 3571
},
{
"epoch": 1.0370155320075483,
"grad_norm": 3.2747983932495117,
"learning_rate": 9.337549399164274e-06,
"loss": 0.7414,
"step": 3572
},
{
"epoch": 1.0373058499056467,
"grad_norm": 3.33699369430542,
"learning_rate": 9.337071694981038e-06,
"loss": 0.7898,
"step": 3573
},
{
"epoch": 1.037596167803745,
"grad_norm": 3.4813315868377686,
"learning_rate": 9.336593830848315e-06,
"loss": 0.6973,
"step": 3574
},
{
"epoch": 1.0378864857018435,
"grad_norm": 2.953972339630127,
"learning_rate": 9.336115806783734e-06,
"loss": 0.6768,
"step": 3575
},
{
"epoch": 1.038176803599942,
"grad_norm": 3.2962663173675537,
"learning_rate": 9.335637622804922e-06,
"loss": 0.7336,
"step": 3576
},
{
"epoch": 1.0384671214980403,
"grad_norm": 3.4844980239868164,
"learning_rate": 9.335159278929516e-06,
"loss": 0.7695,
"step": 3577
},
{
"epoch": 1.0387574393961387,
"grad_norm": 3.954115152359009,
"learning_rate": 9.334680775175154e-06,
"loss": 0.9909,
"step": 3578
},
{
"epoch": 1.0390477572942372,
"grad_norm": 3.5708436965942383,
"learning_rate": 9.334202111559487e-06,
"loss": 0.7544,
"step": 3579
},
{
"epoch": 1.0393380751923356,
"grad_norm": 2.7870044708251953,
"learning_rate": 9.333723288100167e-06,
"loss": 0.6855,
"step": 3580
},
{
"epoch": 1.039628393090434,
"grad_norm": 3.445352554321289,
"learning_rate": 9.33324430481485e-06,
"loss": 0.6626,
"step": 3581
},
{
"epoch": 1.0399187109885324,
"grad_norm": 3.9660799503326416,
"learning_rate": 9.332765161721203e-06,
"loss": 0.8,
"step": 3582
},
{
"epoch": 1.0402090288866308,
"grad_norm": 4.004605293273926,
"learning_rate": 9.332285858836898e-06,
"loss": 0.8748,
"step": 3583
},
{
"epoch": 1.0404993467847292,
"grad_norm": 3.285799980163574,
"learning_rate": 9.331806396179607e-06,
"loss": 0.8192,
"step": 3584
},
{
"epoch": 1.0407896646828276,
"grad_norm": 3.3582661151885986,
"learning_rate": 9.331326773767018e-06,
"loss": 0.6696,
"step": 3585
},
{
"epoch": 1.041079982580926,
"grad_norm": 3.596374273300171,
"learning_rate": 9.330846991616814e-06,
"loss": 0.7014,
"step": 3586
},
{
"epoch": 1.0413703004790245,
"grad_norm": 3.59114408493042,
"learning_rate": 9.330367049746693e-06,
"loss": 0.7166,
"step": 3587
},
{
"epoch": 1.0416606183771229,
"grad_norm": 3.740971565246582,
"learning_rate": 9.329886948174353e-06,
"loss": 0.7826,
"step": 3588
},
{
"epoch": 1.0419509362752213,
"grad_norm": 3.2020390033721924,
"learning_rate": 9.329406686917502e-06,
"loss": 0.643,
"step": 3589
},
{
"epoch": 1.0422412541733197,
"grad_norm": 3.364518404006958,
"learning_rate": 9.328926265993849e-06,
"loss": 0.8063,
"step": 3590
},
{
"epoch": 1.0425315720714181,
"grad_norm": 3.603043556213379,
"learning_rate": 9.328445685421113e-06,
"loss": 0.6926,
"step": 3591
},
{
"epoch": 1.0428218899695165,
"grad_norm": 3.999770164489746,
"learning_rate": 9.327964945217018e-06,
"loss": 0.8984,
"step": 3592
},
{
"epoch": 1.043112207867615,
"grad_norm": 3.8826112747192383,
"learning_rate": 9.327484045399294e-06,
"loss": 0.7575,
"step": 3593
},
{
"epoch": 1.0434025257657134,
"grad_norm": 3.6010074615478516,
"learning_rate": 9.327002985985676e-06,
"loss": 0.8438,
"step": 3594
},
{
"epoch": 1.0436928436638118,
"grad_norm": 3.9782824516296387,
"learning_rate": 9.326521766993904e-06,
"loss": 0.7927,
"step": 3595
},
{
"epoch": 1.0439831615619104,
"grad_norm": 3.465355157852173,
"learning_rate": 9.326040388441727e-06,
"loss": 0.6731,
"step": 3596
},
{
"epoch": 1.0442734794600088,
"grad_norm": 3.5577354431152344,
"learning_rate": 9.325558850346897e-06,
"loss": 0.8736,
"step": 3597
},
{
"epoch": 1.0445637973581072,
"grad_norm": 3.6358604431152344,
"learning_rate": 9.325077152727173e-06,
"loss": 0.7572,
"step": 3598
},
{
"epoch": 1.0448541152562056,
"grad_norm": 3.9167327880859375,
"learning_rate": 9.324595295600318e-06,
"loss": 0.7054,
"step": 3599
},
{
"epoch": 1.045144433154304,
"grad_norm": 4.315560340881348,
"learning_rate": 9.324113278984108e-06,
"loss": 0.9471,
"step": 3600
},
{
"epoch": 1.0454347510524025,
"grad_norm": 3.8556084632873535,
"learning_rate": 9.323631102896314e-06,
"loss": 0.7213,
"step": 3601
},
{
"epoch": 1.0457250689505009,
"grad_norm": 3.4413363933563232,
"learning_rate": 9.323148767354721e-06,
"loss": 0.7063,
"step": 3602
},
{
"epoch": 1.0460153868485993,
"grad_norm": 3.2421858310699463,
"learning_rate": 9.322666272377119e-06,
"loss": 0.7034,
"step": 3603
},
{
"epoch": 1.0463057047466977,
"grad_norm": 3.6639201641082764,
"learning_rate": 9.322183617981297e-06,
"loss": 0.8093,
"step": 3604
},
{
"epoch": 1.0465960226447961,
"grad_norm": 3.616205930709839,
"learning_rate": 9.321700804185061e-06,
"loss": 0.7865,
"step": 3605
},
{
"epoch": 1.0468863405428945,
"grad_norm": 3.593491554260254,
"learning_rate": 9.321217831006214e-06,
"loss": 0.8386,
"step": 3606
},
{
"epoch": 1.047176658440993,
"grad_norm": 3.3423163890838623,
"learning_rate": 9.320734698462569e-06,
"loss": 0.7197,
"step": 3607
},
{
"epoch": 1.0474669763390914,
"grad_norm": 3.197126865386963,
"learning_rate": 9.32025140657194e-06,
"loss": 0.6785,
"step": 3608
},
{
"epoch": 1.0477572942371898,
"grad_norm": 3.575289487838745,
"learning_rate": 9.319767955352154e-06,
"loss": 0.7922,
"step": 3609
},
{
"epoch": 1.0480476121352882,
"grad_norm": 3.8259365558624268,
"learning_rate": 9.319284344821042e-06,
"loss": 0.7762,
"step": 3610
},
{
"epoch": 1.0483379300333866,
"grad_norm": 3.8167777061462402,
"learning_rate": 9.318800574996437e-06,
"loss": 0.9812,
"step": 3611
},
{
"epoch": 1.048628247931485,
"grad_norm": 3.700352430343628,
"learning_rate": 9.318316645896182e-06,
"loss": 0.7656,
"step": 3612
},
{
"epoch": 1.0489185658295834,
"grad_norm": 3.7808494567871094,
"learning_rate": 9.31783255753812e-06,
"loss": 0.6732,
"step": 3613
},
{
"epoch": 1.0492088837276818,
"grad_norm": 3.1864030361175537,
"learning_rate": 9.317348309940109e-06,
"loss": 0.664,
"step": 3614
},
{
"epoch": 1.0494992016257803,
"grad_norm": 3.409240245819092,
"learning_rate": 9.316863903120004e-06,
"loss": 0.7459,
"step": 3615
},
{
"epoch": 1.0497895195238787,
"grad_norm": 3.466313362121582,
"learning_rate": 9.316379337095671e-06,
"loss": 0.7646,
"step": 3616
},
{
"epoch": 1.050079837421977,
"grad_norm": 3.3947641849517822,
"learning_rate": 9.315894611884982e-06,
"loss": 0.7207,
"step": 3617
},
{
"epoch": 1.0503701553200755,
"grad_norm": 3.1996078491210938,
"learning_rate": 9.315409727505813e-06,
"loss": 0.6923,
"step": 3618
},
{
"epoch": 1.050660473218174,
"grad_norm": 3.2390217781066895,
"learning_rate": 9.314924683976044e-06,
"loss": 0.6493,
"step": 3619
},
{
"epoch": 1.0509507911162723,
"grad_norm": 3.375798225402832,
"learning_rate": 9.314439481313567e-06,
"loss": 0.7514,
"step": 3620
},
{
"epoch": 1.0512411090143707,
"grad_norm": 3.334712028503418,
"learning_rate": 9.313954119536273e-06,
"loss": 0.7673,
"step": 3621
},
{
"epoch": 1.0515314269124691,
"grad_norm": 3.1791110038757324,
"learning_rate": 9.313468598662063e-06,
"loss": 0.6983,
"step": 3622
},
{
"epoch": 1.0518217448105676,
"grad_norm": 3.7215187549591064,
"learning_rate": 9.312982918708843e-06,
"loss": 0.878,
"step": 3623
},
{
"epoch": 1.052112062708666,
"grad_norm": 3.428053617477417,
"learning_rate": 9.312497079694524e-06,
"loss": 0.7427,
"step": 3624
},
{
"epoch": 1.0524023806067644,
"grad_norm": 3.332998752593994,
"learning_rate": 9.312011081637025e-06,
"loss": 0.6933,
"step": 3625
},
{
"epoch": 1.0526926985048628,
"grad_norm": 3.5585575103759766,
"learning_rate": 9.311524924554268e-06,
"loss": 0.7643,
"step": 3626
},
{
"epoch": 1.0529830164029612,
"grad_norm": 3.3463525772094727,
"learning_rate": 9.311038608464183e-06,
"loss": 0.6914,
"step": 3627
},
{
"epoch": 1.0532733343010596,
"grad_norm": 3.7298991680145264,
"learning_rate": 9.310552133384703e-06,
"loss": 0.8181,
"step": 3628
},
{
"epoch": 1.053563652199158,
"grad_norm": 3.674640655517578,
"learning_rate": 9.310065499333773e-06,
"loss": 0.7731,
"step": 3629
},
{
"epoch": 1.0538539700972565,
"grad_norm": 3.8359897136688232,
"learning_rate": 9.309578706329338e-06,
"loss": 0.79,
"step": 3630
},
{
"epoch": 1.0541442879953549,
"grad_norm": 3.7508792877197266,
"learning_rate": 9.30909175438935e-06,
"loss": 0.7695,
"step": 3631
},
{
"epoch": 1.0544346058934533,
"grad_norm": 3.3596932888031006,
"learning_rate": 9.308604643531767e-06,
"loss": 0.7073,
"step": 3632
},
{
"epoch": 1.0547249237915517,
"grad_norm": 3.5916035175323486,
"learning_rate": 9.308117373774555e-06,
"loss": 0.7361,
"step": 3633
},
{
"epoch": 1.05501524168965,
"grad_norm": 3.477250576019287,
"learning_rate": 9.307629945135686e-06,
"loss": 0.6548,
"step": 3634
},
{
"epoch": 1.0553055595877485,
"grad_norm": 3.5962586402893066,
"learning_rate": 9.307142357633132e-06,
"loss": 0.8024,
"step": 3635
},
{
"epoch": 1.055595877485847,
"grad_norm": 3.7356138229370117,
"learning_rate": 9.306654611284878e-06,
"loss": 0.7214,
"step": 3636
},
{
"epoch": 1.0558861953839453,
"grad_norm": 3.799440860748291,
"learning_rate": 9.30616670610891e-06,
"loss": 0.7302,
"step": 3637
},
{
"epoch": 1.0561765132820438,
"grad_norm": 4.045415878295898,
"learning_rate": 9.305678642123224e-06,
"loss": 0.8737,
"step": 3638
},
{
"epoch": 1.0564668311801422,
"grad_norm": 3.4359524250030518,
"learning_rate": 9.305190419345817e-06,
"loss": 0.6862,
"step": 3639
},
{
"epoch": 1.0567571490782406,
"grad_norm": 3.230022430419922,
"learning_rate": 9.304702037794696e-06,
"loss": 0.7209,
"step": 3640
},
{
"epoch": 1.057047466976339,
"grad_norm": 3.462850570678711,
"learning_rate": 9.304213497487873e-06,
"loss": 0.7218,
"step": 3641
},
{
"epoch": 1.0573377848744374,
"grad_norm": 4.064338684082031,
"learning_rate": 9.303724798443362e-06,
"loss": 0.915,
"step": 3642
},
{
"epoch": 1.0576281027725358,
"grad_norm": 3.556943416595459,
"learning_rate": 9.303235940679192e-06,
"loss": 0.7198,
"step": 3643
},
{
"epoch": 1.0579184206706342,
"grad_norm": 3.441154718399048,
"learning_rate": 9.302746924213386e-06,
"loss": 0.7755,
"step": 3644
},
{
"epoch": 1.0582087385687329,
"grad_norm": 3.428337812423706,
"learning_rate": 9.302257749063981e-06,
"loss": 0.7677,
"step": 3645
},
{
"epoch": 1.058499056466831,
"grad_norm": 3.435852289199829,
"learning_rate": 9.301768415249017e-06,
"loss": 0.7581,
"step": 3646
},
{
"epoch": 1.0587893743649297,
"grad_norm": 3.674840211868286,
"learning_rate": 9.301278922786543e-06,
"loss": 0.7458,
"step": 3647
},
{
"epoch": 1.0590796922630281,
"grad_norm": 3.1479077339172363,
"learning_rate": 9.300789271694607e-06,
"loss": 0.7086,
"step": 3648
},
{
"epoch": 1.0593700101611265,
"grad_norm": 3.5983262062072754,
"learning_rate": 9.30029946199127e-06,
"loss": 0.8011,
"step": 3649
},
{
"epoch": 1.059660328059225,
"grad_norm": 3.4171347618103027,
"learning_rate": 9.299809493694597e-06,
"loss": 0.7957,
"step": 3650
},
{
"epoch": 1.0599506459573234,
"grad_norm": 3.0307910442352295,
"learning_rate": 9.299319366822654e-06,
"loss": 0.6833,
"step": 3651
},
{
"epoch": 1.0602409638554218,
"grad_norm": 3.349909543991089,
"learning_rate": 9.29882908139352e-06,
"loss": 0.6931,
"step": 3652
},
{
"epoch": 1.0605312817535202,
"grad_norm": 3.658194065093994,
"learning_rate": 9.298338637425276e-06,
"loss": 0.8358,
"step": 3653
},
{
"epoch": 1.0608215996516186,
"grad_norm": 3.7426369190216064,
"learning_rate": 9.297848034936007e-06,
"loss": 0.807,
"step": 3654
},
{
"epoch": 1.061111917549717,
"grad_norm": 3.262444019317627,
"learning_rate": 9.297357273943809e-06,
"loss": 0.7332,
"step": 3655
},
{
"epoch": 1.0614022354478154,
"grad_norm": 3.7360541820526123,
"learning_rate": 9.29686635446678e-06,
"loss": 0.8238,
"step": 3656
},
{
"epoch": 1.0616925533459138,
"grad_norm": 3.4465503692626953,
"learning_rate": 9.296375276523024e-06,
"loss": 0.8175,
"step": 3657
},
{
"epoch": 1.0619828712440122,
"grad_norm": 3.6495959758758545,
"learning_rate": 9.295884040130656e-06,
"loss": 0.7113,
"step": 3658
},
{
"epoch": 1.0622731891421107,
"grad_norm": 4.032883167266846,
"learning_rate": 9.295392645307786e-06,
"loss": 0.9692,
"step": 3659
},
{
"epoch": 1.062563507040209,
"grad_norm": 3.732147216796875,
"learning_rate": 9.294901092072541e-06,
"loss": 0.906,
"step": 3660
},
{
"epoch": 1.0628538249383075,
"grad_norm": 3.5926883220672607,
"learning_rate": 9.294409380443047e-06,
"loss": 0.7899,
"step": 3661
},
{
"epoch": 1.063144142836406,
"grad_norm": 3.649583578109741,
"learning_rate": 9.293917510437442e-06,
"loss": 0.7995,
"step": 3662
},
{
"epoch": 1.0634344607345043,
"grad_norm": 3.221046209335327,
"learning_rate": 9.293425482073862e-06,
"loss": 0.6568,
"step": 3663
},
{
"epoch": 1.0637247786326027,
"grad_norm": 3.4746248722076416,
"learning_rate": 9.292933295370452e-06,
"loss": 0.7059,
"step": 3664
},
{
"epoch": 1.0640150965307011,
"grad_norm": 3.3903510570526123,
"learning_rate": 9.292440950345367e-06,
"loss": 0.8072,
"step": 3665
},
{
"epoch": 1.0643054144287996,
"grad_norm": 3.920558452606201,
"learning_rate": 9.291948447016764e-06,
"loss": 0.8547,
"step": 3666
},
{
"epoch": 1.064595732326898,
"grad_norm": 3.2678873538970947,
"learning_rate": 9.291455785402806e-06,
"loss": 0.7555,
"step": 3667
},
{
"epoch": 1.0648860502249964,
"grad_norm": 3.292327404022217,
"learning_rate": 9.29096296552166e-06,
"loss": 0.7205,
"step": 3668
},
{
"epoch": 1.0651763681230948,
"grad_norm": 3.6426451206207275,
"learning_rate": 9.290469987391503e-06,
"loss": 0.8298,
"step": 3669
},
{
"epoch": 1.0654666860211932,
"grad_norm": 3.2656807899475098,
"learning_rate": 9.289976851030516e-06,
"loss": 0.7498,
"step": 3670
},
{
"epoch": 1.0657570039192916,
"grad_norm": 3.449364423751831,
"learning_rate": 9.289483556456883e-06,
"loss": 0.6807,
"step": 3671
},
{
"epoch": 1.06604732181739,
"grad_norm": 3.5260181427001953,
"learning_rate": 9.288990103688803e-06,
"loss": 0.7635,
"step": 3672
},
{
"epoch": 1.0663376397154885,
"grad_norm": 3.302656650543213,
"learning_rate": 9.288496492744466e-06,
"loss": 0.7195,
"step": 3673
},
{
"epoch": 1.0666279576135869,
"grad_norm": 3.234776258468628,
"learning_rate": 9.288002723642082e-06,
"loss": 0.7321,
"step": 3674
},
{
"epoch": 1.0669182755116853,
"grad_norm": 3.6483352184295654,
"learning_rate": 9.287508796399858e-06,
"loss": 0.7607,
"step": 3675
},
{
"epoch": 1.0672085934097837,
"grad_norm": 3.533311367034912,
"learning_rate": 9.287014711036013e-06,
"loss": 0.771,
"step": 3676
},
{
"epoch": 1.067498911307882,
"grad_norm": 3.86702036857605,
"learning_rate": 9.286520467568765e-06,
"loss": 0.7407,
"step": 3677
},
{
"epoch": 1.0677892292059805,
"grad_norm": 3.479646921157837,
"learning_rate": 9.286026066016344e-06,
"loss": 0.7384,
"step": 3678
},
{
"epoch": 1.068079547104079,
"grad_norm": 3.4313340187072754,
"learning_rate": 9.285531506396981e-06,
"loss": 0.7239,
"step": 3679
},
{
"epoch": 1.0683698650021773,
"grad_norm": 3.6296842098236084,
"learning_rate": 9.28503678872892e-06,
"loss": 0.739,
"step": 3680
},
{
"epoch": 1.0686601829002758,
"grad_norm": 3.3509602546691895,
"learning_rate": 9.2845419130304e-06,
"loss": 0.7174,
"step": 3681
},
{
"epoch": 1.0689505007983742,
"grad_norm": 3.4982831478118896,
"learning_rate": 9.284046879319675e-06,
"loss": 0.689,
"step": 3682
},
{
"epoch": 1.0692408186964726,
"grad_norm": 3.420058488845825,
"learning_rate": 9.283551687615002e-06,
"loss": 0.8226,
"step": 3683
},
{
"epoch": 1.069531136594571,
"grad_norm": 3.6235501766204834,
"learning_rate": 9.283056337934642e-06,
"loss": 0.8152,
"step": 3684
},
{
"epoch": 1.0698214544926694,
"grad_norm": 3.484602212905884,
"learning_rate": 9.282560830296864e-06,
"loss": 0.6969,
"step": 3685
},
{
"epoch": 1.0701117723907678,
"grad_norm": 3.5631332397460938,
"learning_rate": 9.282065164719942e-06,
"loss": 0.6524,
"step": 3686
},
{
"epoch": 1.0704020902888662,
"grad_norm": 3.950852155685425,
"learning_rate": 9.281569341222157e-06,
"loss": 0.8027,
"step": 3687
},
{
"epoch": 1.0706924081869647,
"grad_norm": 3.7912333011627197,
"learning_rate": 9.281073359821793e-06,
"loss": 0.7996,
"step": 3688
},
{
"epoch": 1.070982726085063,
"grad_norm": 3.653871774673462,
"learning_rate": 9.280577220537141e-06,
"loss": 0.8104,
"step": 3689
},
{
"epoch": 1.0712730439831615,
"grad_norm": 3.430440902709961,
"learning_rate": 9.280080923386501e-06,
"loss": 0.7232,
"step": 3690
},
{
"epoch": 1.07156336188126,
"grad_norm": 4.061392784118652,
"learning_rate": 9.279584468388176e-06,
"loss": 0.971,
"step": 3691
},
{
"epoch": 1.0718536797793583,
"grad_norm": 4.008795261383057,
"learning_rate": 9.279087855560474e-06,
"loss": 1.0048,
"step": 3692
},
{
"epoch": 1.0721439976774567,
"grad_norm": 3.099137306213379,
"learning_rate": 9.278591084921707e-06,
"loss": 0.6123,
"step": 3693
},
{
"epoch": 1.0724343155755551,
"grad_norm": 3.285714864730835,
"learning_rate": 9.278094156490201e-06,
"loss": 0.8407,
"step": 3694
},
{
"epoch": 1.0727246334736535,
"grad_norm": 3.100593090057373,
"learning_rate": 9.277597070284281e-06,
"loss": 0.6844,
"step": 3695
},
{
"epoch": 1.0730149513717522,
"grad_norm": 3.205623149871826,
"learning_rate": 9.277099826322277e-06,
"loss": 0.7021,
"step": 3696
},
{
"epoch": 1.0733052692698504,
"grad_norm": 3.03759765625,
"learning_rate": 9.27660242462253e-06,
"loss": 0.7171,
"step": 3697
},
{
"epoch": 1.073595587167949,
"grad_norm": 3.6206579208374023,
"learning_rate": 9.276104865203381e-06,
"loss": 0.7852,
"step": 3698
},
{
"epoch": 1.0738859050660474,
"grad_norm": 3.3694751262664795,
"learning_rate": 9.275607148083183e-06,
"loss": 0.7441,
"step": 3699
},
{
"epoch": 1.0741762229641458,
"grad_norm": 3.1286754608154297,
"learning_rate": 9.27510927328029e-06,
"loss": 0.6822,
"step": 3700
},
{
"epoch": 1.0744665408622442,
"grad_norm": 3.711529493331909,
"learning_rate": 9.274611240813062e-06,
"loss": 0.8291,
"step": 3701
},
{
"epoch": 1.0747568587603427,
"grad_norm": 3.498225688934326,
"learning_rate": 9.27411305069987e-06,
"loss": 0.7177,
"step": 3702
},
{
"epoch": 1.075047176658441,
"grad_norm": 3.874438524246216,
"learning_rate": 9.273614702959084e-06,
"loss": 0.8755,
"step": 3703
},
{
"epoch": 1.0753374945565395,
"grad_norm": 3.329667091369629,
"learning_rate": 9.273116197609085e-06,
"loss": 0.7263,
"step": 3704
},
{
"epoch": 1.075627812454638,
"grad_norm": 3.8230533599853516,
"learning_rate": 9.272617534668253e-06,
"loss": 0.7387,
"step": 3705
},
{
"epoch": 1.0759181303527363,
"grad_norm": 3.4294612407684326,
"learning_rate": 9.272118714154985e-06,
"loss": 0.7991,
"step": 3706
},
{
"epoch": 1.0762084482508347,
"grad_norm": 3.3059473037719727,
"learning_rate": 9.271619736087672e-06,
"loss": 0.678,
"step": 3707
},
{
"epoch": 1.0764987661489331,
"grad_norm": 3.165100336074829,
"learning_rate": 9.271120600484719e-06,
"loss": 0.7196,
"step": 3708
},
{
"epoch": 1.0767890840470316,
"grad_norm": 3.8009140491485596,
"learning_rate": 9.270621307364534e-06,
"loss": 0.9077,
"step": 3709
},
{
"epoch": 1.07707940194513,
"grad_norm": 3.789745330810547,
"learning_rate": 9.270121856745529e-06,
"loss": 0.8262,
"step": 3710
},
{
"epoch": 1.0773697198432284,
"grad_norm": 3.822162628173828,
"learning_rate": 9.269622248646124e-06,
"loss": 0.8806,
"step": 3711
},
{
"epoch": 1.0776600377413268,
"grad_norm": 3.407487392425537,
"learning_rate": 9.269122483084748e-06,
"loss": 0.7972,
"step": 3712
},
{
"epoch": 1.0779503556394252,
"grad_norm": 3.5224902629852295,
"learning_rate": 9.268622560079825e-06,
"loss": 0.8497,
"step": 3713
},
{
"epoch": 1.0782406735375236,
"grad_norm": 3.553903102874756,
"learning_rate": 9.268122479649796e-06,
"loss": 0.7534,
"step": 3714
},
{
"epoch": 1.078530991435622,
"grad_norm": 3.266307830810547,
"learning_rate": 9.267622241813106e-06,
"loss": 0.707,
"step": 3715
},
{
"epoch": 1.0788213093337204,
"grad_norm": 3.3318376541137695,
"learning_rate": 9.267121846588201e-06,
"loss": 0.7378,
"step": 3716
},
{
"epoch": 1.0791116272318189,
"grad_norm": 3.259420871734619,
"learning_rate": 9.266621293993534e-06,
"loss": 0.7609,
"step": 3717
},
{
"epoch": 1.0794019451299173,
"grad_norm": 3.658750295639038,
"learning_rate": 9.26612058404757e-06,
"loss": 0.8624,
"step": 3718
},
{
"epoch": 1.0796922630280157,
"grad_norm": 3.5097463130950928,
"learning_rate": 9.265619716768769e-06,
"loss": 0.7934,
"step": 3719
},
{
"epoch": 1.079982580926114,
"grad_norm": 3.147826671600342,
"learning_rate": 9.265118692175605e-06,
"loss": 0.7036,
"step": 3720
},
{
"epoch": 1.0802728988242125,
"grad_norm": 3.7938437461853027,
"learning_rate": 9.264617510286558e-06,
"loss": 0.788,
"step": 3721
},
{
"epoch": 1.080563216722311,
"grad_norm": 3.502878189086914,
"learning_rate": 9.26411617112011e-06,
"loss": 0.6864,
"step": 3722
},
{
"epoch": 1.0808535346204093,
"grad_norm": 3.6998252868652344,
"learning_rate": 9.263614674694748e-06,
"loss": 0.8459,
"step": 3723
},
{
"epoch": 1.0811438525185078,
"grad_norm": 3.7824223041534424,
"learning_rate": 9.26311302102897e-06,
"loss": 0.8461,
"step": 3724
},
{
"epoch": 1.0814341704166062,
"grad_norm": 3.34706711769104,
"learning_rate": 9.262611210141276e-06,
"loss": 0.8156,
"step": 3725
},
{
"epoch": 1.0817244883147046,
"grad_norm": 3.4476208686828613,
"learning_rate": 9.262109242050172e-06,
"loss": 0.7911,
"step": 3726
},
{
"epoch": 1.082014806212803,
"grad_norm": 3.309239149093628,
"learning_rate": 9.26160711677417e-06,
"loss": 0.7158,
"step": 3727
},
{
"epoch": 1.0823051241109014,
"grad_norm": 3.8136990070343018,
"learning_rate": 9.261104834331788e-06,
"loss": 0.7803,
"step": 3728
},
{
"epoch": 1.0825954420089998,
"grad_norm": 3.3151988983154297,
"learning_rate": 9.260602394741551e-06,
"loss": 0.7313,
"step": 3729
},
{
"epoch": 1.0828857599070982,
"grad_norm": 3.0309386253356934,
"learning_rate": 9.260099798021988e-06,
"loss": 0.6643,
"step": 3730
},
{
"epoch": 1.0831760778051966,
"grad_norm": 3.5916686058044434,
"learning_rate": 9.259597044191635e-06,
"loss": 0.7616,
"step": 3731
},
{
"epoch": 1.083466395703295,
"grad_norm": 4.077143669128418,
"learning_rate": 9.259094133269036e-06,
"loss": 0.7774,
"step": 3732
},
{
"epoch": 1.0837567136013935,
"grad_norm": 3.529888391494751,
"learning_rate": 9.258591065272733e-06,
"loss": 0.7659,
"step": 3733
},
{
"epoch": 1.0840470314994919,
"grad_norm": 3.5668489933013916,
"learning_rate": 9.258087840221281e-06,
"loss": 0.8392,
"step": 3734
},
{
"epoch": 1.0843373493975903,
"grad_norm": 3.344179153442383,
"learning_rate": 9.257584458133242e-06,
"loss": 0.858,
"step": 3735
},
{
"epoch": 1.0846276672956887,
"grad_norm": 4.286630630493164,
"learning_rate": 9.257080919027175e-06,
"loss": 0.8578,
"step": 3736
},
{
"epoch": 1.0849179851937871,
"grad_norm": 3.0358517169952393,
"learning_rate": 9.256577222921654e-06,
"loss": 0.7462,
"step": 3737
},
{
"epoch": 1.0852083030918855,
"grad_norm": 3.1172049045562744,
"learning_rate": 9.256073369835255e-06,
"loss": 0.5998,
"step": 3738
},
{
"epoch": 1.085498620989984,
"grad_norm": 3.513422727584839,
"learning_rate": 9.255569359786558e-06,
"loss": 0.7894,
"step": 3739
},
{
"epoch": 1.0857889388880824,
"grad_norm": 3.954484462738037,
"learning_rate": 9.255065192794153e-06,
"loss": 0.9343,
"step": 3740
},
{
"epoch": 1.0860792567861808,
"grad_norm": 3.374732255935669,
"learning_rate": 9.254560868876633e-06,
"loss": 0.6729,
"step": 3741
},
{
"epoch": 1.0863695746842792,
"grad_norm": 3.012810230255127,
"learning_rate": 9.254056388052593e-06,
"loss": 0.7632,
"step": 3742
},
{
"epoch": 1.0866598925823776,
"grad_norm": 3.49700927734375,
"learning_rate": 9.253551750340643e-06,
"loss": 0.6696,
"step": 3743
},
{
"epoch": 1.086950210480476,
"grad_norm": 3.2697410583496094,
"learning_rate": 9.253046955759394e-06,
"loss": 0.6528,
"step": 3744
},
{
"epoch": 1.0872405283785747,
"grad_norm": 3.7874979972839355,
"learning_rate": 9.25254200432746e-06,
"loss": 0.8849,
"step": 3745
},
{
"epoch": 1.0875308462766728,
"grad_norm": 3.744913101196289,
"learning_rate": 9.252036896063464e-06,
"loss": 0.8497,
"step": 3746
},
{
"epoch": 1.0878211641747715,
"grad_norm": 3.6657257080078125,
"learning_rate": 9.251531630986036e-06,
"loss": 0.8023,
"step": 3747
},
{
"epoch": 1.08811148207287,
"grad_norm": 3.5472493171691895,
"learning_rate": 9.251026209113806e-06,
"loss": 0.7415,
"step": 3748
},
{
"epoch": 1.0884017999709683,
"grad_norm": 3.7813925743103027,
"learning_rate": 9.250520630465419e-06,
"loss": 0.9681,
"step": 3749
},
{
"epoch": 1.0886921178690667,
"grad_norm": 3.2687952518463135,
"learning_rate": 9.250014895059518e-06,
"loss": 0.7353,
"step": 3750
},
{
"epoch": 1.0889824357671651,
"grad_norm": 3.2803022861480713,
"learning_rate": 9.249509002914752e-06,
"loss": 0.681,
"step": 3751
},
{
"epoch": 1.0892727536652635,
"grad_norm": 3.0684728622436523,
"learning_rate": 9.249002954049781e-06,
"loss": 0.7091,
"step": 3752
},
{
"epoch": 1.089563071563362,
"grad_norm": 3.981271982192993,
"learning_rate": 9.24849674848327e-06,
"loss": 0.8076,
"step": 3753
},
{
"epoch": 1.0898533894614604,
"grad_norm": 3.0908520221710205,
"learning_rate": 9.247990386233883e-06,
"loss": 0.7367,
"step": 3754
},
{
"epoch": 1.0901437073595588,
"grad_norm": 3.574917793273926,
"learning_rate": 9.247483867320295e-06,
"loss": 0.6696,
"step": 3755
},
{
"epoch": 1.0904340252576572,
"grad_norm": 3.577314853668213,
"learning_rate": 9.246977191761188e-06,
"loss": 0.8258,
"step": 3756
},
{
"epoch": 1.0907243431557556,
"grad_norm": 3.003840446472168,
"learning_rate": 9.246470359575249e-06,
"loss": 0.6683,
"step": 3757
},
{
"epoch": 1.091014661053854,
"grad_norm": 3.4558334350585938,
"learning_rate": 9.245963370781168e-06,
"loss": 0.7331,
"step": 3758
},
{
"epoch": 1.0913049789519524,
"grad_norm": 4.03562593460083,
"learning_rate": 9.245456225397642e-06,
"loss": 0.868,
"step": 3759
},
{
"epoch": 1.0915952968500509,
"grad_norm": 3.536433458328247,
"learning_rate": 9.244948923443376e-06,
"loss": 0.8345,
"step": 3760
},
{
"epoch": 1.0918856147481493,
"grad_norm": 3.7021758556365967,
"learning_rate": 9.244441464937077e-06,
"loss": 0.717,
"step": 3761
},
{
"epoch": 1.0921759326462477,
"grad_norm": 3.741546869277954,
"learning_rate": 9.243933849897462e-06,
"loss": 0.7938,
"step": 3762
},
{
"epoch": 1.092466250544346,
"grad_norm": 3.5878963470458984,
"learning_rate": 9.243426078343251e-06,
"loss": 0.8451,
"step": 3763
},
{
"epoch": 1.0927565684424445,
"grad_norm": 3.552255630493164,
"learning_rate": 9.242918150293169e-06,
"loss": 0.8474,
"step": 3764
},
{
"epoch": 1.093046886340543,
"grad_norm": 3.8845558166503906,
"learning_rate": 9.24241006576595e-06,
"loss": 0.7834,
"step": 3765
},
{
"epoch": 1.0933372042386413,
"grad_norm": 3.360624074935913,
"learning_rate": 9.241901824780331e-06,
"loss": 0.7395,
"step": 3766
},
{
"epoch": 1.0936275221367397,
"grad_norm": 3.2982327938079834,
"learning_rate": 9.241393427355056e-06,
"loss": 0.7452,
"step": 3767
},
{
"epoch": 1.0939178400348382,
"grad_norm": 3.23142409324646,
"learning_rate": 9.240884873508876e-06,
"loss": 0.6713,
"step": 3768
},
{
"epoch": 1.0942081579329366,
"grad_norm": 3.441584348678589,
"learning_rate": 9.240376163260545e-06,
"loss": 0.8075,
"step": 3769
},
{
"epoch": 1.094498475831035,
"grad_norm": 3.3424441814422607,
"learning_rate": 9.239867296628821e-06,
"loss": 0.7221,
"step": 3770
},
{
"epoch": 1.0947887937291334,
"grad_norm": 3.5608901977539062,
"learning_rate": 9.239358273632476e-06,
"loss": 0.8401,
"step": 3771
},
{
"epoch": 1.0950791116272318,
"grad_norm": 3.4727823734283447,
"learning_rate": 9.238849094290279e-06,
"loss": 0.7322,
"step": 3772
},
{
"epoch": 1.0953694295253302,
"grad_norm": 3.133427858352661,
"learning_rate": 9.238339758621011e-06,
"loss": 0.7485,
"step": 3773
},
{
"epoch": 1.0956597474234286,
"grad_norm": 3.073030710220337,
"learning_rate": 9.237830266643453e-06,
"loss": 0.6532,
"step": 3774
},
{
"epoch": 1.095950065321527,
"grad_norm": 3.06816029548645,
"learning_rate": 9.237320618376398e-06,
"loss": 0.6492,
"step": 3775
},
{
"epoch": 1.0962403832196255,
"grad_norm": 3.501046657562256,
"learning_rate": 9.23681081383864e-06,
"loss": 0.8614,
"step": 3776
},
{
"epoch": 1.0965307011177239,
"grad_norm": 3.766171455383301,
"learning_rate": 9.236300853048978e-06,
"loss": 0.8673,
"step": 3777
},
{
"epoch": 1.0968210190158223,
"grad_norm": 4.0137553215026855,
"learning_rate": 9.235790736026225e-06,
"loss": 0.788,
"step": 3778
},
{
"epoch": 1.0971113369139207,
"grad_norm": 3.591977834701538,
"learning_rate": 9.235280462789188e-06,
"loss": 0.7047,
"step": 3779
},
{
"epoch": 1.0974016548120191,
"grad_norm": 3.4781503677368164,
"learning_rate": 9.23477003335669e-06,
"loss": 0.7515,
"step": 3780
},
{
"epoch": 1.0976919727101175,
"grad_norm": 3.477678060531616,
"learning_rate": 9.234259447747554e-06,
"loss": 0.7738,
"step": 3781
},
{
"epoch": 1.097982290608216,
"grad_norm": 3.9467685222625732,
"learning_rate": 9.233748705980607e-06,
"loss": 0.856,
"step": 3782
},
{
"epoch": 1.0982726085063144,
"grad_norm": 3.463690996170044,
"learning_rate": 9.233237808074691e-06,
"loss": 0.7639,
"step": 3783
},
{
"epoch": 1.0985629264044128,
"grad_norm": 3.620694875717163,
"learning_rate": 9.232726754048643e-06,
"loss": 0.8162,
"step": 3784
},
{
"epoch": 1.0988532443025112,
"grad_norm": 3.6893718242645264,
"learning_rate": 9.232215543921313e-06,
"loss": 0.8336,
"step": 3785
},
{
"epoch": 1.0991435622006096,
"grad_norm": 3.620185613632202,
"learning_rate": 9.231704177711552e-06,
"loss": 0.8067,
"step": 3786
},
{
"epoch": 1.099433880098708,
"grad_norm": 3.3584699630737305,
"learning_rate": 9.231192655438222e-06,
"loss": 0.7664,
"step": 3787
},
{
"epoch": 1.0997241979968064,
"grad_norm": 3.5024573802948,
"learning_rate": 9.230680977120184e-06,
"loss": 0.7521,
"step": 3788
},
{
"epoch": 1.1000145158949048,
"grad_norm": 3.554534435272217,
"learning_rate": 9.230169142776311e-06,
"loss": 0.7894,
"step": 3789
},
{
"epoch": 1.1003048337930033,
"grad_norm": 3.831371784210205,
"learning_rate": 9.22965715242548e-06,
"loss": 0.8239,
"step": 3790
},
{
"epoch": 1.1005951516911017,
"grad_norm": 3.851170778274536,
"learning_rate": 9.22914500608657e-06,
"loss": 0.7703,
"step": 3791
},
{
"epoch": 1.1008854695892,
"grad_norm": 3.348322868347168,
"learning_rate": 9.22863270377847e-06,
"loss": 0.7707,
"step": 3792
},
{
"epoch": 1.1011757874872985,
"grad_norm": 3.3687806129455566,
"learning_rate": 9.228120245520076e-06,
"loss": 0.7372,
"step": 3793
},
{
"epoch": 1.101466105385397,
"grad_norm": 3.2010092735290527,
"learning_rate": 9.227607631330285e-06,
"loss": 0.6718,
"step": 3794
},
{
"epoch": 1.1017564232834953,
"grad_norm": 4.082772731781006,
"learning_rate": 9.227094861228e-06,
"loss": 0.8161,
"step": 3795
},
{
"epoch": 1.102046741181594,
"grad_norm": 3.4030327796936035,
"learning_rate": 9.226581935232135e-06,
"loss": 0.8786,
"step": 3796
},
{
"epoch": 1.1023370590796921,
"grad_norm": 3.6011297702789307,
"learning_rate": 9.226068853361607e-06,
"loss": 0.7148,
"step": 3797
},
{
"epoch": 1.1026273769777908,
"grad_norm": 3.7094037532806396,
"learning_rate": 9.225555615635336e-06,
"loss": 0.7745,
"step": 3798
},
{
"epoch": 1.1029176948758892,
"grad_norm": 3.2469165325164795,
"learning_rate": 9.225042222072251e-06,
"loss": 0.6453,
"step": 3799
},
{
"epoch": 1.1032080127739876,
"grad_norm": 3.479039430618286,
"learning_rate": 9.224528672691284e-06,
"loss": 0.7451,
"step": 3800
},
{
"epoch": 1.103498330672086,
"grad_norm": 3.0856449604034424,
"learning_rate": 9.224014967511378e-06,
"loss": 0.7583,
"step": 3801
},
{
"epoch": 1.1037886485701844,
"grad_norm": 3.4856984615325928,
"learning_rate": 9.223501106551475e-06,
"loss": 0.6975,
"step": 3802
},
{
"epoch": 1.1040789664682829,
"grad_norm": 3.5641419887542725,
"learning_rate": 9.222987089830528e-06,
"loss": 0.7357,
"step": 3803
},
{
"epoch": 1.1043692843663813,
"grad_norm": 3.8171226978302,
"learning_rate": 9.222472917367492e-06,
"loss": 0.8233,
"step": 3804
},
{
"epoch": 1.1046596022644797,
"grad_norm": 3.733131170272827,
"learning_rate": 9.22195858918133e-06,
"loss": 0.7833,
"step": 3805
},
{
"epoch": 1.104949920162578,
"grad_norm": 3.9596691131591797,
"learning_rate": 9.221444105291013e-06,
"loss": 0.8639,
"step": 3806
},
{
"epoch": 1.1052402380606765,
"grad_norm": 3.4496874809265137,
"learning_rate": 9.22092946571551e-06,
"loss": 0.7427,
"step": 3807
},
{
"epoch": 1.105530555958775,
"grad_norm": 3.837810754776001,
"learning_rate": 9.220414670473806e-06,
"loss": 0.7449,
"step": 3808
},
{
"epoch": 1.1058208738568733,
"grad_norm": 3.513516902923584,
"learning_rate": 9.219899719584882e-06,
"loss": 0.8359,
"step": 3809
},
{
"epoch": 1.1061111917549717,
"grad_norm": 3.4239394664764404,
"learning_rate": 9.21938461306773e-06,
"loss": 0.6477,
"step": 3810
},
{
"epoch": 1.1064015096530702,
"grad_norm": 3.192553758621216,
"learning_rate": 9.21886935094135e-06,
"loss": 0.6797,
"step": 3811
},
{
"epoch": 1.1066918275511686,
"grad_norm": 3.2809319496154785,
"learning_rate": 9.218353933224743e-06,
"loss": 0.7457,
"step": 3812
},
{
"epoch": 1.106982145449267,
"grad_norm": 3.670210361480713,
"learning_rate": 9.217838359936914e-06,
"loss": 0.6784,
"step": 3813
},
{
"epoch": 1.1072724633473654,
"grad_norm": 3.770373582839966,
"learning_rate": 9.21732263109688e-06,
"loss": 0.8031,
"step": 3814
},
{
"epoch": 1.1075627812454638,
"grad_norm": 3.9848551750183105,
"learning_rate": 9.216806746723666e-06,
"loss": 0.8274,
"step": 3815
},
{
"epoch": 1.1078530991435622,
"grad_norm": 3.6225457191467285,
"learning_rate": 9.216290706836288e-06,
"loss": 0.8351,
"step": 3816
},
{
"epoch": 1.1081434170416606,
"grad_norm": 3.5515317916870117,
"learning_rate": 9.215774511453784e-06,
"loss": 0.6946,
"step": 3817
},
{
"epoch": 1.108433734939759,
"grad_norm": 3.5677294731140137,
"learning_rate": 9.215258160595187e-06,
"loss": 0.7142,
"step": 3818
},
{
"epoch": 1.1087240528378575,
"grad_norm": 3.2002451419830322,
"learning_rate": 9.214741654279543e-06,
"loss": 0.7483,
"step": 3819
},
{
"epoch": 1.1090143707359559,
"grad_norm": 3.1444714069366455,
"learning_rate": 9.2142249925259e-06,
"loss": 0.7511,
"step": 3820
},
{
"epoch": 1.1093046886340543,
"grad_norm": 3.7607555389404297,
"learning_rate": 9.213708175353311e-06,
"loss": 0.6861,
"step": 3821
},
{
"epoch": 1.1095950065321527,
"grad_norm": 3.2420289516448975,
"learning_rate": 9.213191202780835e-06,
"loss": 0.6305,
"step": 3822
},
{
"epoch": 1.1098853244302511,
"grad_norm": 3.4901387691497803,
"learning_rate": 9.212674074827542e-06,
"loss": 0.8123,
"step": 3823
},
{
"epoch": 1.1101756423283495,
"grad_norm": 3.4428091049194336,
"learning_rate": 9.212156791512502e-06,
"loss": 0.8259,
"step": 3824
},
{
"epoch": 1.110465960226448,
"grad_norm": 3.0317587852478027,
"learning_rate": 9.211639352854786e-06,
"loss": 0.67,
"step": 3825
},
{
"epoch": 1.1107562781245464,
"grad_norm": 3.2258551120758057,
"learning_rate": 9.211121758873487e-06,
"loss": 0.7019,
"step": 3826
},
{
"epoch": 1.1110465960226448,
"grad_norm": 3.6131057739257812,
"learning_rate": 9.210604009587687e-06,
"loss": 0.8236,
"step": 3827
},
{
"epoch": 1.1113369139207432,
"grad_norm": 3.5522913932800293,
"learning_rate": 9.21008610501648e-06,
"loss": 0.7453,
"step": 3828
},
{
"epoch": 1.1116272318188416,
"grad_norm": 3.3678643703460693,
"learning_rate": 9.20956804517897e-06,
"loss": 0.7088,
"step": 3829
},
{
"epoch": 1.11191754971694,
"grad_norm": 3.779475688934326,
"learning_rate": 9.20904983009426e-06,
"loss": 0.7937,
"step": 3830
},
{
"epoch": 1.1122078676150384,
"grad_norm": 3.308375597000122,
"learning_rate": 9.208531459781464e-06,
"loss": 0.7086,
"step": 3831
},
{
"epoch": 1.1124981855131368,
"grad_norm": 3.5668954849243164,
"learning_rate": 9.208012934259697e-06,
"loss": 0.7745,
"step": 3832
},
{
"epoch": 1.1127885034112353,
"grad_norm": 3.0808634757995605,
"learning_rate": 9.207494253548084e-06,
"loss": 0.6845,
"step": 3833
},
{
"epoch": 1.1130788213093337,
"grad_norm": 3.044464349746704,
"learning_rate": 9.206975417665751e-06,
"loss": 0.7371,
"step": 3834
},
{
"epoch": 1.113369139207432,
"grad_norm": 3.4729931354522705,
"learning_rate": 9.206456426631836e-06,
"loss": 0.7245,
"step": 3835
},
{
"epoch": 1.1136594571055305,
"grad_norm": 3.503591775894165,
"learning_rate": 9.205937280465476e-06,
"loss": 0.7385,
"step": 3836
},
{
"epoch": 1.113949775003629,
"grad_norm": 3.2636380195617676,
"learning_rate": 9.205417979185818e-06,
"loss": 0.7385,
"step": 3837
},
{
"epoch": 1.1142400929017273,
"grad_norm": 4.049813747406006,
"learning_rate": 9.204898522812015e-06,
"loss": 0.8251,
"step": 3838
},
{
"epoch": 1.1145304107998257,
"grad_norm": 3.246598958969116,
"learning_rate": 9.204378911363222e-06,
"loss": 0.6892,
"step": 3839
},
{
"epoch": 1.1148207286979241,
"grad_norm": 3.6350643634796143,
"learning_rate": 9.203859144858604e-06,
"loss": 0.8535,
"step": 3840
},
{
"epoch": 1.1151110465960226,
"grad_norm": 3.558542251586914,
"learning_rate": 9.203339223317328e-06,
"loss": 0.8299,
"step": 3841
},
{
"epoch": 1.115401364494121,
"grad_norm": 3.504409074783325,
"learning_rate": 9.20281914675857e-06,
"loss": 0.7694,
"step": 3842
},
{
"epoch": 1.1156916823922194,
"grad_norm": 3.365307569503784,
"learning_rate": 9.20229891520151e-06,
"loss": 0.7959,
"step": 3843
},
{
"epoch": 1.1159820002903178,
"grad_norm": 3.161320447921753,
"learning_rate": 9.201778528665333e-06,
"loss": 0.6549,
"step": 3844
},
{
"epoch": 1.1162723181884162,
"grad_norm": 3.2018449306488037,
"learning_rate": 9.201257987169233e-06,
"loss": 0.6626,
"step": 3845
},
{
"epoch": 1.1165626360865146,
"grad_norm": 3.6142992973327637,
"learning_rate": 9.200737290732402e-06,
"loss": 0.7719,
"step": 3846
},
{
"epoch": 1.1168529539846133,
"grad_norm": 3.2540829181671143,
"learning_rate": 9.20021643937405e-06,
"loss": 0.6995,
"step": 3847
},
{
"epoch": 1.1171432718827115,
"grad_norm": 3.530956268310547,
"learning_rate": 9.19969543311338e-06,
"loss": 0.857,
"step": 3848
},
{
"epoch": 1.11743358978081,
"grad_norm": 3.8063101768493652,
"learning_rate": 9.199174271969612e-06,
"loss": 0.8972,
"step": 3849
},
{
"epoch": 1.1177239076789085,
"grad_norm": 3.33796763420105,
"learning_rate": 9.198652955961961e-06,
"loss": 0.7059,
"step": 3850
},
{
"epoch": 1.118014225577007,
"grad_norm": 3.4572362899780273,
"learning_rate": 9.198131485109656e-06,
"loss": 0.7459,
"step": 3851
},
{
"epoch": 1.1183045434751053,
"grad_norm": 4.223832607269287,
"learning_rate": 9.197609859431928e-06,
"loss": 0.8582,
"step": 3852
},
{
"epoch": 1.1185948613732037,
"grad_norm": 3.749410390853882,
"learning_rate": 9.197088078948013e-06,
"loss": 0.7968,
"step": 3853
},
{
"epoch": 1.1188851792713022,
"grad_norm": 3.3402292728424072,
"learning_rate": 9.196566143677157e-06,
"loss": 0.7766,
"step": 3854
},
{
"epoch": 1.1191754971694006,
"grad_norm": 3.567389488220215,
"learning_rate": 9.196044053638607e-06,
"loss": 0.8716,
"step": 3855
},
{
"epoch": 1.119465815067499,
"grad_norm": 3.3039045333862305,
"learning_rate": 9.195521808851615e-06,
"loss": 0.6931,
"step": 3856
},
{
"epoch": 1.1197561329655974,
"grad_norm": 3.9325478076934814,
"learning_rate": 9.194999409335446e-06,
"loss": 0.8135,
"step": 3857
},
{
"epoch": 1.1200464508636958,
"grad_norm": 3.852951765060425,
"learning_rate": 9.194476855109362e-06,
"loss": 0.8106,
"step": 3858
},
{
"epoch": 1.1203367687617942,
"grad_norm": 3.6040732860565186,
"learning_rate": 9.193954146192638e-06,
"loss": 0.766,
"step": 3859
},
{
"epoch": 1.1206270866598926,
"grad_norm": 3.2979674339294434,
"learning_rate": 9.193431282604547e-06,
"loss": 0.7364,
"step": 3860
},
{
"epoch": 1.120917404557991,
"grad_norm": 3.225715160369873,
"learning_rate": 9.192908264364377e-06,
"loss": 0.7519,
"step": 3861
},
{
"epoch": 1.1212077224560895,
"grad_norm": 3.7926652431488037,
"learning_rate": 9.192385091491411e-06,
"loss": 0.7857,
"step": 3862
},
{
"epoch": 1.1214980403541879,
"grad_norm": 3.2855775356292725,
"learning_rate": 9.19186176400495e-06,
"loss": 0.674,
"step": 3863
},
{
"epoch": 1.1217883582522863,
"grad_norm": 3.847721815109253,
"learning_rate": 9.191338281924288e-06,
"loss": 0.826,
"step": 3864
},
{
"epoch": 1.1220786761503847,
"grad_norm": 3.684709072113037,
"learning_rate": 9.190814645268735e-06,
"loss": 0.8217,
"step": 3865
},
{
"epoch": 1.1223689940484831,
"grad_norm": 3.2224950790405273,
"learning_rate": 9.1902908540576e-06,
"loss": 0.7144,
"step": 3866
},
{
"epoch": 1.1226593119465815,
"grad_norm": 3.4135384559631348,
"learning_rate": 9.1897669083102e-06,
"loss": 0.8183,
"step": 3867
},
{
"epoch": 1.12294962984468,
"grad_norm": 3.310356616973877,
"learning_rate": 9.189242808045862e-06,
"loss": 0.8442,
"step": 3868
},
{
"epoch": 1.1232399477427784,
"grad_norm": 3.4118008613586426,
"learning_rate": 9.188718553283912e-06,
"loss": 0.7003,
"step": 3869
},
{
"epoch": 1.1235302656408768,
"grad_norm": 3.467306613922119,
"learning_rate": 9.18819414404368e-06,
"loss": 0.6666,
"step": 3870
},
{
"epoch": 1.1238205835389752,
"grad_norm": 3.144047737121582,
"learning_rate": 9.187669580344512e-06,
"loss": 0.7123,
"step": 3871
},
{
"epoch": 1.1241109014370736,
"grad_norm": 3.6717677116394043,
"learning_rate": 9.187144862205753e-06,
"loss": 0.812,
"step": 3872
},
{
"epoch": 1.124401219335172,
"grad_norm": 4.038080215454102,
"learning_rate": 9.186619989646753e-06,
"loss": 0.7922,
"step": 3873
},
{
"epoch": 1.1246915372332704,
"grad_norm": 3.4617326259613037,
"learning_rate": 9.186094962686867e-06,
"loss": 0.7475,
"step": 3874
},
{
"epoch": 1.1249818551313688,
"grad_norm": 3.546358823776245,
"learning_rate": 9.18556978134546e-06,
"loss": 0.675,
"step": 3875
},
{
"epoch": 1.1252721730294672,
"grad_norm": 3.410590648651123,
"learning_rate": 9.185044445641902e-06,
"loss": 0.7824,
"step": 3876
},
{
"epoch": 1.1255624909275657,
"grad_norm": 3.880183458328247,
"learning_rate": 9.184518955595567e-06,
"loss": 0.9077,
"step": 3877
},
{
"epoch": 1.125852808825664,
"grad_norm": 3.3557281494140625,
"learning_rate": 9.18399331122583e-06,
"loss": 0.699,
"step": 3878
},
{
"epoch": 1.1261431267237625,
"grad_norm": 3.676377773284912,
"learning_rate": 9.183467512552082e-06,
"loss": 0.8433,
"step": 3879
},
{
"epoch": 1.126433444621861,
"grad_norm": 3.825648069381714,
"learning_rate": 9.182941559593713e-06,
"loss": 0.7285,
"step": 3880
},
{
"epoch": 1.1267237625199593,
"grad_norm": 3.3879647254943848,
"learning_rate": 9.182415452370119e-06,
"loss": 0.7921,
"step": 3881
},
{
"epoch": 1.1270140804180577,
"grad_norm": 3.6778652667999268,
"learning_rate": 9.181889190900702e-06,
"loss": 0.7911,
"step": 3882
},
{
"epoch": 1.1273043983161561,
"grad_norm": 3.598294734954834,
"learning_rate": 9.181362775204871e-06,
"loss": 0.7536,
"step": 3883
},
{
"epoch": 1.1275947162142546,
"grad_norm": 3.4838759899139404,
"learning_rate": 9.18083620530204e-06,
"loss": 0.8197,
"step": 3884
},
{
"epoch": 1.127885034112353,
"grad_norm": 3.5205631256103516,
"learning_rate": 9.180309481211629e-06,
"loss": 0.7828,
"step": 3885
},
{
"epoch": 1.1281753520104514,
"grad_norm": 3.924164295196533,
"learning_rate": 9.179782602953065e-06,
"loss": 0.7685,
"step": 3886
},
{
"epoch": 1.1284656699085498,
"grad_norm": 3.336639881134033,
"learning_rate": 9.179255570545775e-06,
"loss": 0.7275,
"step": 3887
},
{
"epoch": 1.1287559878066482,
"grad_norm": 3.553356885910034,
"learning_rate": 9.178728384009199e-06,
"loss": 0.7881,
"step": 3888
},
{
"epoch": 1.1290463057047466,
"grad_norm": 3.6561996936798096,
"learning_rate": 9.178201043362778e-06,
"loss": 0.876,
"step": 3889
},
{
"epoch": 1.129336623602845,
"grad_norm": 7.617891788482666,
"learning_rate": 9.177673548625962e-06,
"loss": 0.6766,
"step": 3890
},
{
"epoch": 1.1296269415009434,
"grad_norm": 3.3711862564086914,
"learning_rate": 9.177145899818203e-06,
"loss": 0.79,
"step": 3891
},
{
"epoch": 1.1299172593990419,
"grad_norm": 3.308711528778076,
"learning_rate": 9.17661809695896e-06,
"loss": 0.7435,
"step": 3892
},
{
"epoch": 1.1302075772971403,
"grad_norm": 3.669429063796997,
"learning_rate": 9.176090140067699e-06,
"loss": 0.647,
"step": 3893
},
{
"epoch": 1.130497895195239,
"grad_norm": 3.878659248352051,
"learning_rate": 9.175562029163892e-06,
"loss": 0.7192,
"step": 3894
},
{
"epoch": 1.130788213093337,
"grad_norm": 3.555819272994995,
"learning_rate": 9.175033764267013e-06,
"loss": 0.7141,
"step": 3895
},
{
"epoch": 1.1310785309914357,
"grad_norm": 3.896650791168213,
"learning_rate": 9.174505345396546e-06,
"loss": 0.823,
"step": 3896
},
{
"epoch": 1.131368848889534,
"grad_norm": 3.3993911743164062,
"learning_rate": 9.173976772571978e-06,
"loss": 0.7859,
"step": 3897
},
{
"epoch": 1.1316591667876326,
"grad_norm": 3.185831069946289,
"learning_rate": 9.173448045812806e-06,
"loss": 0.8121,
"step": 3898
},
{
"epoch": 1.1319494846857308,
"grad_norm": 3.3628885746002197,
"learning_rate": 9.172919165138523e-06,
"loss": 0.6954,
"step": 3899
},
{
"epoch": 1.1322398025838294,
"grad_norm": 3.817692995071411,
"learning_rate": 9.172390130568638e-06,
"loss": 0.892,
"step": 3900
},
{
"epoch": 1.1325301204819278,
"grad_norm": 3.3503918647766113,
"learning_rate": 9.17186094212266e-06,
"loss": 0.6653,
"step": 3901
},
{
"epoch": 1.1328204383800262,
"grad_norm": 3.7152490615844727,
"learning_rate": 9.171331599820106e-06,
"loss": 0.7165,
"step": 3902
},
{
"epoch": 1.1331107562781246,
"grad_norm": 3.5846714973449707,
"learning_rate": 9.1708021036805e-06,
"loss": 0.7886,
"step": 3903
},
{
"epoch": 1.133401074176223,
"grad_norm": 3.3426952362060547,
"learning_rate": 9.170272453723365e-06,
"loss": 0.7662,
"step": 3904
},
{
"epoch": 1.1336913920743215,
"grad_norm": 3.628878355026245,
"learning_rate": 9.169742649968238e-06,
"loss": 0.7641,
"step": 3905
},
{
"epoch": 1.1339817099724199,
"grad_norm": 3.536870002746582,
"learning_rate": 9.169212692434658e-06,
"loss": 0.7743,
"step": 3906
},
{
"epoch": 1.1342720278705183,
"grad_norm": 3.908158540725708,
"learning_rate": 9.168682581142168e-06,
"loss": 0.8958,
"step": 3907
},
{
"epoch": 1.1345623457686167,
"grad_norm": 3.5317137241363525,
"learning_rate": 9.168152316110318e-06,
"loss": 0.7183,
"step": 3908
},
{
"epoch": 1.1348526636667151,
"grad_norm": 3.372509479522705,
"learning_rate": 9.167621897358665e-06,
"loss": 0.8437,
"step": 3909
},
{
"epoch": 1.1351429815648135,
"grad_norm": 3.9317944049835205,
"learning_rate": 9.16709132490677e-06,
"loss": 0.7876,
"step": 3910
},
{
"epoch": 1.135433299462912,
"grad_norm": 4.102498531341553,
"learning_rate": 9.166560598774201e-06,
"loss": 0.8697,
"step": 3911
},
{
"epoch": 1.1357236173610104,
"grad_norm": 4.032670974731445,
"learning_rate": 9.16602971898053e-06,
"loss": 0.8186,
"step": 3912
},
{
"epoch": 1.1360139352591088,
"grad_norm": 3.8047587871551514,
"learning_rate": 9.165498685545335e-06,
"loss": 0.7771,
"step": 3913
},
{
"epoch": 1.1363042531572072,
"grad_norm": 3.7372140884399414,
"learning_rate": 9.164967498488203e-06,
"loss": 0.8252,
"step": 3914
},
{
"epoch": 1.1365945710553056,
"grad_norm": 3.7121047973632812,
"learning_rate": 9.164436157828721e-06,
"loss": 0.7722,
"step": 3915
},
{
"epoch": 1.136884888953404,
"grad_norm": 3.0970237255096436,
"learning_rate": 9.16390466358649e-06,
"loss": 0.6937,
"step": 3916
},
{
"epoch": 1.1371752068515024,
"grad_norm": 3.5272154808044434,
"learning_rate": 9.163373015781104e-06,
"loss": 0.7701,
"step": 3917
},
{
"epoch": 1.1374655247496008,
"grad_norm": 3.2425544261932373,
"learning_rate": 9.162841214432174e-06,
"loss": 0.771,
"step": 3918
},
{
"epoch": 1.1377558426476992,
"grad_norm": 3.648613452911377,
"learning_rate": 9.162309259559313e-06,
"loss": 0.8285,
"step": 3919
},
{
"epoch": 1.1380461605457977,
"grad_norm": 3.265514373779297,
"learning_rate": 9.161777151182137e-06,
"loss": 0.7192,
"step": 3920
},
{
"epoch": 1.138336478443896,
"grad_norm": 3.608022689819336,
"learning_rate": 9.161244889320271e-06,
"loss": 0.6825,
"step": 3921
},
{
"epoch": 1.1386267963419945,
"grad_norm": 3.5768356323242188,
"learning_rate": 9.160712473993347e-06,
"loss": 0.7143,
"step": 3922
},
{
"epoch": 1.138917114240093,
"grad_norm": 3.050487518310547,
"learning_rate": 9.160179905220995e-06,
"loss": 0.6958,
"step": 3923
},
{
"epoch": 1.1392074321381913,
"grad_norm": 3.143773078918457,
"learning_rate": 9.159647183022862e-06,
"loss": 0.6979,
"step": 3924
},
{
"epoch": 1.1394977500362897,
"grad_norm": 3.6614866256713867,
"learning_rate": 9.159114307418589e-06,
"loss": 0.6862,
"step": 3925
},
{
"epoch": 1.1397880679343881,
"grad_norm": 4.121794700622559,
"learning_rate": 9.158581278427833e-06,
"loss": 0.9153,
"step": 3926
},
{
"epoch": 1.1400783858324866,
"grad_norm": 3.5927717685699463,
"learning_rate": 9.158048096070249e-06,
"loss": 0.7082,
"step": 3927
},
{
"epoch": 1.140368703730585,
"grad_norm": 3.526240825653076,
"learning_rate": 9.1575147603655e-06,
"loss": 0.7109,
"step": 3928
},
{
"epoch": 1.1406590216286834,
"grad_norm": 3.6357266902923584,
"learning_rate": 9.156981271333258e-06,
"loss": 0.8743,
"step": 3929
},
{
"epoch": 1.1409493395267818,
"grad_norm": 3.472874879837036,
"learning_rate": 9.156447628993197e-06,
"loss": 0.7222,
"step": 3930
},
{
"epoch": 1.1412396574248802,
"grad_norm": 3.201047420501709,
"learning_rate": 9.155913833364995e-06,
"loss": 0.7311,
"step": 3931
},
{
"epoch": 1.1415299753229786,
"grad_norm": 3.7483444213867188,
"learning_rate": 9.15537988446834e-06,
"loss": 0.8526,
"step": 3932
},
{
"epoch": 1.141820293221077,
"grad_norm": 3.605494737625122,
"learning_rate": 9.154845782322926e-06,
"loss": 0.8127,
"step": 3933
},
{
"epoch": 1.1421106111191754,
"grad_norm": 3.23360013961792,
"learning_rate": 9.154311526948446e-06,
"loss": 0.7632,
"step": 3934
},
{
"epoch": 1.1424009290172739,
"grad_norm": 3.5619661808013916,
"learning_rate": 9.153777118364607e-06,
"loss": 0.7391,
"step": 3935
},
{
"epoch": 1.1426912469153723,
"grad_norm": 3.4658992290496826,
"learning_rate": 9.153242556591115e-06,
"loss": 0.7462,
"step": 3936
},
{
"epoch": 1.1429815648134707,
"grad_norm": 3.5309834480285645,
"learning_rate": 9.152707841647687e-06,
"loss": 0.7752,
"step": 3937
},
{
"epoch": 1.143271882711569,
"grad_norm": 3.535386323928833,
"learning_rate": 9.15217297355404e-06,
"loss": 0.8618,
"step": 3938
},
{
"epoch": 1.1435622006096675,
"grad_norm": 3.6657485961914062,
"learning_rate": 9.151637952329903e-06,
"loss": 0.806,
"step": 3939
},
{
"epoch": 1.143852518507766,
"grad_norm": 3.7661120891571045,
"learning_rate": 9.151102777995007e-06,
"loss": 0.77,
"step": 3940
},
{
"epoch": 1.1441428364058643,
"grad_norm": 3.8279857635498047,
"learning_rate": 9.150567450569086e-06,
"loss": 0.8114,
"step": 3941
},
{
"epoch": 1.1444331543039628,
"grad_norm": 3.7410404682159424,
"learning_rate": 9.150031970071884e-06,
"loss": 0.7662,
"step": 3942
},
{
"epoch": 1.1447234722020612,
"grad_norm": 3.9251410961151123,
"learning_rate": 9.149496336523151e-06,
"loss": 0.7287,
"step": 3943
},
{
"epoch": 1.1450137901001596,
"grad_norm": 4.000790596008301,
"learning_rate": 9.14896054994264e-06,
"loss": 0.8683,
"step": 3944
},
{
"epoch": 1.1453041079982582,
"grad_norm": 3.3374154567718506,
"learning_rate": 9.148424610350111e-06,
"loss": 0.6767,
"step": 3945
},
{
"epoch": 1.1455944258963564,
"grad_norm": 3.548007011413574,
"learning_rate": 9.147888517765326e-06,
"loss": 0.7309,
"step": 3946
},
{
"epoch": 1.145884743794455,
"grad_norm": 3.259523630142212,
"learning_rate": 9.147352272208061e-06,
"loss": 0.7373,
"step": 3947
},
{
"epoch": 1.1461750616925532,
"grad_norm": 3.932647466659546,
"learning_rate": 9.14681587369809e-06,
"loss": 0.8901,
"step": 3948
},
{
"epoch": 1.1464653795906519,
"grad_norm": 4.032466411590576,
"learning_rate": 9.146279322255194e-06,
"loss": 0.8693,
"step": 3949
},
{
"epoch": 1.14675569748875,
"grad_norm": 2.9973812103271484,
"learning_rate": 9.14574261789916e-06,
"loss": 0.6747,
"step": 3950
},
{
"epoch": 1.1470460153868487,
"grad_norm": 3.4267399311065674,
"learning_rate": 9.145205760649787e-06,
"loss": 0.7947,
"step": 3951
},
{
"epoch": 1.147336333284947,
"grad_norm": 3.820967197418213,
"learning_rate": 9.14466875052687e-06,
"loss": 0.87,
"step": 3952
},
{
"epoch": 1.1476266511830455,
"grad_norm": 3.9774868488311768,
"learning_rate": 9.144131587550214e-06,
"loss": 0.7757,
"step": 3953
},
{
"epoch": 1.147916969081144,
"grad_norm": 3.902125597000122,
"learning_rate": 9.143594271739628e-06,
"loss": 0.8454,
"step": 3954
},
{
"epoch": 1.1482072869792423,
"grad_norm": 3.565986156463623,
"learning_rate": 9.14305680311493e-06,
"loss": 0.8091,
"step": 3955
},
{
"epoch": 1.1484976048773408,
"grad_norm": 3.5948240756988525,
"learning_rate": 9.142519181695943e-06,
"loss": 0.8775,
"step": 3956
},
{
"epoch": 1.1487879227754392,
"grad_norm": 3.324223279953003,
"learning_rate": 9.141981407502492e-06,
"loss": 0.5903,
"step": 3957
},
{
"epoch": 1.1490782406735376,
"grad_norm": 3.6919519901275635,
"learning_rate": 9.141443480554408e-06,
"loss": 0.6949,
"step": 3958
},
{
"epoch": 1.149368558571636,
"grad_norm": 4.20566987991333,
"learning_rate": 9.140905400871535e-06,
"loss": 0.9056,
"step": 3959
},
{
"epoch": 1.1496588764697344,
"grad_norm": 3.5956645011901855,
"learning_rate": 9.140367168473711e-06,
"loss": 0.8069,
"step": 3960
},
{
"epoch": 1.1499491943678328,
"grad_norm": 3.393167734146118,
"learning_rate": 9.139828783380791e-06,
"loss": 0.7518,
"step": 3961
},
{
"epoch": 1.1502395122659312,
"grad_norm": 3.863666534423828,
"learning_rate": 9.13929024561263e-06,
"loss": 0.8516,
"step": 3962
},
{
"epoch": 1.1505298301640297,
"grad_norm": 3.6960501670837402,
"learning_rate": 9.138751555189084e-06,
"loss": 0.8199,
"step": 3963
},
{
"epoch": 1.150820148062128,
"grad_norm": 3.536280393600464,
"learning_rate": 9.138212712130024e-06,
"loss": 0.7833,
"step": 3964
},
{
"epoch": 1.1511104659602265,
"grad_norm": 3.709526538848877,
"learning_rate": 9.137673716455322e-06,
"loss": 0.7854,
"step": 3965
},
{
"epoch": 1.151400783858325,
"grad_norm": 4.362963676452637,
"learning_rate": 9.137134568184855e-06,
"loss": 1.0421,
"step": 3966
},
{
"epoch": 1.1516911017564233,
"grad_norm": 3.34218168258667,
"learning_rate": 9.136595267338507e-06,
"loss": 0.7751,
"step": 3967
},
{
"epoch": 1.1519814196545217,
"grad_norm": 3.735380172729492,
"learning_rate": 9.136055813936167e-06,
"loss": 0.7086,
"step": 3968
},
{
"epoch": 1.1522717375526201,
"grad_norm": 3.6577022075653076,
"learning_rate": 9.13551620799773e-06,
"loss": 0.792,
"step": 3969
},
{
"epoch": 1.1525620554507185,
"grad_norm": 3.5989091396331787,
"learning_rate": 9.134976449543097e-06,
"loss": 0.8835,
"step": 3970
},
{
"epoch": 1.152852373348817,
"grad_norm": 3.620215654373169,
"learning_rate": 9.134436538592173e-06,
"loss": 0.8646,
"step": 3971
},
{
"epoch": 1.1531426912469154,
"grad_norm": 3.421151638031006,
"learning_rate": 9.13389647516487e-06,
"loss": 0.8433,
"step": 3972
},
{
"epoch": 1.1534330091450138,
"grad_norm": 3.528225898742676,
"learning_rate": 9.133356259281106e-06,
"loss": 0.7508,
"step": 3973
},
{
"epoch": 1.1537233270431122,
"grad_norm": 3.510094165802002,
"learning_rate": 9.132815890960802e-06,
"loss": 0.8679,
"step": 3974
},
{
"epoch": 1.1540136449412106,
"grad_norm": 3.6377060413360596,
"learning_rate": 9.132275370223889e-06,
"loss": 0.8165,
"step": 3975
},
{
"epoch": 1.154303962839309,
"grad_norm": 3.446504592895508,
"learning_rate": 9.1317346970903e-06,
"loss": 0.724,
"step": 3976
},
{
"epoch": 1.1545942807374074,
"grad_norm": 3.3458592891693115,
"learning_rate": 9.131193871579975e-06,
"loss": 0.7576,
"step": 3977
},
{
"epoch": 1.1548845986355059,
"grad_norm": 3.1824088096618652,
"learning_rate": 9.13065289371286e-06,
"loss": 0.7326,
"step": 3978
},
{
"epoch": 1.1551749165336043,
"grad_norm": 3.756395101547241,
"learning_rate": 9.130111763508905e-06,
"loss": 0.7555,
"step": 3979
},
{
"epoch": 1.1554652344317027,
"grad_norm": 3.877638816833496,
"learning_rate": 9.129570480988067e-06,
"loss": 0.7437,
"step": 3980
},
{
"epoch": 1.155755552329801,
"grad_norm": 4.016098499298096,
"learning_rate": 9.129029046170309e-06,
"loss": 0.8865,
"step": 3981
},
{
"epoch": 1.1560458702278995,
"grad_norm": 4.113951683044434,
"learning_rate": 9.128487459075596e-06,
"loss": 0.6799,
"step": 3982
},
{
"epoch": 1.156336188125998,
"grad_norm": 3.6073343753814697,
"learning_rate": 9.127945719723908e-06,
"loss": 0.7611,
"step": 3983
},
{
"epoch": 1.1566265060240963,
"grad_norm": 3.384596347808838,
"learning_rate": 9.127403828135217e-06,
"loss": 0.7778,
"step": 3984
},
{
"epoch": 1.1569168239221947,
"grad_norm": 3.194797992706299,
"learning_rate": 9.126861784329511e-06,
"loss": 0.5762,
"step": 3985
},
{
"epoch": 1.1572071418202932,
"grad_norm": 3.214726686477661,
"learning_rate": 9.12631958832678e-06,
"loss": 0.7325,
"step": 3986
},
{
"epoch": 1.1574974597183916,
"grad_norm": 3.8333182334899902,
"learning_rate": 9.12577724014702e-06,
"loss": 0.7784,
"step": 3987
},
{
"epoch": 1.15778777761649,
"grad_norm": 3.346073865890503,
"learning_rate": 9.125234739810235e-06,
"loss": 0.6485,
"step": 3988
},
{
"epoch": 1.1580780955145884,
"grad_norm": 3.903724431991577,
"learning_rate": 9.12469208733643e-06,
"loss": 0.7638,
"step": 3989
},
{
"epoch": 1.1583684134126868,
"grad_norm": 3.9996793270111084,
"learning_rate": 9.124149282745614e-06,
"loss": 0.842,
"step": 3990
},
{
"epoch": 1.1586587313107852,
"grad_norm": 3.5065417289733887,
"learning_rate": 9.12360632605781e-06,
"loss": 0.7256,
"step": 3991
},
{
"epoch": 1.1589490492088836,
"grad_norm": 3.8333420753479004,
"learning_rate": 9.123063217293043e-06,
"loss": 0.7925,
"step": 3992
},
{
"epoch": 1.159239367106982,
"grad_norm": 3.6747794151306152,
"learning_rate": 9.12251995647134e-06,
"loss": 0.8747,
"step": 3993
},
{
"epoch": 1.1595296850050805,
"grad_norm": 3.530374050140381,
"learning_rate": 9.121976543612736e-06,
"loss": 0.7956,
"step": 3994
},
{
"epoch": 1.1598200029031789,
"grad_norm": 3.2479302883148193,
"learning_rate": 9.121432978737273e-06,
"loss": 0.7378,
"step": 3995
},
{
"epoch": 1.1601103208012775,
"grad_norm": 3.1474578380584717,
"learning_rate": 9.120889261864999e-06,
"loss": 0.6483,
"step": 3996
},
{
"epoch": 1.1604006386993757,
"grad_norm": 3.481990098953247,
"learning_rate": 9.120345393015962e-06,
"loss": 0.7954,
"step": 3997
},
{
"epoch": 1.1606909565974743,
"grad_norm": 3.3965940475463867,
"learning_rate": 9.119801372210224e-06,
"loss": 0.8142,
"step": 3998
},
{
"epoch": 1.1609812744955725,
"grad_norm": 3.3146190643310547,
"learning_rate": 9.119257199467846e-06,
"loss": 0.6746,
"step": 3999
},
{
"epoch": 1.1612715923936712,
"grad_norm": 3.1795899868011475,
"learning_rate": 9.118712874808897e-06,
"loss": 0.6696,
"step": 4000
},
{
"epoch": 1.1612715923936712,
"eval_loss": 1.1864620447158813,
"eval_runtime": 13.2004,
"eval_samples_per_second": 30.302,
"eval_steps_per_second": 3.788,
"step": 4000
},
{
"epoch": 1.1615619102917696,
"grad_norm": 3.368516683578491,
"learning_rate": 9.11816839825345e-06,
"loss": 0.799,
"step": 4001
},
{
"epoch": 1.161852228189868,
"grad_norm": 3.838491201400757,
"learning_rate": 9.117623769821588e-06,
"loss": 0.8574,
"step": 4002
},
{
"epoch": 1.1621425460879664,
"grad_norm": 3.6480486392974854,
"learning_rate": 9.117078989533394e-06,
"loss": 0.7749,
"step": 4003
},
{
"epoch": 1.1624328639860648,
"grad_norm": 3.585958480834961,
"learning_rate": 9.116534057408964e-06,
"loss": 0.7411,
"step": 4004
},
{
"epoch": 1.1627231818841632,
"grad_norm": 3.195746898651123,
"learning_rate": 9.115988973468387e-06,
"loss": 0.64,
"step": 4005
},
{
"epoch": 1.1630134997822617,
"grad_norm": 3.85469913482666,
"learning_rate": 9.115443737731775e-06,
"loss": 0.7704,
"step": 4006
},
{
"epoch": 1.16330381768036,
"grad_norm": 3.8025283813476562,
"learning_rate": 9.114898350219227e-06,
"loss": 0.775,
"step": 4007
},
{
"epoch": 1.1635941355784585,
"grad_norm": 3.44447660446167,
"learning_rate": 9.114352810950864e-06,
"loss": 0.7815,
"step": 4008
},
{
"epoch": 1.163884453476557,
"grad_norm": 3.455094575881958,
"learning_rate": 9.1138071199468e-06,
"loss": 0.7137,
"step": 4009
},
{
"epoch": 1.1641747713746553,
"grad_norm": 4.332744121551514,
"learning_rate": 9.113261277227163e-06,
"loss": 0.8485,
"step": 4010
},
{
"epoch": 1.1644650892727537,
"grad_norm": 3.313493490219116,
"learning_rate": 9.112715282812081e-06,
"loss": 0.7353,
"step": 4011
},
{
"epoch": 1.1647554071708521,
"grad_norm": 3.056633472442627,
"learning_rate": 9.112169136721693e-06,
"loss": 0.7518,
"step": 4012
},
{
"epoch": 1.1650457250689505,
"grad_norm": 3.9191696643829346,
"learning_rate": 9.111622838976139e-06,
"loss": 0.8178,
"step": 4013
},
{
"epoch": 1.165336042967049,
"grad_norm": 3.181851387023926,
"learning_rate": 9.111076389595566e-06,
"loss": 0.6374,
"step": 4014
},
{
"epoch": 1.1656263608651474,
"grad_norm": 3.6287424564361572,
"learning_rate": 9.110529788600127e-06,
"loss": 0.8051,
"step": 4015
},
{
"epoch": 1.1659166787632458,
"grad_norm": 3.4764528274536133,
"learning_rate": 9.109983036009979e-06,
"loss": 0.6772,
"step": 4016
},
{
"epoch": 1.1662069966613442,
"grad_norm": 3.713264226913452,
"learning_rate": 9.109436131845291e-06,
"loss": 0.9324,
"step": 4017
},
{
"epoch": 1.1664973145594426,
"grad_norm": 3.6909563541412354,
"learning_rate": 9.108889076126226e-06,
"loss": 0.709,
"step": 4018
},
{
"epoch": 1.166787632457541,
"grad_norm": 3.3515591621398926,
"learning_rate": 9.108341868872966e-06,
"loss": 0.8808,
"step": 4019
},
{
"epoch": 1.1670779503556394,
"grad_norm": 3.6842029094696045,
"learning_rate": 9.107794510105685e-06,
"loss": 0.7281,
"step": 4020
},
{
"epoch": 1.1673682682537379,
"grad_norm": 3.2459568977355957,
"learning_rate": 9.107246999844573e-06,
"loss": 0.717,
"step": 4021
},
{
"epoch": 1.1676585861518363,
"grad_norm": 3.540125608444214,
"learning_rate": 9.106699338109824e-06,
"loss": 0.7114,
"step": 4022
},
{
"epoch": 1.1679489040499347,
"grad_norm": 3.283958911895752,
"learning_rate": 9.10615152492163e-06,
"loss": 0.8662,
"step": 4023
},
{
"epoch": 1.168239221948033,
"grad_norm": 2.9903454780578613,
"learning_rate": 9.105603560300199e-06,
"loss": 0.682,
"step": 4024
},
{
"epoch": 1.1685295398461315,
"grad_norm": 3.7494277954101562,
"learning_rate": 9.105055444265737e-06,
"loss": 0.8702,
"step": 4025
},
{
"epoch": 1.16881985774423,
"grad_norm": 3.8516342639923096,
"learning_rate": 9.10450717683846e-06,
"loss": 0.849,
"step": 4026
},
{
"epoch": 1.1691101756423283,
"grad_norm": 3.3459055423736572,
"learning_rate": 9.103958758038587e-06,
"loss": 0.7186,
"step": 4027
},
{
"epoch": 1.1694004935404267,
"grad_norm": 3.6910083293914795,
"learning_rate": 9.103410187886343e-06,
"loss": 0.7625,
"step": 4028
},
{
"epoch": 1.1696908114385252,
"grad_norm": 3.9832990169525146,
"learning_rate": 9.10286146640196e-06,
"loss": 0.8289,
"step": 4029
},
{
"epoch": 1.1699811293366236,
"grad_norm": 3.4876708984375,
"learning_rate": 9.102312593605675e-06,
"loss": 0.891,
"step": 4030
},
{
"epoch": 1.170271447234722,
"grad_norm": 3.19136643409729,
"learning_rate": 9.10176356951773e-06,
"loss": 0.726,
"step": 4031
},
{
"epoch": 1.1705617651328204,
"grad_norm": 4.043649673461914,
"learning_rate": 9.101214394158371e-06,
"loss": 0.7879,
"step": 4032
},
{
"epoch": 1.1708520830309188,
"grad_norm": 3.827148914337158,
"learning_rate": 9.100665067547854e-06,
"loss": 0.7717,
"step": 4033
},
{
"epoch": 1.1711424009290172,
"grad_norm": 3.3949193954467773,
"learning_rate": 9.100115589706436e-06,
"loss": 0.7799,
"step": 4034
},
{
"epoch": 1.1714327188271156,
"grad_norm": 3.4499807357788086,
"learning_rate": 9.09956596065438e-06,
"loss": 0.9016,
"step": 4035
},
{
"epoch": 1.171723036725214,
"grad_norm": 3.645195245742798,
"learning_rate": 9.09901618041196e-06,
"loss": 0.7207,
"step": 4036
},
{
"epoch": 1.1720133546233125,
"grad_norm": 3.701106071472168,
"learning_rate": 9.09846624899945e-06,
"loss": 0.7687,
"step": 4037
},
{
"epoch": 1.1723036725214109,
"grad_norm": 3.188385486602783,
"learning_rate": 9.097916166437131e-06,
"loss": 0.7065,
"step": 4038
},
{
"epoch": 1.1725939904195093,
"grad_norm": 4.226047992706299,
"learning_rate": 9.09736593274529e-06,
"loss": 0.9615,
"step": 4039
},
{
"epoch": 1.1728843083176077,
"grad_norm": 3.4825079441070557,
"learning_rate": 9.09681554794422e-06,
"loss": 0.7315,
"step": 4040
},
{
"epoch": 1.1731746262157061,
"grad_norm": 3.5694072246551514,
"learning_rate": 9.096265012054218e-06,
"loss": 0.7047,
"step": 4041
},
{
"epoch": 1.1734649441138045,
"grad_norm": 3.669870615005493,
"learning_rate": 9.095714325095587e-06,
"loss": 0.8166,
"step": 4042
},
{
"epoch": 1.173755262011903,
"grad_norm": 3.8622612953186035,
"learning_rate": 9.095163487088639e-06,
"loss": 0.8473,
"step": 4043
},
{
"epoch": 1.1740455799100014,
"grad_norm": 3.600687026977539,
"learning_rate": 9.094612498053684e-06,
"loss": 0.7861,
"step": 4044
},
{
"epoch": 1.1743358978081,
"grad_norm": 3.816171884536743,
"learning_rate": 9.094061358011047e-06,
"loss": 0.7794,
"step": 4045
},
{
"epoch": 1.1746262157061982,
"grad_norm": 3.986691474914551,
"learning_rate": 9.09351006698105e-06,
"loss": 0.8641,
"step": 4046
},
{
"epoch": 1.1749165336042968,
"grad_norm": 3.3331282138824463,
"learning_rate": 9.092958624984029e-06,
"loss": 0.7402,
"step": 4047
},
{
"epoch": 1.175206851502395,
"grad_norm": 3.6391406059265137,
"learning_rate": 9.092407032040316e-06,
"loss": 0.8001,
"step": 4048
},
{
"epoch": 1.1754971694004936,
"grad_norm": 3.1407461166381836,
"learning_rate": 9.091855288170257e-06,
"loss": 0.6524,
"step": 4049
},
{
"epoch": 1.1757874872985918,
"grad_norm": 3.806478977203369,
"learning_rate": 9.091303393394197e-06,
"loss": 0.858,
"step": 4050
},
{
"epoch": 1.1760778051966905,
"grad_norm": 3.330761194229126,
"learning_rate": 9.090751347732492e-06,
"loss": 0.6516,
"step": 4051
},
{
"epoch": 1.1763681230947889,
"grad_norm": 4.271059513092041,
"learning_rate": 9.090199151205502e-06,
"loss": 0.721,
"step": 4052
},
{
"epoch": 1.1766584409928873,
"grad_norm": 3.2130672931671143,
"learning_rate": 9.089646803833589e-06,
"loss": 0.7209,
"step": 4053
},
{
"epoch": 1.1769487588909857,
"grad_norm": 3.837550163269043,
"learning_rate": 9.089094305637125e-06,
"loss": 0.7907,
"step": 4054
},
{
"epoch": 1.1772390767890841,
"grad_norm": 3.642279863357544,
"learning_rate": 9.088541656636487e-06,
"loss": 0.7112,
"step": 4055
},
{
"epoch": 1.1775293946871825,
"grad_norm": 3.739576816558838,
"learning_rate": 9.087988856852054e-06,
"loss": 0.8681,
"step": 4056
},
{
"epoch": 1.177819712585281,
"grad_norm": 3.580559015274048,
"learning_rate": 9.087435906304214e-06,
"loss": 0.9132,
"step": 4057
},
{
"epoch": 1.1781100304833794,
"grad_norm": 3.414616584777832,
"learning_rate": 9.08688280501336e-06,
"loss": 0.715,
"step": 4058
},
{
"epoch": 1.1784003483814778,
"grad_norm": 3.5943245887756348,
"learning_rate": 9.08632955299989e-06,
"loss": 0.8107,
"step": 4059
},
{
"epoch": 1.1786906662795762,
"grad_norm": 3.487362861633301,
"learning_rate": 9.085776150284209e-06,
"loss": 0.7891,
"step": 4060
},
{
"epoch": 1.1789809841776746,
"grad_norm": 3.6045470237731934,
"learning_rate": 9.085222596886724e-06,
"loss": 0.7728,
"step": 4061
},
{
"epoch": 1.179271302075773,
"grad_norm": 3.904658079147339,
"learning_rate": 9.08466889282785e-06,
"loss": 0.8763,
"step": 4062
},
{
"epoch": 1.1795616199738714,
"grad_norm": 3.8356258869171143,
"learning_rate": 9.08411503812801e-06,
"loss": 0.8663,
"step": 4063
},
{
"epoch": 1.1798519378719698,
"grad_norm": 3.4146289825439453,
"learning_rate": 9.083561032807626e-06,
"loss": 0.7986,
"step": 4064
},
{
"epoch": 1.1801422557700683,
"grad_norm": 3.3314566612243652,
"learning_rate": 9.083006876887132e-06,
"loss": 0.8305,
"step": 4065
},
{
"epoch": 1.1804325736681667,
"grad_norm": 3.6700377464294434,
"learning_rate": 9.082452570386966e-06,
"loss": 0.8067,
"step": 4066
},
{
"epoch": 1.180722891566265,
"grad_norm": 3.317873954772949,
"learning_rate": 9.08189811332757e-06,
"loss": 0.7411,
"step": 4067
},
{
"epoch": 1.1810132094643635,
"grad_norm": 3.274186134338379,
"learning_rate": 9.08134350572939e-06,
"loss": 0.738,
"step": 4068
},
{
"epoch": 1.181303527362462,
"grad_norm": 3.3086955547332764,
"learning_rate": 9.08078874761288e-06,
"loss": 0.7723,
"step": 4069
},
{
"epoch": 1.1815938452605603,
"grad_norm": 3.8123908042907715,
"learning_rate": 9.080233838998503e-06,
"loss": 0.8489,
"step": 4070
},
{
"epoch": 1.1818841631586587,
"grad_norm": 3.341263771057129,
"learning_rate": 9.079678779906718e-06,
"loss": 0.7099,
"step": 4071
},
{
"epoch": 1.1821744810567572,
"grad_norm": 3.642395496368408,
"learning_rate": 9.079123570358e-06,
"loss": 0.6924,
"step": 4072
},
{
"epoch": 1.1824647989548556,
"grad_norm": 3.3351449966430664,
"learning_rate": 9.078568210372825e-06,
"loss": 0.7104,
"step": 4073
},
{
"epoch": 1.182755116852954,
"grad_norm": 3.6893389225006104,
"learning_rate": 9.078012699971673e-06,
"loss": 0.6957,
"step": 4074
},
{
"epoch": 1.1830454347510524,
"grad_norm": 3.6875810623168945,
"learning_rate": 9.077457039175028e-06,
"loss": 0.7803,
"step": 4075
},
{
"epoch": 1.1833357526491508,
"grad_norm": 3.341475248336792,
"learning_rate": 9.076901228003387e-06,
"loss": 0.8119,
"step": 4076
},
{
"epoch": 1.1836260705472492,
"grad_norm": 3.684300422668457,
"learning_rate": 9.076345266477247e-06,
"loss": 0.8527,
"step": 4077
},
{
"epoch": 1.1839163884453476,
"grad_norm": 3.4260594844818115,
"learning_rate": 9.075789154617112e-06,
"loss": 0.6892,
"step": 4078
},
{
"epoch": 1.184206706343446,
"grad_norm": 3.6972508430480957,
"learning_rate": 9.075232892443488e-06,
"loss": 0.6416,
"step": 4079
},
{
"epoch": 1.1844970242415445,
"grad_norm": 3.9194812774658203,
"learning_rate": 9.074676479976894e-06,
"loss": 0.8281,
"step": 4080
},
{
"epoch": 1.1847873421396429,
"grad_norm": 3.2946715354919434,
"learning_rate": 9.074119917237849e-06,
"loss": 0.7115,
"step": 4081
},
{
"epoch": 1.1850776600377413,
"grad_norm": 3.7364883422851562,
"learning_rate": 9.073563204246877e-06,
"loss": 0.6713,
"step": 4082
},
{
"epoch": 1.1853679779358397,
"grad_norm": 3.7229502201080322,
"learning_rate": 9.07300634102451e-06,
"loss": 0.7803,
"step": 4083
},
{
"epoch": 1.1856582958339381,
"grad_norm": 3.2690937519073486,
"learning_rate": 9.072449327591285e-06,
"loss": 0.6948,
"step": 4084
},
{
"epoch": 1.1859486137320365,
"grad_norm": 3.791633367538452,
"learning_rate": 9.071892163967749e-06,
"loss": 0.863,
"step": 4085
},
{
"epoch": 1.186238931630135,
"grad_norm": 3.2623965740203857,
"learning_rate": 9.071334850174442e-06,
"loss": 0.6323,
"step": 4086
},
{
"epoch": 1.1865292495282334,
"grad_norm": 3.938901901245117,
"learning_rate": 9.070777386231921e-06,
"loss": 0.8053,
"step": 4087
},
{
"epoch": 1.1868195674263318,
"grad_norm": 3.3571414947509766,
"learning_rate": 9.070219772160748e-06,
"loss": 0.7462,
"step": 4088
},
{
"epoch": 1.1871098853244302,
"grad_norm": 3.772347927093506,
"learning_rate": 9.069662007981483e-06,
"loss": 0.8494,
"step": 4089
},
{
"epoch": 1.1874002032225286,
"grad_norm": 3.5584139823913574,
"learning_rate": 9.0691040937147e-06,
"loss": 0.7863,
"step": 4090
},
{
"epoch": 1.187690521120627,
"grad_norm": 3.906470775604248,
"learning_rate": 9.068546029380971e-06,
"loss": 0.8599,
"step": 4091
},
{
"epoch": 1.1879808390187254,
"grad_norm": 3.395383834838867,
"learning_rate": 9.06798781500088e-06,
"loss": 0.7968,
"step": 4092
},
{
"epoch": 1.1882711569168238,
"grad_norm": 3.3741462230682373,
"learning_rate": 9.067429450595014e-06,
"loss": 0.7056,
"step": 4093
},
{
"epoch": 1.1885614748149222,
"grad_norm": 3.172368049621582,
"learning_rate": 9.066870936183962e-06,
"loss": 0.7439,
"step": 4094
},
{
"epoch": 1.1888517927130207,
"grad_norm": 3.850167751312256,
"learning_rate": 9.066312271788323e-06,
"loss": 0.8851,
"step": 4095
},
{
"epoch": 1.1891421106111193,
"grad_norm": 3.6464662551879883,
"learning_rate": 9.065753457428703e-06,
"loss": 0.7846,
"step": 4096
},
{
"epoch": 1.1894324285092175,
"grad_norm": 4.118659973144531,
"learning_rate": 9.065194493125708e-06,
"loss": 0.9087,
"step": 4097
},
{
"epoch": 1.1897227464073161,
"grad_norm": 3.62093448638916,
"learning_rate": 9.064635378899954e-06,
"loss": 0.8598,
"step": 4098
},
{
"epoch": 1.1900130643054143,
"grad_norm": 3.810291051864624,
"learning_rate": 9.06407611477206e-06,
"loss": 0.8275,
"step": 4099
},
{
"epoch": 1.190303382203513,
"grad_norm": 3.34863018989563,
"learning_rate": 9.06351670076265e-06,
"loss": 0.6317,
"step": 4100
},
{
"epoch": 1.1905937001016111,
"grad_norm": 3.578842878341675,
"learning_rate": 9.06295713689236e-06,
"loss": 0.7453,
"step": 4101
},
{
"epoch": 1.1908840179997098,
"grad_norm": 3.7192788124084473,
"learning_rate": 9.06239742318182e-06,
"loss": 0.8762,
"step": 4102
},
{
"epoch": 1.1911743358978082,
"grad_norm": 3.813288450241089,
"learning_rate": 9.061837559651676e-06,
"loss": 0.8466,
"step": 4103
},
{
"epoch": 1.1914646537959066,
"grad_norm": 3.4084653854370117,
"learning_rate": 9.061277546322576e-06,
"loss": 0.8022,
"step": 4104
},
{
"epoch": 1.191754971694005,
"grad_norm": 3.501131057739258,
"learning_rate": 9.060717383215169e-06,
"loss": 0.7563,
"step": 4105
},
{
"epoch": 1.1920452895921034,
"grad_norm": 3.5633366107940674,
"learning_rate": 9.060157070350119e-06,
"loss": 0.8084,
"step": 4106
},
{
"epoch": 1.1923356074902018,
"grad_norm": 3.551622152328491,
"learning_rate": 9.059596607748087e-06,
"loss": 0.7899,
"step": 4107
},
{
"epoch": 1.1926259253883003,
"grad_norm": 3.22371244430542,
"learning_rate": 9.059035995429743e-06,
"loss": 0.6764,
"step": 4108
},
{
"epoch": 1.1929162432863987,
"grad_norm": 3.7044527530670166,
"learning_rate": 9.058475233415763e-06,
"loss": 0.9281,
"step": 4109
},
{
"epoch": 1.193206561184497,
"grad_norm": 3.9586267471313477,
"learning_rate": 9.057914321726824e-06,
"loss": 0.9419,
"step": 4110
},
{
"epoch": 1.1934968790825955,
"grad_norm": 2.908240556716919,
"learning_rate": 9.057353260383617e-06,
"loss": 0.8072,
"step": 4111
},
{
"epoch": 1.193787196980694,
"grad_norm": 3.484663724899292,
"learning_rate": 9.056792049406833e-06,
"loss": 0.804,
"step": 4112
},
{
"epoch": 1.1940775148787923,
"grad_norm": 3.4705088138580322,
"learning_rate": 9.056230688817168e-06,
"loss": 0.7696,
"step": 4113
},
{
"epoch": 1.1943678327768907,
"grad_norm": 3.4565269947052,
"learning_rate": 9.055669178635322e-06,
"loss": 0.7479,
"step": 4114
},
{
"epoch": 1.1946581506749891,
"grad_norm": 3.331815719604492,
"learning_rate": 9.055107518882009e-06,
"loss": 0.6769,
"step": 4115
},
{
"epoch": 1.1949484685730876,
"grad_norm": 3.844775438308716,
"learning_rate": 9.054545709577939e-06,
"loss": 0.9125,
"step": 4116
},
{
"epoch": 1.195238786471186,
"grad_norm": 3.518406867980957,
"learning_rate": 9.053983750743831e-06,
"loss": 0.7155,
"step": 4117
},
{
"epoch": 1.1955291043692844,
"grad_norm": 3.5197741985321045,
"learning_rate": 9.053421642400414e-06,
"loss": 0.786,
"step": 4118
},
{
"epoch": 1.1958194222673828,
"grad_norm": 3.6934590339660645,
"learning_rate": 9.052859384568414e-06,
"loss": 0.778,
"step": 4119
},
{
"epoch": 1.1961097401654812,
"grad_norm": 3.5394248962402344,
"learning_rate": 9.052296977268566e-06,
"loss": 0.755,
"step": 4120
},
{
"epoch": 1.1964000580635796,
"grad_norm": 3.7590219974517822,
"learning_rate": 9.051734420521616e-06,
"loss": 0.8084,
"step": 4121
},
{
"epoch": 1.196690375961678,
"grad_norm": 3.022731304168701,
"learning_rate": 9.051171714348309e-06,
"loss": 0.7038,
"step": 4122
},
{
"epoch": 1.1969806938597765,
"grad_norm": 3.880645990371704,
"learning_rate": 9.050608858769395e-06,
"loss": 0.7077,
"step": 4123
},
{
"epoch": 1.1972710117578749,
"grad_norm": 3.356694459915161,
"learning_rate": 9.050045853805634e-06,
"loss": 0.7646,
"step": 4124
},
{
"epoch": 1.1975613296559733,
"grad_norm": 3.812464714050293,
"learning_rate": 9.04948269947779e-06,
"loss": 0.8239,
"step": 4125
},
{
"epoch": 1.1978516475540717,
"grad_norm": 3.7726550102233887,
"learning_rate": 9.04891939580663e-06,
"loss": 0.8597,
"step": 4126
},
{
"epoch": 1.19814196545217,
"grad_norm": 3.775982141494751,
"learning_rate": 9.048355942812929e-06,
"loss": 0.797,
"step": 4127
},
{
"epoch": 1.1984322833502685,
"grad_norm": 3.6224353313446045,
"learning_rate": 9.04779234051747e-06,
"loss": 0.676,
"step": 4128
},
{
"epoch": 1.198722601248367,
"grad_norm": 3.9695451259613037,
"learning_rate": 9.047228588941034e-06,
"loss": 0.8476,
"step": 4129
},
{
"epoch": 1.1990129191464654,
"grad_norm": 3.48233962059021,
"learning_rate": 9.046664688104414e-06,
"loss": 0.7039,
"step": 4130
},
{
"epoch": 1.1993032370445638,
"grad_norm": 3.5250630378723145,
"learning_rate": 9.046100638028406e-06,
"loss": 0.7195,
"step": 4131
},
{
"epoch": 1.1995935549426622,
"grad_norm": 4.188467502593994,
"learning_rate": 9.045536438733814e-06,
"loss": 0.8922,
"step": 4132
},
{
"epoch": 1.1998838728407606,
"grad_norm": 3.3059566020965576,
"learning_rate": 9.044972090241439e-06,
"loss": 0.791,
"step": 4133
},
{
"epoch": 1.200174190738859,
"grad_norm": 3.44315505027771,
"learning_rate": 9.044407592572102e-06,
"loss": 0.7476,
"step": 4134
},
{
"epoch": 1.2004645086369574,
"grad_norm": 3.908571481704712,
"learning_rate": 9.043842945746617e-06,
"loss": 0.8055,
"step": 4135
},
{
"epoch": 1.2007548265350558,
"grad_norm": 3.499602794647217,
"learning_rate": 9.04327814978581e-06,
"loss": 0.7689,
"step": 4136
},
{
"epoch": 1.2010451444331542,
"grad_norm": 3.504218578338623,
"learning_rate": 9.042713204710509e-06,
"loss": 0.7161,
"step": 4137
},
{
"epoch": 1.2013354623312527,
"grad_norm": 3.1022610664367676,
"learning_rate": 9.04214811054155e-06,
"loss": 0.7635,
"step": 4138
},
{
"epoch": 1.201625780229351,
"grad_norm": 3.5882506370544434,
"learning_rate": 9.04158286729977e-06,
"loss": 0.7621,
"step": 4139
},
{
"epoch": 1.2019160981274495,
"grad_norm": 3.5278327465057373,
"learning_rate": 9.04101747500602e-06,
"loss": 0.7782,
"step": 4140
},
{
"epoch": 1.202206416025548,
"grad_norm": 3.5033469200134277,
"learning_rate": 9.040451933681148e-06,
"loss": 0.7269,
"step": 4141
},
{
"epoch": 1.2024967339236463,
"grad_norm": 3.472656488418579,
"learning_rate": 9.039886243346013e-06,
"loss": 0.7632,
"step": 4142
},
{
"epoch": 1.2027870518217447,
"grad_norm": 3.2979049682617188,
"learning_rate": 9.039320404021475e-06,
"loss": 0.765,
"step": 4143
},
{
"epoch": 1.2030773697198431,
"grad_norm": 3.6671695709228516,
"learning_rate": 9.038754415728405e-06,
"loss": 0.6898,
"step": 4144
},
{
"epoch": 1.2033676876179416,
"grad_norm": 3.387666940689087,
"learning_rate": 9.038188278487673e-06,
"loss": 0.662,
"step": 4145
},
{
"epoch": 1.20365800551604,
"grad_norm": 3.3943850994110107,
"learning_rate": 9.037621992320162e-06,
"loss": 0.7152,
"step": 4146
},
{
"epoch": 1.2039483234141386,
"grad_norm": 3.2745096683502197,
"learning_rate": 9.037055557246754e-06,
"loss": 0.7477,
"step": 4147
},
{
"epoch": 1.2042386413122368,
"grad_norm": 3.368821859359741,
"learning_rate": 9.036488973288339e-06,
"loss": 0.7086,
"step": 4148
},
{
"epoch": 1.2045289592103354,
"grad_norm": 3.569892644882202,
"learning_rate": 9.035922240465813e-06,
"loss": 0.8061,
"step": 4149
},
{
"epoch": 1.2048192771084336,
"grad_norm": 4.035867214202881,
"learning_rate": 9.035355358800073e-06,
"loss": 0.8411,
"step": 4150
},
{
"epoch": 1.2051095950065323,
"grad_norm": 3.9796719551086426,
"learning_rate": 9.034788328312031e-06,
"loss": 0.8424,
"step": 4151
},
{
"epoch": 1.2053999129046307,
"grad_norm": 3.9051156044006348,
"learning_rate": 9.034221149022599e-06,
"loss": 0.8068,
"step": 4152
},
{
"epoch": 1.205690230802729,
"grad_norm": 3.869713068008423,
"learning_rate": 9.033653820952689e-06,
"loss": 0.8491,
"step": 4153
},
{
"epoch": 1.2059805487008275,
"grad_norm": 2.9886488914489746,
"learning_rate": 9.033086344123227e-06,
"loss": 0.7795,
"step": 4154
},
{
"epoch": 1.206270866598926,
"grad_norm": 4.163388252258301,
"learning_rate": 9.032518718555142e-06,
"loss": 0.8913,
"step": 4155
},
{
"epoch": 1.2065611844970243,
"grad_norm": 3.384000539779663,
"learning_rate": 9.031950944269366e-06,
"loss": 0.8076,
"step": 4156
},
{
"epoch": 1.2068515023951227,
"grad_norm": 4.030092239379883,
"learning_rate": 9.03138302128684e-06,
"loss": 0.8349,
"step": 4157
},
{
"epoch": 1.2071418202932211,
"grad_norm": 3.787898540496826,
"learning_rate": 9.030814949628509e-06,
"loss": 0.7586,
"step": 4158
},
{
"epoch": 1.2074321381913196,
"grad_norm": 3.355987787246704,
"learning_rate": 9.03024672931532e-06,
"loss": 0.7544,
"step": 4159
},
{
"epoch": 1.207722456089418,
"grad_norm": 3.9991297721862793,
"learning_rate": 9.029678360368232e-06,
"loss": 0.7545,
"step": 4160
},
{
"epoch": 1.2080127739875164,
"grad_norm": 3.7311341762542725,
"learning_rate": 9.029109842808205e-06,
"loss": 0.7447,
"step": 4161
},
{
"epoch": 1.2083030918856148,
"grad_norm": 4.173926830291748,
"learning_rate": 9.028541176656206e-06,
"loss": 0.9467,
"step": 4162
},
{
"epoch": 1.2085934097837132,
"grad_norm": 3.6992671489715576,
"learning_rate": 9.027972361933206e-06,
"loss": 0.7205,
"step": 4163
},
{
"epoch": 1.2088837276818116,
"grad_norm": 3.7675483226776123,
"learning_rate": 9.027403398660186e-06,
"loss": 0.8685,
"step": 4164
},
{
"epoch": 1.20917404557991,
"grad_norm": 3.525923490524292,
"learning_rate": 9.026834286858125e-06,
"loss": 0.8266,
"step": 4165
},
{
"epoch": 1.2094643634780085,
"grad_norm": 3.47044038772583,
"learning_rate": 9.026265026548016e-06,
"loss": 0.8065,
"step": 4166
},
{
"epoch": 1.2097546813761069,
"grad_norm": 3.7477779388427734,
"learning_rate": 9.025695617750848e-06,
"loss": 0.7428,
"step": 4167
},
{
"epoch": 1.2100449992742053,
"grad_norm": 3.2594008445739746,
"learning_rate": 9.025126060487623e-06,
"loss": 0.7125,
"step": 4168
},
{
"epoch": 1.2103353171723037,
"grad_norm": 3.4195213317871094,
"learning_rate": 9.024556354779348e-06,
"loss": 0.8543,
"step": 4169
},
{
"epoch": 1.210625635070402,
"grad_norm": 2.9705264568328857,
"learning_rate": 9.02398650064703e-06,
"loss": 0.6412,
"step": 4170
},
{
"epoch": 1.2109159529685005,
"grad_norm": 3.3002724647521973,
"learning_rate": 9.023416498111688e-06,
"loss": 0.7906,
"step": 4171
},
{
"epoch": 1.211206270866599,
"grad_norm": 3.0194554328918457,
"learning_rate": 9.022846347194343e-06,
"loss": 0.7628,
"step": 4172
},
{
"epoch": 1.2114965887646973,
"grad_norm": 3.412965774536133,
"learning_rate": 9.02227604791602e-06,
"loss": 0.7688,
"step": 4173
},
{
"epoch": 1.2117869066627958,
"grad_norm": 3.7909467220306396,
"learning_rate": 9.021705600297753e-06,
"loss": 0.8916,
"step": 4174
},
{
"epoch": 1.2120772245608942,
"grad_norm": 3.2401669025421143,
"learning_rate": 9.021135004360578e-06,
"loss": 0.6957,
"step": 4175
},
{
"epoch": 1.2123675424589926,
"grad_norm": 3.907761812210083,
"learning_rate": 9.020564260125542e-06,
"loss": 0.8673,
"step": 4176
},
{
"epoch": 1.212657860357091,
"grad_norm": 3.2626876831054688,
"learning_rate": 9.019993367613689e-06,
"loss": 0.7596,
"step": 4177
},
{
"epoch": 1.2129481782551894,
"grad_norm": 3.8206748962402344,
"learning_rate": 9.019422326846078e-06,
"loss": 0.8473,
"step": 4178
},
{
"epoch": 1.2132384961532878,
"grad_norm": 3.7625372409820557,
"learning_rate": 9.018851137843765e-06,
"loss": 0.8529,
"step": 4179
},
{
"epoch": 1.2135288140513862,
"grad_norm": 3.553237199783325,
"learning_rate": 9.018279800627818e-06,
"loss": 0.8849,
"step": 4180
},
{
"epoch": 1.2138191319494847,
"grad_norm": 3.6299870014190674,
"learning_rate": 9.017708315219307e-06,
"loss": 0.7347,
"step": 4181
},
{
"epoch": 1.214109449847583,
"grad_norm": 3.9615767002105713,
"learning_rate": 9.017136681639307e-06,
"loss": 0.8044,
"step": 4182
},
{
"epoch": 1.2143997677456815,
"grad_norm": 3.804377555847168,
"learning_rate": 9.0165648999089e-06,
"loss": 0.7135,
"step": 4183
},
{
"epoch": 1.21469008564378,
"grad_norm": 3.876023054122925,
"learning_rate": 9.015992970049175e-06,
"loss": 0.8958,
"step": 4184
},
{
"epoch": 1.2149804035418783,
"grad_norm": 3.5934906005859375,
"learning_rate": 9.015420892081222e-06,
"loss": 0.7761,
"step": 4185
},
{
"epoch": 1.2152707214399767,
"grad_norm": 3.36338210105896,
"learning_rate": 9.014848666026138e-06,
"loss": 0.722,
"step": 4186
},
{
"epoch": 1.2155610393380751,
"grad_norm": 3.8048529624938965,
"learning_rate": 9.01427629190503e-06,
"loss": 0.8724,
"step": 4187
},
{
"epoch": 1.2158513572361735,
"grad_norm": 3.8319287300109863,
"learning_rate": 9.013703769739007e-06,
"loss": 0.8544,
"step": 4188
},
{
"epoch": 1.216141675134272,
"grad_norm": 3.9430227279663086,
"learning_rate": 9.01313109954918e-06,
"loss": 0.7627,
"step": 4189
},
{
"epoch": 1.2164319930323704,
"grad_norm": 3.7642529010772705,
"learning_rate": 9.01255828135667e-06,
"loss": 0.7264,
"step": 4190
},
{
"epoch": 1.2167223109304688,
"grad_norm": 3.522141933441162,
"learning_rate": 9.011985315182605e-06,
"loss": 0.8301,
"step": 4191
},
{
"epoch": 1.2170126288285672,
"grad_norm": 3.0998566150665283,
"learning_rate": 9.011412201048113e-06,
"loss": 0.7483,
"step": 4192
},
{
"epoch": 1.2173029467266656,
"grad_norm": 3.6285431385040283,
"learning_rate": 9.010838938974329e-06,
"loss": 0.7769,
"step": 4193
},
{
"epoch": 1.217593264624764,
"grad_norm": 4.2689337730407715,
"learning_rate": 9.010265528982398e-06,
"loss": 0.9484,
"step": 4194
},
{
"epoch": 1.2178835825228624,
"grad_norm": 3.3270440101623535,
"learning_rate": 9.009691971093467e-06,
"loss": 0.8008,
"step": 4195
},
{
"epoch": 1.218173900420961,
"grad_norm": 3.4125139713287354,
"learning_rate": 9.009118265328684e-06,
"loss": 0.7329,
"step": 4196
},
{
"epoch": 1.2184642183190593,
"grad_norm": 3.2748773097991943,
"learning_rate": 9.008544411709214e-06,
"loss": 0.69,
"step": 4197
},
{
"epoch": 1.218754536217158,
"grad_norm": 3.5631113052368164,
"learning_rate": 9.007970410256216e-06,
"loss": 0.7348,
"step": 4198
},
{
"epoch": 1.219044854115256,
"grad_norm": 3.6760542392730713,
"learning_rate": 9.007396260990857e-06,
"loss": 0.8198,
"step": 4199
},
{
"epoch": 1.2193351720133547,
"grad_norm": 3.3203012943267822,
"learning_rate": 9.006821963934316e-06,
"loss": 0.7226,
"step": 4200
},
{
"epoch": 1.219625489911453,
"grad_norm": 4.029517650604248,
"learning_rate": 9.006247519107771e-06,
"loss": 0.7686,
"step": 4201
},
{
"epoch": 1.2199158078095516,
"grad_norm": 4.306983470916748,
"learning_rate": 9.005672926532408e-06,
"loss": 0.8475,
"step": 4202
},
{
"epoch": 1.22020612570765,
"grad_norm": 3.5306789875030518,
"learning_rate": 9.005098186229417e-06,
"loss": 0.7178,
"step": 4203
},
{
"epoch": 1.2204964436057484,
"grad_norm": 3.456655502319336,
"learning_rate": 9.004523298219993e-06,
"loss": 0.7594,
"step": 4204
},
{
"epoch": 1.2207867615038468,
"grad_norm": 3.8073463439941406,
"learning_rate": 9.003948262525341e-06,
"loss": 0.82,
"step": 4205
},
{
"epoch": 1.2210770794019452,
"grad_norm": 3.5894739627838135,
"learning_rate": 9.003373079166664e-06,
"loss": 0.7883,
"step": 4206
},
{
"epoch": 1.2213673973000436,
"grad_norm": 3.461728572845459,
"learning_rate": 9.002797748165178e-06,
"loss": 0.8509,
"step": 4207
},
{
"epoch": 1.221657715198142,
"grad_norm": 3.460731267929077,
"learning_rate": 9.002222269542098e-06,
"loss": 0.8584,
"step": 4208
},
{
"epoch": 1.2219480330962404,
"grad_norm": 3.5668509006500244,
"learning_rate": 9.00164664331865e-06,
"loss": 0.8295,
"step": 4209
},
{
"epoch": 1.2222383509943389,
"grad_norm": 3.156965970993042,
"learning_rate": 9.001070869516062e-06,
"loss": 0.7822,
"step": 4210
},
{
"epoch": 1.2225286688924373,
"grad_norm": 3.166682720184326,
"learning_rate": 9.000494948155567e-06,
"loss": 0.7692,
"step": 4211
},
{
"epoch": 1.2228189867905357,
"grad_norm": 3.3912453651428223,
"learning_rate": 8.999918879258406e-06,
"loss": 0.7951,
"step": 4212
},
{
"epoch": 1.223109304688634,
"grad_norm": 3.546839952468872,
"learning_rate": 8.999342662845826e-06,
"loss": 0.7712,
"step": 4213
},
{
"epoch": 1.2233996225867325,
"grad_norm": 3.8041069507598877,
"learning_rate": 8.998766298939074e-06,
"loss": 0.8666,
"step": 4214
},
{
"epoch": 1.223689940484831,
"grad_norm": 3.5458247661590576,
"learning_rate": 8.998189787559408e-06,
"loss": 0.8102,
"step": 4215
},
{
"epoch": 1.2239802583829293,
"grad_norm": 3.452237367630005,
"learning_rate": 8.997613128728089e-06,
"loss": 0.7241,
"step": 4216
},
{
"epoch": 1.2242705762810278,
"grad_norm": 3.775862216949463,
"learning_rate": 8.997036322466385e-06,
"loss": 0.7433,
"step": 4217
},
{
"epoch": 1.2245608941791262,
"grad_norm": 3.6754865646362305,
"learning_rate": 8.996459368795567e-06,
"loss": 0.8025,
"step": 4218
},
{
"epoch": 1.2248512120772246,
"grad_norm": 3.375824213027954,
"learning_rate": 8.995882267736913e-06,
"loss": 0.7066,
"step": 4219
},
{
"epoch": 1.225141529975323,
"grad_norm": 3.4623117446899414,
"learning_rate": 8.995305019311708e-06,
"loss": 0.785,
"step": 4220
},
{
"epoch": 1.2254318478734214,
"grad_norm": 3.7280542850494385,
"learning_rate": 8.994727623541237e-06,
"loss": 0.7869,
"step": 4221
},
{
"epoch": 1.2257221657715198,
"grad_norm": 4.037339210510254,
"learning_rate": 8.9941500804468e-06,
"loss": 0.8466,
"step": 4222
},
{
"epoch": 1.2260124836696182,
"grad_norm": 3.8792598247528076,
"learning_rate": 8.99357239004969e-06,
"loss": 0.9094,
"step": 4223
},
{
"epoch": 1.2263028015677166,
"grad_norm": 3.7027788162231445,
"learning_rate": 8.992994552371217e-06,
"loss": 0.7475,
"step": 4224
},
{
"epoch": 1.226593119465815,
"grad_norm": 3.8787484169006348,
"learning_rate": 8.992416567432688e-06,
"loss": 0.9464,
"step": 4225
},
{
"epoch": 1.2268834373639135,
"grad_norm": 3.166562080383301,
"learning_rate": 8.991838435255422e-06,
"loss": 0.762,
"step": 4226
},
{
"epoch": 1.227173755262012,
"grad_norm": 3.317545175552368,
"learning_rate": 8.991260155860737e-06,
"loss": 0.6764,
"step": 4227
},
{
"epoch": 1.2274640731601103,
"grad_norm": 3.3221254348754883,
"learning_rate": 8.990681729269962e-06,
"loss": 0.8601,
"step": 4228
},
{
"epoch": 1.2277543910582087,
"grad_norm": 3.914020299911499,
"learning_rate": 8.990103155504428e-06,
"loss": 0.8584,
"step": 4229
},
{
"epoch": 1.2280447089563071,
"grad_norm": 3.6654372215270996,
"learning_rate": 8.989524434585473e-06,
"loss": 0.7289,
"step": 4230
},
{
"epoch": 1.2283350268544055,
"grad_norm": 3.4380693435668945,
"learning_rate": 8.988945566534442e-06,
"loss": 0.7692,
"step": 4231
},
{
"epoch": 1.228625344752504,
"grad_norm": 3.8467538356781006,
"learning_rate": 8.98836655137268e-06,
"loss": 0.9227,
"step": 4232
},
{
"epoch": 1.2289156626506024,
"grad_norm": 3.577817916870117,
"learning_rate": 8.987787389121542e-06,
"loss": 0.7317,
"step": 4233
},
{
"epoch": 1.2292059805487008,
"grad_norm": 3.5391640663146973,
"learning_rate": 8.987208079802387e-06,
"loss": 0.7497,
"step": 4234
},
{
"epoch": 1.2294962984467992,
"grad_norm": 3.71026611328125,
"learning_rate": 8.986628623436583e-06,
"loss": 0.7541,
"step": 4235
},
{
"epoch": 1.2297866163448976,
"grad_norm": 3.2825422286987305,
"learning_rate": 8.986049020045495e-06,
"loss": 0.8143,
"step": 4236
},
{
"epoch": 1.230076934242996,
"grad_norm": 3.931927442550659,
"learning_rate": 8.9854692696505e-06,
"loss": 0.8363,
"step": 4237
},
{
"epoch": 1.2303672521410944,
"grad_norm": 3.6304123401641846,
"learning_rate": 8.984889372272982e-06,
"loss": 0.7422,
"step": 4238
},
{
"epoch": 1.2306575700391928,
"grad_norm": 3.913593053817749,
"learning_rate": 8.984309327934326e-06,
"loss": 0.7626,
"step": 4239
},
{
"epoch": 1.2309478879372913,
"grad_norm": 3.2616569995880127,
"learning_rate": 8.983729136655921e-06,
"loss": 0.6163,
"step": 4240
},
{
"epoch": 1.2312382058353897,
"grad_norm": 4.207817554473877,
"learning_rate": 8.983148798459167e-06,
"loss": 0.8562,
"step": 4241
},
{
"epoch": 1.231528523733488,
"grad_norm": 3.02081561088562,
"learning_rate": 8.982568313365467e-06,
"loss": 0.6839,
"step": 4242
},
{
"epoch": 1.2318188416315865,
"grad_norm": 3.8226892948150635,
"learning_rate": 8.981987681396226e-06,
"loss": 0.8784,
"step": 4243
},
{
"epoch": 1.232109159529685,
"grad_norm": 3.748441696166992,
"learning_rate": 8.981406902572862e-06,
"loss": 0.8386,
"step": 4244
},
{
"epoch": 1.2323994774277833,
"grad_norm": 3.492546319961548,
"learning_rate": 8.98082597691679e-06,
"loss": 0.7597,
"step": 4245
},
{
"epoch": 1.2326897953258817,
"grad_norm": 3.4718661308288574,
"learning_rate": 8.980244904449436e-06,
"loss": 0.7796,
"step": 4246
},
{
"epoch": 1.2329801132239804,
"grad_norm": 3.1242318153381348,
"learning_rate": 8.97966368519223e-06,
"loss": 0.5742,
"step": 4247
},
{
"epoch": 1.2332704311220786,
"grad_norm": 3.907931327819824,
"learning_rate": 8.979082319166605e-06,
"loss": 0.8138,
"step": 4248
},
{
"epoch": 1.2335607490201772,
"grad_norm": 3.067992925643921,
"learning_rate": 8.978500806394004e-06,
"loss": 0.6971,
"step": 4249
},
{
"epoch": 1.2338510669182754,
"grad_norm": 3.232266664505005,
"learning_rate": 8.977919146895872e-06,
"loss": 0.7405,
"step": 4250
},
{
"epoch": 1.234141384816374,
"grad_norm": 3.50213623046875,
"learning_rate": 8.977337340693662e-06,
"loss": 0.686,
"step": 4251
},
{
"epoch": 1.2344317027144722,
"grad_norm": 3.8020687103271484,
"learning_rate": 8.976755387808826e-06,
"loss": 0.7404,
"step": 4252
},
{
"epoch": 1.2347220206125709,
"grad_norm": 3.3541903495788574,
"learning_rate": 8.976173288262832e-06,
"loss": 0.7247,
"step": 4253
},
{
"epoch": 1.2350123385106693,
"grad_norm": 3.84443736076355,
"learning_rate": 8.975591042077144e-06,
"loss": 0.8052,
"step": 4254
},
{
"epoch": 1.2353026564087677,
"grad_norm": 3.4659833908081055,
"learning_rate": 8.975008649273238e-06,
"loss": 0.7656,
"step": 4255
},
{
"epoch": 1.235592974306866,
"grad_norm": 3.320693254470825,
"learning_rate": 8.974426109872587e-06,
"loss": 0.6717,
"step": 4256
},
{
"epoch": 1.2358832922049645,
"grad_norm": 3.577528953552246,
"learning_rate": 8.97384342389668e-06,
"loss": 0.7556,
"step": 4257
},
{
"epoch": 1.236173610103063,
"grad_norm": 3.8595802783966064,
"learning_rate": 8.973260591367006e-06,
"loss": 0.8209,
"step": 4258
},
{
"epoch": 1.2364639280011613,
"grad_norm": 3.4095239639282227,
"learning_rate": 8.972677612305056e-06,
"loss": 0.733,
"step": 4259
},
{
"epoch": 1.2367542458992598,
"grad_norm": 3.280168294906616,
"learning_rate": 8.972094486732332e-06,
"loss": 0.6605,
"step": 4260
},
{
"epoch": 1.2370445637973582,
"grad_norm": 2.979154586791992,
"learning_rate": 8.971511214670342e-06,
"loss": 0.6957,
"step": 4261
},
{
"epoch": 1.2373348816954566,
"grad_norm": 3.2444956302642822,
"learning_rate": 8.970927796140592e-06,
"loss": 0.8197,
"step": 4262
},
{
"epoch": 1.237625199593555,
"grad_norm": 3.193018913269043,
"learning_rate": 8.970344231164602e-06,
"loss": 0.7737,
"step": 4263
},
{
"epoch": 1.2379155174916534,
"grad_norm": 3.533512830734253,
"learning_rate": 8.969760519763891e-06,
"loss": 0.8184,
"step": 4264
},
{
"epoch": 1.2382058353897518,
"grad_norm": 3.282985210418701,
"learning_rate": 8.969176661959989e-06,
"loss": 0.7852,
"step": 4265
},
{
"epoch": 1.2384961532878502,
"grad_norm": 3.325979471206665,
"learning_rate": 8.968592657774427e-06,
"loss": 0.7307,
"step": 4266
},
{
"epoch": 1.2387864711859486,
"grad_norm": 3.227482318878174,
"learning_rate": 8.96800850722874e-06,
"loss": 0.7528,
"step": 4267
},
{
"epoch": 1.239076789084047,
"grad_norm": 3.809748888015747,
"learning_rate": 8.967424210344475e-06,
"loss": 0.8771,
"step": 4268
},
{
"epoch": 1.2393671069821455,
"grad_norm": 3.711108684539795,
"learning_rate": 8.96683976714318e-06,
"loss": 0.7809,
"step": 4269
},
{
"epoch": 1.2396574248802439,
"grad_norm": 3.6016719341278076,
"learning_rate": 8.96625517764641e-06,
"loss": 0.8463,
"step": 4270
},
{
"epoch": 1.2399477427783423,
"grad_norm": 3.190556049346924,
"learning_rate": 8.965670441875722e-06,
"loss": 0.7897,
"step": 4271
},
{
"epoch": 1.2402380606764407,
"grad_norm": 3.8056397438049316,
"learning_rate": 8.965085559852682e-06,
"loss": 0.7555,
"step": 4272
},
{
"epoch": 1.2405283785745391,
"grad_norm": 3.822848081588745,
"learning_rate": 8.964500531598859e-06,
"loss": 0.7953,
"step": 4273
},
{
"epoch": 1.2408186964726375,
"grad_norm": 3.6595678329467773,
"learning_rate": 8.963915357135831e-06,
"loss": 0.8042,
"step": 4274
},
{
"epoch": 1.241109014370736,
"grad_norm": 3.2902088165283203,
"learning_rate": 8.963330036485177e-06,
"loss": 0.6457,
"step": 4275
},
{
"epoch": 1.2413993322688344,
"grad_norm": 3.0377769470214844,
"learning_rate": 8.962744569668485e-06,
"loss": 0.7047,
"step": 4276
},
{
"epoch": 1.2416896501669328,
"grad_norm": 3.4491989612579346,
"learning_rate": 8.962158956707343e-06,
"loss": 0.7604,
"step": 4277
},
{
"epoch": 1.2419799680650312,
"grad_norm": 3.833693027496338,
"learning_rate": 8.961573197623353e-06,
"loss": 0.7477,
"step": 4278
},
{
"epoch": 1.2422702859631296,
"grad_norm": 3.5604989528656006,
"learning_rate": 8.960987292438117e-06,
"loss": 0.7044,
"step": 4279
},
{
"epoch": 1.242560603861228,
"grad_norm": 4.023108959197998,
"learning_rate": 8.96040124117324e-06,
"loss": 0.8121,
"step": 4280
},
{
"epoch": 1.2428509217593264,
"grad_norm": 4.016019821166992,
"learning_rate": 8.959815043850336e-06,
"loss": 0.8181,
"step": 4281
},
{
"epoch": 1.2431412396574248,
"grad_norm": 3.4648163318634033,
"learning_rate": 8.959228700491025e-06,
"loss": 0.7576,
"step": 4282
},
{
"epoch": 1.2434315575555233,
"grad_norm": 3.7959625720977783,
"learning_rate": 8.958642211116932e-06,
"loss": 0.8032,
"step": 4283
},
{
"epoch": 1.2437218754536217,
"grad_norm": 3.156304121017456,
"learning_rate": 8.958055575749685e-06,
"loss": 0.6847,
"step": 4284
},
{
"epoch": 1.24401219335172,
"grad_norm": 3.544156789779663,
"learning_rate": 8.957468794410918e-06,
"loss": 0.8136,
"step": 4285
},
{
"epoch": 1.2443025112498185,
"grad_norm": 3.452969551086426,
"learning_rate": 8.956881867122272e-06,
"loss": 0.8339,
"step": 4286
},
{
"epoch": 1.244592829147917,
"grad_norm": 3.346737861633301,
"learning_rate": 8.956294793905394e-06,
"loss": 0.6818,
"step": 4287
},
{
"epoch": 1.2448831470460153,
"grad_norm": 3.5661866664886475,
"learning_rate": 8.955707574781934e-06,
"loss": 0.8036,
"step": 4288
},
{
"epoch": 1.2451734649441137,
"grad_norm": 3.5071399211883545,
"learning_rate": 8.955120209773549e-06,
"loss": 0.7945,
"step": 4289
},
{
"epoch": 1.2454637828422122,
"grad_norm": 3.2883074283599854,
"learning_rate": 8.954532698901899e-06,
"loss": 0.7716,
"step": 4290
},
{
"epoch": 1.2457541007403106,
"grad_norm": 3.3931667804718018,
"learning_rate": 8.953945042188652e-06,
"loss": 0.7448,
"step": 4291
},
{
"epoch": 1.246044418638409,
"grad_norm": 3.219741106033325,
"learning_rate": 8.953357239655482e-06,
"loss": 0.739,
"step": 4292
},
{
"epoch": 1.2463347365365074,
"grad_norm": 3.6574721336364746,
"learning_rate": 8.952769291324065e-06,
"loss": 0.842,
"step": 4293
},
{
"epoch": 1.2466250544346058,
"grad_norm": 3.3695685863494873,
"learning_rate": 8.952181197216086e-06,
"loss": 0.7608,
"step": 4294
},
{
"epoch": 1.2469153723327042,
"grad_norm": 3.4170355796813965,
"learning_rate": 8.951592957353233e-06,
"loss": 0.7691,
"step": 4295
},
{
"epoch": 1.2472056902308026,
"grad_norm": 3.5159530639648438,
"learning_rate": 8.9510045717572e-06,
"loss": 0.7036,
"step": 4296
},
{
"epoch": 1.247496008128901,
"grad_norm": 3.3947741985321045,
"learning_rate": 8.950416040449684e-06,
"loss": 0.7098,
"step": 4297
},
{
"epoch": 1.2477863260269997,
"grad_norm": 3.538968801498413,
"learning_rate": 8.949827363452394e-06,
"loss": 0.7997,
"step": 4298
},
{
"epoch": 1.2480766439250979,
"grad_norm": 3.8506956100463867,
"learning_rate": 8.949238540787038e-06,
"loss": 0.8263,
"step": 4299
},
{
"epoch": 1.2483669618231965,
"grad_norm": 3.439701795578003,
"learning_rate": 8.948649572475332e-06,
"loss": 0.8389,
"step": 4300
},
{
"epoch": 1.2486572797212947,
"grad_norm": 3.6517250537872314,
"learning_rate": 8.948060458538996e-06,
"loss": 0.8981,
"step": 4301
},
{
"epoch": 1.2489475976193933,
"grad_norm": 3.491595983505249,
"learning_rate": 8.947471198999758e-06,
"loss": 0.729,
"step": 4302
},
{
"epoch": 1.2492379155174917,
"grad_norm": 3.2227985858917236,
"learning_rate": 8.946881793879348e-06,
"loss": 0.7198,
"step": 4303
},
{
"epoch": 1.2495282334155902,
"grad_norm": 3.37418532371521,
"learning_rate": 8.946292243199504e-06,
"loss": 0.7225,
"step": 4304
},
{
"epoch": 1.2498185513136886,
"grad_norm": 3.6257195472717285,
"learning_rate": 8.94570254698197e-06,
"loss": 0.8104,
"step": 4305
},
{
"epoch": 1.250108869211787,
"grad_norm": 3.424806833267212,
"learning_rate": 8.945112705248488e-06,
"loss": 0.7668,
"step": 4306
},
{
"epoch": 1.2503991871098854,
"grad_norm": 3.6353793144226074,
"learning_rate": 8.944522718020818e-06,
"loss": 0.6752,
"step": 4307
},
{
"epoch": 1.2506895050079838,
"grad_norm": 3.7617337703704834,
"learning_rate": 8.943932585320714e-06,
"loss": 0.9097,
"step": 4308
},
{
"epoch": 1.2509798229060822,
"grad_norm": 3.1361441612243652,
"learning_rate": 8.943342307169942e-06,
"loss": 0.6137,
"step": 4309
},
{
"epoch": 1.2512701408041806,
"grad_norm": 3.2930431365966797,
"learning_rate": 8.94275188359027e-06,
"loss": 0.6702,
"step": 4310
},
{
"epoch": 1.251560458702279,
"grad_norm": 3.5887277126312256,
"learning_rate": 8.942161314603475e-06,
"loss": 0.7784,
"step": 4311
},
{
"epoch": 1.2518507766003775,
"grad_norm": 3.7460267543792725,
"learning_rate": 8.941570600231333e-06,
"loss": 0.8589,
"step": 4312
},
{
"epoch": 1.2521410944984759,
"grad_norm": 3.7701773643493652,
"learning_rate": 8.940979740495632e-06,
"loss": 0.8413,
"step": 4313
},
{
"epoch": 1.2524314123965743,
"grad_norm": 3.804666519165039,
"learning_rate": 8.940388735418163e-06,
"loss": 0.7439,
"step": 4314
},
{
"epoch": 1.2527217302946727,
"grad_norm": 3.4871022701263428,
"learning_rate": 8.93979758502072e-06,
"loss": 0.6554,
"step": 4315
},
{
"epoch": 1.2530120481927711,
"grad_norm": 4.020226001739502,
"learning_rate": 8.939206289325107e-06,
"loss": 0.8006,
"step": 4316
},
{
"epoch": 1.2533023660908695,
"grad_norm": 3.413485527038574,
"learning_rate": 8.938614848353127e-06,
"loss": 0.7265,
"step": 4317
},
{
"epoch": 1.253592683988968,
"grad_norm": 3.1707980632781982,
"learning_rate": 8.938023262126596e-06,
"loss": 0.727,
"step": 4318
},
{
"epoch": 1.2538830018870664,
"grad_norm": 3.4203269481658936,
"learning_rate": 8.937431530667329e-06,
"loss": 0.7808,
"step": 4319
},
{
"epoch": 1.2541733197851648,
"grad_norm": 3.5568814277648926,
"learning_rate": 8.93683965399715e-06,
"loss": 0.8797,
"step": 4320
},
{
"epoch": 1.2544636376832632,
"grad_norm": 3.493055820465088,
"learning_rate": 8.936247632137886e-06,
"loss": 0.7317,
"step": 4321
},
{
"epoch": 1.2547539555813616,
"grad_norm": 3.5168776512145996,
"learning_rate": 8.935655465111372e-06,
"loss": 0.7399,
"step": 4322
},
{
"epoch": 1.25504427347946,
"grad_norm": 3.694639205932617,
"learning_rate": 8.935063152939446e-06,
"loss": 0.7509,
"step": 4323
},
{
"epoch": 1.2553345913775584,
"grad_norm": 3.880681276321411,
"learning_rate": 8.934470695643955e-06,
"loss": 0.7885,
"step": 4324
},
{
"epoch": 1.2556249092756568,
"grad_norm": 3.654292345046997,
"learning_rate": 8.933878093246744e-06,
"loss": 0.7816,
"step": 4325
},
{
"epoch": 1.2559152271737553,
"grad_norm": 3.8426339626312256,
"learning_rate": 8.933285345769671e-06,
"loss": 0.7581,
"step": 4326
},
{
"epoch": 1.2562055450718537,
"grad_norm": 4.196420192718506,
"learning_rate": 8.932692453234596e-06,
"loss": 0.9055,
"step": 4327
},
{
"epoch": 1.256495862969952,
"grad_norm": 3.6766929626464844,
"learning_rate": 8.93209941566338e-06,
"loss": 0.7715,
"step": 4328
},
{
"epoch": 1.2567861808680505,
"grad_norm": 3.5587241649627686,
"learning_rate": 8.9315062330779e-06,
"loss": 0.7941,
"step": 4329
},
{
"epoch": 1.257076498766149,
"grad_norm": 3.5319676399230957,
"learning_rate": 8.930912905500032e-06,
"loss": 0.7719,
"step": 4330
},
{
"epoch": 1.2573668166642473,
"grad_norm": 3.6964783668518066,
"learning_rate": 8.930319432951655e-06,
"loss": 0.8323,
"step": 4331
},
{
"epoch": 1.2576571345623457,
"grad_norm": 3.3253002166748047,
"learning_rate": 8.929725815454656e-06,
"loss": 0.7429,
"step": 4332
},
{
"epoch": 1.2579474524604441,
"grad_norm": 3.380309581756592,
"learning_rate": 8.929132053030928e-06,
"loss": 0.6763,
"step": 4333
},
{
"epoch": 1.2582377703585426,
"grad_norm": 3.194960117340088,
"learning_rate": 8.928538145702372e-06,
"loss": 0.6991,
"step": 4334
},
{
"epoch": 1.258528088256641,
"grad_norm": 3.830277681350708,
"learning_rate": 8.927944093490886e-06,
"loss": 0.8593,
"step": 4335
},
{
"epoch": 1.2588184061547394,
"grad_norm": 3.335928440093994,
"learning_rate": 8.92734989641838e-06,
"loss": 0.7855,
"step": 4336
},
{
"epoch": 1.2591087240528378,
"grad_norm": 3.180267572402954,
"learning_rate": 8.92675555450677e-06,
"loss": 0.6565,
"step": 4337
},
{
"epoch": 1.2593990419509362,
"grad_norm": 3.597320795059204,
"learning_rate": 8.926161067777973e-06,
"loss": 0.8024,
"step": 4338
},
{
"epoch": 1.2596893598490346,
"grad_norm": 3.2640135288238525,
"learning_rate": 8.925566436253915e-06,
"loss": 0.6889,
"step": 4339
},
{
"epoch": 1.259979677747133,
"grad_norm": 3.3412210941314697,
"learning_rate": 8.924971659956523e-06,
"loss": 0.703,
"step": 4340
},
{
"epoch": 1.2602699956452315,
"grad_norm": 3.2234513759613037,
"learning_rate": 8.924376738907734e-06,
"loss": 0.8093,
"step": 4341
},
{
"epoch": 1.2605603135433299,
"grad_norm": 3.5414047241210938,
"learning_rate": 8.923781673129488e-06,
"loss": 0.7886,
"step": 4342
},
{
"epoch": 1.2608506314414283,
"grad_norm": 3.6356825828552246,
"learning_rate": 8.923186462643732e-06,
"loss": 0.8428,
"step": 4343
},
{
"epoch": 1.2611409493395267,
"grad_norm": 3.2509765625,
"learning_rate": 8.922591107472413e-06,
"loss": 0.6025,
"step": 4344
},
{
"epoch": 1.2614312672376253,
"grad_norm": 3.6975150108337402,
"learning_rate": 8.921995607637494e-06,
"loss": 0.8912,
"step": 4345
},
{
"epoch": 1.2617215851357235,
"grad_norm": 3.27187180519104,
"learning_rate": 8.921399963160934e-06,
"loss": 0.8242,
"step": 4346
},
{
"epoch": 1.2620119030338222,
"grad_norm": 3.6707258224487305,
"learning_rate": 8.920804174064697e-06,
"loss": 0.7924,
"step": 4347
},
{
"epoch": 1.2623022209319203,
"grad_norm": 3.329015016555786,
"learning_rate": 8.920208240370757e-06,
"loss": 0.6646,
"step": 4348
},
{
"epoch": 1.262592538830019,
"grad_norm": 3.4273433685302734,
"learning_rate": 8.919612162101096e-06,
"loss": 0.7172,
"step": 4349
},
{
"epoch": 1.2628828567281172,
"grad_norm": 3.6761045455932617,
"learning_rate": 8.919015939277693e-06,
"loss": 0.7967,
"step": 4350
},
{
"epoch": 1.2631731746262158,
"grad_norm": 3.431152105331421,
"learning_rate": 8.918419571922536e-06,
"loss": 0.7262,
"step": 4351
},
{
"epoch": 1.263463492524314,
"grad_norm": 3.728382110595703,
"learning_rate": 8.917823060057622e-06,
"loss": 0.8809,
"step": 4352
},
{
"epoch": 1.2637538104224126,
"grad_norm": 3.5108156204223633,
"learning_rate": 8.917226403704947e-06,
"loss": 0.8824,
"step": 4353
},
{
"epoch": 1.2640441283205108,
"grad_norm": 4.058180809020996,
"learning_rate": 8.916629602886518e-06,
"loss": 0.9238,
"step": 4354
},
{
"epoch": 1.2643344462186095,
"grad_norm": 3.4847519397735596,
"learning_rate": 8.916032657624342e-06,
"loss": 0.7447,
"step": 4355
},
{
"epoch": 1.2646247641167079,
"grad_norm": 3.2892417907714844,
"learning_rate": 8.915435567940436e-06,
"loss": 0.8063,
"step": 4356
},
{
"epoch": 1.2649150820148063,
"grad_norm": 3.6869657039642334,
"learning_rate": 8.914838333856822e-06,
"loss": 0.7635,
"step": 4357
},
{
"epoch": 1.2652053999129047,
"grad_norm": 3.4175963401794434,
"learning_rate": 8.914240955395522e-06,
"loss": 0.69,
"step": 4358
},
{
"epoch": 1.2654957178110031,
"grad_norm": 3.2602951526641846,
"learning_rate": 8.913643432578567e-06,
"loss": 0.7531,
"step": 4359
},
{
"epoch": 1.2657860357091015,
"grad_norm": 3.464566469192505,
"learning_rate": 8.913045765428e-06,
"loss": 0.7623,
"step": 4360
},
{
"epoch": 1.2660763536072,
"grad_norm": 3.740095615386963,
"learning_rate": 8.912447953965854e-06,
"loss": 0.7427,
"step": 4361
},
{
"epoch": 1.2663666715052984,
"grad_norm": 3.2100818157196045,
"learning_rate": 8.911849998214182e-06,
"loss": 0.7076,
"step": 4362
},
{
"epoch": 1.2666569894033968,
"grad_norm": 4.004035472869873,
"learning_rate": 8.911251898195033e-06,
"loss": 0.9656,
"step": 4363
},
{
"epoch": 1.2669473073014952,
"grad_norm": 3.215731143951416,
"learning_rate": 8.910653653930466e-06,
"loss": 0.7425,
"step": 4364
},
{
"epoch": 1.2672376251995936,
"grad_norm": 3.169572353363037,
"learning_rate": 8.910055265442546e-06,
"loss": 0.606,
"step": 4365
},
{
"epoch": 1.267527943097692,
"grad_norm": 3.384373903274536,
"learning_rate": 8.909456732753339e-06,
"loss": 0.7641,
"step": 4366
},
{
"epoch": 1.2678182609957904,
"grad_norm": 3.2704479694366455,
"learning_rate": 8.908858055884919e-06,
"loss": 0.7908,
"step": 4367
},
{
"epoch": 1.2681085788938888,
"grad_norm": 3.2683961391448975,
"learning_rate": 8.908259234859365e-06,
"loss": 0.7381,
"step": 4368
},
{
"epoch": 1.2683988967919873,
"grad_norm": 3.749446153640747,
"learning_rate": 8.90766026969876e-06,
"loss": 0.7697,
"step": 4369
},
{
"epoch": 1.2686892146900857,
"grad_norm": 3.1948935985565186,
"learning_rate": 8.907061160425196e-06,
"loss": 0.7704,
"step": 4370
},
{
"epoch": 1.268979532588184,
"grad_norm": 3.549154281616211,
"learning_rate": 8.906461907060766e-06,
"loss": 0.7984,
"step": 4371
},
{
"epoch": 1.2692698504862825,
"grad_norm": 3.7574455738067627,
"learning_rate": 8.905862509627573e-06,
"loss": 0.8247,
"step": 4372
},
{
"epoch": 1.269560168384381,
"grad_norm": 3.7579362392425537,
"learning_rate": 8.905262968147719e-06,
"loss": 0.8506,
"step": 4373
},
{
"epoch": 1.2698504862824793,
"grad_norm": 3.5681581497192383,
"learning_rate": 8.904663282643317e-06,
"loss": 0.8562,
"step": 4374
},
{
"epoch": 1.2701408041805777,
"grad_norm": 3.9688186645507812,
"learning_rate": 8.904063453136483e-06,
"loss": 0.7506,
"step": 4375
},
{
"epoch": 1.2704311220786761,
"grad_norm": 3.3955612182617188,
"learning_rate": 8.90346347964934e-06,
"loss": 0.8032,
"step": 4376
},
{
"epoch": 1.2707214399767746,
"grad_norm": 3.876274585723877,
"learning_rate": 8.90286336220401e-06,
"loss": 0.8659,
"step": 4377
},
{
"epoch": 1.271011757874873,
"grad_norm": 3.3711607456207275,
"learning_rate": 8.902263100822628e-06,
"loss": 0.8466,
"step": 4378
},
{
"epoch": 1.2713020757729714,
"grad_norm": 3.78266978263855,
"learning_rate": 8.901662695527333e-06,
"loss": 0.7602,
"step": 4379
},
{
"epoch": 1.2715923936710698,
"grad_norm": 3.5354392528533936,
"learning_rate": 8.901062146340264e-06,
"loss": 0.7627,
"step": 4380
},
{
"epoch": 1.2718827115691682,
"grad_norm": 3.4958252906799316,
"learning_rate": 8.900461453283573e-06,
"loss": 0.7408,
"step": 4381
},
{
"epoch": 1.2721730294672666,
"grad_norm": 3.33056902885437,
"learning_rate": 8.899860616379413e-06,
"loss": 0.6797,
"step": 4382
},
{
"epoch": 1.272463347365365,
"grad_norm": 3.6068787574768066,
"learning_rate": 8.899259635649937e-06,
"loss": 0.7534,
"step": 4383
},
{
"epoch": 1.2727536652634635,
"grad_norm": 3.752138376235962,
"learning_rate": 8.898658511117316e-06,
"loss": 0.862,
"step": 4384
},
{
"epoch": 1.2730439831615619,
"grad_norm": 4.157615661621094,
"learning_rate": 8.898057242803715e-06,
"loss": 0.9252,
"step": 4385
},
{
"epoch": 1.2733343010596603,
"grad_norm": 3.7800021171569824,
"learning_rate": 8.89745583073131e-06,
"loss": 0.9327,
"step": 4386
},
{
"epoch": 1.2736246189577587,
"grad_norm": 3.5581021308898926,
"learning_rate": 8.89685427492228e-06,
"loss": 0.7756,
"step": 4387
},
{
"epoch": 1.273914936855857,
"grad_norm": 4.283809185028076,
"learning_rate": 8.896252575398812e-06,
"loss": 0.7042,
"step": 4388
},
{
"epoch": 1.2742052547539555,
"grad_norm": 3.8366518020629883,
"learning_rate": 8.895650732183094e-06,
"loss": 0.7813,
"step": 4389
},
{
"epoch": 1.274495572652054,
"grad_norm": 3.6063332557678223,
"learning_rate": 8.895048745297324e-06,
"loss": 0.8001,
"step": 4390
},
{
"epoch": 1.2747858905501523,
"grad_norm": 3.7101552486419678,
"learning_rate": 8.894446614763703e-06,
"loss": 0.8196,
"step": 4391
},
{
"epoch": 1.2750762084482508,
"grad_norm": 3.490100145339966,
"learning_rate": 8.893844340604433e-06,
"loss": 0.6849,
"step": 4392
},
{
"epoch": 1.2753665263463492,
"grad_norm": 3.1747055053710938,
"learning_rate": 8.89324192284173e-06,
"loss": 0.7158,
"step": 4393
},
{
"epoch": 1.2756568442444478,
"grad_norm": 3.8452651500701904,
"learning_rate": 8.892639361497812e-06,
"loss": 0.8298,
"step": 4394
},
{
"epoch": 1.275947162142546,
"grad_norm": 3.712412118911743,
"learning_rate": 8.892036656594898e-06,
"loss": 0.8208,
"step": 4395
},
{
"epoch": 1.2762374800406446,
"grad_norm": 3.924801826477051,
"learning_rate": 8.891433808155217e-06,
"loss": 0.7733,
"step": 4396
},
{
"epoch": 1.2765277979387428,
"grad_norm": 3.4314823150634766,
"learning_rate": 8.890830816201002e-06,
"loss": 0.7885,
"step": 4397
},
{
"epoch": 1.2768181158368415,
"grad_norm": 3.6019883155822754,
"learning_rate": 8.890227680754488e-06,
"loss": 0.8482,
"step": 4398
},
{
"epoch": 1.2771084337349397,
"grad_norm": 3.7721011638641357,
"learning_rate": 8.889624401837922e-06,
"loss": 0.8683,
"step": 4399
},
{
"epoch": 1.2773987516330383,
"grad_norm": 4.242115497589111,
"learning_rate": 8.889020979473552e-06,
"loss": 0.7933,
"step": 4400
},
{
"epoch": 1.2776890695311365,
"grad_norm": 3.3585760593414307,
"learning_rate": 8.888417413683632e-06,
"loss": 0.7908,
"step": 4401
},
{
"epoch": 1.2779793874292351,
"grad_norm": 3.427093744277954,
"learning_rate": 8.88781370449042e-06,
"loss": 0.7503,
"step": 4402
},
{
"epoch": 1.2782697053273333,
"grad_norm": 3.113924264907837,
"learning_rate": 8.887209851916184e-06,
"loss": 0.6797,
"step": 4403
},
{
"epoch": 1.278560023225432,
"grad_norm": 3.53076171875,
"learning_rate": 8.886605855983186e-06,
"loss": 0.8397,
"step": 4404
},
{
"epoch": 1.2788503411235301,
"grad_norm": 3.538825273513794,
"learning_rate": 8.88600171671371e-06,
"loss": 0.8238,
"step": 4405
},
{
"epoch": 1.2791406590216288,
"grad_norm": 3.9378812313079834,
"learning_rate": 8.885397434130032e-06,
"loss": 0.93,
"step": 4406
},
{
"epoch": 1.2794309769197272,
"grad_norm": 3.679234743118286,
"learning_rate": 8.88479300825444e-06,
"loss": 0.8222,
"step": 4407
},
{
"epoch": 1.2797212948178256,
"grad_norm": 3.579631805419922,
"learning_rate": 8.884188439109221e-06,
"loss": 0.712,
"step": 4408
},
{
"epoch": 1.280011612715924,
"grad_norm": 3.1883227825164795,
"learning_rate": 8.883583726716675e-06,
"loss": 0.7363,
"step": 4409
},
{
"epoch": 1.2803019306140224,
"grad_norm": 3.2501161098480225,
"learning_rate": 8.882978871099104e-06,
"loss": 0.7167,
"step": 4410
},
{
"epoch": 1.2805922485121208,
"grad_norm": 3.2505548000335693,
"learning_rate": 8.882373872278811e-06,
"loss": 0.7979,
"step": 4411
},
{
"epoch": 1.2808825664102192,
"grad_norm": 3.244330644607544,
"learning_rate": 8.881768730278112e-06,
"loss": 0.7045,
"step": 4412
},
{
"epoch": 1.2811728843083177,
"grad_norm": 3.533038854598999,
"learning_rate": 8.88116344511932e-06,
"loss": 0.7283,
"step": 4413
},
{
"epoch": 1.281463202206416,
"grad_norm": 3.818068027496338,
"learning_rate": 8.88055801682476e-06,
"loss": 0.7732,
"step": 4414
},
{
"epoch": 1.2817535201045145,
"grad_norm": 3.346083164215088,
"learning_rate": 8.879952445416763e-06,
"loss": 0.7547,
"step": 4415
},
{
"epoch": 1.282043838002613,
"grad_norm": 3.3234782218933105,
"learning_rate": 8.87934673091766e-06,
"loss": 0.7011,
"step": 4416
},
{
"epoch": 1.2823341559007113,
"grad_norm": 3.6858558654785156,
"learning_rate": 8.878740873349786e-06,
"loss": 0.7762,
"step": 4417
},
{
"epoch": 1.2826244737988097,
"grad_norm": 3.9861769676208496,
"learning_rate": 8.878134872735488e-06,
"loss": 0.7367,
"step": 4418
},
{
"epoch": 1.2829147916969081,
"grad_norm": 3.2009475231170654,
"learning_rate": 8.877528729097119e-06,
"loss": 0.6656,
"step": 4419
},
{
"epoch": 1.2832051095950066,
"grad_norm": 3.757075071334839,
"learning_rate": 8.876922442457026e-06,
"loss": 0.8415,
"step": 4420
},
{
"epoch": 1.283495427493105,
"grad_norm": 3.684903383255005,
"learning_rate": 8.87631601283757e-06,
"loss": 0.7047,
"step": 4421
},
{
"epoch": 1.2837857453912034,
"grad_norm": 3.873124122619629,
"learning_rate": 8.875709440261122e-06,
"loss": 0.9507,
"step": 4422
},
{
"epoch": 1.2840760632893018,
"grad_norm": 3.6405625343322754,
"learning_rate": 8.875102724750046e-06,
"loss": 0.7636,
"step": 4423
},
{
"epoch": 1.2843663811874002,
"grad_norm": 3.4353067874908447,
"learning_rate": 8.874495866326717e-06,
"loss": 0.7197,
"step": 4424
},
{
"epoch": 1.2846566990854986,
"grad_norm": 3.651857376098633,
"learning_rate": 8.873888865013522e-06,
"loss": 0.7654,
"step": 4425
},
{
"epoch": 1.284947016983597,
"grad_norm": 3.4452688694000244,
"learning_rate": 8.873281720832841e-06,
"loss": 0.7886,
"step": 4426
},
{
"epoch": 1.2852373348816954,
"grad_norm": 3.2013700008392334,
"learning_rate": 8.872674433807066e-06,
"loss": 0.7016,
"step": 4427
},
{
"epoch": 1.2855276527797939,
"grad_norm": 3.624314546585083,
"learning_rate": 8.872067003958597e-06,
"loss": 0.7305,
"step": 4428
},
{
"epoch": 1.2858179706778923,
"grad_norm": 3.3400025367736816,
"learning_rate": 8.871459431309832e-06,
"loss": 0.7687,
"step": 4429
},
{
"epoch": 1.2861082885759907,
"grad_norm": 3.594221353530884,
"learning_rate": 8.870851715883181e-06,
"loss": 0.7492,
"step": 4430
},
{
"epoch": 1.286398606474089,
"grad_norm": 3.681166172027588,
"learning_rate": 8.870243857701054e-06,
"loss": 0.7178,
"step": 4431
},
{
"epoch": 1.2866889243721875,
"grad_norm": 3.8900341987609863,
"learning_rate": 8.86963585678587e-06,
"loss": 0.7441,
"step": 4432
},
{
"epoch": 1.286979242270286,
"grad_norm": 3.9225640296936035,
"learning_rate": 8.86902771316005e-06,
"loss": 0.8456,
"step": 4433
},
{
"epoch": 1.2872695601683843,
"grad_norm": 4.030943393707275,
"learning_rate": 8.868419426846023e-06,
"loss": 0.917,
"step": 4434
},
{
"epoch": 1.2875598780664828,
"grad_norm": 3.665842294692993,
"learning_rate": 8.867810997866224e-06,
"loss": 0.7861,
"step": 4435
},
{
"epoch": 1.2878501959645812,
"grad_norm": 3.1855833530426025,
"learning_rate": 8.867202426243089e-06,
"loss": 0.7015,
"step": 4436
},
{
"epoch": 1.2881405138626796,
"grad_norm": 3.545858860015869,
"learning_rate": 8.866593711999065e-06,
"loss": 0.6991,
"step": 4437
},
{
"epoch": 1.288430831760778,
"grad_norm": 3.6752161979675293,
"learning_rate": 8.865984855156597e-06,
"loss": 0.9095,
"step": 4438
},
{
"epoch": 1.2887211496588764,
"grad_norm": 3.5139942169189453,
"learning_rate": 8.865375855738144e-06,
"loss": 0.7329,
"step": 4439
},
{
"epoch": 1.2890114675569748,
"grad_norm": 3.157313346862793,
"learning_rate": 8.864766713766163e-06,
"loss": 0.7239,
"step": 4440
},
{
"epoch": 1.2893017854550732,
"grad_norm": 3.623577117919922,
"learning_rate": 8.864157429263117e-06,
"loss": 0.8599,
"step": 4441
},
{
"epoch": 1.2895921033531716,
"grad_norm": 3.468719959259033,
"learning_rate": 8.86354800225148e-06,
"loss": 0.7423,
"step": 4442
},
{
"epoch": 1.28988242125127,
"grad_norm": 3.5650932788848877,
"learning_rate": 8.862938432753727e-06,
"loss": 0.7737,
"step": 4443
},
{
"epoch": 1.2901727391493685,
"grad_norm": 3.9904751777648926,
"learning_rate": 8.862328720792336e-06,
"loss": 0.8928,
"step": 4444
},
{
"epoch": 1.290463057047467,
"grad_norm": 3.572465419769287,
"learning_rate": 8.861718866389794e-06,
"loss": 0.7338,
"step": 4445
},
{
"epoch": 1.2907533749455653,
"grad_norm": 3.5529489517211914,
"learning_rate": 8.861108869568595e-06,
"loss": 0.7628,
"step": 4446
},
{
"epoch": 1.291043692843664,
"grad_norm": 4.1549763679504395,
"learning_rate": 8.860498730351232e-06,
"loss": 0.8803,
"step": 4447
},
{
"epoch": 1.2913340107417621,
"grad_norm": 3.6090340614318848,
"learning_rate": 8.859888448760207e-06,
"loss": 0.7089,
"step": 4448
},
{
"epoch": 1.2916243286398608,
"grad_norm": 3.5773282051086426,
"learning_rate": 8.859278024818028e-06,
"loss": 0.7114,
"step": 4449
},
{
"epoch": 1.291914646537959,
"grad_norm": 3.5102736949920654,
"learning_rate": 8.858667458547207e-06,
"loss": 0.6933,
"step": 4450
},
{
"epoch": 1.2922049644360576,
"grad_norm": 3.4220693111419678,
"learning_rate": 8.858056749970263e-06,
"loss": 0.8308,
"step": 4451
},
{
"epoch": 1.2924952823341558,
"grad_norm": 3.735527992248535,
"learning_rate": 8.857445899109716e-06,
"loss": 0.8135,
"step": 4452
},
{
"epoch": 1.2927856002322544,
"grad_norm": 3.3440489768981934,
"learning_rate": 8.856834905988095e-06,
"loss": 0.8015,
"step": 4453
},
{
"epoch": 1.2930759181303526,
"grad_norm": 3.7086057662963867,
"learning_rate": 8.856223770627932e-06,
"loss": 0.7704,
"step": 4454
},
{
"epoch": 1.2933662360284512,
"grad_norm": 3.410614490509033,
"learning_rate": 8.855612493051768e-06,
"loss": 0.7604,
"step": 4455
},
{
"epoch": 1.2936565539265494,
"grad_norm": 3.5582363605499268,
"learning_rate": 8.855001073282145e-06,
"loss": 0.7961,
"step": 4456
},
{
"epoch": 1.293946871824648,
"grad_norm": 3.860466241836548,
"learning_rate": 8.854389511341613e-06,
"loss": 0.8195,
"step": 4457
},
{
"epoch": 1.2942371897227465,
"grad_norm": 3.4531681537628174,
"learning_rate": 8.853777807252724e-06,
"loss": 0.7939,
"step": 4458
},
{
"epoch": 1.294527507620845,
"grad_norm": 3.2805068492889404,
"learning_rate": 8.85316596103804e-06,
"loss": 0.7967,
"step": 4459
},
{
"epoch": 1.2948178255189433,
"grad_norm": 3.298468828201294,
"learning_rate": 8.852553972720123e-06,
"loss": 0.7372,
"step": 4460
},
{
"epoch": 1.2951081434170417,
"grad_norm": 3.193430185317993,
"learning_rate": 8.851941842321545e-06,
"loss": 0.6366,
"step": 4461
},
{
"epoch": 1.2953984613151401,
"grad_norm": 3.1615333557128906,
"learning_rate": 8.851329569864882e-06,
"loss": 0.6768,
"step": 4462
},
{
"epoch": 1.2956887792132386,
"grad_norm": 3.9289627075195312,
"learning_rate": 8.85071715537271e-06,
"loss": 0.8529,
"step": 4463
},
{
"epoch": 1.295979097111337,
"grad_norm": 3.5651650428771973,
"learning_rate": 8.85010459886762e-06,
"loss": 0.7418,
"step": 4464
},
{
"epoch": 1.2962694150094354,
"grad_norm": 3.642563819885254,
"learning_rate": 8.849491900372199e-06,
"loss": 0.7399,
"step": 4465
},
{
"epoch": 1.2965597329075338,
"grad_norm": 4.064639568328857,
"learning_rate": 8.848879059909043e-06,
"loss": 0.7291,
"step": 4466
},
{
"epoch": 1.2968500508056322,
"grad_norm": 3.841418504714966,
"learning_rate": 8.848266077500757e-06,
"loss": 0.7529,
"step": 4467
},
{
"epoch": 1.2971403687037306,
"grad_norm": 3.663754463195801,
"learning_rate": 8.847652953169944e-06,
"loss": 0.8091,
"step": 4468
},
{
"epoch": 1.297430686601829,
"grad_norm": 3.4412853717803955,
"learning_rate": 8.847039686939218e-06,
"loss": 0.7146,
"step": 4469
},
{
"epoch": 1.2977210044999274,
"grad_norm": 3.2278478145599365,
"learning_rate": 8.846426278831193e-06,
"loss": 0.7616,
"step": 4470
},
{
"epoch": 1.2980113223980259,
"grad_norm": 3.4113316535949707,
"learning_rate": 8.845812728868496e-06,
"loss": 0.7473,
"step": 4471
},
{
"epoch": 1.2983016402961243,
"grad_norm": 3.9025003910064697,
"learning_rate": 8.845199037073748e-06,
"loss": 0.8915,
"step": 4472
},
{
"epoch": 1.2985919581942227,
"grad_norm": 4.0561957359313965,
"learning_rate": 8.84458520346959e-06,
"loss": 0.906,
"step": 4473
},
{
"epoch": 1.298882276092321,
"grad_norm": 3.336223840713501,
"learning_rate": 8.843971228078652e-06,
"loss": 0.713,
"step": 4474
},
{
"epoch": 1.2991725939904195,
"grad_norm": 2.9069418907165527,
"learning_rate": 8.843357110923582e-06,
"loss": 0.6755,
"step": 4475
},
{
"epoch": 1.299462911888518,
"grad_norm": 3.5413739681243896,
"learning_rate": 8.842742852027027e-06,
"loss": 0.7062,
"step": 4476
},
{
"epoch": 1.2997532297866163,
"grad_norm": 3.9641714096069336,
"learning_rate": 8.84212845141164e-06,
"loss": 0.7989,
"step": 4477
},
{
"epoch": 1.3000435476847148,
"grad_norm": 3.441683292388916,
"learning_rate": 8.84151390910008e-06,
"loss": 0.7618,
"step": 4478
},
{
"epoch": 1.3003338655828132,
"grad_norm": 3.9780099391937256,
"learning_rate": 8.840899225115012e-06,
"loss": 0.924,
"step": 4479
},
{
"epoch": 1.3006241834809116,
"grad_norm": 3.294429302215576,
"learning_rate": 8.840284399479104e-06,
"loss": 0.8258,
"step": 4480
},
{
"epoch": 1.30091450137901,
"grad_norm": 3.246403455734253,
"learning_rate": 8.839669432215032e-06,
"loss": 0.7254,
"step": 4481
},
{
"epoch": 1.3012048192771084,
"grad_norm": 3.34118390083313,
"learning_rate": 8.839054323345475e-06,
"loss": 0.6937,
"step": 4482
},
{
"epoch": 1.3014951371752068,
"grad_norm": 3.6143157482147217,
"learning_rate": 8.83843907289312e-06,
"loss": 0.8604,
"step": 4483
},
{
"epoch": 1.3017854550733052,
"grad_norm": 3.6616508960723877,
"learning_rate": 8.837823680880653e-06,
"loss": 0.7709,
"step": 4484
},
{
"epoch": 1.3020757729714036,
"grad_norm": 3.775017499923706,
"learning_rate": 8.837208147330772e-06,
"loss": 0.9203,
"step": 4485
},
{
"epoch": 1.302366090869502,
"grad_norm": 3.4333863258361816,
"learning_rate": 8.836592472266177e-06,
"loss": 0.693,
"step": 4486
},
{
"epoch": 1.3026564087676005,
"grad_norm": 3.2192022800445557,
"learning_rate": 8.835976655709574e-06,
"loss": 0.7076,
"step": 4487
},
{
"epoch": 1.3029467266656989,
"grad_norm": 3.2752268314361572,
"learning_rate": 8.835360697683675e-06,
"loss": 0.6535,
"step": 4488
},
{
"epoch": 1.3032370445637973,
"grad_norm": 3.480109691619873,
"learning_rate": 8.834744598211195e-06,
"loss": 0.7052,
"step": 4489
},
{
"epoch": 1.3035273624618957,
"grad_norm": 3.7845897674560547,
"learning_rate": 8.834128357314856e-06,
"loss": 0.7524,
"step": 4490
},
{
"epoch": 1.3038176803599941,
"grad_norm": 3.100076198577881,
"learning_rate": 8.833511975017385e-06,
"loss": 0.6737,
"step": 4491
},
{
"epoch": 1.3041079982580925,
"grad_norm": 3.8748559951782227,
"learning_rate": 8.832895451341514e-06,
"loss": 0.7552,
"step": 4492
},
{
"epoch": 1.304398316156191,
"grad_norm": 3.216489315032959,
"learning_rate": 8.832278786309979e-06,
"loss": 0.7168,
"step": 4493
},
{
"epoch": 1.3046886340542894,
"grad_norm": 3.7815732955932617,
"learning_rate": 8.831661979945522e-06,
"loss": 0.7442,
"step": 4494
},
{
"epoch": 1.3049789519523878,
"grad_norm": 3.8493406772613525,
"learning_rate": 8.831045032270895e-06,
"loss": 0.7968,
"step": 4495
},
{
"epoch": 1.3052692698504864,
"grad_norm": 3.2961323261260986,
"learning_rate": 8.830427943308846e-06,
"loss": 0.8342,
"step": 4496
},
{
"epoch": 1.3055595877485846,
"grad_norm": 3.404946804046631,
"learning_rate": 8.829810713082134e-06,
"loss": 0.6763,
"step": 4497
},
{
"epoch": 1.3058499056466832,
"grad_norm": 3.4007487297058105,
"learning_rate": 8.829193341613522e-06,
"loss": 0.6758,
"step": 4498
},
{
"epoch": 1.3061402235447814,
"grad_norm": 3.0017502307891846,
"learning_rate": 8.82857582892578e-06,
"loss": 0.629,
"step": 4499
},
{
"epoch": 1.30643054144288,
"grad_norm": 3.563961982727051,
"learning_rate": 8.827958175041682e-06,
"loss": 0.7526,
"step": 4500
},
{
"epoch": 1.30643054144288,
"eval_loss": 1.1974629163742065,
"eval_runtime": 13.5571,
"eval_samples_per_second": 29.505,
"eval_steps_per_second": 3.688,
"step": 4500
},
{
"epoch": 1.3067208593409783,
"grad_norm": 3.8173577785491943,
"learning_rate": 8.827340379984003e-06,
"loss": 0.8251,
"step": 4501
},
{
"epoch": 1.307011177239077,
"grad_norm": 3.867654323577881,
"learning_rate": 8.826722443775531e-06,
"loss": 0.8697,
"step": 4502
},
{
"epoch": 1.307301495137175,
"grad_norm": 3.384533405303955,
"learning_rate": 8.826104366439054e-06,
"loss": 0.6338,
"step": 4503
},
{
"epoch": 1.3075918130352737,
"grad_norm": 3.7659904956817627,
"learning_rate": 8.825486147997366e-06,
"loss": 0.7178,
"step": 4504
},
{
"epoch": 1.307882130933372,
"grad_norm": 3.433115243911743,
"learning_rate": 8.824867788473267e-06,
"loss": 0.7663,
"step": 4505
},
{
"epoch": 1.3081724488314705,
"grad_norm": 3.6183979511260986,
"learning_rate": 8.824249287889563e-06,
"loss": 0.789,
"step": 4506
},
{
"epoch": 1.308462766729569,
"grad_norm": 3.6479341983795166,
"learning_rate": 8.823630646269061e-06,
"loss": 0.8397,
"step": 4507
},
{
"epoch": 1.3087530846276674,
"grad_norm": 3.5444366931915283,
"learning_rate": 8.82301186363458e-06,
"loss": 0.7653,
"step": 4508
},
{
"epoch": 1.3090434025257658,
"grad_norm": 3.838498830795288,
"learning_rate": 8.822392940008937e-06,
"loss": 0.7974,
"step": 4509
},
{
"epoch": 1.3093337204238642,
"grad_norm": 3.370309352874756,
"learning_rate": 8.82177387541496e-06,
"loss": 0.7828,
"step": 4510
},
{
"epoch": 1.3096240383219626,
"grad_norm": 4.022466659545898,
"learning_rate": 8.82115466987548e-06,
"loss": 0.7971,
"step": 4511
},
{
"epoch": 1.309914356220061,
"grad_norm": 3.3781039714813232,
"learning_rate": 8.820535323413331e-06,
"loss": 0.7885,
"step": 4512
},
{
"epoch": 1.3102046741181594,
"grad_norm": 3.4412195682525635,
"learning_rate": 8.819915836051354e-06,
"loss": 0.7148,
"step": 4513
},
{
"epoch": 1.3104949920162579,
"grad_norm": 3.6797571182250977,
"learning_rate": 8.8192962078124e-06,
"loss": 0.7848,
"step": 4514
},
{
"epoch": 1.3107853099143563,
"grad_norm": 3.71624755859375,
"learning_rate": 8.818676438719314e-06,
"loss": 0.882,
"step": 4515
},
{
"epoch": 1.3110756278124547,
"grad_norm": 3.6649434566497803,
"learning_rate": 8.818056528794958e-06,
"loss": 0.8181,
"step": 4516
},
{
"epoch": 1.311365945710553,
"grad_norm": 3.5233776569366455,
"learning_rate": 8.817436478062193e-06,
"loss": 0.7826,
"step": 4517
},
{
"epoch": 1.3116562636086515,
"grad_norm": 3.272698402404785,
"learning_rate": 8.816816286543886e-06,
"loss": 0.8691,
"step": 4518
},
{
"epoch": 1.31194658150675,
"grad_norm": 3.329058885574341,
"learning_rate": 8.816195954262907e-06,
"loss": 0.755,
"step": 4519
},
{
"epoch": 1.3122368994048483,
"grad_norm": 3.4336793422698975,
"learning_rate": 8.815575481242137e-06,
"loss": 0.8395,
"step": 4520
},
{
"epoch": 1.3125272173029467,
"grad_norm": 3.476872444152832,
"learning_rate": 8.814954867504457e-06,
"loss": 0.7582,
"step": 4521
},
{
"epoch": 1.3128175352010452,
"grad_norm": 3.659498453140259,
"learning_rate": 8.814334113072755e-06,
"loss": 0.751,
"step": 4522
},
{
"epoch": 1.3131078530991436,
"grad_norm": 3.768644332885742,
"learning_rate": 8.813713217969926e-06,
"loss": 0.7894,
"step": 4523
},
{
"epoch": 1.313398170997242,
"grad_norm": 3.286921977996826,
"learning_rate": 8.813092182218866e-06,
"loss": 0.7101,
"step": 4524
},
{
"epoch": 1.3136884888953404,
"grad_norm": 3.3848443031311035,
"learning_rate": 8.81247100584248e-06,
"loss": 0.7923,
"step": 4525
},
{
"epoch": 1.3139788067934388,
"grad_norm": 3.8643271923065186,
"learning_rate": 8.811849688863674e-06,
"loss": 0.7354,
"step": 4526
},
{
"epoch": 1.3142691246915372,
"grad_norm": 3.531477212905884,
"learning_rate": 8.811228231305368e-06,
"loss": 0.7571,
"step": 4527
},
{
"epoch": 1.3145594425896356,
"grad_norm": 3.487464189529419,
"learning_rate": 8.810606633190475e-06,
"loss": 0.803,
"step": 4528
},
{
"epoch": 1.314849760487734,
"grad_norm": 3.5375609397888184,
"learning_rate": 8.80998489454192e-06,
"loss": 0.747,
"step": 4529
},
{
"epoch": 1.3151400783858325,
"grad_norm": 3.17202091217041,
"learning_rate": 8.809363015382636e-06,
"loss": 0.7476,
"step": 4530
},
{
"epoch": 1.3154303962839309,
"grad_norm": 3.4418551921844482,
"learning_rate": 8.808740995735556e-06,
"loss": 0.8416,
"step": 4531
},
{
"epoch": 1.3157207141820293,
"grad_norm": 3.173208713531494,
"learning_rate": 8.80811883562362e-06,
"loss": 0.6681,
"step": 4532
},
{
"epoch": 1.3160110320801277,
"grad_norm": 3.4615097045898438,
"learning_rate": 8.80749653506977e-06,
"loss": 0.7732,
"step": 4533
},
{
"epoch": 1.3163013499782261,
"grad_norm": 3.5268373489379883,
"learning_rate": 8.806874094096962e-06,
"loss": 0.7281,
"step": 4534
},
{
"epoch": 1.3165916678763245,
"grad_norm": 3.3505427837371826,
"learning_rate": 8.806251512728145e-06,
"loss": 0.8716,
"step": 4535
},
{
"epoch": 1.316881985774423,
"grad_norm": 4.016345977783203,
"learning_rate": 8.805628790986284e-06,
"loss": 0.9032,
"step": 4536
},
{
"epoch": 1.3171723036725214,
"grad_norm": 3.4283342361450195,
"learning_rate": 8.805005928894346e-06,
"loss": 0.7144,
"step": 4537
},
{
"epoch": 1.3174626215706198,
"grad_norm": 3.8195414543151855,
"learning_rate": 8.804382926475296e-06,
"loss": 0.8395,
"step": 4538
},
{
"epoch": 1.3177529394687182,
"grad_norm": 3.2591845989227295,
"learning_rate": 8.803759783752113e-06,
"loss": 0.8047,
"step": 4539
},
{
"epoch": 1.3180432573668166,
"grad_norm": 3.5877437591552734,
"learning_rate": 8.80313650074778e-06,
"loss": 0.8031,
"step": 4540
},
{
"epoch": 1.318333575264915,
"grad_norm": 3.386138439178467,
"learning_rate": 8.802513077485283e-06,
"loss": 0.6563,
"step": 4541
},
{
"epoch": 1.3186238931630134,
"grad_norm": 3.528615951538086,
"learning_rate": 8.801889513987612e-06,
"loss": 0.8133,
"step": 4542
},
{
"epoch": 1.3189142110611118,
"grad_norm": 3.881578207015991,
"learning_rate": 8.801265810277764e-06,
"loss": 0.8733,
"step": 4543
},
{
"epoch": 1.3192045289592103,
"grad_norm": 3.4517362117767334,
"learning_rate": 8.800641966378742e-06,
"loss": 0.7932,
"step": 4544
},
{
"epoch": 1.319494846857309,
"grad_norm": 3.6806721687316895,
"learning_rate": 8.800017982313552e-06,
"loss": 0.7803,
"step": 4545
},
{
"epoch": 1.319785164755407,
"grad_norm": 3.730502128601074,
"learning_rate": 8.799393858105206e-06,
"loss": 0.7542,
"step": 4546
},
{
"epoch": 1.3200754826535057,
"grad_norm": 3.9465389251708984,
"learning_rate": 8.798769593776723e-06,
"loss": 0.9533,
"step": 4547
},
{
"epoch": 1.320365800551604,
"grad_norm": 3.670346975326538,
"learning_rate": 8.798145189351127e-06,
"loss": 0.7445,
"step": 4548
},
{
"epoch": 1.3206561184497025,
"grad_norm": 3.567537784576416,
"learning_rate": 8.797520644851441e-06,
"loss": 0.8044,
"step": 4549
},
{
"epoch": 1.3209464363478007,
"grad_norm": 4.003215312957764,
"learning_rate": 8.7968959603007e-06,
"loss": 0.7814,
"step": 4550
},
{
"epoch": 1.3212367542458994,
"grad_norm": 3.2233598232269287,
"learning_rate": 8.796271135721944e-06,
"loss": 0.763,
"step": 4551
},
{
"epoch": 1.3215270721439976,
"grad_norm": 3.6300904750823975,
"learning_rate": 8.795646171138215e-06,
"loss": 0.7442,
"step": 4552
},
{
"epoch": 1.3218173900420962,
"grad_norm": 3.644545555114746,
"learning_rate": 8.795021066572562e-06,
"loss": 0.7269,
"step": 4553
},
{
"epoch": 1.3221077079401944,
"grad_norm": 3.8695108890533447,
"learning_rate": 8.794395822048036e-06,
"loss": 0.8088,
"step": 4554
},
{
"epoch": 1.322398025838293,
"grad_norm": 4.05075216293335,
"learning_rate": 8.7937704375877e-06,
"loss": 0.9627,
"step": 4555
},
{
"epoch": 1.3226883437363912,
"grad_norm": 4.718074798583984,
"learning_rate": 8.793144913214616e-06,
"loss": 0.9465,
"step": 4556
},
{
"epoch": 1.3229786616344898,
"grad_norm": 3.9765820503234863,
"learning_rate": 8.792519248951851e-06,
"loss": 0.7774,
"step": 4557
},
{
"epoch": 1.3232689795325883,
"grad_norm": 3.4047420024871826,
"learning_rate": 8.791893444822483e-06,
"loss": 0.7692,
"step": 4558
},
{
"epoch": 1.3235592974306867,
"grad_norm": 3.7445778846740723,
"learning_rate": 8.791267500849589e-06,
"loss": 0.7714,
"step": 4559
},
{
"epoch": 1.323849615328785,
"grad_norm": 3.5737650394439697,
"learning_rate": 8.790641417056254e-06,
"loss": 0.8386,
"step": 4560
},
{
"epoch": 1.3241399332268835,
"grad_norm": 3.6184606552124023,
"learning_rate": 8.790015193465566e-06,
"loss": 0.8462,
"step": 4561
},
{
"epoch": 1.324430251124982,
"grad_norm": 3.231999158859253,
"learning_rate": 8.789388830100625e-06,
"loss": 0.7059,
"step": 4562
},
{
"epoch": 1.3247205690230803,
"grad_norm": 3.858499765396118,
"learning_rate": 8.788762326984525e-06,
"loss": 0.9108,
"step": 4563
},
{
"epoch": 1.3250108869211787,
"grad_norm": 3.4451212882995605,
"learning_rate": 8.788135684140375e-06,
"loss": 0.7431,
"step": 4564
},
{
"epoch": 1.3253012048192772,
"grad_norm": 3.238949775695801,
"learning_rate": 8.787508901591283e-06,
"loss": 0.7886,
"step": 4565
},
{
"epoch": 1.3255915227173756,
"grad_norm": 3.2312495708465576,
"learning_rate": 8.786881979360368e-06,
"loss": 0.7297,
"step": 4566
},
{
"epoch": 1.325881840615474,
"grad_norm": 4.028390407562256,
"learning_rate": 8.786254917470749e-06,
"loss": 0.8983,
"step": 4567
},
{
"epoch": 1.3261721585135724,
"grad_norm": 3.7783362865448,
"learning_rate": 8.785627715945549e-06,
"loss": 0.8377,
"step": 4568
},
{
"epoch": 1.3264624764116708,
"grad_norm": 3.3699865341186523,
"learning_rate": 8.7850003748079e-06,
"loss": 0.7171,
"step": 4569
},
{
"epoch": 1.3267527943097692,
"grad_norm": 4.025466442108154,
"learning_rate": 8.784372894080942e-06,
"loss": 0.7516,
"step": 4570
},
{
"epoch": 1.3270431122078676,
"grad_norm": 3.33362078666687,
"learning_rate": 8.783745273787811e-06,
"loss": 0.7302,
"step": 4571
},
{
"epoch": 1.327333430105966,
"grad_norm": 4.020394325256348,
"learning_rate": 8.783117513951658e-06,
"loss": 0.8613,
"step": 4572
},
{
"epoch": 1.3276237480040645,
"grad_norm": 3.2039053440093994,
"learning_rate": 8.78248961459563e-06,
"loss": 0.7226,
"step": 4573
},
{
"epoch": 1.3279140659021629,
"grad_norm": 3.7454745769500732,
"learning_rate": 8.781861575742888e-06,
"loss": 0.7889,
"step": 4574
},
{
"epoch": 1.3282043838002613,
"grad_norm": 3.397183895111084,
"learning_rate": 8.78123339741659e-06,
"loss": 0.7233,
"step": 4575
},
{
"epoch": 1.3284947016983597,
"grad_norm": 3.3106231689453125,
"learning_rate": 8.780605079639909e-06,
"loss": 0.7288,
"step": 4576
},
{
"epoch": 1.3287850195964581,
"grad_norm": 3.925104856491089,
"learning_rate": 8.779976622436008e-06,
"loss": 0.7683,
"step": 4577
},
{
"epoch": 1.3290753374945565,
"grad_norm": 3.4288625717163086,
"learning_rate": 8.779348025828071e-06,
"loss": 0.8009,
"step": 4578
},
{
"epoch": 1.329365655392655,
"grad_norm": 3.6718027591705322,
"learning_rate": 8.77871928983928e-06,
"loss": 0.8048,
"step": 4579
},
{
"epoch": 1.3296559732907534,
"grad_norm": 3.671327829360962,
"learning_rate": 8.77809041449282e-06,
"loss": 0.7686,
"step": 4580
},
{
"epoch": 1.3299462911888518,
"grad_norm": 3.493149757385254,
"learning_rate": 8.777461399811886e-06,
"loss": 0.8484,
"step": 4581
},
{
"epoch": 1.3302366090869502,
"grad_norm": 3.3519535064697266,
"learning_rate": 8.776832245819672e-06,
"loss": 0.8071,
"step": 4582
},
{
"epoch": 1.3305269269850486,
"grad_norm": 3.5099620819091797,
"learning_rate": 8.776202952539386e-06,
"loss": 0.7594,
"step": 4583
},
{
"epoch": 1.330817244883147,
"grad_norm": 3.586620569229126,
"learning_rate": 8.775573519994232e-06,
"loss": 0.8284,
"step": 4584
},
{
"epoch": 1.3311075627812454,
"grad_norm": 3.604992151260376,
"learning_rate": 8.774943948207427e-06,
"loss": 0.8269,
"step": 4585
},
{
"epoch": 1.3313978806793438,
"grad_norm": 4.07960844039917,
"learning_rate": 8.774314237202183e-06,
"loss": 0.8021,
"step": 4586
},
{
"epoch": 1.3316881985774423,
"grad_norm": 3.2540876865386963,
"learning_rate": 8.773684387001734e-06,
"loss": 0.6545,
"step": 4587
},
{
"epoch": 1.3319785164755407,
"grad_norm": 3.480595111846924,
"learning_rate": 8.773054397629297e-06,
"loss": 0.8309,
"step": 4588
},
{
"epoch": 1.332268834373639,
"grad_norm": 3.6046059131622314,
"learning_rate": 8.772424269108113e-06,
"loss": 0.722,
"step": 4589
},
{
"epoch": 1.3325591522717375,
"grad_norm": 3.5399723052978516,
"learning_rate": 8.77179400146142e-06,
"loss": 0.7857,
"step": 4590
},
{
"epoch": 1.332849470169836,
"grad_norm": 3.6178572177886963,
"learning_rate": 8.77116359471246e-06,
"loss": 0.7778,
"step": 4591
},
{
"epoch": 1.3331397880679343,
"grad_norm": 3.778470277786255,
"learning_rate": 8.770533048884483e-06,
"loss": 0.7767,
"step": 4592
},
{
"epoch": 1.3334301059660327,
"grad_norm": 3.622446298599243,
"learning_rate": 8.769902364000741e-06,
"loss": 0.8007,
"step": 4593
},
{
"epoch": 1.3337204238641311,
"grad_norm": 3.512143611907959,
"learning_rate": 8.7692715400845e-06,
"loss": 0.7634,
"step": 4594
},
{
"epoch": 1.3340107417622296,
"grad_norm": 3.5384085178375244,
"learning_rate": 8.768640577159018e-06,
"loss": 0.6932,
"step": 4595
},
{
"epoch": 1.3343010596603282,
"grad_norm": 3.703672170639038,
"learning_rate": 8.76800947524757e-06,
"loss": 0.8353,
"step": 4596
},
{
"epoch": 1.3345913775584264,
"grad_norm": 3.3167338371276855,
"learning_rate": 8.767378234373425e-06,
"loss": 0.7462,
"step": 4597
},
{
"epoch": 1.334881695456525,
"grad_norm": 3.785743236541748,
"learning_rate": 8.766746854559866e-06,
"loss": 0.8629,
"step": 4598
},
{
"epoch": 1.3351720133546232,
"grad_norm": 3.8739917278289795,
"learning_rate": 8.766115335830178e-06,
"loss": 0.8669,
"step": 4599
},
{
"epoch": 1.3354623312527218,
"grad_norm": 4.050196647644043,
"learning_rate": 8.76548367820765e-06,
"loss": 0.8905,
"step": 4600
},
{
"epoch": 1.33575264915082,
"grad_norm": 3.502135753631592,
"learning_rate": 8.764851881715581e-06,
"loss": 0.6934,
"step": 4601
},
{
"epoch": 1.3360429670489187,
"grad_norm": 3.472646713256836,
"learning_rate": 8.764219946377268e-06,
"loss": 0.7761,
"step": 4602
},
{
"epoch": 1.3363332849470169,
"grad_norm": 3.4790096282958984,
"learning_rate": 8.763587872216016e-06,
"loss": 0.6904,
"step": 4603
},
{
"epoch": 1.3366236028451155,
"grad_norm": 3.4734671115875244,
"learning_rate": 8.762955659255137e-06,
"loss": 0.7641,
"step": 4604
},
{
"epoch": 1.3369139207432137,
"grad_norm": 3.610750913619995,
"learning_rate": 8.762323307517946e-06,
"loss": 0.7647,
"step": 4605
},
{
"epoch": 1.3372042386413123,
"grad_norm": 3.5902762413024902,
"learning_rate": 8.761690817027764e-06,
"loss": 0.7836,
"step": 4606
},
{
"epoch": 1.3374945565394105,
"grad_norm": 3.4237771034240723,
"learning_rate": 8.761058187807921e-06,
"loss": 0.798,
"step": 4607
},
{
"epoch": 1.3377848744375092,
"grad_norm": 3.5920920372009277,
"learning_rate": 8.760425419881742e-06,
"loss": 0.8194,
"step": 4608
},
{
"epoch": 1.3380751923356076,
"grad_norm": 3.539668321609497,
"learning_rate": 8.759792513272564e-06,
"loss": 0.7582,
"step": 4609
},
{
"epoch": 1.338365510233706,
"grad_norm": 3.6332497596740723,
"learning_rate": 8.759159468003734e-06,
"loss": 0.814,
"step": 4610
},
{
"epoch": 1.3386558281318044,
"grad_norm": 3.294271469116211,
"learning_rate": 8.758526284098591e-06,
"loss": 0.7436,
"step": 4611
},
{
"epoch": 1.3389461460299028,
"grad_norm": 3.5333640575408936,
"learning_rate": 8.757892961580492e-06,
"loss": 0.8189,
"step": 4612
},
{
"epoch": 1.3392364639280012,
"grad_norm": 3.9221789836883545,
"learning_rate": 8.757259500472793e-06,
"loss": 0.7984,
"step": 4613
},
{
"epoch": 1.3395267818260996,
"grad_norm": 3.526892900466919,
"learning_rate": 8.756625900798852e-06,
"loss": 0.7433,
"step": 4614
},
{
"epoch": 1.339817099724198,
"grad_norm": 3.8629260063171387,
"learning_rate": 8.75599216258204e-06,
"loss": 0.8027,
"step": 4615
},
{
"epoch": 1.3401074176222965,
"grad_norm": 3.3638052940368652,
"learning_rate": 8.755358285845728e-06,
"loss": 0.8417,
"step": 4616
},
{
"epoch": 1.3403977355203949,
"grad_norm": 3.4001290798187256,
"learning_rate": 8.754724270613291e-06,
"loss": 0.7387,
"step": 4617
},
{
"epoch": 1.3406880534184933,
"grad_norm": 3.7218117713928223,
"learning_rate": 8.754090116908115e-06,
"loss": 0.7018,
"step": 4618
},
{
"epoch": 1.3409783713165917,
"grad_norm": 3.930997848510742,
"learning_rate": 8.753455824753584e-06,
"loss": 0.7548,
"step": 4619
},
{
"epoch": 1.3412686892146901,
"grad_norm": 3.2416226863861084,
"learning_rate": 8.752821394173092e-06,
"loss": 0.7009,
"step": 4620
},
{
"epoch": 1.3415590071127885,
"grad_norm": 3.5444040298461914,
"learning_rate": 8.752186825190037e-06,
"loss": 0.7432,
"step": 4621
},
{
"epoch": 1.341849325010887,
"grad_norm": 3.347137451171875,
"learning_rate": 8.751552117827819e-06,
"loss": 0.7666,
"step": 4622
},
{
"epoch": 1.3421396429089854,
"grad_norm": 3.45306396484375,
"learning_rate": 8.750917272109849e-06,
"loss": 0.7356,
"step": 4623
},
{
"epoch": 1.3424299608070838,
"grad_norm": 3.755613327026367,
"learning_rate": 8.750282288059538e-06,
"loss": 0.7731,
"step": 4624
},
{
"epoch": 1.3427202787051822,
"grad_norm": 3.633800745010376,
"learning_rate": 8.749647165700306e-06,
"loss": 0.785,
"step": 4625
},
{
"epoch": 1.3430105966032806,
"grad_norm": 3.169142961502075,
"learning_rate": 8.749011905055572e-06,
"loss": 0.7931,
"step": 4626
},
{
"epoch": 1.343300914501379,
"grad_norm": 3.288231611251831,
"learning_rate": 8.748376506148768e-06,
"loss": 0.7093,
"step": 4627
},
{
"epoch": 1.3435912323994774,
"grad_norm": 3.5134363174438477,
"learning_rate": 8.747740969003327e-06,
"loss": 0.7093,
"step": 4628
},
{
"epoch": 1.3438815502975758,
"grad_norm": 3.5111641883850098,
"learning_rate": 8.747105293642686e-06,
"loss": 0.7451,
"step": 4629
},
{
"epoch": 1.3441718681956742,
"grad_norm": 4.0172343254089355,
"learning_rate": 8.746469480090287e-06,
"loss": 0.9514,
"step": 4630
},
{
"epoch": 1.3444621860937727,
"grad_norm": 3.5807735919952393,
"learning_rate": 8.74583352836958e-06,
"loss": 0.7602,
"step": 4631
},
{
"epoch": 1.344752503991871,
"grad_norm": 3.807891368865967,
"learning_rate": 8.745197438504021e-06,
"loss": 0.7435,
"step": 4632
},
{
"epoch": 1.3450428218899695,
"grad_norm": 3.7621943950653076,
"learning_rate": 8.744561210517067e-06,
"loss": 0.7656,
"step": 4633
},
{
"epoch": 1.345333139788068,
"grad_norm": 3.7001357078552246,
"learning_rate": 8.743924844432178e-06,
"loss": 0.7488,
"step": 4634
},
{
"epoch": 1.3456234576861663,
"grad_norm": 3.6607542037963867,
"learning_rate": 8.74328834027283e-06,
"loss": 0.788,
"step": 4635
},
{
"epoch": 1.3459137755842647,
"grad_norm": 3.2181572914123535,
"learning_rate": 8.742651698062492e-06,
"loss": 0.7679,
"step": 4636
},
{
"epoch": 1.3462040934823631,
"grad_norm": 3.7429494857788086,
"learning_rate": 8.742014917824646e-06,
"loss": 0.8146,
"step": 4637
},
{
"epoch": 1.3464944113804616,
"grad_norm": 3.507017135620117,
"learning_rate": 8.741377999582774e-06,
"loss": 0.6924,
"step": 4638
},
{
"epoch": 1.34678472927856,
"grad_norm": 3.102918863296509,
"learning_rate": 8.740740943360367e-06,
"loss": 0.66,
"step": 4639
},
{
"epoch": 1.3470750471766584,
"grad_norm": 3.8772389888763428,
"learning_rate": 8.740103749180916e-06,
"loss": 0.7636,
"step": 4640
},
{
"epoch": 1.3473653650747568,
"grad_norm": 3.7670934200286865,
"learning_rate": 8.739466417067926e-06,
"loss": 0.8094,
"step": 4641
},
{
"epoch": 1.3476556829728552,
"grad_norm": 3.172104597091675,
"learning_rate": 8.738828947044895e-06,
"loss": 0.7114,
"step": 4642
},
{
"epoch": 1.3479460008709536,
"grad_norm": 3.3927054405212402,
"learning_rate": 8.738191339135339e-06,
"loss": 0.7699,
"step": 4643
},
{
"epoch": 1.348236318769052,
"grad_norm": 3.920764207839966,
"learning_rate": 8.737553593362769e-06,
"loss": 0.8753,
"step": 4644
},
{
"epoch": 1.3485266366671504,
"grad_norm": 3.6030490398406982,
"learning_rate": 8.736915709750704e-06,
"loss": 0.7227,
"step": 4645
},
{
"epoch": 1.3488169545652489,
"grad_norm": 3.684602975845337,
"learning_rate": 8.736277688322675e-06,
"loss": 0.7326,
"step": 4646
},
{
"epoch": 1.3491072724633475,
"grad_norm": 3.836862802505493,
"learning_rate": 8.735639529102203e-06,
"loss": 0.8414,
"step": 4647
},
{
"epoch": 1.3493975903614457,
"grad_norm": 3.029850959777832,
"learning_rate": 8.73500123211283e-06,
"loss": 0.662,
"step": 4648
},
{
"epoch": 1.3496879082595443,
"grad_norm": 3.983029842376709,
"learning_rate": 8.734362797378094e-06,
"loss": 0.8949,
"step": 4649
},
{
"epoch": 1.3499782261576425,
"grad_norm": 3.8315110206604004,
"learning_rate": 8.733724224921539e-06,
"loss": 0.769,
"step": 4650
},
{
"epoch": 1.3502685440557411,
"grad_norm": 3.8778200149536133,
"learning_rate": 8.733085514766715e-06,
"loss": 0.8529,
"step": 4651
},
{
"epoch": 1.3505588619538393,
"grad_norm": 3.760114908218384,
"learning_rate": 8.73244666693718e-06,
"loss": 0.7106,
"step": 4652
},
{
"epoch": 1.350849179851938,
"grad_norm": 3.9810686111450195,
"learning_rate": 8.731807681456493e-06,
"loss": 0.9468,
"step": 4653
},
{
"epoch": 1.3511394977500362,
"grad_norm": 3.370008707046509,
"learning_rate": 8.73116855834822e-06,
"loss": 0.7507,
"step": 4654
},
{
"epoch": 1.3514298156481348,
"grad_norm": 3.6600735187530518,
"learning_rate": 8.73052929763593e-06,
"loss": 0.7098,
"step": 4655
},
{
"epoch": 1.351720133546233,
"grad_norm": 3.648756742477417,
"learning_rate": 8.7298898993432e-06,
"loss": 0.748,
"step": 4656
},
{
"epoch": 1.3520104514443316,
"grad_norm": 3.0947530269622803,
"learning_rate": 8.729250363493613e-06,
"loss": 0.7099,
"step": 4657
},
{
"epoch": 1.35230076934243,
"grad_norm": 3.60309100151062,
"learning_rate": 8.72861069011075e-06,
"loss": 0.9044,
"step": 4658
},
{
"epoch": 1.3525910872405285,
"grad_norm": 4.018613815307617,
"learning_rate": 8.727970879218207e-06,
"loss": 0.9816,
"step": 4659
},
{
"epoch": 1.3528814051386269,
"grad_norm": 3.9331774711608887,
"learning_rate": 8.727330930839575e-06,
"loss": 0.8805,
"step": 4660
},
{
"epoch": 1.3531717230367253,
"grad_norm": 3.5565247535705566,
"learning_rate": 8.726690844998457e-06,
"loss": 0.7209,
"step": 4661
},
{
"epoch": 1.3534620409348237,
"grad_norm": 3.6686999797821045,
"learning_rate": 8.726050621718462e-06,
"loss": 0.9374,
"step": 4662
},
{
"epoch": 1.353752358832922,
"grad_norm": 3.3711307048797607,
"learning_rate": 8.725410261023198e-06,
"loss": 0.7055,
"step": 4663
},
{
"epoch": 1.3540426767310205,
"grad_norm": 3.3798294067382812,
"learning_rate": 8.72476976293628e-06,
"loss": 0.8147,
"step": 4664
},
{
"epoch": 1.354332994629119,
"grad_norm": 3.112764596939087,
"learning_rate": 8.724129127481333e-06,
"loss": 0.6867,
"step": 4665
},
{
"epoch": 1.3546233125272173,
"grad_norm": 3.7623043060302734,
"learning_rate": 8.723488354681981e-06,
"loss": 0.8379,
"step": 4666
},
{
"epoch": 1.3549136304253158,
"grad_norm": 3.341522216796875,
"learning_rate": 8.722847444561857e-06,
"loss": 0.7471,
"step": 4667
},
{
"epoch": 1.3552039483234142,
"grad_norm": 3.6576430797576904,
"learning_rate": 8.722206397144596e-06,
"loss": 0.8535,
"step": 4668
},
{
"epoch": 1.3554942662215126,
"grad_norm": 3.5284230709075928,
"learning_rate": 8.721565212453841e-06,
"loss": 0.748,
"step": 4669
},
{
"epoch": 1.355784584119611,
"grad_norm": 3.8579068183898926,
"learning_rate": 8.720923890513237e-06,
"loss": 0.9345,
"step": 4670
},
{
"epoch": 1.3560749020177094,
"grad_norm": 3.495478630065918,
"learning_rate": 8.720282431346437e-06,
"loss": 0.8069,
"step": 4671
},
{
"epoch": 1.3563652199158078,
"grad_norm": 3.690916061401367,
"learning_rate": 8.719640834977097e-06,
"loss": 0.8264,
"step": 4672
},
{
"epoch": 1.3566555378139062,
"grad_norm": 3.7047739028930664,
"learning_rate": 8.718999101428878e-06,
"loss": 0.8304,
"step": 4673
},
{
"epoch": 1.3569458557120047,
"grad_norm": 4.055042743682861,
"learning_rate": 8.71835723072545e-06,
"loss": 0.9142,
"step": 4674
},
{
"epoch": 1.357236173610103,
"grad_norm": 4.049088478088379,
"learning_rate": 8.717715222890481e-06,
"loss": 0.9886,
"step": 4675
},
{
"epoch": 1.3575264915082015,
"grad_norm": 3.6256749629974365,
"learning_rate": 8.71707307794765e-06,
"loss": 0.8418,
"step": 4676
},
{
"epoch": 1.3578168094063,
"grad_norm": 3.9590277671813965,
"learning_rate": 8.71643079592064e-06,
"loss": 0.9358,
"step": 4677
},
{
"epoch": 1.3581071273043983,
"grad_norm": 3.463407278060913,
"learning_rate": 8.715788376833136e-06,
"loss": 0.8553,
"step": 4678
},
{
"epoch": 1.3583974452024967,
"grad_norm": 3.912795066833496,
"learning_rate": 8.715145820708834e-06,
"loss": 0.8467,
"step": 4679
},
{
"epoch": 1.3586877631005951,
"grad_norm": 3.678302526473999,
"learning_rate": 8.714503127571425e-06,
"loss": 0.7989,
"step": 4680
},
{
"epoch": 1.3589780809986935,
"grad_norm": 3.615201711654663,
"learning_rate": 8.713860297444617e-06,
"loss": 0.8054,
"step": 4681
},
{
"epoch": 1.359268398896792,
"grad_norm": 3.3816769123077393,
"learning_rate": 8.713217330352116e-06,
"loss": 0.7385,
"step": 4682
},
{
"epoch": 1.3595587167948904,
"grad_norm": 3.387265682220459,
"learning_rate": 8.71257422631763e-06,
"loss": 0.6743,
"step": 4683
},
{
"epoch": 1.3598490346929888,
"grad_norm": 3.653581142425537,
"learning_rate": 8.711930985364882e-06,
"loss": 0.7954,
"step": 4684
},
{
"epoch": 1.3601393525910872,
"grad_norm": 2.788815498352051,
"learning_rate": 8.711287607517592e-06,
"loss": 0.6557,
"step": 4685
},
{
"epoch": 1.3604296704891856,
"grad_norm": 3.226760149002075,
"learning_rate": 8.710644092799486e-06,
"loss": 0.7571,
"step": 4686
},
{
"epoch": 1.360719988387284,
"grad_norm": 3.787449836730957,
"learning_rate": 8.7100004412343e-06,
"loss": 0.8545,
"step": 4687
},
{
"epoch": 1.3610103062853824,
"grad_norm": 3.5828921794891357,
"learning_rate": 8.70935665284577e-06,
"loss": 0.8057,
"step": 4688
},
{
"epoch": 1.3613006241834809,
"grad_norm": 3.35209321975708,
"learning_rate": 8.70871272765764e-06,
"loss": 0.6767,
"step": 4689
},
{
"epoch": 1.3615909420815793,
"grad_norm": 3.3819470405578613,
"learning_rate": 8.708068665693654e-06,
"loss": 0.7925,
"step": 4690
},
{
"epoch": 1.3618812599796777,
"grad_norm": 3.8231050968170166,
"learning_rate": 8.707424466977568e-06,
"loss": 0.792,
"step": 4691
},
{
"epoch": 1.362171577877776,
"grad_norm": 3.430509567260742,
"learning_rate": 8.706780131533139e-06,
"loss": 0.6875,
"step": 4692
},
{
"epoch": 1.3624618957758745,
"grad_norm": 3.384800910949707,
"learning_rate": 8.70613565938413e-06,
"loss": 0.8151,
"step": 4693
},
{
"epoch": 1.362752213673973,
"grad_norm": 3.6406893730163574,
"learning_rate": 8.705491050554308e-06,
"loss": 0.6947,
"step": 4694
},
{
"epoch": 1.3630425315720713,
"grad_norm": 3.7997663021087646,
"learning_rate": 8.704846305067446e-06,
"loss": 0.7631,
"step": 4695
},
{
"epoch": 1.36333284947017,
"grad_norm": 3.516602039337158,
"learning_rate": 8.704201422947325e-06,
"loss": 0.6366,
"step": 4696
},
{
"epoch": 1.3636231673682682,
"grad_norm": 3.8194212913513184,
"learning_rate": 8.703556404217723e-06,
"loss": 0.8989,
"step": 4697
},
{
"epoch": 1.3639134852663668,
"grad_norm": 3.5797340869903564,
"learning_rate": 8.702911248902432e-06,
"loss": 0.8461,
"step": 4698
},
{
"epoch": 1.364203803164465,
"grad_norm": 3.9624059200286865,
"learning_rate": 8.702265957025241e-06,
"loss": 0.8728,
"step": 4699
},
{
"epoch": 1.3644941210625636,
"grad_norm": 3.4519660472869873,
"learning_rate": 8.701620528609953e-06,
"loss": 0.7457,
"step": 4700
},
{
"epoch": 1.3647844389606618,
"grad_norm": 3.042926788330078,
"learning_rate": 8.70097496368037e-06,
"loss": 0.7291,
"step": 4701
},
{
"epoch": 1.3650747568587605,
"grad_norm": 3.8642966747283936,
"learning_rate": 8.700329262260296e-06,
"loss": 0.7738,
"step": 4702
},
{
"epoch": 1.3653650747568586,
"grad_norm": 3.7110559940338135,
"learning_rate": 8.69968342437355e-06,
"loss": 0.8148,
"step": 4703
},
{
"epoch": 1.3656553926549573,
"grad_norm": 3.6700289249420166,
"learning_rate": 8.699037450043945e-06,
"loss": 0.8303,
"step": 4704
},
{
"epoch": 1.3659457105530555,
"grad_norm": 3.7049381732940674,
"learning_rate": 8.698391339295308e-06,
"loss": 0.7679,
"step": 4705
},
{
"epoch": 1.366236028451154,
"grad_norm": 3.5809037685394287,
"learning_rate": 8.697745092151467e-06,
"loss": 0.8532,
"step": 4706
},
{
"epoch": 1.3665263463492523,
"grad_norm": 3.510563373565674,
"learning_rate": 8.697098708636254e-06,
"loss": 0.7873,
"step": 4707
},
{
"epoch": 1.366816664247351,
"grad_norm": 3.510591745376587,
"learning_rate": 8.696452188773506e-06,
"loss": 0.7684,
"step": 4708
},
{
"epoch": 1.3671069821454493,
"grad_norm": 2.9833831787109375,
"learning_rate": 8.69580553258707e-06,
"loss": 0.641,
"step": 4709
},
{
"epoch": 1.3673973000435478,
"grad_norm": 3.475123882293701,
"learning_rate": 8.695158740100792e-06,
"loss": 0.7174,
"step": 4710
},
{
"epoch": 1.3676876179416462,
"grad_norm": 3.5613787174224854,
"learning_rate": 8.694511811338526e-06,
"loss": 0.8575,
"step": 4711
},
{
"epoch": 1.3679779358397446,
"grad_norm": 3.813519239425659,
"learning_rate": 8.69386474632413e-06,
"loss": 0.7598,
"step": 4712
},
{
"epoch": 1.368268253737843,
"grad_norm": 3.702981472015381,
"learning_rate": 8.69321754508147e-06,
"loss": 0.7989,
"step": 4713
},
{
"epoch": 1.3685585716359414,
"grad_norm": 3.176640033721924,
"learning_rate": 8.692570207634411e-06,
"loss": 0.7673,
"step": 4714
},
{
"epoch": 1.3688488895340398,
"grad_norm": 4.3239874839782715,
"learning_rate": 8.691922734006828e-06,
"loss": 0.8715,
"step": 4715
},
{
"epoch": 1.3691392074321382,
"grad_norm": 3.727651357650757,
"learning_rate": 8.6912751242226e-06,
"loss": 0.8308,
"step": 4716
},
{
"epoch": 1.3694295253302367,
"grad_norm": 3.114671468734741,
"learning_rate": 8.690627378305609e-06,
"loss": 0.7163,
"step": 4717
},
{
"epoch": 1.369719843228335,
"grad_norm": 3.7326815128326416,
"learning_rate": 8.689979496279747e-06,
"loss": 0.763,
"step": 4718
},
{
"epoch": 1.3700101611264335,
"grad_norm": 3.3478212356567383,
"learning_rate": 8.689331478168906e-06,
"loss": 0.7826,
"step": 4719
},
{
"epoch": 1.370300479024532,
"grad_norm": 3.6909072399139404,
"learning_rate": 8.68868332399698e-06,
"loss": 0.8426,
"step": 4720
},
{
"epoch": 1.3705907969226303,
"grad_norm": 3.614936351776123,
"learning_rate": 8.688035033787881e-06,
"loss": 0.6808,
"step": 4721
},
{
"epoch": 1.3708811148207287,
"grad_norm": 2.883828639984131,
"learning_rate": 8.68738660756551e-06,
"loss": 0.622,
"step": 4722
},
{
"epoch": 1.3711714327188271,
"grad_norm": 3.4927828311920166,
"learning_rate": 8.686738045353788e-06,
"loss": 0.7631,
"step": 4723
},
{
"epoch": 1.3714617506169255,
"grad_norm": 3.9574286937713623,
"learning_rate": 8.686089347176628e-06,
"loss": 0.8098,
"step": 4724
},
{
"epoch": 1.371752068515024,
"grad_norm": 3.645223379135132,
"learning_rate": 8.685440513057955e-06,
"loss": 0.7543,
"step": 4725
},
{
"epoch": 1.3720423864131224,
"grad_norm": 3.428619146347046,
"learning_rate": 8.6847915430217e-06,
"loss": 0.7329,
"step": 4726
},
{
"epoch": 1.3723327043112208,
"grad_norm": 3.4425439834594727,
"learning_rate": 8.684142437091793e-06,
"loss": 0.7378,
"step": 4727
},
{
"epoch": 1.3726230222093192,
"grad_norm": 3.855262041091919,
"learning_rate": 8.683493195292177e-06,
"loss": 0.8353,
"step": 4728
},
{
"epoch": 1.3729133401074176,
"grad_norm": 3.844834327697754,
"learning_rate": 8.682843817646793e-06,
"loss": 0.8555,
"step": 4729
},
{
"epoch": 1.373203658005516,
"grad_norm": 3.6634161472320557,
"learning_rate": 8.682194304179592e-06,
"loss": 0.7366,
"step": 4730
},
{
"epoch": 1.3734939759036144,
"grad_norm": 3.33919358253479,
"learning_rate": 8.681544654914525e-06,
"loss": 0.7108,
"step": 4731
},
{
"epoch": 1.3737842938017129,
"grad_norm": 3.5919675827026367,
"learning_rate": 8.680894869875551e-06,
"loss": 0.7798,
"step": 4732
},
{
"epoch": 1.3740746116998113,
"grad_norm": 3.4425220489501953,
"learning_rate": 8.680244949086635e-06,
"loss": 0.7974,
"step": 4733
},
{
"epoch": 1.3743649295979097,
"grad_norm": 3.5473134517669678,
"learning_rate": 8.679594892571748e-06,
"loss": 0.8122,
"step": 4734
},
{
"epoch": 1.374655247496008,
"grad_norm": 3.0709292888641357,
"learning_rate": 8.678944700354858e-06,
"loss": 0.702,
"step": 4735
},
{
"epoch": 1.3749455653941065,
"grad_norm": 3.581000566482544,
"learning_rate": 8.678294372459951e-06,
"loss": 0.7628,
"step": 4736
},
{
"epoch": 1.375235883292205,
"grad_norm": 3.376634120941162,
"learning_rate": 8.677643908911007e-06,
"loss": 0.7925,
"step": 4737
},
{
"epoch": 1.3755262011903033,
"grad_norm": 3.8900506496429443,
"learning_rate": 8.676993309732013e-06,
"loss": 0.9143,
"step": 4738
},
{
"epoch": 1.3758165190884017,
"grad_norm": 3.7068428993225098,
"learning_rate": 8.676342574946966e-06,
"loss": 0.8024,
"step": 4739
},
{
"epoch": 1.3761068369865002,
"grad_norm": 3.0612878799438477,
"learning_rate": 8.675691704579862e-06,
"loss": 0.7633,
"step": 4740
},
{
"epoch": 1.3763971548845986,
"grad_norm": 3.1773834228515625,
"learning_rate": 8.675040698654708e-06,
"loss": 0.6102,
"step": 4741
},
{
"epoch": 1.376687472782697,
"grad_norm": 3.589904308319092,
"learning_rate": 8.674389557195513e-06,
"loss": 0.7074,
"step": 4742
},
{
"epoch": 1.3769777906807954,
"grad_norm": 2.9823384284973145,
"learning_rate": 8.673738280226287e-06,
"loss": 0.6443,
"step": 4743
},
{
"epoch": 1.3772681085788938,
"grad_norm": 3.825936794281006,
"learning_rate": 8.673086867771051e-06,
"loss": 0.8828,
"step": 4744
},
{
"epoch": 1.3775584264769922,
"grad_norm": 4.055813312530518,
"learning_rate": 8.672435319853831e-06,
"loss": 0.879,
"step": 4745
},
{
"epoch": 1.3778487443750906,
"grad_norm": 3.423865556716919,
"learning_rate": 8.671783636498652e-06,
"loss": 0.6735,
"step": 4746
},
{
"epoch": 1.3781390622731893,
"grad_norm": 3.7734615802764893,
"learning_rate": 8.67113181772955e-06,
"loss": 0.8025,
"step": 4747
},
{
"epoch": 1.3784293801712875,
"grad_norm": 4.315977573394775,
"learning_rate": 8.670479863570565e-06,
"loss": 0.9297,
"step": 4748
},
{
"epoch": 1.378719698069386,
"grad_norm": 3.550494432449341,
"learning_rate": 8.669827774045738e-06,
"loss": 0.7168,
"step": 4749
},
{
"epoch": 1.3790100159674843,
"grad_norm": 3.4887938499450684,
"learning_rate": 8.669175549179117e-06,
"loss": 0.7436,
"step": 4750
},
{
"epoch": 1.379300333865583,
"grad_norm": 3.508185863494873,
"learning_rate": 8.66852318899476e-06,
"loss": 0.9205,
"step": 4751
},
{
"epoch": 1.3795906517636811,
"grad_norm": 3.516390800476074,
"learning_rate": 8.667870693516723e-06,
"loss": 0.8103,
"step": 4752
},
{
"epoch": 1.3798809696617798,
"grad_norm": 3.781928300857544,
"learning_rate": 8.667218062769071e-06,
"loss": 0.8471,
"step": 4753
},
{
"epoch": 1.380171287559878,
"grad_norm": 3.511101722717285,
"learning_rate": 8.66656529677587e-06,
"loss": 0.8162,
"step": 4754
},
{
"epoch": 1.3804616054579766,
"grad_norm": 3.4614226818084717,
"learning_rate": 8.665912395561199e-06,
"loss": 0.7267,
"step": 4755
},
{
"epoch": 1.3807519233560748,
"grad_norm": 3.698258638381958,
"learning_rate": 8.665259359149132e-06,
"loss": 0.8931,
"step": 4756
},
{
"epoch": 1.3810422412541734,
"grad_norm": 3.715919017791748,
"learning_rate": 8.664606187563755e-06,
"loss": 0.8431,
"step": 4757
},
{
"epoch": 1.3813325591522716,
"grad_norm": 3.9970555305480957,
"learning_rate": 8.663952880829156e-06,
"loss": 0.8736,
"step": 4758
},
{
"epoch": 1.3816228770503702,
"grad_norm": 3.340646982192993,
"learning_rate": 8.663299438969429e-06,
"loss": 0.8427,
"step": 4759
},
{
"epoch": 1.3819131949484686,
"grad_norm": 3.6539206504821777,
"learning_rate": 8.66264586200867e-06,
"loss": 0.8327,
"step": 4760
},
{
"epoch": 1.382203512846567,
"grad_norm": 3.351881980895996,
"learning_rate": 8.661992149970987e-06,
"loss": 0.7164,
"step": 4761
},
{
"epoch": 1.3824938307446655,
"grad_norm": 3.7830088138580322,
"learning_rate": 8.661338302880486e-06,
"loss": 0.8005,
"step": 4762
},
{
"epoch": 1.3827841486427639,
"grad_norm": 3.4923183917999268,
"learning_rate": 8.660684320761283e-06,
"loss": 0.8499,
"step": 4763
},
{
"epoch": 1.3830744665408623,
"grad_norm": 3.5462584495544434,
"learning_rate": 8.660030203637495e-06,
"loss": 0.8269,
"step": 4764
},
{
"epoch": 1.3833647844389607,
"grad_norm": 3.958484172821045,
"learning_rate": 8.659375951533244e-06,
"loss": 0.8645,
"step": 4765
},
{
"epoch": 1.3836551023370591,
"grad_norm": 3.3109965324401855,
"learning_rate": 8.658721564472661e-06,
"loss": 0.7037,
"step": 4766
},
{
"epoch": 1.3839454202351575,
"grad_norm": 3.4409313201904297,
"learning_rate": 8.658067042479877e-06,
"loss": 0.7239,
"step": 4767
},
{
"epoch": 1.384235738133256,
"grad_norm": 3.4091596603393555,
"learning_rate": 8.657412385579034e-06,
"loss": 0.8077,
"step": 4768
},
{
"epoch": 1.3845260560313544,
"grad_norm": 3.524073362350464,
"learning_rate": 8.656757593794273e-06,
"loss": 0.8358,
"step": 4769
},
{
"epoch": 1.3848163739294528,
"grad_norm": 3.48581862449646,
"learning_rate": 8.656102667149742e-06,
"loss": 0.7484,
"step": 4770
},
{
"epoch": 1.3851066918275512,
"grad_norm": 3.575439929962158,
"learning_rate": 8.655447605669596e-06,
"loss": 0.8364,
"step": 4771
},
{
"epoch": 1.3853970097256496,
"grad_norm": 3.5409598350524902,
"learning_rate": 8.654792409377995e-06,
"loss": 0.817,
"step": 4772
},
{
"epoch": 1.385687327623748,
"grad_norm": 3.779784679412842,
"learning_rate": 8.654137078299099e-06,
"loss": 0.8296,
"step": 4773
},
{
"epoch": 1.3859776455218464,
"grad_norm": 3.9878501892089844,
"learning_rate": 8.653481612457077e-06,
"loss": 0.9375,
"step": 4774
},
{
"epoch": 1.3862679634199448,
"grad_norm": 3.504255533218384,
"learning_rate": 8.652826011876104e-06,
"loss": 0.7396,
"step": 4775
},
{
"epoch": 1.3865582813180433,
"grad_norm": 3.366769313812256,
"learning_rate": 8.652170276580357e-06,
"loss": 0.7795,
"step": 4776
},
{
"epoch": 1.3868485992161417,
"grad_norm": 3.392413377761841,
"learning_rate": 8.651514406594017e-06,
"loss": 0.7361,
"step": 4777
},
{
"epoch": 1.38713891711424,
"grad_norm": 3.5908358097076416,
"learning_rate": 8.650858401941278e-06,
"loss": 0.8597,
"step": 4778
},
{
"epoch": 1.3874292350123385,
"grad_norm": 3.968031406402588,
"learning_rate": 8.650202262646327e-06,
"loss": 0.9725,
"step": 4779
},
{
"epoch": 1.387719552910437,
"grad_norm": 4.526281356811523,
"learning_rate": 8.649545988733367e-06,
"loss": 0.7876,
"step": 4780
},
{
"epoch": 1.3880098708085353,
"grad_norm": 3.4686625003814697,
"learning_rate": 8.648889580226601e-06,
"loss": 0.8439,
"step": 4781
},
{
"epoch": 1.3883001887066337,
"grad_norm": 3.479299545288086,
"learning_rate": 8.648233037150233e-06,
"loss": 0.7461,
"step": 4782
},
{
"epoch": 1.3885905066047322,
"grad_norm": 3.5957489013671875,
"learning_rate": 8.647576359528479e-06,
"loss": 0.8737,
"step": 4783
},
{
"epoch": 1.3888808245028306,
"grad_norm": 3.4597275257110596,
"learning_rate": 8.646919547385554e-06,
"loss": 0.7269,
"step": 4784
},
{
"epoch": 1.389171142400929,
"grad_norm": 3.6386306285858154,
"learning_rate": 8.646262600745687e-06,
"loss": 0.9262,
"step": 4785
},
{
"epoch": 1.3894614602990274,
"grad_norm": 3.6978306770324707,
"learning_rate": 8.6456055196331e-06,
"loss": 0.757,
"step": 4786
},
{
"epoch": 1.3897517781971258,
"grad_norm": 3.7922861576080322,
"learning_rate": 8.64494830407203e-06,
"loss": 0.8727,
"step": 4787
},
{
"epoch": 1.3900420960952242,
"grad_norm": 3.6173031330108643,
"learning_rate": 8.644290954086711e-06,
"loss": 0.9186,
"step": 4788
},
{
"epoch": 1.3903324139933226,
"grad_norm": 3.2797791957855225,
"learning_rate": 8.643633469701389e-06,
"loss": 0.7659,
"step": 4789
},
{
"epoch": 1.390622731891421,
"grad_norm": 3.4677507877349854,
"learning_rate": 8.64297585094031e-06,
"loss": 0.8224,
"step": 4790
},
{
"epoch": 1.3909130497895195,
"grad_norm": 3.7027604579925537,
"learning_rate": 8.642318097827728e-06,
"loss": 0.8528,
"step": 4791
},
{
"epoch": 1.3912033676876179,
"grad_norm": 3.7726094722747803,
"learning_rate": 8.6416602103879e-06,
"loss": 0.817,
"step": 4792
},
{
"epoch": 1.3914936855857163,
"grad_norm": 2.998366117477417,
"learning_rate": 8.641002188645087e-06,
"loss": 0.6437,
"step": 4793
},
{
"epoch": 1.3917840034838147,
"grad_norm": 3.7641310691833496,
"learning_rate": 8.64034403262356e-06,
"loss": 0.8539,
"step": 4794
},
{
"epoch": 1.3920743213819131,
"grad_norm": 3.446791648864746,
"learning_rate": 8.639685742347588e-06,
"loss": 0.7193,
"step": 4795
},
{
"epoch": 1.3923646392800118,
"grad_norm": 3.87794828414917,
"learning_rate": 8.639027317841453e-06,
"loss": 0.8783,
"step": 4796
},
{
"epoch": 1.39265495717811,
"grad_norm": 3.71549129486084,
"learning_rate": 8.638368759129433e-06,
"loss": 0.7826,
"step": 4797
},
{
"epoch": 1.3929452750762086,
"grad_norm": 3.6566429138183594,
"learning_rate": 8.637710066235816e-06,
"loss": 0.7971,
"step": 4798
},
{
"epoch": 1.3932355929743068,
"grad_norm": 3.5347371101379395,
"learning_rate": 8.637051239184896e-06,
"loss": 0.7795,
"step": 4799
},
{
"epoch": 1.3935259108724054,
"grad_norm": 3.63020920753479,
"learning_rate": 8.63639227800097e-06,
"loss": 0.7689,
"step": 4800
},
{
"epoch": 1.3938162287705036,
"grad_norm": 3.451944589614868,
"learning_rate": 8.635733182708339e-06,
"loss": 0.7747,
"step": 4801
},
{
"epoch": 1.3941065466686022,
"grad_norm": 3.4489200115203857,
"learning_rate": 8.635073953331312e-06,
"loss": 0.7529,
"step": 4802
},
{
"epoch": 1.3943968645667004,
"grad_norm": 3.329653024673462,
"learning_rate": 8.6344145898942e-06,
"loss": 0.7888,
"step": 4803
},
{
"epoch": 1.394687182464799,
"grad_norm": 3.4779586791992188,
"learning_rate": 8.633755092421319e-06,
"loss": 0.7773,
"step": 4804
},
{
"epoch": 1.3949775003628972,
"grad_norm": 3.3433725833892822,
"learning_rate": 8.633095460936993e-06,
"loss": 0.7696,
"step": 4805
},
{
"epoch": 1.3952678182609959,
"grad_norm": 3.6129310131073,
"learning_rate": 8.632435695465549e-06,
"loss": 0.7715,
"step": 4806
},
{
"epoch": 1.395558136159094,
"grad_norm": 3.8630218505859375,
"learning_rate": 8.631775796031316e-06,
"loss": 0.8732,
"step": 4807
},
{
"epoch": 1.3958484540571927,
"grad_norm": 3.8463497161865234,
"learning_rate": 8.631115762658635e-06,
"loss": 0.7539,
"step": 4808
},
{
"epoch": 1.3961387719552911,
"grad_norm": 3.2628061771392822,
"learning_rate": 8.630455595371846e-06,
"loss": 0.7529,
"step": 4809
},
{
"epoch": 1.3964290898533895,
"grad_norm": 3.669912338256836,
"learning_rate": 8.629795294195293e-06,
"loss": 0.8761,
"step": 4810
},
{
"epoch": 1.396719407751488,
"grad_norm": 3.5903918743133545,
"learning_rate": 8.629134859153331e-06,
"loss": 0.7032,
"step": 4811
},
{
"epoch": 1.3970097256495864,
"grad_norm": 3.8013930320739746,
"learning_rate": 8.628474290270316e-06,
"loss": 0.8091,
"step": 4812
},
{
"epoch": 1.3973000435476848,
"grad_norm": 3.3065521717071533,
"learning_rate": 8.627813587570609e-06,
"loss": 0.7613,
"step": 4813
},
{
"epoch": 1.3975903614457832,
"grad_norm": 3.47182035446167,
"learning_rate": 8.627152751078576e-06,
"loss": 0.7276,
"step": 4814
},
{
"epoch": 1.3978806793438816,
"grad_norm": 3.4294252395629883,
"learning_rate": 8.62649178081859e-06,
"loss": 0.6753,
"step": 4815
},
{
"epoch": 1.39817099724198,
"grad_norm": 3.6028592586517334,
"learning_rate": 8.625830676815026e-06,
"loss": 0.8833,
"step": 4816
},
{
"epoch": 1.3984613151400784,
"grad_norm": 3.3166987895965576,
"learning_rate": 8.625169439092265e-06,
"loss": 0.6944,
"step": 4817
},
{
"epoch": 1.3987516330381768,
"grad_norm": 3.5034635066986084,
"learning_rate": 8.624508067674692e-06,
"loss": 0.8244,
"step": 4818
},
{
"epoch": 1.3990419509362753,
"grad_norm": 3.2709362506866455,
"learning_rate": 8.623846562586701e-06,
"loss": 0.7226,
"step": 4819
},
{
"epoch": 1.3993322688343737,
"grad_norm": 3.6299030780792236,
"learning_rate": 8.623184923852688e-06,
"loss": 0.7935,
"step": 4820
},
{
"epoch": 1.399622586732472,
"grad_norm": 3.91402268409729,
"learning_rate": 8.622523151497052e-06,
"loss": 0.8692,
"step": 4821
},
{
"epoch": 1.3999129046305705,
"grad_norm": 3.647177219390869,
"learning_rate": 8.6218612455442e-06,
"loss": 0.7919,
"step": 4822
},
{
"epoch": 1.400203222528669,
"grad_norm": 4.167767524719238,
"learning_rate": 8.621199206018544e-06,
"loss": 0.8089,
"step": 4823
},
{
"epoch": 1.4004935404267673,
"grad_norm": 3.5647425651550293,
"learning_rate": 8.620537032944495e-06,
"loss": 0.6652,
"step": 4824
},
{
"epoch": 1.4007838583248657,
"grad_norm": 3.2275984287261963,
"learning_rate": 8.619874726346479e-06,
"loss": 0.6856,
"step": 4825
},
{
"epoch": 1.4010741762229642,
"grad_norm": 3.2474308013916016,
"learning_rate": 8.61921228624892e-06,
"loss": 0.7441,
"step": 4826
},
{
"epoch": 1.4013644941210626,
"grad_norm": 4.000554084777832,
"learning_rate": 8.618549712676247e-06,
"loss": 0.7875,
"step": 4827
},
{
"epoch": 1.401654812019161,
"grad_norm": 3.8246102333068848,
"learning_rate": 8.617887005652898e-06,
"loss": 0.7176,
"step": 4828
},
{
"epoch": 1.4019451299172594,
"grad_norm": 3.5609772205352783,
"learning_rate": 8.61722416520331e-06,
"loss": 0.8243,
"step": 4829
},
{
"epoch": 1.4022354478153578,
"grad_norm": 3.6110541820526123,
"learning_rate": 8.616561191351934e-06,
"loss": 0.7761,
"step": 4830
},
{
"epoch": 1.4025257657134562,
"grad_norm": 3.7265875339508057,
"learning_rate": 8.615898084123214e-06,
"loss": 0.7602,
"step": 4831
},
{
"epoch": 1.4028160836115546,
"grad_norm": 3.745316505432129,
"learning_rate": 8.615234843541606e-06,
"loss": 0.8678,
"step": 4832
},
{
"epoch": 1.403106401509653,
"grad_norm": 3.5089032649993896,
"learning_rate": 8.614571469631573e-06,
"loss": 0.7717,
"step": 4833
},
{
"epoch": 1.4033967194077515,
"grad_norm": 3.9560272693634033,
"learning_rate": 8.613907962417578e-06,
"loss": 0.9322,
"step": 4834
},
{
"epoch": 1.4036870373058499,
"grad_norm": 3.922571897506714,
"learning_rate": 8.613244321924092e-06,
"loss": 0.8043,
"step": 4835
},
{
"epoch": 1.4039773552039483,
"grad_norm": 3.940345525741577,
"learning_rate": 8.612580548175588e-06,
"loss": 0.9217,
"step": 4836
},
{
"epoch": 1.4042676731020467,
"grad_norm": 3.3031015396118164,
"learning_rate": 8.61191664119655e-06,
"loss": 0.7364,
"step": 4837
},
{
"epoch": 1.4045579910001451,
"grad_norm": 3.5342633724212646,
"learning_rate": 8.611252601011457e-06,
"loss": 0.8785,
"step": 4838
},
{
"epoch": 1.4048483088982435,
"grad_norm": 3.5416972637176514,
"learning_rate": 8.610588427644803e-06,
"loss": 0.7948,
"step": 4839
},
{
"epoch": 1.405138626796342,
"grad_norm": 3.5838162899017334,
"learning_rate": 8.60992412112108e-06,
"loss": 0.799,
"step": 4840
},
{
"epoch": 1.4054289446944404,
"grad_norm": 3.579805850982666,
"learning_rate": 8.609259681464788e-06,
"loss": 0.6866,
"step": 4841
},
{
"epoch": 1.4057192625925388,
"grad_norm": 3.6548197269439697,
"learning_rate": 8.60859510870043e-06,
"loss": 0.7634,
"step": 4842
},
{
"epoch": 1.4060095804906372,
"grad_norm": 3.1477739810943604,
"learning_rate": 8.607930402852518e-06,
"loss": 0.7293,
"step": 4843
},
{
"epoch": 1.4062998983887356,
"grad_norm": 3.979515790939331,
"learning_rate": 8.607265563945563e-06,
"loss": 0.8599,
"step": 4844
},
{
"epoch": 1.406590216286834,
"grad_norm": 3.6897566318511963,
"learning_rate": 8.606600592004086e-06,
"loss": 0.7855,
"step": 4845
},
{
"epoch": 1.4068805341849324,
"grad_norm": 3.6874310970306396,
"learning_rate": 8.60593548705261e-06,
"loss": 0.828,
"step": 4846
},
{
"epoch": 1.407170852083031,
"grad_norm": 3.679901123046875,
"learning_rate": 8.605270249115668e-06,
"loss": 0.8838,
"step": 4847
},
{
"epoch": 1.4074611699811292,
"grad_norm": 3.7150042057037354,
"learning_rate": 8.604604878217786e-06,
"loss": 0.7686,
"step": 4848
},
{
"epoch": 1.4077514878792279,
"grad_norm": 3.672172784805298,
"learning_rate": 8.603939374383507e-06,
"loss": 0.687,
"step": 4849
},
{
"epoch": 1.408041805777326,
"grad_norm": 3.7549571990966797,
"learning_rate": 8.603273737637374e-06,
"loss": 0.8388,
"step": 4850
},
{
"epoch": 1.4083321236754247,
"grad_norm": 4.318403720855713,
"learning_rate": 8.602607968003935e-06,
"loss": 0.9144,
"step": 4851
},
{
"epoch": 1.408622441573523,
"grad_norm": 3.597714424133301,
"learning_rate": 8.601942065507746e-06,
"loss": 0.7885,
"step": 4852
},
{
"epoch": 1.4089127594716215,
"grad_norm": 3.403085947036743,
"learning_rate": 8.601276030173361e-06,
"loss": 0.8434,
"step": 4853
},
{
"epoch": 1.4092030773697197,
"grad_norm": 3.6063506603240967,
"learning_rate": 8.600609862025346e-06,
"loss": 0.8667,
"step": 4854
},
{
"epoch": 1.4094933952678184,
"grad_norm": 3.697525978088379,
"learning_rate": 8.599943561088268e-06,
"loss": 0.84,
"step": 4855
},
{
"epoch": 1.4097837131659166,
"grad_norm": 3.562664031982422,
"learning_rate": 8.5992771273867e-06,
"loss": 0.7553,
"step": 4856
},
{
"epoch": 1.4100740310640152,
"grad_norm": 3.5420081615448,
"learning_rate": 8.59861056094522e-06,
"loss": 0.7472,
"step": 4857
},
{
"epoch": 1.4103643489621134,
"grad_norm": 3.676253080368042,
"learning_rate": 8.59794386178841e-06,
"loss": 0.8556,
"step": 4858
},
{
"epoch": 1.410654666860212,
"grad_norm": 3.7087533473968506,
"learning_rate": 8.59727702994086e-06,
"loss": 0.7977,
"step": 4859
},
{
"epoch": 1.4109449847583104,
"grad_norm": 3.540095806121826,
"learning_rate": 8.596610065427158e-06,
"loss": 0.815,
"step": 4860
},
{
"epoch": 1.4112353026564088,
"grad_norm": 2.9336438179016113,
"learning_rate": 8.595942968271907e-06,
"loss": 0.7382,
"step": 4861
},
{
"epoch": 1.4115256205545073,
"grad_norm": 3.024334669113159,
"learning_rate": 8.595275738499704e-06,
"loss": 0.8273,
"step": 4862
},
{
"epoch": 1.4118159384526057,
"grad_norm": 3.550865650177002,
"learning_rate": 8.594608376135159e-06,
"loss": 0.7818,
"step": 4863
},
{
"epoch": 1.412106256350704,
"grad_norm": 3.29832124710083,
"learning_rate": 8.593940881202885e-06,
"loss": 0.7025,
"step": 4864
},
{
"epoch": 1.4123965742488025,
"grad_norm": 3.7970573902130127,
"learning_rate": 8.593273253727495e-06,
"loss": 0.831,
"step": 4865
},
{
"epoch": 1.412686892146901,
"grad_norm": 3.563462257385254,
"learning_rate": 8.592605493733614e-06,
"loss": 0.7108,
"step": 4866
},
{
"epoch": 1.4129772100449993,
"grad_norm": 3.863367795944214,
"learning_rate": 8.59193760124587e-06,
"loss": 0.7942,
"step": 4867
},
{
"epoch": 1.4132675279430977,
"grad_norm": 3.109443426132202,
"learning_rate": 8.591269576288892e-06,
"loss": 0.7006,
"step": 4868
},
{
"epoch": 1.4135578458411961,
"grad_norm": 3.792145252227783,
"learning_rate": 8.590601418887316e-06,
"loss": 0.8134,
"step": 4869
},
{
"epoch": 1.4138481637392946,
"grad_norm": 3.6752769947052,
"learning_rate": 8.589933129065786e-06,
"loss": 0.7159,
"step": 4870
},
{
"epoch": 1.414138481637393,
"grad_norm": 3.0564382076263428,
"learning_rate": 8.589264706848946e-06,
"loss": 0.7533,
"step": 4871
},
{
"epoch": 1.4144287995354914,
"grad_norm": 3.0098416805267334,
"learning_rate": 8.588596152261447e-06,
"loss": 0.6984,
"step": 4872
},
{
"epoch": 1.4147191174335898,
"grad_norm": 3.4505839347839355,
"learning_rate": 8.587927465327948e-06,
"loss": 0.7734,
"step": 4873
},
{
"epoch": 1.4150094353316882,
"grad_norm": 3.9714856147766113,
"learning_rate": 8.587258646073107e-06,
"loss": 0.8756,
"step": 4874
},
{
"epoch": 1.4152997532297866,
"grad_norm": 3.669161081314087,
"learning_rate": 8.58658969452159e-06,
"loss": 0.8002,
"step": 4875
},
{
"epoch": 1.415590071127885,
"grad_norm": 3.4111788272857666,
"learning_rate": 8.585920610698068e-06,
"loss": 0.79,
"step": 4876
},
{
"epoch": 1.4158803890259835,
"grad_norm": 3.534163236618042,
"learning_rate": 8.585251394627217e-06,
"loss": 0.6854,
"step": 4877
},
{
"epoch": 1.4161707069240819,
"grad_norm": 3.521871566772461,
"learning_rate": 8.584582046333719e-06,
"loss": 0.7174,
"step": 4878
},
{
"epoch": 1.4164610248221803,
"grad_norm": 3.245898962020874,
"learning_rate": 8.583912565842258e-06,
"loss": 0.7329,
"step": 4879
},
{
"epoch": 1.4167513427202787,
"grad_norm": 3.9191839694976807,
"learning_rate": 8.583242953177522e-06,
"loss": 0.8377,
"step": 4880
},
{
"epoch": 1.417041660618377,
"grad_norm": 3.0914013385772705,
"learning_rate": 8.582573208364209e-06,
"loss": 0.7686,
"step": 4881
},
{
"epoch": 1.4173319785164755,
"grad_norm": 3.8165574073791504,
"learning_rate": 8.581903331427016e-06,
"loss": 0.7768,
"step": 4882
},
{
"epoch": 1.417622296414574,
"grad_norm": 3.884101152420044,
"learning_rate": 8.581233322390652e-06,
"loss": 0.8283,
"step": 4883
},
{
"epoch": 1.4179126143126723,
"grad_norm": 4.394293308258057,
"learning_rate": 8.580563181279822e-06,
"loss": 0.9988,
"step": 4884
},
{
"epoch": 1.4182029322107708,
"grad_norm": 3.411958694458008,
"learning_rate": 8.579892908119244e-06,
"loss": 0.7588,
"step": 4885
},
{
"epoch": 1.4184932501088692,
"grad_norm": 3.832937002182007,
"learning_rate": 8.579222502933635e-06,
"loss": 0.7294,
"step": 4886
},
{
"epoch": 1.4187835680069676,
"grad_norm": 3.814302921295166,
"learning_rate": 8.578551965747722e-06,
"loss": 0.7515,
"step": 4887
},
{
"epoch": 1.419073885905066,
"grad_norm": 3.579897403717041,
"learning_rate": 8.577881296586233e-06,
"loss": 0.8351,
"step": 4888
},
{
"epoch": 1.4193642038031644,
"grad_norm": 3.93332576751709,
"learning_rate": 8.5772104954739e-06,
"loss": 0.727,
"step": 4889
},
{
"epoch": 1.4196545217012628,
"grad_norm": 3.954401731491089,
"learning_rate": 8.576539562435464e-06,
"loss": 0.7004,
"step": 4890
},
{
"epoch": 1.4199448395993612,
"grad_norm": 3.2439942359924316,
"learning_rate": 8.575868497495668e-06,
"loss": 0.7239,
"step": 4891
},
{
"epoch": 1.4202351574974597,
"grad_norm": 3.3064539432525635,
"learning_rate": 8.575197300679262e-06,
"loss": 0.8092,
"step": 4892
},
{
"epoch": 1.420525475395558,
"grad_norm": 3.907304525375366,
"learning_rate": 8.574525972010997e-06,
"loss": 0.851,
"step": 4893
},
{
"epoch": 1.4208157932936565,
"grad_norm": 3.5380594730377197,
"learning_rate": 8.573854511515633e-06,
"loss": 0.7994,
"step": 4894
},
{
"epoch": 1.421106111191755,
"grad_norm": 3.559415817260742,
"learning_rate": 8.573182919217936e-06,
"loss": 0.76,
"step": 4895
},
{
"epoch": 1.4213964290898533,
"grad_norm": 3.537963628768921,
"learning_rate": 8.572511195142665e-06,
"loss": 0.7259,
"step": 4896
},
{
"epoch": 1.4216867469879517,
"grad_norm": 3.594255208969116,
"learning_rate": 8.571839339314602e-06,
"loss": 0.856,
"step": 4897
},
{
"epoch": 1.4219770648860504,
"grad_norm": 3.629476308822632,
"learning_rate": 8.571167351758522e-06,
"loss": 0.7807,
"step": 4898
},
{
"epoch": 1.4222673827841485,
"grad_norm": 3.595150947570801,
"learning_rate": 8.570495232499207e-06,
"loss": 0.801,
"step": 4899
},
{
"epoch": 1.4225577006822472,
"grad_norm": 3.8158557415008545,
"learning_rate": 8.569822981561445e-06,
"loss": 0.8622,
"step": 4900
},
{
"epoch": 1.4228480185803454,
"grad_norm": 3.8504481315612793,
"learning_rate": 8.569150598970027e-06,
"loss": 0.7183,
"step": 4901
},
{
"epoch": 1.423138336478444,
"grad_norm": 3.875899076461792,
"learning_rate": 8.568478084749752e-06,
"loss": 0.7786,
"step": 4902
},
{
"epoch": 1.4234286543765422,
"grad_norm": 3.6759371757507324,
"learning_rate": 8.56780543892542e-06,
"loss": 0.8178,
"step": 4903
},
{
"epoch": 1.4237189722746408,
"grad_norm": 3.799499034881592,
"learning_rate": 8.567132661521841e-06,
"loss": 0.854,
"step": 4904
},
{
"epoch": 1.424009290172739,
"grad_norm": 3.120879888534546,
"learning_rate": 8.566459752563825e-06,
"loss": 0.7493,
"step": 4905
},
{
"epoch": 1.4242996080708377,
"grad_norm": 3.856126070022583,
"learning_rate": 8.56578671207619e-06,
"loss": 0.777,
"step": 4906
},
{
"epoch": 1.4245899259689359,
"grad_norm": 3.700613021850586,
"learning_rate": 8.565113540083751e-06,
"loss": 0.8536,
"step": 4907
},
{
"epoch": 1.4248802438670345,
"grad_norm": 3.3016512393951416,
"learning_rate": 8.564440236611344e-06,
"loss": 0.7961,
"step": 4908
},
{
"epoch": 1.4251705617651327,
"grad_norm": 3.592452049255371,
"learning_rate": 8.563766801683794e-06,
"loss": 0.9353,
"step": 4909
},
{
"epoch": 1.4254608796632313,
"grad_norm": 2.960012674331665,
"learning_rate": 8.56309323532594e-06,
"loss": 0.6846,
"step": 4910
},
{
"epoch": 1.4257511975613297,
"grad_norm": 3.6264259815216064,
"learning_rate": 8.56241953756262e-06,
"loss": 0.727,
"step": 4911
},
{
"epoch": 1.4260415154594281,
"grad_norm": 3.664760112762451,
"learning_rate": 8.56174570841868e-06,
"loss": 0.7984,
"step": 4912
},
{
"epoch": 1.4263318333575266,
"grad_norm": 3.2246367931365967,
"learning_rate": 8.561071747918973e-06,
"loss": 0.6332,
"step": 4913
},
{
"epoch": 1.426622151255625,
"grad_norm": 3.133545160293579,
"learning_rate": 8.560397656088353e-06,
"loss": 0.7211,
"step": 4914
},
{
"epoch": 1.4269124691537234,
"grad_norm": 3.770587205886841,
"learning_rate": 8.55972343295168e-06,
"loss": 0.7908,
"step": 4915
},
{
"epoch": 1.4272027870518218,
"grad_norm": 3.3660528659820557,
"learning_rate": 8.559049078533821e-06,
"loss": 0.7996,
"step": 4916
},
{
"epoch": 1.4274931049499202,
"grad_norm": 3.4238767623901367,
"learning_rate": 8.558374592859644e-06,
"loss": 0.817,
"step": 4917
},
{
"epoch": 1.4277834228480186,
"grad_norm": 3.7060892581939697,
"learning_rate": 8.557699975954023e-06,
"loss": 0.7631,
"step": 4918
},
{
"epoch": 1.428073740746117,
"grad_norm": 3.3508338928222656,
"learning_rate": 8.557025227841839e-06,
"loss": 0.7387,
"step": 4919
},
{
"epoch": 1.4283640586442155,
"grad_norm": 3.907799243927002,
"learning_rate": 8.556350348547978e-06,
"loss": 0.6976,
"step": 4920
},
{
"epoch": 1.4286543765423139,
"grad_norm": 3.8321168422698975,
"learning_rate": 8.555675338097324e-06,
"loss": 0.8515,
"step": 4921
},
{
"epoch": 1.4289446944404123,
"grad_norm": 3.4706666469573975,
"learning_rate": 8.555000196514776e-06,
"loss": 0.8331,
"step": 4922
},
{
"epoch": 1.4292350123385107,
"grad_norm": 3.963350534439087,
"learning_rate": 8.554324923825233e-06,
"loss": 0.8487,
"step": 4923
},
{
"epoch": 1.429525330236609,
"grad_norm": 3.9221112728118896,
"learning_rate": 8.553649520053596e-06,
"loss": 0.8157,
"step": 4924
},
{
"epoch": 1.4298156481347075,
"grad_norm": 3.6907260417938232,
"learning_rate": 8.552973985224774e-06,
"loss": 0.8462,
"step": 4925
},
{
"epoch": 1.430105966032806,
"grad_norm": 3.558818817138672,
"learning_rate": 8.552298319363682e-06,
"loss": 0.754,
"step": 4926
},
{
"epoch": 1.4303962839309043,
"grad_norm": 3.271465539932251,
"learning_rate": 8.551622522495238e-06,
"loss": 0.746,
"step": 4927
},
{
"epoch": 1.4306866018290028,
"grad_norm": 3.642778158187866,
"learning_rate": 8.550946594644365e-06,
"loss": 0.7517,
"step": 4928
},
{
"epoch": 1.4309769197271012,
"grad_norm": 3.227018117904663,
"learning_rate": 8.550270535835992e-06,
"loss": 0.5879,
"step": 4929
},
{
"epoch": 1.4312672376251996,
"grad_norm": 3.576512098312378,
"learning_rate": 8.549594346095049e-06,
"loss": 0.7585,
"step": 4930
},
{
"epoch": 1.431557555523298,
"grad_norm": 3.381173849105835,
"learning_rate": 8.548918025446474e-06,
"loss": 0.7194,
"step": 4931
},
{
"epoch": 1.4318478734213964,
"grad_norm": 3.5712335109710693,
"learning_rate": 8.548241573915213e-06,
"loss": 0.7103,
"step": 4932
},
{
"epoch": 1.4321381913194948,
"grad_norm": 4.106939315795898,
"learning_rate": 8.54756499152621e-06,
"loss": 0.7445,
"step": 4933
},
{
"epoch": 1.4324285092175932,
"grad_norm": 3.6397581100463867,
"learning_rate": 8.546888278304416e-06,
"loss": 0.9127,
"step": 4934
},
{
"epoch": 1.4327188271156917,
"grad_norm": 3.9541220664978027,
"learning_rate": 8.546211434274791e-06,
"loss": 0.8085,
"step": 4935
},
{
"epoch": 1.43300914501379,
"grad_norm": 3.7158708572387695,
"learning_rate": 8.545534459462297e-06,
"loss": 0.7887,
"step": 4936
},
{
"epoch": 1.4332994629118885,
"grad_norm": 3.8351891040802,
"learning_rate": 8.544857353891898e-06,
"loss": 0.8938,
"step": 4937
},
{
"epoch": 1.433589780809987,
"grad_norm": 3.1466290950775146,
"learning_rate": 8.544180117588567e-06,
"loss": 0.6964,
"step": 4938
},
{
"epoch": 1.4338800987080853,
"grad_norm": 3.5582618713378906,
"learning_rate": 8.54350275057728e-06,
"loss": 0.7432,
"step": 4939
},
{
"epoch": 1.4341704166061837,
"grad_norm": 3.1632747650146484,
"learning_rate": 8.542825252883015e-06,
"loss": 0.6981,
"step": 4940
},
{
"epoch": 1.4344607345042821,
"grad_norm": 3.2447924613952637,
"learning_rate": 8.542147624530763e-06,
"loss": 0.7172,
"step": 4941
},
{
"epoch": 1.4347510524023805,
"grad_norm": 3.235755443572998,
"learning_rate": 8.541469865545513e-06,
"loss": 0.7927,
"step": 4942
},
{
"epoch": 1.435041370300479,
"grad_norm": 3.388984203338623,
"learning_rate": 8.540791975952258e-06,
"loss": 0.733,
"step": 4943
},
{
"epoch": 1.4353316881985774,
"grad_norm": 3.0334298610687256,
"learning_rate": 8.540113955776001e-06,
"loss": 0.5858,
"step": 4944
},
{
"epoch": 1.4356220060966758,
"grad_norm": 3.707620859146118,
"learning_rate": 8.539435805041745e-06,
"loss": 0.7823,
"step": 4945
},
{
"epoch": 1.4359123239947742,
"grad_norm": 3.4698052406311035,
"learning_rate": 8.538757523774503e-06,
"loss": 0.8276,
"step": 4946
},
{
"epoch": 1.4362026418928728,
"grad_norm": 3.6473255157470703,
"learning_rate": 8.538079111999287e-06,
"loss": 0.7954,
"step": 4947
},
{
"epoch": 1.436492959790971,
"grad_norm": 3.7372074127197266,
"learning_rate": 8.537400569741117e-06,
"loss": 0.841,
"step": 4948
},
{
"epoch": 1.4367832776890697,
"grad_norm": 4.107751369476318,
"learning_rate": 8.536721897025018e-06,
"loss": 0.8634,
"step": 4949
},
{
"epoch": 1.4370735955871679,
"grad_norm": 3.484713077545166,
"learning_rate": 8.536043093876018e-06,
"loss": 0.8296,
"step": 4950
},
{
"epoch": 1.4373639134852665,
"grad_norm": 3.7558670043945312,
"learning_rate": 8.535364160319154e-06,
"loss": 0.8254,
"step": 4951
},
{
"epoch": 1.4376542313833647,
"grad_norm": 3.655763864517212,
"learning_rate": 8.534685096379463e-06,
"loss": 0.7879,
"step": 4952
},
{
"epoch": 1.4379445492814633,
"grad_norm": 3.9244983196258545,
"learning_rate": 8.534005902081985e-06,
"loss": 0.7759,
"step": 4953
},
{
"epoch": 1.4382348671795615,
"grad_norm": 3.526134490966797,
"learning_rate": 8.533326577451775e-06,
"loss": 0.8024,
"step": 4954
},
{
"epoch": 1.4385251850776601,
"grad_norm": 3.7379188537597656,
"learning_rate": 8.53264712251388e-06,
"loss": 0.7485,
"step": 4955
},
{
"epoch": 1.4388155029757583,
"grad_norm": 4.165005683898926,
"learning_rate": 8.531967537293365e-06,
"loss": 0.9631,
"step": 4956
},
{
"epoch": 1.439105820873857,
"grad_norm": 3.4370205402374268,
"learning_rate": 8.531287821815286e-06,
"loss": 0.6982,
"step": 4957
},
{
"epoch": 1.4393961387719552,
"grad_norm": 3.3375890254974365,
"learning_rate": 8.530607976104712e-06,
"loss": 0.7578,
"step": 4958
},
{
"epoch": 1.4396864566700538,
"grad_norm": 3.7006642818450928,
"learning_rate": 8.529928000186721e-06,
"loss": 0.832,
"step": 4959
},
{
"epoch": 1.4399767745681522,
"grad_norm": 3.493058443069458,
"learning_rate": 8.529247894086383e-06,
"loss": 0.8828,
"step": 4960
},
{
"epoch": 1.4402670924662506,
"grad_norm": 3.9224722385406494,
"learning_rate": 8.528567657828785e-06,
"loss": 0.9021,
"step": 4961
},
{
"epoch": 1.440557410364349,
"grad_norm": 3.570800542831421,
"learning_rate": 8.527887291439012e-06,
"loss": 0.7967,
"step": 4962
},
{
"epoch": 1.4408477282624474,
"grad_norm": 4.029253959655762,
"learning_rate": 8.527206794942154e-06,
"loss": 0.7519,
"step": 4963
},
{
"epoch": 1.4411380461605459,
"grad_norm": 3.2075116634368896,
"learning_rate": 8.52652616836331e-06,
"loss": 0.673,
"step": 4964
},
{
"epoch": 1.4414283640586443,
"grad_norm": 3.6427388191223145,
"learning_rate": 8.525845411727581e-06,
"loss": 0.7974,
"step": 4965
},
{
"epoch": 1.4417186819567427,
"grad_norm": 3.2091753482818604,
"learning_rate": 8.525164525060072e-06,
"loss": 0.7223,
"step": 4966
},
{
"epoch": 1.442008999854841,
"grad_norm": 3.3279550075531006,
"learning_rate": 8.524483508385895e-06,
"loss": 0.7353,
"step": 4967
},
{
"epoch": 1.4422993177529395,
"grad_norm": 3.2981271743774414,
"learning_rate": 8.523802361730162e-06,
"loss": 0.7777,
"step": 4968
},
{
"epoch": 1.442589635651038,
"grad_norm": 3.850630760192871,
"learning_rate": 8.523121085118001e-06,
"loss": 0.8775,
"step": 4969
},
{
"epoch": 1.4428799535491363,
"grad_norm": 3.483059883117676,
"learning_rate": 8.522439678574528e-06,
"loss": 0.7326,
"step": 4970
},
{
"epoch": 1.4431702714472348,
"grad_norm": 3.390303611755371,
"learning_rate": 8.52175814212488e-06,
"loss": 0.7247,
"step": 4971
},
{
"epoch": 1.4434605893453332,
"grad_norm": 3.6529483795166016,
"learning_rate": 8.521076475794188e-06,
"loss": 0.7653,
"step": 4972
},
{
"epoch": 1.4437509072434316,
"grad_norm": 3.635930061340332,
"learning_rate": 8.520394679607592e-06,
"loss": 0.8241,
"step": 4973
},
{
"epoch": 1.44404122514153,
"grad_norm": 3.3492178916931152,
"learning_rate": 8.519712753590241e-06,
"loss": 0.7107,
"step": 4974
},
{
"epoch": 1.4443315430396284,
"grad_norm": 4.295066833496094,
"learning_rate": 8.519030697767278e-06,
"loss": 0.8889,
"step": 4975
},
{
"epoch": 1.4446218609377268,
"grad_norm": 3.8008925914764404,
"learning_rate": 8.51834851216386e-06,
"loss": 0.8281,
"step": 4976
},
{
"epoch": 1.4449121788358252,
"grad_norm": 3.6782050132751465,
"learning_rate": 8.517666196805142e-06,
"loss": 0.7278,
"step": 4977
},
{
"epoch": 1.4452024967339236,
"grad_norm": 3.2875430583953857,
"learning_rate": 8.516983751716294e-06,
"loss": 0.7124,
"step": 4978
},
{
"epoch": 1.445492814632022,
"grad_norm": 3.449599027633667,
"learning_rate": 8.516301176922482e-06,
"loss": 0.6499,
"step": 4979
},
{
"epoch": 1.4457831325301205,
"grad_norm": 3.2835583686828613,
"learning_rate": 8.515618472448875e-06,
"loss": 0.7154,
"step": 4980
},
{
"epoch": 1.4460734504282189,
"grad_norm": 3.622060537338257,
"learning_rate": 8.514935638320656e-06,
"loss": 0.8061,
"step": 4981
},
{
"epoch": 1.4463637683263173,
"grad_norm": 3.7743592262268066,
"learning_rate": 8.514252674563003e-06,
"loss": 0.781,
"step": 4982
},
{
"epoch": 1.4466540862244157,
"grad_norm": 3.5391032695770264,
"learning_rate": 8.513569581201109e-06,
"loss": 0.7509,
"step": 4983
},
{
"epoch": 1.4469444041225141,
"grad_norm": 3.4815375804901123,
"learning_rate": 8.512886358260162e-06,
"loss": 0.8138,
"step": 4984
},
{
"epoch": 1.4472347220206125,
"grad_norm": 3.804208755493164,
"learning_rate": 8.512203005765358e-06,
"loss": 0.7921,
"step": 4985
},
{
"epoch": 1.447525039918711,
"grad_norm": 3.3835744857788086,
"learning_rate": 8.511519523741903e-06,
"loss": 0.7415,
"step": 4986
},
{
"epoch": 1.4478153578168094,
"grad_norm": 3.5784029960632324,
"learning_rate": 8.510835912215001e-06,
"loss": 0.7147,
"step": 4987
},
{
"epoch": 1.4481056757149078,
"grad_norm": 3.8594770431518555,
"learning_rate": 8.510152171209864e-06,
"loss": 0.718,
"step": 4988
},
{
"epoch": 1.4483959936130062,
"grad_norm": 3.8807501792907715,
"learning_rate": 8.509468300751709e-06,
"loss": 0.7239,
"step": 4989
},
{
"epoch": 1.4486863115111046,
"grad_norm": 3.600749969482422,
"learning_rate": 8.508784300865754e-06,
"loss": 0.7901,
"step": 4990
},
{
"epoch": 1.448976629409203,
"grad_norm": 3.7116174697875977,
"learning_rate": 8.508100171577226e-06,
"loss": 0.8248,
"step": 4991
},
{
"epoch": 1.4492669473073014,
"grad_norm": 4.034679889678955,
"learning_rate": 8.507415912911357e-06,
"loss": 1.0043,
"step": 4992
},
{
"epoch": 1.4495572652053998,
"grad_norm": 3.861468553543091,
"learning_rate": 8.50673152489338e-06,
"loss": 0.8495,
"step": 4993
},
{
"epoch": 1.4498475831034983,
"grad_norm": 3.6064560413360596,
"learning_rate": 8.506047007548537e-06,
"loss": 0.776,
"step": 4994
},
{
"epoch": 1.4501379010015967,
"grad_norm": 3.368307113647461,
"learning_rate": 8.505362360902071e-06,
"loss": 0.8076,
"step": 4995
},
{
"epoch": 1.450428218899695,
"grad_norm": 3.307891845703125,
"learning_rate": 8.504677584979233e-06,
"loss": 0.7516,
"step": 4996
},
{
"epoch": 1.4507185367977935,
"grad_norm": 3.733379602432251,
"learning_rate": 8.503992679805277e-06,
"loss": 0.8998,
"step": 4997
},
{
"epoch": 1.4510088546958921,
"grad_norm": 3.367964029312134,
"learning_rate": 8.503307645405461e-06,
"loss": 0.7692,
"step": 4998
},
{
"epoch": 1.4512991725939903,
"grad_norm": 3.5045888423919678,
"learning_rate": 8.502622481805047e-06,
"loss": 0.8215,
"step": 4999
},
{
"epoch": 1.451589490492089,
"grad_norm": 3.624884605407715,
"learning_rate": 8.501937189029309e-06,
"loss": 0.8049,
"step": 5000
},
{
"epoch": 1.451589490492089,
"eval_loss": 1.1851102113723755,
"eval_runtime": 13.235,
"eval_samples_per_second": 30.223,
"eval_steps_per_second": 3.778,
"step": 5000
},
{
"epoch": 1.4518798083901872,
"grad_norm": 3.5284340381622314,
"learning_rate": 8.501251767103515e-06,
"loss": 0.8034,
"step": 5001
},
{
"epoch": 1.4521701262882858,
"grad_norm": 3.5684597492218018,
"learning_rate": 8.500566216052948e-06,
"loss": 0.7959,
"step": 5002
},
{
"epoch": 1.452460444186384,
"grad_norm": 3.7145283222198486,
"learning_rate": 8.499880535902885e-06,
"loss": 0.9445,
"step": 5003
},
{
"epoch": 1.4527507620844826,
"grad_norm": 3.89518666267395,
"learning_rate": 8.499194726678619e-06,
"loss": 0.7677,
"step": 5004
},
{
"epoch": 1.4530410799825808,
"grad_norm": 3.8414015769958496,
"learning_rate": 8.498508788405438e-06,
"loss": 0.9152,
"step": 5005
},
{
"epoch": 1.4533313978806794,
"grad_norm": 3.748683214187622,
"learning_rate": 8.497822721108642e-06,
"loss": 0.7538,
"step": 5006
},
{
"epoch": 1.4536217157787776,
"grad_norm": 3.3457822799682617,
"learning_rate": 8.497136524813534e-06,
"loss": 0.7947,
"step": 5007
},
{
"epoch": 1.4539120336768763,
"grad_norm": 3.300783157348633,
"learning_rate": 8.496450199545417e-06,
"loss": 0.7006,
"step": 5008
},
{
"epoch": 1.4542023515749745,
"grad_norm": 3.2852492332458496,
"learning_rate": 8.495763745329604e-06,
"loss": 0.6321,
"step": 5009
},
{
"epoch": 1.454492669473073,
"grad_norm": 3.0854744911193848,
"learning_rate": 8.49507716219141e-06,
"loss": 0.6212,
"step": 5010
},
{
"epoch": 1.4547829873711715,
"grad_norm": 4.364450931549072,
"learning_rate": 8.49439045015616e-06,
"loss": 0.9948,
"step": 5011
},
{
"epoch": 1.45507330526927,
"grad_norm": 3.982003927230835,
"learning_rate": 8.493703609249175e-06,
"loss": 0.8609,
"step": 5012
},
{
"epoch": 1.4553636231673683,
"grad_norm": 3.6758294105529785,
"learning_rate": 8.49301663949579e-06,
"loss": 0.7805,
"step": 5013
},
{
"epoch": 1.4556539410654667,
"grad_norm": 3.922879934310913,
"learning_rate": 8.492329540921335e-06,
"loss": 0.9171,
"step": 5014
},
{
"epoch": 1.4559442589635652,
"grad_norm": 3.4253084659576416,
"learning_rate": 8.491642313551153e-06,
"loss": 0.7327,
"step": 5015
},
{
"epoch": 1.4562345768616636,
"grad_norm": 3.4870643615722656,
"learning_rate": 8.490954957410588e-06,
"loss": 0.7023,
"step": 5016
},
{
"epoch": 1.456524894759762,
"grad_norm": 3.2392799854278564,
"learning_rate": 8.490267472524989e-06,
"loss": 0.6963,
"step": 5017
},
{
"epoch": 1.4568152126578604,
"grad_norm": 3.677802324295044,
"learning_rate": 8.489579858919711e-06,
"loss": 0.8241,
"step": 5018
},
{
"epoch": 1.4571055305559588,
"grad_norm": 3.4841086864471436,
"learning_rate": 8.488892116620114e-06,
"loss": 0.7841,
"step": 5019
},
{
"epoch": 1.4573958484540572,
"grad_norm": 3.652825117111206,
"learning_rate": 8.48820424565156e-06,
"loss": 0.8396,
"step": 5020
},
{
"epoch": 1.4576861663521556,
"grad_norm": 4.243154048919678,
"learning_rate": 8.487516246039415e-06,
"loss": 0.9935,
"step": 5021
},
{
"epoch": 1.457976484250254,
"grad_norm": 3.3527235984802246,
"learning_rate": 8.486828117809057e-06,
"loss": 0.7414,
"step": 5022
},
{
"epoch": 1.4582668021483525,
"grad_norm": 3.2306318283081055,
"learning_rate": 8.486139860985862e-06,
"loss": 0.7676,
"step": 5023
},
{
"epoch": 1.4585571200464509,
"grad_norm": 3.4278311729431152,
"learning_rate": 8.485451475595211e-06,
"loss": 0.7074,
"step": 5024
},
{
"epoch": 1.4588474379445493,
"grad_norm": 3.2792117595672607,
"learning_rate": 8.484762961662494e-06,
"loss": 0.7377,
"step": 5025
},
{
"epoch": 1.4591377558426477,
"grad_norm": 3.4412848949432373,
"learning_rate": 8.4840743192131e-06,
"loss": 0.7358,
"step": 5026
},
{
"epoch": 1.4594280737407461,
"grad_norm": 3.700155258178711,
"learning_rate": 8.48338554827243e-06,
"loss": 0.7138,
"step": 5027
},
{
"epoch": 1.4597183916388445,
"grad_norm": 3.4831392765045166,
"learning_rate": 8.482696648865883e-06,
"loss": 0.795,
"step": 5028
},
{
"epoch": 1.460008709536943,
"grad_norm": 3.9102721214294434,
"learning_rate": 8.482007621018865e-06,
"loss": 0.7914,
"step": 5029
},
{
"epoch": 1.4602990274350414,
"grad_norm": 3.5112287998199463,
"learning_rate": 8.481318464756787e-06,
"loss": 0.6755,
"step": 5030
},
{
"epoch": 1.4605893453331398,
"grad_norm": 3.3797972202301025,
"learning_rate": 8.480629180105067e-06,
"loss": 0.7752,
"step": 5031
},
{
"epoch": 1.4608796632312382,
"grad_norm": 3.8857204914093018,
"learning_rate": 8.479939767089124e-06,
"loss": 0.7878,
"step": 5032
},
{
"epoch": 1.4611699811293366,
"grad_norm": 3.759293556213379,
"learning_rate": 8.479250225734382e-06,
"loss": 0.767,
"step": 5033
},
{
"epoch": 1.461460299027435,
"grad_norm": 3.3629093170166016,
"learning_rate": 8.478560556066274e-06,
"loss": 0.7772,
"step": 5034
},
{
"epoch": 1.4617506169255334,
"grad_norm": 3.661879062652588,
"learning_rate": 8.477870758110231e-06,
"loss": 0.8362,
"step": 5035
},
{
"epoch": 1.4620409348236318,
"grad_norm": 3.17903995513916,
"learning_rate": 8.477180831891696e-06,
"loss": 0.7094,
"step": 5036
},
{
"epoch": 1.4623312527217303,
"grad_norm": 3.840388536453247,
"learning_rate": 8.476490777436113e-06,
"loss": 0.7962,
"step": 5037
},
{
"epoch": 1.4626215706198287,
"grad_norm": 3.8354861736297607,
"learning_rate": 8.475800594768929e-06,
"loss": 0.7228,
"step": 5038
},
{
"epoch": 1.462911888517927,
"grad_norm": 3.448528289794922,
"learning_rate": 8.475110283915597e-06,
"loss": 0.8893,
"step": 5039
},
{
"epoch": 1.4632022064160255,
"grad_norm": 3.4191551208496094,
"learning_rate": 8.474419844901575e-06,
"loss": 0.8896,
"step": 5040
},
{
"epoch": 1.463492524314124,
"grad_norm": 3.802597999572754,
"learning_rate": 8.473729277752331e-06,
"loss": 0.7941,
"step": 5041
},
{
"epoch": 1.4637828422122223,
"grad_norm": 3.8264427185058594,
"learning_rate": 8.47303858249333e-06,
"loss": 0.823,
"step": 5042
},
{
"epoch": 1.4640731601103207,
"grad_norm": 3.2838687896728516,
"learning_rate": 8.472347759150044e-06,
"loss": 0.7341,
"step": 5043
},
{
"epoch": 1.4643634780084192,
"grad_norm": 3.4499127864837646,
"learning_rate": 8.47165680774795e-06,
"loss": 0.7497,
"step": 5044
},
{
"epoch": 1.4646537959065176,
"grad_norm": 3.102621078491211,
"learning_rate": 8.47096572831253e-06,
"loss": 0.6851,
"step": 5045
},
{
"epoch": 1.464944113804616,
"grad_norm": 3.695542097091675,
"learning_rate": 8.470274520869273e-06,
"loss": 0.7494,
"step": 5046
},
{
"epoch": 1.4652344317027144,
"grad_norm": 3.250293254852295,
"learning_rate": 8.469583185443669e-06,
"loss": 0.7554,
"step": 5047
},
{
"epoch": 1.4655247496008128,
"grad_norm": 3.8266215324401855,
"learning_rate": 8.468891722061211e-06,
"loss": 0.8187,
"step": 5048
},
{
"epoch": 1.4658150674989114,
"grad_norm": 3.5755343437194824,
"learning_rate": 8.468200130747406e-06,
"loss": 0.7568,
"step": 5049
},
{
"epoch": 1.4661053853970096,
"grad_norm": 3.6069979667663574,
"learning_rate": 8.467508411527754e-06,
"loss": 0.8279,
"step": 5050
},
{
"epoch": 1.4663957032951083,
"grad_norm": 3.417710542678833,
"learning_rate": 8.46681656442777e-06,
"loss": 0.7938,
"step": 5051
},
{
"epoch": 1.4666860211932065,
"grad_norm": 3.6008191108703613,
"learning_rate": 8.466124589472967e-06,
"loss": 0.8101,
"step": 5052
},
{
"epoch": 1.466976339091305,
"grad_norm": 3.4891951084136963,
"learning_rate": 8.465432486688863e-06,
"loss": 0.8224,
"step": 5053
},
{
"epoch": 1.4672666569894033,
"grad_norm": 3.3960723876953125,
"learning_rate": 8.464740256100984e-06,
"loss": 0.8218,
"step": 5054
},
{
"epoch": 1.467556974887502,
"grad_norm": 3.9240763187408447,
"learning_rate": 8.46404789773486e-06,
"loss": 0.7348,
"step": 5055
},
{
"epoch": 1.4678472927856,
"grad_norm": 3.406634569168091,
"learning_rate": 8.463355411616024e-06,
"loss": 0.7603,
"step": 5056
},
{
"epoch": 1.4681376106836987,
"grad_norm": 2.8764095306396484,
"learning_rate": 8.462662797770016e-06,
"loss": 0.5915,
"step": 5057
},
{
"epoch": 1.468427928581797,
"grad_norm": 3.4037272930145264,
"learning_rate": 8.461970056222375e-06,
"loss": 0.6647,
"step": 5058
},
{
"epoch": 1.4687182464798956,
"grad_norm": 3.4545750617980957,
"learning_rate": 8.461277186998656e-06,
"loss": 0.7738,
"step": 5059
},
{
"epoch": 1.469008564377994,
"grad_norm": 3.645581007003784,
"learning_rate": 8.460584190124405e-06,
"loss": 0.7971,
"step": 5060
},
{
"epoch": 1.4692988822760924,
"grad_norm": 3.7972629070281982,
"learning_rate": 8.459891065625184e-06,
"loss": 0.7959,
"step": 5061
},
{
"epoch": 1.4695892001741908,
"grad_norm": 3.7442901134490967,
"learning_rate": 8.459197813526554e-06,
"loss": 0.8311,
"step": 5062
},
{
"epoch": 1.4698795180722892,
"grad_norm": 3.545626640319824,
"learning_rate": 8.45850443385408e-06,
"loss": 0.8391,
"step": 5063
},
{
"epoch": 1.4701698359703876,
"grad_norm": 3.4700894355773926,
"learning_rate": 8.457810926633336e-06,
"loss": 0.7605,
"step": 5064
},
{
"epoch": 1.470460153868486,
"grad_norm": 3.531576633453369,
"learning_rate": 8.457117291889895e-06,
"loss": 0.706,
"step": 5065
},
{
"epoch": 1.4707504717665845,
"grad_norm": 3.6248650550842285,
"learning_rate": 8.456423529649343e-06,
"loss": 0.9177,
"step": 5066
},
{
"epoch": 1.4710407896646829,
"grad_norm": 3.7519371509552,
"learning_rate": 8.45572963993726e-06,
"loss": 0.8181,
"step": 5067
},
{
"epoch": 1.4713311075627813,
"grad_norm": 3.677908420562744,
"learning_rate": 8.455035622779242e-06,
"loss": 0.8197,
"step": 5068
},
{
"epoch": 1.4716214254608797,
"grad_norm": 3.604118824005127,
"learning_rate": 8.45434147820088e-06,
"loss": 0.7423,
"step": 5069
},
{
"epoch": 1.4719117433589781,
"grad_norm": 3.7687582969665527,
"learning_rate": 8.453647206227776e-06,
"loss": 0.8346,
"step": 5070
},
{
"epoch": 1.4722020612570765,
"grad_norm": 3.278323173522949,
"learning_rate": 8.452952806885533e-06,
"loss": 0.6388,
"step": 5071
},
{
"epoch": 1.472492379155175,
"grad_norm": 3.315422296524048,
"learning_rate": 8.45225828019976e-06,
"loss": 0.7371,
"step": 5072
},
{
"epoch": 1.4727826970532734,
"grad_norm": 4.38593864440918,
"learning_rate": 8.451563626196072e-06,
"loss": 0.9145,
"step": 5073
},
{
"epoch": 1.4730730149513718,
"grad_norm": 3.3235526084899902,
"learning_rate": 8.450868844900088e-06,
"loss": 0.6989,
"step": 5074
},
{
"epoch": 1.4733633328494702,
"grad_norm": 3.56598162651062,
"learning_rate": 8.450173936337429e-06,
"loss": 0.7485,
"step": 5075
},
{
"epoch": 1.4736536507475686,
"grad_norm": 3.6153061389923096,
"learning_rate": 8.449478900533726e-06,
"loss": 0.8819,
"step": 5076
},
{
"epoch": 1.473943968645667,
"grad_norm": 3.7739386558532715,
"learning_rate": 8.448783737514609e-06,
"loss": 0.6986,
"step": 5077
},
{
"epoch": 1.4742342865437654,
"grad_norm": 3.4768197536468506,
"learning_rate": 8.448088447305716e-06,
"loss": 0.7281,
"step": 5078
},
{
"epoch": 1.4745246044418638,
"grad_norm": 3.376514196395874,
"learning_rate": 8.447393029932692e-06,
"loss": 0.7537,
"step": 5079
},
{
"epoch": 1.4748149223399623,
"grad_norm": 3.229945421218872,
"learning_rate": 8.446697485421179e-06,
"loss": 0.7705,
"step": 5080
},
{
"epoch": 1.4751052402380607,
"grad_norm": 3.3229260444641113,
"learning_rate": 8.446001813796829e-06,
"loss": 0.8065,
"step": 5081
},
{
"epoch": 1.475395558136159,
"grad_norm": 4.087240695953369,
"learning_rate": 8.445306015085301e-06,
"loss": 0.8067,
"step": 5082
},
{
"epoch": 1.4756858760342575,
"grad_norm": 3.625922203063965,
"learning_rate": 8.444610089312255e-06,
"loss": 0.8401,
"step": 5083
},
{
"epoch": 1.475976193932356,
"grad_norm": 3.589026689529419,
"learning_rate": 8.443914036503356e-06,
"loss": 0.7364,
"step": 5084
},
{
"epoch": 1.4762665118304543,
"grad_norm": 3.892855405807495,
"learning_rate": 8.443217856684273e-06,
"loss": 0.8431,
"step": 5085
},
{
"epoch": 1.4765568297285527,
"grad_norm": 3.6631200313568115,
"learning_rate": 8.442521549880682e-06,
"loss": 0.6817,
"step": 5086
},
{
"epoch": 1.4768471476266511,
"grad_norm": 3.349924325942993,
"learning_rate": 8.441825116118264e-06,
"loss": 0.8062,
"step": 5087
},
{
"epoch": 1.4771374655247496,
"grad_norm": 3.383465051651001,
"learning_rate": 8.4411285554227e-06,
"loss": 0.8242,
"step": 5088
},
{
"epoch": 1.477427783422848,
"grad_norm": 3.3581674098968506,
"learning_rate": 8.44043186781968e-06,
"loss": 0.73,
"step": 5089
},
{
"epoch": 1.4777181013209464,
"grad_norm": 3.511465072631836,
"learning_rate": 8.439735053334899e-06,
"loss": 0.7939,
"step": 5090
},
{
"epoch": 1.4780084192190448,
"grad_norm": 3.3580431938171387,
"learning_rate": 8.439038111994055e-06,
"loss": 0.7183,
"step": 5091
},
{
"epoch": 1.4782987371171432,
"grad_norm": 3.0493764877319336,
"learning_rate": 8.43834104382285e-06,
"loss": 0.6271,
"step": 5092
},
{
"epoch": 1.4785890550152416,
"grad_norm": 3.401853561401367,
"learning_rate": 8.43764384884699e-06,
"loss": 0.7419,
"step": 5093
},
{
"epoch": 1.47887937291334,
"grad_norm": 3.6405131816864014,
"learning_rate": 8.43694652709219e-06,
"loss": 0.932,
"step": 5094
},
{
"epoch": 1.4791696908114385,
"grad_norm": 3.652693510055542,
"learning_rate": 8.436249078584166e-06,
"loss": 0.8069,
"step": 5095
},
{
"epoch": 1.4794600087095369,
"grad_norm": 3.3175535202026367,
"learning_rate": 8.43555150334864e-06,
"loss": 0.7172,
"step": 5096
},
{
"epoch": 1.4797503266076353,
"grad_norm": 3.5787837505340576,
"learning_rate": 8.434853801411337e-06,
"loss": 0.7607,
"step": 5097
},
{
"epoch": 1.480040644505734,
"grad_norm": 3.6244215965270996,
"learning_rate": 8.43415597279799e-06,
"loss": 0.7861,
"step": 5098
},
{
"epoch": 1.480330962403832,
"grad_norm": 2.996455430984497,
"learning_rate": 8.433458017534332e-06,
"loss": 0.6984,
"step": 5099
},
{
"epoch": 1.4806212803019307,
"grad_norm": 3.1511950492858887,
"learning_rate": 8.432759935646107e-06,
"loss": 0.6542,
"step": 5100
},
{
"epoch": 1.480911598200029,
"grad_norm": 3.5917961597442627,
"learning_rate": 8.432061727159056e-06,
"loss": 0.6977,
"step": 5101
},
{
"epoch": 1.4812019160981276,
"grad_norm": 3.706416130065918,
"learning_rate": 8.431363392098931e-06,
"loss": 0.7762,
"step": 5102
},
{
"epoch": 1.4814922339962258,
"grad_norm": 3.645132303237915,
"learning_rate": 8.430664930491485e-06,
"loss": 0.7918,
"step": 5103
},
{
"epoch": 1.4817825518943244,
"grad_norm": 3.448289155960083,
"learning_rate": 8.429966342362478e-06,
"loss": 0.8402,
"step": 5104
},
{
"epoch": 1.4820728697924226,
"grad_norm": 3.323197364807129,
"learning_rate": 8.429267627737675e-06,
"loss": 0.7244,
"step": 5105
},
{
"epoch": 1.4823631876905212,
"grad_norm": 3.8272650241851807,
"learning_rate": 8.428568786642842e-06,
"loss": 0.9625,
"step": 5106
},
{
"epoch": 1.4826535055886194,
"grad_norm": 3.6143205165863037,
"learning_rate": 8.427869819103753e-06,
"loss": 0.7005,
"step": 5107
},
{
"epoch": 1.482943823486718,
"grad_norm": 3.774230718612671,
"learning_rate": 8.427170725146184e-06,
"loss": 0.8041,
"step": 5108
},
{
"epoch": 1.4832341413848162,
"grad_norm": 3.8581020832061768,
"learning_rate": 8.42647150479592e-06,
"loss": 0.8116,
"step": 5109
},
{
"epoch": 1.4835244592829149,
"grad_norm": 3.6557369232177734,
"learning_rate": 8.425772158078747e-06,
"loss": 0.8101,
"step": 5110
},
{
"epoch": 1.4838147771810133,
"grad_norm": 3.73181414604187,
"learning_rate": 8.425072685020454e-06,
"loss": 0.7939,
"step": 5111
},
{
"epoch": 1.4841050950791117,
"grad_norm": 3.7906265258789062,
"learning_rate": 8.424373085646842e-06,
"loss": 0.8461,
"step": 5112
},
{
"epoch": 1.4843954129772101,
"grad_norm": 3.1719911098480225,
"learning_rate": 8.423673359983708e-06,
"loss": 0.7403,
"step": 5113
},
{
"epoch": 1.4846857308753085,
"grad_norm": 3.553091049194336,
"learning_rate": 8.42297350805686e-06,
"loss": 0.8006,
"step": 5114
},
{
"epoch": 1.484976048773407,
"grad_norm": 3.4175894260406494,
"learning_rate": 8.42227352989211e-06,
"loss": 0.8031,
"step": 5115
},
{
"epoch": 1.4852663666715054,
"grad_norm": 3.7422873973846436,
"learning_rate": 8.421573425515267e-06,
"loss": 0.8509,
"step": 5116
},
{
"epoch": 1.4855566845696038,
"grad_norm": 3.5525622367858887,
"learning_rate": 8.420873194952153e-06,
"loss": 0.8151,
"step": 5117
},
{
"epoch": 1.4858470024677022,
"grad_norm": 3.4313485622406006,
"learning_rate": 8.420172838228595e-06,
"loss": 0.7339,
"step": 5118
},
{
"epoch": 1.4861373203658006,
"grad_norm": 3.8493919372558594,
"learning_rate": 8.41947235537042e-06,
"loss": 0.7356,
"step": 5119
},
{
"epoch": 1.486427638263899,
"grad_norm": 3.501344919204712,
"learning_rate": 8.41877174640346e-06,
"loss": 0.8211,
"step": 5120
},
{
"epoch": 1.4867179561619974,
"grad_norm": 3.3096718788146973,
"learning_rate": 8.418071011353556e-06,
"loss": 0.699,
"step": 5121
},
{
"epoch": 1.4870082740600958,
"grad_norm": 3.5474376678466797,
"learning_rate": 8.417370150246548e-06,
"loss": 0.8234,
"step": 5122
},
{
"epoch": 1.4872985919581942,
"grad_norm": 3.771197557449341,
"learning_rate": 8.416669163108287e-06,
"loss": 0.8363,
"step": 5123
},
{
"epoch": 1.4875889098562927,
"grad_norm": 3.500458240509033,
"learning_rate": 8.415968049964623e-06,
"loss": 0.7884,
"step": 5124
},
{
"epoch": 1.487879227754391,
"grad_norm": 4.015684127807617,
"learning_rate": 8.415266810841412e-06,
"loss": 0.8161,
"step": 5125
},
{
"epoch": 1.4881695456524895,
"grad_norm": 3.5269722938537598,
"learning_rate": 8.414565445764517e-06,
"loss": 0.7957,
"step": 5126
},
{
"epoch": 1.488459863550588,
"grad_norm": 3.418762683868408,
"learning_rate": 8.413863954759802e-06,
"loss": 0.9305,
"step": 5127
},
{
"epoch": 1.4887501814486863,
"grad_norm": 3.1369898319244385,
"learning_rate": 8.41316233785314e-06,
"loss": 0.7075,
"step": 5128
},
{
"epoch": 1.4890404993467847,
"grad_norm": 3.5826096534729004,
"learning_rate": 8.412460595070405e-06,
"loss": 0.8197,
"step": 5129
},
{
"epoch": 1.4893308172448831,
"grad_norm": 3.6522650718688965,
"learning_rate": 8.411758726437478e-06,
"loss": 0.8644,
"step": 5130
},
{
"epoch": 1.4896211351429816,
"grad_norm": 3.8462347984313965,
"learning_rate": 8.411056731980243e-06,
"loss": 0.7973,
"step": 5131
},
{
"epoch": 1.48991145304108,
"grad_norm": 4.156594753265381,
"learning_rate": 8.41035461172459e-06,
"loss": 0.8661,
"step": 5132
},
{
"epoch": 1.4902017709391784,
"grad_norm": 4.024465084075928,
"learning_rate": 8.409652365696411e-06,
"loss": 0.7934,
"step": 5133
},
{
"epoch": 1.4904920888372768,
"grad_norm": 3.535364866256714,
"learning_rate": 8.408949993921607e-06,
"loss": 0.76,
"step": 5134
},
{
"epoch": 1.4907824067353752,
"grad_norm": 3.9186315536499023,
"learning_rate": 8.40824749642608e-06,
"loss": 0.763,
"step": 5135
},
{
"epoch": 1.4910727246334736,
"grad_norm": 3.914283514022827,
"learning_rate": 8.407544873235736e-06,
"loss": 0.7664,
"step": 5136
},
{
"epoch": 1.491363042531572,
"grad_norm": 3.8266966342926025,
"learning_rate": 8.40684212437649e-06,
"loss": 0.7919,
"step": 5137
},
{
"epoch": 1.4916533604296704,
"grad_norm": 3.40091609954834,
"learning_rate": 8.406139249874261e-06,
"loss": 0.7519,
"step": 5138
},
{
"epoch": 1.4919436783277689,
"grad_norm": 3.6243724822998047,
"learning_rate": 8.405436249754965e-06,
"loss": 0.731,
"step": 5139
},
{
"epoch": 1.4922339962258673,
"grad_norm": 3.760503053665161,
"learning_rate": 8.404733124044532e-06,
"loss": 0.8834,
"step": 5140
},
{
"epoch": 1.4925243141239657,
"grad_norm": 3.4237194061279297,
"learning_rate": 8.404029872768895e-06,
"loss": 0.7376,
"step": 5141
},
{
"epoch": 1.492814632022064,
"grad_norm": 4.07653284072876,
"learning_rate": 8.403326495953985e-06,
"loss": 0.9527,
"step": 5142
},
{
"epoch": 1.4931049499201625,
"grad_norm": 3.364163875579834,
"learning_rate": 8.402622993625744e-06,
"loss": 0.744,
"step": 5143
},
{
"epoch": 1.493395267818261,
"grad_norm": 3.8818440437316895,
"learning_rate": 8.40191936581012e-06,
"loss": 0.8061,
"step": 5144
},
{
"epoch": 1.4936855857163593,
"grad_norm": 3.259274482727051,
"learning_rate": 8.401215612533056e-06,
"loss": 0.7186,
"step": 5145
},
{
"epoch": 1.4939759036144578,
"grad_norm": 3.872246265411377,
"learning_rate": 8.400511733820513e-06,
"loss": 0.9133,
"step": 5146
},
{
"epoch": 1.4942662215125562,
"grad_norm": 4.047363758087158,
"learning_rate": 8.399807729698446e-06,
"loss": 0.9393,
"step": 5147
},
{
"epoch": 1.4945565394106546,
"grad_norm": 3.743149995803833,
"learning_rate": 8.399103600192817e-06,
"loss": 0.8807,
"step": 5148
},
{
"epoch": 1.4948468573087532,
"grad_norm": 3.3832364082336426,
"learning_rate": 8.398399345329598e-06,
"loss": 0.7135,
"step": 5149
},
{
"epoch": 1.4951371752068514,
"grad_norm": 3.180245876312256,
"learning_rate": 8.397694965134759e-06,
"loss": 0.767,
"step": 5150
},
{
"epoch": 1.49542749310495,
"grad_norm": 3.3784093856811523,
"learning_rate": 8.39699045963428e-06,
"loss": 0.8157,
"step": 5151
},
{
"epoch": 1.4957178110030482,
"grad_norm": 3.921077251434326,
"learning_rate": 8.39628582885414e-06,
"loss": 0.8413,
"step": 5152
},
{
"epoch": 1.4960081289011469,
"grad_norm": 3.9500279426574707,
"learning_rate": 8.395581072820325e-06,
"loss": 0.7991,
"step": 5153
},
{
"epoch": 1.496298446799245,
"grad_norm": 3.412574052810669,
"learning_rate": 8.394876191558828e-06,
"loss": 0.6968,
"step": 5154
},
{
"epoch": 1.4965887646973437,
"grad_norm": 3.49398136138916,
"learning_rate": 8.394171185095646e-06,
"loss": 0.7868,
"step": 5155
},
{
"epoch": 1.496879082595442,
"grad_norm": 3.4007065296173096,
"learning_rate": 8.393466053456775e-06,
"loss": 0.7535,
"step": 5156
},
{
"epoch": 1.4971694004935405,
"grad_norm": 3.4070701599121094,
"learning_rate": 8.392760796668225e-06,
"loss": 0.7558,
"step": 5157
},
{
"epoch": 1.4974597183916387,
"grad_norm": 3.5991742610931396,
"learning_rate": 8.392055414756e-06,
"loss": 0.807,
"step": 5158
},
{
"epoch": 1.4977500362897374,
"grad_norm": 3.701852560043335,
"learning_rate": 8.39134990774612e-06,
"loss": 0.8775,
"step": 5159
},
{
"epoch": 1.4980403541878355,
"grad_norm": 3.2740137577056885,
"learning_rate": 8.390644275664602e-06,
"loss": 0.7085,
"step": 5160
},
{
"epoch": 1.4983306720859342,
"grad_norm": 3.118898868560791,
"learning_rate": 8.389938518537468e-06,
"loss": 0.7442,
"step": 5161
},
{
"epoch": 1.4986209899840326,
"grad_norm": 3.790092945098877,
"learning_rate": 8.389232636390744e-06,
"loss": 0.7488,
"step": 5162
},
{
"epoch": 1.498911307882131,
"grad_norm": 3.3232455253601074,
"learning_rate": 8.388526629250469e-06,
"loss": 0.8099,
"step": 5163
},
{
"epoch": 1.4992016257802294,
"grad_norm": 3.8602137565612793,
"learning_rate": 8.387820497142674e-06,
"loss": 0.7779,
"step": 5164
},
{
"epoch": 1.4994919436783278,
"grad_norm": 3.3287713527679443,
"learning_rate": 8.387114240093406e-06,
"loss": 0.7674,
"step": 5165
},
{
"epoch": 1.4997822615764262,
"grad_norm": 3.3892929553985596,
"learning_rate": 8.386407858128707e-06,
"loss": 0.8281,
"step": 5166
},
{
"epoch": 1.5000725794745247,
"grad_norm": 3.7774667739868164,
"learning_rate": 8.38570135127463e-06,
"loss": 0.9089,
"step": 5167
},
{
"epoch": 1.500362897372623,
"grad_norm": 3.992173194885254,
"learning_rate": 8.384994719557232e-06,
"loss": 0.8284,
"step": 5168
},
{
"epoch": 1.5006532152707215,
"grad_norm": 3.2167437076568604,
"learning_rate": 8.38428796300257e-06,
"loss": 0.7445,
"step": 5169
},
{
"epoch": 1.50094353316882,
"grad_norm": 3.703031301498413,
"learning_rate": 8.383581081636712e-06,
"loss": 0.8307,
"step": 5170
},
{
"epoch": 1.5012338510669183,
"grad_norm": 3.630709409713745,
"learning_rate": 8.382874075485728e-06,
"loss": 0.7981,
"step": 5171
},
{
"epoch": 1.5015241689650167,
"grad_norm": 3.494649887084961,
"learning_rate": 8.382166944575689e-06,
"loss": 0.7494,
"step": 5172
},
{
"epoch": 1.5018144868631151,
"grad_norm": 3.6578376293182373,
"learning_rate": 8.381459688932674e-06,
"loss": 0.9244,
"step": 5173
},
{
"epoch": 1.5021048047612136,
"grad_norm": 3.397042989730835,
"learning_rate": 8.38075230858277e-06,
"loss": 0.7623,
"step": 5174
},
{
"epoch": 1.502395122659312,
"grad_norm": 3.3813118934631348,
"learning_rate": 8.38004480355206e-06,
"loss": 0.6921,
"step": 5175
},
{
"epoch": 1.5026854405574104,
"grad_norm": 3.190324306488037,
"learning_rate": 8.379337173866642e-06,
"loss": 0.7639,
"step": 5176
},
{
"epoch": 1.5029757584555088,
"grad_norm": 3.264589786529541,
"learning_rate": 8.37862941955261e-06,
"loss": 0.7104,
"step": 5177
},
{
"epoch": 1.5032660763536072,
"grad_norm": 3.360027551651001,
"learning_rate": 8.377921540636062e-06,
"loss": 0.7006,
"step": 5178
},
{
"epoch": 1.5035563942517056,
"grad_norm": 3.662677049636841,
"learning_rate": 8.37721353714311e-06,
"loss": 0.7836,
"step": 5179
},
{
"epoch": 1.503846712149804,
"grad_norm": 3.459056854248047,
"learning_rate": 8.376505409099865e-06,
"loss": 0.7282,
"step": 5180
},
{
"epoch": 1.5041370300479024,
"grad_norm": 3.67429256439209,
"learning_rate": 8.375797156532436e-06,
"loss": 0.8587,
"step": 5181
},
{
"epoch": 1.5044273479460009,
"grad_norm": 3.853055715560913,
"learning_rate": 8.375088779466953e-06,
"loss": 0.8487,
"step": 5182
},
{
"epoch": 1.5047176658440993,
"grad_norm": 3.8401939868927,
"learning_rate": 8.374380277929532e-06,
"loss": 0.9323,
"step": 5183
},
{
"epoch": 1.5050079837421977,
"grad_norm": 3.4322612285614014,
"learning_rate": 8.373671651946306e-06,
"loss": 0.7913,
"step": 5184
},
{
"epoch": 1.505298301640296,
"grad_norm": 3.570939064025879,
"learning_rate": 8.372962901543409e-06,
"loss": 0.8441,
"step": 5185
},
{
"epoch": 1.5055886195383945,
"grad_norm": 3.5912673473358154,
"learning_rate": 8.372254026746977e-06,
"loss": 0.7455,
"step": 5186
},
{
"epoch": 1.505878937436493,
"grad_norm": 3.4715113639831543,
"learning_rate": 8.371545027583154e-06,
"loss": 0.7535,
"step": 5187
},
{
"epoch": 1.5061692553345913,
"grad_norm": 3.537951707839966,
"learning_rate": 8.370835904078092e-06,
"loss": 0.7693,
"step": 5188
},
{
"epoch": 1.5064595732326898,
"grad_norm": 3.579514265060425,
"learning_rate": 8.370126656257938e-06,
"loss": 0.8167,
"step": 5189
},
{
"epoch": 1.5067498911307882,
"grad_norm": 3.547579050064087,
"learning_rate": 8.369417284148849e-06,
"loss": 0.721,
"step": 5190
},
{
"epoch": 1.5070402090288866,
"grad_norm": 3.8502068519592285,
"learning_rate": 8.368707787776988e-06,
"loss": 0.8689,
"step": 5191
},
{
"epoch": 1.5073305269269852,
"grad_norm": 3.8900763988494873,
"learning_rate": 8.367998167168521e-06,
"loss": 0.8262,
"step": 5192
},
{
"epoch": 1.5076208448250834,
"grad_norm": 3.818483591079712,
"learning_rate": 8.367288422349617e-06,
"loss": 0.8494,
"step": 5193
},
{
"epoch": 1.507911162723182,
"grad_norm": 3.8860888481140137,
"learning_rate": 8.366578553346455e-06,
"loss": 0.881,
"step": 5194
},
{
"epoch": 1.5082014806212802,
"grad_norm": 3.470583915710449,
"learning_rate": 8.365868560185209e-06,
"loss": 0.7415,
"step": 5195
},
{
"epoch": 1.5084917985193789,
"grad_norm": 3.4204583168029785,
"learning_rate": 8.365158442892069e-06,
"loss": 0.7979,
"step": 5196
},
{
"epoch": 1.508782116417477,
"grad_norm": 4.06003999710083,
"learning_rate": 8.36444820149322e-06,
"loss": 0.8262,
"step": 5197
},
{
"epoch": 1.5090724343155757,
"grad_norm": 3.7327427864074707,
"learning_rate": 8.363737836014855e-06,
"loss": 0.8375,
"step": 5198
},
{
"epoch": 1.5093627522136739,
"grad_norm": 3.577173948287964,
"learning_rate": 8.363027346483174e-06,
"loss": 0.8699,
"step": 5199
},
{
"epoch": 1.5096530701117725,
"grad_norm": 3.675419569015503,
"learning_rate": 8.36231673292438e-06,
"loss": 0.7442,
"step": 5200
},
{
"epoch": 1.5099433880098707,
"grad_norm": 3.533881425857544,
"learning_rate": 8.36160599536468e-06,
"loss": 0.8418,
"step": 5201
},
{
"epoch": 1.5102337059079693,
"grad_norm": 3.6481122970581055,
"learning_rate": 8.360895133830284e-06,
"loss": 0.7421,
"step": 5202
},
{
"epoch": 1.5105240238060675,
"grad_norm": 3.723921298980713,
"learning_rate": 8.360184148347409e-06,
"loss": 0.7413,
"step": 5203
},
{
"epoch": 1.5108143417041662,
"grad_norm": 3.4912638664245605,
"learning_rate": 8.359473038942275e-06,
"loss": 0.834,
"step": 5204
},
{
"epoch": 1.5111046596022644,
"grad_norm": 3.025022506713867,
"learning_rate": 8.358761805641109e-06,
"loss": 0.64,
"step": 5205
},
{
"epoch": 1.511394977500363,
"grad_norm": 3.581099033355713,
"learning_rate": 8.358050448470143e-06,
"loss": 0.8429,
"step": 5206
},
{
"epoch": 1.5116852953984612,
"grad_norm": 3.6286072731018066,
"learning_rate": 8.357338967455605e-06,
"loss": 0.778,
"step": 5207
},
{
"epoch": 1.5119756132965598,
"grad_norm": 3.345937967300415,
"learning_rate": 8.356627362623742e-06,
"loss": 0.845,
"step": 5208
},
{
"epoch": 1.512265931194658,
"grad_norm": 3.4083850383758545,
"learning_rate": 8.35591563400079e-06,
"loss": 0.842,
"step": 5209
},
{
"epoch": 1.5125562490927567,
"grad_norm": 3.297445774078369,
"learning_rate": 8.355203781613004e-06,
"loss": 0.6617,
"step": 5210
},
{
"epoch": 1.5128465669908548,
"grad_norm": 3.6352899074554443,
"learning_rate": 8.354491805486633e-06,
"loss": 0.8348,
"step": 5211
},
{
"epoch": 1.5131368848889535,
"grad_norm": 3.588831663131714,
"learning_rate": 8.353779705647936e-06,
"loss": 0.8025,
"step": 5212
},
{
"epoch": 1.5134272027870517,
"grad_norm": 3.7391092777252197,
"learning_rate": 8.353067482123174e-06,
"loss": 0.77,
"step": 5213
},
{
"epoch": 1.5137175206851503,
"grad_norm": 4.033049583435059,
"learning_rate": 8.352355134938615e-06,
"loss": 0.8908,
"step": 5214
},
{
"epoch": 1.5140078385832485,
"grad_norm": 3.4990336894989014,
"learning_rate": 8.351642664120527e-06,
"loss": 0.6708,
"step": 5215
},
{
"epoch": 1.5142981564813471,
"grad_norm": 3.54728627204895,
"learning_rate": 8.35093006969519e-06,
"loss": 0.7484,
"step": 5216
},
{
"epoch": 1.5145884743794453,
"grad_norm": 3.496731758117676,
"learning_rate": 8.35021735168888e-06,
"loss": 0.7755,
"step": 5217
},
{
"epoch": 1.514878792277544,
"grad_norm": 3.043483257293701,
"learning_rate": 8.349504510127884e-06,
"loss": 0.675,
"step": 5218
},
{
"epoch": 1.5151691101756424,
"grad_norm": 3.824181079864502,
"learning_rate": 8.34879154503849e-06,
"loss": 0.8784,
"step": 5219
},
{
"epoch": 1.5154594280737408,
"grad_norm": 3.797044515609741,
"learning_rate": 8.348078456446992e-06,
"loss": 0.9087,
"step": 5220
},
{
"epoch": 1.5157497459718392,
"grad_norm": 3.5474209785461426,
"learning_rate": 8.347365244379693e-06,
"loss": 0.7362,
"step": 5221
},
{
"epoch": 1.5160400638699376,
"grad_norm": 3.3133018016815186,
"learning_rate": 8.346651908862888e-06,
"loss": 0.668,
"step": 5222
},
{
"epoch": 1.516330381768036,
"grad_norm": 3.8621597290039062,
"learning_rate": 8.345938449922892e-06,
"loss": 0.8732,
"step": 5223
},
{
"epoch": 1.5166206996661344,
"grad_norm": 3.851616382598877,
"learning_rate": 8.345224867586012e-06,
"loss": 0.927,
"step": 5224
},
{
"epoch": 1.5169110175642329,
"grad_norm": 3.438823938369751,
"learning_rate": 8.344511161878567e-06,
"loss": 0.7236,
"step": 5225
},
{
"epoch": 1.5172013354623313,
"grad_norm": 3.7797598838806152,
"learning_rate": 8.343797332826877e-06,
"loss": 0.8414,
"step": 5226
},
{
"epoch": 1.5174916533604297,
"grad_norm": 3.4886631965637207,
"learning_rate": 8.343083380457269e-06,
"loss": 0.8468,
"step": 5227
},
{
"epoch": 1.517781971258528,
"grad_norm": 3.6253554821014404,
"learning_rate": 8.342369304796072e-06,
"loss": 0.8223,
"step": 5228
},
{
"epoch": 1.5180722891566265,
"grad_norm": 3.529344081878662,
"learning_rate": 8.341655105869622e-06,
"loss": 0.7312,
"step": 5229
},
{
"epoch": 1.518362607054725,
"grad_norm": 3.156813383102417,
"learning_rate": 8.340940783704257e-06,
"loss": 0.7537,
"step": 5230
},
{
"epoch": 1.5186529249528233,
"grad_norm": 3.1911001205444336,
"learning_rate": 8.340226338326321e-06,
"loss": 0.7023,
"step": 5231
},
{
"epoch": 1.5189432428509217,
"grad_norm": 3.402534246444702,
"learning_rate": 8.339511769762166e-06,
"loss": 0.822,
"step": 5232
},
{
"epoch": 1.5192335607490202,
"grad_norm": 3.5590410232543945,
"learning_rate": 8.338797078038139e-06,
"loss": 0.8028,
"step": 5233
},
{
"epoch": 1.5195238786471186,
"grad_norm": 3.573758840560913,
"learning_rate": 8.338082263180602e-06,
"loss": 0.8886,
"step": 5234
},
{
"epoch": 1.519814196545217,
"grad_norm": 3.4216904640197754,
"learning_rate": 8.337367325215917e-06,
"loss": 0.7472,
"step": 5235
},
{
"epoch": 1.5201045144433154,
"grad_norm": 3.222221851348877,
"learning_rate": 8.336652264170447e-06,
"loss": 0.816,
"step": 5236
},
{
"epoch": 1.5203948323414138,
"grad_norm": 3.748356342315674,
"learning_rate": 8.335937080070567e-06,
"loss": 0.7983,
"step": 5237
},
{
"epoch": 1.5206851502395122,
"grad_norm": 3.761164903640747,
"learning_rate": 8.335221772942652e-06,
"loss": 0.7149,
"step": 5238
},
{
"epoch": 1.5209754681376106,
"grad_norm": 3.916020393371582,
"learning_rate": 8.334506342813081e-06,
"loss": 0.9039,
"step": 5239
},
{
"epoch": 1.521265786035709,
"grad_norm": 4.239342212677002,
"learning_rate": 8.333790789708238e-06,
"loss": 0.8167,
"step": 5240
},
{
"epoch": 1.5215561039338075,
"grad_norm": 3.4121599197387695,
"learning_rate": 8.333075113654516e-06,
"loss": 0.8067,
"step": 5241
},
{
"epoch": 1.5218464218319059,
"grad_norm": 3.260080575942993,
"learning_rate": 8.332359314678306e-06,
"loss": 0.7618,
"step": 5242
},
{
"epoch": 1.5221367397300045,
"grad_norm": 3.816723346710205,
"learning_rate": 8.331643392806006e-06,
"loss": 0.7984,
"step": 5243
},
{
"epoch": 1.5224270576281027,
"grad_norm": 3.672610282897949,
"learning_rate": 8.33092734806402e-06,
"loss": 0.7288,
"step": 5244
},
{
"epoch": 1.5227173755262013,
"grad_norm": 3.4358227252960205,
"learning_rate": 8.330211180478754e-06,
"loss": 0.6884,
"step": 5245
},
{
"epoch": 1.5230076934242995,
"grad_norm": 3.3935177326202393,
"learning_rate": 8.329494890076623e-06,
"loss": 0.7529,
"step": 5246
},
{
"epoch": 1.5232980113223982,
"grad_norm": 3.87324595451355,
"learning_rate": 8.32877847688404e-06,
"loss": 0.8658,
"step": 5247
},
{
"epoch": 1.5235883292204964,
"grad_norm": 3.859293222427368,
"learning_rate": 8.32806194092743e-06,
"loss": 0.8443,
"step": 5248
},
{
"epoch": 1.523878647118595,
"grad_norm": 3.1775994300842285,
"learning_rate": 8.327345282233217e-06,
"loss": 0.6733,
"step": 5249
},
{
"epoch": 1.5241689650166932,
"grad_norm": 3.546396255493164,
"learning_rate": 8.326628500827826e-06,
"loss": 0.739,
"step": 5250
},
{
"epoch": 1.5244592829147918,
"grad_norm": 3.3907859325408936,
"learning_rate": 8.3259115967377e-06,
"loss": 0.8505,
"step": 5251
},
{
"epoch": 1.52474960081289,
"grad_norm": 3.738556146621704,
"learning_rate": 8.325194569989273e-06,
"loss": 0.8121,
"step": 5252
},
{
"epoch": 1.5250399187109887,
"grad_norm": 3.676562547683716,
"learning_rate": 8.324477420608989e-06,
"loss": 0.7887,
"step": 5253
},
{
"epoch": 1.5253302366090868,
"grad_norm": 3.9967105388641357,
"learning_rate": 8.323760148623298e-06,
"loss": 0.9404,
"step": 5254
},
{
"epoch": 1.5256205545071855,
"grad_norm": 3.4851815700531006,
"learning_rate": 8.323042754058652e-06,
"loss": 0.7178,
"step": 5255
},
{
"epoch": 1.5259108724052837,
"grad_norm": 3.8005199432373047,
"learning_rate": 8.322325236941507e-06,
"loss": 0.8294,
"step": 5256
},
{
"epoch": 1.5262011903033823,
"grad_norm": 3.8124680519104004,
"learning_rate": 8.321607597298326e-06,
"loss": 0.8139,
"step": 5257
},
{
"epoch": 1.5264915082014805,
"grad_norm": 3.2474987506866455,
"learning_rate": 8.320889835155577e-06,
"loss": 0.7376,
"step": 5258
},
{
"epoch": 1.5267818260995791,
"grad_norm": 3.5540499687194824,
"learning_rate": 8.320171950539726e-06,
"loss": 0.7025,
"step": 5259
},
{
"epoch": 1.5270721439976773,
"grad_norm": 3.8556888103485107,
"learning_rate": 8.319453943477252e-06,
"loss": 0.8861,
"step": 5260
},
{
"epoch": 1.527362461895776,
"grad_norm": 3.5100462436676025,
"learning_rate": 8.318735813994633e-06,
"loss": 0.8822,
"step": 5261
},
{
"epoch": 1.5276527797938741,
"grad_norm": 3.7765297889709473,
"learning_rate": 8.318017562118354e-06,
"loss": 0.908,
"step": 5262
},
{
"epoch": 1.5279430976919728,
"grad_norm": 3.2733256816864014,
"learning_rate": 8.317299187874906e-06,
"loss": 0.7915,
"step": 5263
},
{
"epoch": 1.528233415590071,
"grad_norm": 3.604302167892456,
"learning_rate": 8.31658069129078e-06,
"loss": 0.7866,
"step": 5264
},
{
"epoch": 1.5285237334881696,
"grad_norm": 3.134767532348633,
"learning_rate": 8.315862072392471e-06,
"loss": 0.7795,
"step": 5265
},
{
"epoch": 1.5288140513862678,
"grad_norm": 3.620120048522949,
"learning_rate": 8.315143331206488e-06,
"loss": 0.8672,
"step": 5266
},
{
"epoch": 1.5291043692843664,
"grad_norm": 3.1708273887634277,
"learning_rate": 8.314424467759334e-06,
"loss": 0.7367,
"step": 5267
},
{
"epoch": 1.5293946871824649,
"grad_norm": 3.5994269847869873,
"learning_rate": 8.313705482077521e-06,
"loss": 0.934,
"step": 5268
},
{
"epoch": 1.5296850050805633,
"grad_norm": 3.8919529914855957,
"learning_rate": 8.312986374187563e-06,
"loss": 0.7943,
"step": 5269
},
{
"epoch": 1.5299753229786617,
"grad_norm": 3.3385488986968994,
"learning_rate": 8.312267144115984e-06,
"loss": 0.6968,
"step": 5270
},
{
"epoch": 1.53026564087676,
"grad_norm": 3.434603691101074,
"learning_rate": 8.311547791889307e-06,
"loss": 0.713,
"step": 5271
},
{
"epoch": 1.5305559587748585,
"grad_norm": 3.5553691387176514,
"learning_rate": 8.310828317534061e-06,
"loss": 0.78,
"step": 5272
},
{
"epoch": 1.530846276672957,
"grad_norm": 3.590174436569214,
"learning_rate": 8.310108721076782e-06,
"loss": 0.8297,
"step": 5273
},
{
"epoch": 1.5311365945710553,
"grad_norm": 3.4327259063720703,
"learning_rate": 8.309389002544005e-06,
"loss": 0.665,
"step": 5274
},
{
"epoch": 1.5314269124691537,
"grad_norm": 3.5836644172668457,
"learning_rate": 8.308669161962275e-06,
"loss": 0.908,
"step": 5275
},
{
"epoch": 1.5317172303672522,
"grad_norm": 3.7336366176605225,
"learning_rate": 8.30794919935814e-06,
"loss": 0.9022,
"step": 5276
},
{
"epoch": 1.5320075482653506,
"grad_norm": 3.6512033939361572,
"learning_rate": 8.307229114758151e-06,
"loss": 0.8058,
"step": 5277
},
{
"epoch": 1.532297866163449,
"grad_norm": 3.5464484691619873,
"learning_rate": 8.306508908188866e-06,
"loss": 0.7925,
"step": 5278
},
{
"epoch": 1.5325881840615474,
"grad_norm": 3.1770904064178467,
"learning_rate": 8.305788579676843e-06,
"loss": 0.7042,
"step": 5279
},
{
"epoch": 1.5328785019596458,
"grad_norm": 3.586550712585449,
"learning_rate": 8.30506812924865e-06,
"loss": 0.8136,
"step": 5280
},
{
"epoch": 1.5331688198577442,
"grad_norm": 3.467254638671875,
"learning_rate": 8.304347556930856e-06,
"loss": 0.7584,
"step": 5281
},
{
"epoch": 1.5334591377558426,
"grad_norm": 2.9315671920776367,
"learning_rate": 8.303626862750034e-06,
"loss": 0.6456,
"step": 5282
},
{
"epoch": 1.533749455653941,
"grad_norm": 3.198570966720581,
"learning_rate": 8.302906046732766e-06,
"loss": 0.7304,
"step": 5283
},
{
"epoch": 1.5340397735520395,
"grad_norm": 3.693838596343994,
"learning_rate": 8.302185108905632e-06,
"loss": 0.8126,
"step": 5284
},
{
"epoch": 1.5343300914501379,
"grad_norm": 3.353278875350952,
"learning_rate": 8.301464049295224e-06,
"loss": 0.7881,
"step": 5285
},
{
"epoch": 1.5346204093482363,
"grad_norm": 3.5820326805114746,
"learning_rate": 8.300742867928128e-06,
"loss": 0.8091,
"step": 5286
},
{
"epoch": 1.5349107272463347,
"grad_norm": 3.771308422088623,
"learning_rate": 8.300021564830949e-06,
"loss": 0.7514,
"step": 5287
},
{
"epoch": 1.5352010451444331,
"grad_norm": 3.5976288318634033,
"learning_rate": 8.299300140030283e-06,
"loss": 0.718,
"step": 5288
},
{
"epoch": 1.5354913630425315,
"grad_norm": 3.889220714569092,
"learning_rate": 8.298578593552737e-06,
"loss": 1.033,
"step": 5289
},
{
"epoch": 1.53578168094063,
"grad_norm": 3.3892271518707275,
"learning_rate": 8.29785692542492e-06,
"loss": 0.766,
"step": 5290
},
{
"epoch": 1.5360719988387284,
"grad_norm": 3.569516658782959,
"learning_rate": 8.297135135673451e-06,
"loss": 0.8218,
"step": 5291
},
{
"epoch": 1.5363623167368268,
"grad_norm": 3.7338545322418213,
"learning_rate": 8.296413224324944e-06,
"loss": 0.8123,
"step": 5292
},
{
"epoch": 1.5366526346349252,
"grad_norm": 3.023319959640503,
"learning_rate": 8.295691191406029e-06,
"loss": 0.6148,
"step": 5293
},
{
"epoch": 1.5369429525330238,
"grad_norm": 4.053857803344727,
"learning_rate": 8.294969036943328e-06,
"loss": 0.8692,
"step": 5294
},
{
"epoch": 1.537233270431122,
"grad_norm": 3.6890289783477783,
"learning_rate": 8.294246760963477e-06,
"loss": 0.8347,
"step": 5295
},
{
"epoch": 1.5375235883292206,
"grad_norm": 3.724935531616211,
"learning_rate": 8.29352436349311e-06,
"loss": 0.7793,
"step": 5296
},
{
"epoch": 1.5378139062273188,
"grad_norm": 3.5507402420043945,
"learning_rate": 8.292801844558875e-06,
"loss": 0.7672,
"step": 5297
},
{
"epoch": 1.5381042241254175,
"grad_norm": 3.179330348968506,
"learning_rate": 8.292079204187415e-06,
"loss": 0.6646,
"step": 5298
},
{
"epoch": 1.5383945420235157,
"grad_norm": 3.5478248596191406,
"learning_rate": 8.291356442405379e-06,
"loss": 0.8077,
"step": 5299
},
{
"epoch": 1.5386848599216143,
"grad_norm": 3.900157928466797,
"learning_rate": 8.290633559239422e-06,
"loss": 0.8082,
"step": 5300
},
{
"epoch": 1.5389751778197125,
"grad_norm": 3.41748046875,
"learning_rate": 8.289910554716208e-06,
"loss": 0.8316,
"step": 5301
},
{
"epoch": 1.5392654957178111,
"grad_norm": 3.5558431148529053,
"learning_rate": 8.289187428862398e-06,
"loss": 0.7715,
"step": 5302
},
{
"epoch": 1.5395558136159093,
"grad_norm": 3.833019495010376,
"learning_rate": 8.28846418170466e-06,
"loss": 0.9359,
"step": 5303
},
{
"epoch": 1.539846131514008,
"grad_norm": 3.508436679840088,
"learning_rate": 8.287740813269666e-06,
"loss": 0.7736,
"step": 5304
},
{
"epoch": 1.5401364494121061,
"grad_norm": 3.5869967937469482,
"learning_rate": 8.2870173235841e-06,
"loss": 0.8337,
"step": 5305
},
{
"epoch": 1.5404267673102048,
"grad_norm": 3.26682448387146,
"learning_rate": 8.286293712674636e-06,
"loss": 0.854,
"step": 5306
},
{
"epoch": 1.540717085208303,
"grad_norm": 3.3529770374298096,
"learning_rate": 8.285569980567965e-06,
"loss": 0.6657,
"step": 5307
},
{
"epoch": 1.5410074031064016,
"grad_norm": 3.2348685264587402,
"learning_rate": 8.284846127290778e-06,
"loss": 0.7903,
"step": 5308
},
{
"epoch": 1.5412977210044998,
"grad_norm": 2.985450267791748,
"learning_rate": 8.284122152869766e-06,
"loss": 0.6562,
"step": 5309
},
{
"epoch": 1.5415880389025984,
"grad_norm": 3.228339433670044,
"learning_rate": 8.283398057331636e-06,
"loss": 0.7334,
"step": 5310
},
{
"epoch": 1.5418783568006966,
"grad_norm": 3.838925361633301,
"learning_rate": 8.282673840703088e-06,
"loss": 0.8747,
"step": 5311
},
{
"epoch": 1.5421686746987953,
"grad_norm": 3.7266595363616943,
"learning_rate": 8.28194950301083e-06,
"loss": 0.8117,
"step": 5312
},
{
"epoch": 1.5424589925968935,
"grad_norm": 3.6524641513824463,
"learning_rate": 8.281225044281578e-06,
"loss": 0.6544,
"step": 5313
},
{
"epoch": 1.542749310494992,
"grad_norm": 3.4742302894592285,
"learning_rate": 8.280500464542047e-06,
"loss": 0.7832,
"step": 5314
},
{
"epoch": 1.5430396283930903,
"grad_norm": 3.4193482398986816,
"learning_rate": 8.279775763818962e-06,
"loss": 0.7379,
"step": 5315
},
{
"epoch": 1.543329946291189,
"grad_norm": 3.552457571029663,
"learning_rate": 8.279050942139048e-06,
"loss": 0.7344,
"step": 5316
},
{
"epoch": 1.543620264189287,
"grad_norm": 3.5372767448425293,
"learning_rate": 8.278325999529037e-06,
"loss": 0.8419,
"step": 5317
},
{
"epoch": 1.5439105820873857,
"grad_norm": 3.67195725440979,
"learning_rate": 8.277600936015663e-06,
"loss": 0.8765,
"step": 5318
},
{
"epoch": 1.5442008999854842,
"grad_norm": 3.4521541595458984,
"learning_rate": 8.276875751625669e-06,
"loss": 0.775,
"step": 5319
},
{
"epoch": 1.5444912178835826,
"grad_norm": 2.988212823867798,
"learning_rate": 8.276150446385796e-06,
"loss": 0.6954,
"step": 5320
},
{
"epoch": 1.544781535781681,
"grad_norm": 3.3321187496185303,
"learning_rate": 8.275425020322794e-06,
"loss": 0.6975,
"step": 5321
},
{
"epoch": 1.5450718536797794,
"grad_norm": 3.7323224544525146,
"learning_rate": 8.274699473463417e-06,
"loss": 0.7937,
"step": 5322
},
{
"epoch": 1.5453621715778778,
"grad_norm": 3.2482450008392334,
"learning_rate": 8.273973805834425e-06,
"loss": 0.7083,
"step": 5323
},
{
"epoch": 1.5456524894759762,
"grad_norm": 3.227125883102417,
"learning_rate": 8.273248017462579e-06,
"loss": 0.7372,
"step": 5324
},
{
"epoch": 1.5459428073740746,
"grad_norm": 3.8536081314086914,
"learning_rate": 8.272522108374643e-06,
"loss": 0.7417,
"step": 5325
},
{
"epoch": 1.546233125272173,
"grad_norm": 3.9545321464538574,
"learning_rate": 8.27179607859739e-06,
"loss": 0.759,
"step": 5326
},
{
"epoch": 1.5465234431702715,
"grad_norm": 3.3392481803894043,
"learning_rate": 8.271069928157595e-06,
"loss": 0.7807,
"step": 5327
},
{
"epoch": 1.5468137610683699,
"grad_norm": 3.7387235164642334,
"learning_rate": 8.270343657082043e-06,
"loss": 0.7683,
"step": 5328
},
{
"epoch": 1.5471040789664683,
"grad_norm": 3.2074947357177734,
"learning_rate": 8.26961726539751e-06,
"loss": 0.7249,
"step": 5329
},
{
"epoch": 1.5473943968645667,
"grad_norm": 3.8873088359832764,
"learning_rate": 8.268890753130794e-06,
"loss": 0.8258,
"step": 5330
},
{
"epoch": 1.5476847147626651,
"grad_norm": 3.9215521812438965,
"learning_rate": 8.268164120308684e-06,
"loss": 0.836,
"step": 5331
},
{
"epoch": 1.5479750326607635,
"grad_norm": 3.316826581954956,
"learning_rate": 8.267437366957976e-06,
"loss": 0.7363,
"step": 5332
},
{
"epoch": 1.548265350558862,
"grad_norm": 3.273144245147705,
"learning_rate": 8.266710493105476e-06,
"loss": 0.7226,
"step": 5333
},
{
"epoch": 1.5485556684569604,
"grad_norm": 3.902099132537842,
"learning_rate": 8.265983498777987e-06,
"loss": 0.845,
"step": 5334
},
{
"epoch": 1.5488459863550588,
"grad_norm": 3.659940004348755,
"learning_rate": 8.265256384002326e-06,
"loss": 0.7165,
"step": 5335
},
{
"epoch": 1.5491363042531572,
"grad_norm": 3.8005053997039795,
"learning_rate": 8.264529148805303e-06,
"loss": 0.854,
"step": 5336
},
{
"epoch": 1.5494266221512556,
"grad_norm": 3.4792816638946533,
"learning_rate": 8.26380179321374e-06,
"loss": 0.8514,
"step": 5337
},
{
"epoch": 1.549716940049354,
"grad_norm": 3.3794267177581787,
"learning_rate": 8.263074317254465e-06,
"loss": 0.7644,
"step": 5338
},
{
"epoch": 1.5500072579474524,
"grad_norm": 3.5877439975738525,
"learning_rate": 8.262346720954302e-06,
"loss": 0.6902,
"step": 5339
},
{
"epoch": 1.5502975758455508,
"grad_norm": 3.7112104892730713,
"learning_rate": 8.261619004340086e-06,
"loss": 0.7891,
"step": 5340
},
{
"epoch": 1.5505878937436492,
"grad_norm": 3.597099542617798,
"learning_rate": 8.260891167438655e-06,
"loss": 0.8692,
"step": 5341
},
{
"epoch": 1.5508782116417477,
"grad_norm": 3.904702663421631,
"learning_rate": 8.260163210276856e-06,
"loss": 0.9059,
"step": 5342
},
{
"epoch": 1.5511685295398463,
"grad_norm": 3.292292833328247,
"learning_rate": 8.259435132881528e-06,
"loss": 0.6733,
"step": 5343
},
{
"epoch": 1.5514588474379445,
"grad_norm": 3.1722826957702637,
"learning_rate": 8.258706935279526e-06,
"loss": 0.7296,
"step": 5344
},
{
"epoch": 1.5517491653360431,
"grad_norm": 3.7739975452423096,
"learning_rate": 8.257978617497706e-06,
"loss": 0.8633,
"step": 5345
},
{
"epoch": 1.5520394832341413,
"grad_norm": 3.7184388637542725,
"learning_rate": 8.257250179562926e-06,
"loss": 0.8095,
"step": 5346
},
{
"epoch": 1.55232980113224,
"grad_norm": 3.367509603500366,
"learning_rate": 8.256521621502053e-06,
"loss": 0.7923,
"step": 5347
},
{
"epoch": 1.5526201190303381,
"grad_norm": 3.6302716732025146,
"learning_rate": 8.255792943341957e-06,
"loss": 0.7699,
"step": 5348
},
{
"epoch": 1.5529104369284368,
"grad_norm": 3.957557439804077,
"learning_rate": 8.255064145109507e-06,
"loss": 0.8685,
"step": 5349
},
{
"epoch": 1.553200754826535,
"grad_norm": 3.2462220191955566,
"learning_rate": 8.254335226831582e-06,
"loss": 0.7029,
"step": 5350
},
{
"epoch": 1.5534910727246336,
"grad_norm": 3.4993910789489746,
"learning_rate": 8.253606188535068e-06,
"loss": 0.8325,
"step": 5351
},
{
"epoch": 1.5537813906227318,
"grad_norm": 3.787658452987671,
"learning_rate": 8.252877030246848e-06,
"loss": 0.8423,
"step": 5352
},
{
"epoch": 1.5540717085208304,
"grad_norm": 3.5158355236053467,
"learning_rate": 8.252147751993813e-06,
"loss": 0.7064,
"step": 5353
},
{
"epoch": 1.5543620264189286,
"grad_norm": 3.4053237438201904,
"learning_rate": 8.25141835380286e-06,
"loss": 0.7082,
"step": 5354
},
{
"epoch": 1.5546523443170273,
"grad_norm": 3.5351498126983643,
"learning_rate": 8.25068883570089e-06,
"loss": 0.8094,
"step": 5355
},
{
"epoch": 1.5549426622151254,
"grad_norm": 4.113193988800049,
"learning_rate": 8.249959197714803e-06,
"loss": 0.871,
"step": 5356
},
{
"epoch": 1.555232980113224,
"grad_norm": 3.213313341140747,
"learning_rate": 8.249229439871513e-06,
"loss": 0.7838,
"step": 5357
},
{
"epoch": 1.5555232980113223,
"grad_norm": 3.948580503463745,
"learning_rate": 8.248499562197929e-06,
"loss": 0.8546,
"step": 5358
},
{
"epoch": 1.555813615909421,
"grad_norm": 3.2688424587249756,
"learning_rate": 8.24776956472097e-06,
"loss": 0.6867,
"step": 5359
},
{
"epoch": 1.556103933807519,
"grad_norm": 3.9987499713897705,
"learning_rate": 8.24703944746756e-06,
"loss": 0.9382,
"step": 5360
},
{
"epoch": 1.5563942517056177,
"grad_norm": 3.3232181072235107,
"learning_rate": 8.246309210464623e-06,
"loss": 0.6795,
"step": 5361
},
{
"epoch": 1.556684569603716,
"grad_norm": 4.028323650360107,
"learning_rate": 8.24557885373909e-06,
"loss": 0.7453,
"step": 5362
},
{
"epoch": 1.5569748875018146,
"grad_norm": 3.309086561203003,
"learning_rate": 8.244848377317896e-06,
"loss": 0.7652,
"step": 5363
},
{
"epoch": 1.5572652053999128,
"grad_norm": 3.551588296890259,
"learning_rate": 8.244117781227982e-06,
"loss": 0.7157,
"step": 5364
},
{
"epoch": 1.5575555232980114,
"grad_norm": 3.302396535873413,
"learning_rate": 8.243387065496293e-06,
"loss": 0.7068,
"step": 5365
},
{
"epoch": 1.5578458411961096,
"grad_norm": 3.6382970809936523,
"learning_rate": 8.242656230149776e-06,
"loss": 0.7192,
"step": 5366
},
{
"epoch": 1.5581361590942082,
"grad_norm": 3.7732627391815186,
"learning_rate": 8.241925275215384e-06,
"loss": 0.8809,
"step": 5367
},
{
"epoch": 1.5584264769923066,
"grad_norm": 3.7419416904449463,
"learning_rate": 8.241194200720073e-06,
"loss": 0.9588,
"step": 5368
},
{
"epoch": 1.558716794890405,
"grad_norm": 3.50207257270813,
"learning_rate": 8.240463006690807e-06,
"loss": 0.7929,
"step": 5369
},
{
"epoch": 1.5590071127885035,
"grad_norm": 3.7464301586151123,
"learning_rate": 8.239731693154552e-06,
"loss": 0.7807,
"step": 5370
},
{
"epoch": 1.5592974306866019,
"grad_norm": 3.450807809829712,
"learning_rate": 8.239000260138277e-06,
"loss": 0.819,
"step": 5371
},
{
"epoch": 1.5595877485847003,
"grad_norm": 3.783979654312134,
"learning_rate": 8.238268707668957e-06,
"loss": 0.7797,
"step": 5372
},
{
"epoch": 1.5598780664827987,
"grad_norm": 3.410276174545288,
"learning_rate": 8.237537035773572e-06,
"loss": 0.7907,
"step": 5373
},
{
"epoch": 1.560168384380897,
"grad_norm": 3.077827215194702,
"learning_rate": 8.236805244479109e-06,
"loss": 0.7487,
"step": 5374
},
{
"epoch": 1.5604587022789955,
"grad_norm": 3.5620744228363037,
"learning_rate": 8.23607333381255e-06,
"loss": 0.7937,
"step": 5375
},
{
"epoch": 1.560749020177094,
"grad_norm": 3.3524978160858154,
"learning_rate": 8.235341303800892e-06,
"loss": 0.657,
"step": 5376
},
{
"epoch": 1.5610393380751924,
"grad_norm": 3.698017120361328,
"learning_rate": 8.234609154471129e-06,
"loss": 0.8229,
"step": 5377
},
{
"epoch": 1.5613296559732908,
"grad_norm": 3.363804340362549,
"learning_rate": 8.233876885850265e-06,
"loss": 0.7087,
"step": 5378
},
{
"epoch": 1.5616199738713892,
"grad_norm": 3.8434033393859863,
"learning_rate": 8.233144497965306e-06,
"loss": 0.822,
"step": 5379
},
{
"epoch": 1.5619102917694876,
"grad_norm": 3.6037120819091797,
"learning_rate": 8.23241199084326e-06,
"loss": 0.7207,
"step": 5380
},
{
"epoch": 1.562200609667586,
"grad_norm": 3.8489432334899902,
"learning_rate": 8.231679364511142e-06,
"loss": 0.8636,
"step": 5381
},
{
"epoch": 1.5624909275656844,
"grad_norm": 3.7548909187316895,
"learning_rate": 8.230946618995972e-06,
"loss": 0.7164,
"step": 5382
},
{
"epoch": 1.5627812454637828,
"grad_norm": 3.570434808731079,
"learning_rate": 8.230213754324773e-06,
"loss": 0.7482,
"step": 5383
},
{
"epoch": 1.5630715633618812,
"grad_norm": 3.7216358184814453,
"learning_rate": 8.229480770524571e-06,
"loss": 0.7673,
"step": 5384
},
{
"epoch": 1.5633618812599797,
"grad_norm": 3.5830092430114746,
"learning_rate": 8.228747667622402e-06,
"loss": 0.7737,
"step": 5385
},
{
"epoch": 1.563652199158078,
"grad_norm": 3.990433931350708,
"learning_rate": 8.228014445645299e-06,
"loss": 0.7824,
"step": 5386
},
{
"epoch": 1.5639425170561765,
"grad_norm": 3.3041436672210693,
"learning_rate": 8.227281104620307e-06,
"loss": 0.8376,
"step": 5387
},
{
"epoch": 1.564232834954275,
"grad_norm": 3.908924102783203,
"learning_rate": 8.226547644574465e-06,
"loss": 0.8597,
"step": 5388
},
{
"epoch": 1.5645231528523733,
"grad_norm": 3.409175157546997,
"learning_rate": 8.225814065534827e-06,
"loss": 0.8483,
"step": 5389
},
{
"epoch": 1.5648134707504717,
"grad_norm": 3.362900733947754,
"learning_rate": 8.225080367528447e-06,
"loss": 0.7746,
"step": 5390
},
{
"epoch": 1.5651037886485701,
"grad_norm": 4.050478458404541,
"learning_rate": 8.224346550582382e-06,
"loss": 0.8165,
"step": 5391
},
{
"epoch": 1.5653941065466686,
"grad_norm": 4.049386978149414,
"learning_rate": 8.223612614723697e-06,
"loss": 0.9072,
"step": 5392
},
{
"epoch": 1.565684424444767,
"grad_norm": 3.4654226303100586,
"learning_rate": 8.222878559979458e-06,
"loss": 0.8,
"step": 5393
},
{
"epoch": 1.5659747423428656,
"grad_norm": 3.154883861541748,
"learning_rate": 8.222144386376736e-06,
"loss": 0.7033,
"step": 5394
},
{
"epoch": 1.5662650602409638,
"grad_norm": 3.412895679473877,
"learning_rate": 8.221410093942608e-06,
"loss": 0.7621,
"step": 5395
},
{
"epoch": 1.5665553781390624,
"grad_norm": 4.3008928298950195,
"learning_rate": 8.220675682704153e-06,
"loss": 0.9183,
"step": 5396
},
{
"epoch": 1.5668456960371606,
"grad_norm": 3.676053047180176,
"learning_rate": 8.219941152688459e-06,
"loss": 0.8163,
"step": 5397
},
{
"epoch": 1.5671360139352593,
"grad_norm": 3.78293776512146,
"learning_rate": 8.219206503922612e-06,
"loss": 0.854,
"step": 5398
},
{
"epoch": 1.5674263318333574,
"grad_norm": 3.3566396236419678,
"learning_rate": 8.218471736433706e-06,
"loss": 0.8328,
"step": 5399
},
{
"epoch": 1.567716649731456,
"grad_norm": 4.530660629272461,
"learning_rate": 8.217736850248841e-06,
"loss": 0.7557,
"step": 5400
},
{
"epoch": 1.5680069676295543,
"grad_norm": 3.4996469020843506,
"learning_rate": 8.217001845395118e-06,
"loss": 0.7436,
"step": 5401
},
{
"epoch": 1.568297285527653,
"grad_norm": 3.5606935024261475,
"learning_rate": 8.216266721899642e-06,
"loss": 0.7685,
"step": 5402
},
{
"epoch": 1.568587603425751,
"grad_norm": 3.8874683380126953,
"learning_rate": 8.215531479789527e-06,
"loss": 0.8437,
"step": 5403
},
{
"epoch": 1.5688779213238497,
"grad_norm": 3.3992443084716797,
"learning_rate": 8.214796119091886e-06,
"loss": 0.8176,
"step": 5404
},
{
"epoch": 1.569168239221948,
"grad_norm": 3.601271867752075,
"learning_rate": 8.21406063983384e-06,
"loss": 0.8506,
"step": 5405
},
{
"epoch": 1.5694585571200466,
"grad_norm": 3.4691638946533203,
"learning_rate": 8.213325042042512e-06,
"loss": 0.7478,
"step": 5406
},
{
"epoch": 1.5697488750181448,
"grad_norm": 3.978273868560791,
"learning_rate": 8.212589325745036e-06,
"loss": 0.9128,
"step": 5407
},
{
"epoch": 1.5700391929162434,
"grad_norm": 3.4004123210906982,
"learning_rate": 8.211853490968536e-06,
"loss": 0.7231,
"step": 5408
},
{
"epoch": 1.5703295108143416,
"grad_norm": 3.540611982345581,
"learning_rate": 8.211117537740154e-06,
"loss": 0.758,
"step": 5409
},
{
"epoch": 1.5706198287124402,
"grad_norm": 3.3469531536102295,
"learning_rate": 8.210381466087035e-06,
"loss": 0.6891,
"step": 5410
},
{
"epoch": 1.5709101466105384,
"grad_norm": 3.611398935317993,
"learning_rate": 8.209645276036318e-06,
"loss": 0.8158,
"step": 5411
},
{
"epoch": 1.571200464508637,
"grad_norm": 3.818127155303955,
"learning_rate": 8.208908967615159e-06,
"loss": 0.762,
"step": 5412
},
{
"epoch": 1.5714907824067352,
"grad_norm": 3.4710285663604736,
"learning_rate": 8.20817254085071e-06,
"loss": 0.7297,
"step": 5413
},
{
"epoch": 1.5717811003048339,
"grad_norm": 3.6411702632904053,
"learning_rate": 8.20743599577013e-06,
"loss": 0.8483,
"step": 5414
},
{
"epoch": 1.572071418202932,
"grad_norm": 3.465782403945923,
"learning_rate": 8.206699332400585e-06,
"loss": 0.7407,
"step": 5415
},
{
"epoch": 1.5723617361010307,
"grad_norm": 3.5217745304107666,
"learning_rate": 8.20596255076924e-06,
"loss": 0.8056,
"step": 5416
},
{
"epoch": 1.5726520539991289,
"grad_norm": 3.9428863525390625,
"learning_rate": 8.205225650903269e-06,
"loss": 0.867,
"step": 5417
},
{
"epoch": 1.5729423718972275,
"grad_norm": 3.5911359786987305,
"learning_rate": 8.204488632829848e-06,
"loss": 0.8481,
"step": 5418
},
{
"epoch": 1.573232689795326,
"grad_norm": 3.63502836227417,
"learning_rate": 8.203751496576157e-06,
"loss": 0.8925,
"step": 5419
},
{
"epoch": 1.5735230076934243,
"grad_norm": 3.740027666091919,
"learning_rate": 8.203014242169382e-06,
"loss": 0.89,
"step": 5420
},
{
"epoch": 1.5738133255915228,
"grad_norm": 3.697819232940674,
"learning_rate": 8.202276869636713e-06,
"loss": 0.8272,
"step": 5421
},
{
"epoch": 1.5741036434896212,
"grad_norm": 3.058216094970703,
"learning_rate": 8.201539379005346e-06,
"loss": 0.8128,
"step": 5422
},
{
"epoch": 1.5743939613877196,
"grad_norm": 3.1359705924987793,
"learning_rate": 8.200801770302474e-06,
"loss": 0.6643,
"step": 5423
},
{
"epoch": 1.574684279285818,
"grad_norm": 3.386383533477783,
"learning_rate": 8.200064043555304e-06,
"loss": 0.6573,
"step": 5424
},
{
"epoch": 1.5749745971839164,
"grad_norm": 3.152573823928833,
"learning_rate": 8.199326198791044e-06,
"loss": 0.7556,
"step": 5425
},
{
"epoch": 1.5752649150820148,
"grad_norm": 3.3397903442382812,
"learning_rate": 8.198588236036902e-06,
"loss": 0.7253,
"step": 5426
},
{
"epoch": 1.5755552329801132,
"grad_norm": 3.6608428955078125,
"learning_rate": 8.197850155320094e-06,
"loss": 0.7888,
"step": 5427
},
{
"epoch": 1.5758455508782117,
"grad_norm": 3.4254817962646484,
"learning_rate": 8.197111956667842e-06,
"loss": 0.7963,
"step": 5428
},
{
"epoch": 1.57613586877631,
"grad_norm": 3.2243576049804688,
"learning_rate": 8.196373640107372e-06,
"loss": 0.6132,
"step": 5429
},
{
"epoch": 1.5764261866744085,
"grad_norm": 3.9535470008850098,
"learning_rate": 8.195635205665909e-06,
"loss": 0.8969,
"step": 5430
},
{
"epoch": 1.576716504572507,
"grad_norm": 3.825469970703125,
"learning_rate": 8.194896653370686e-06,
"loss": 0.7282,
"step": 5431
},
{
"epoch": 1.5770068224706053,
"grad_norm": 4.59237003326416,
"learning_rate": 8.194157983248943e-06,
"loss": 0.9332,
"step": 5432
},
{
"epoch": 1.5772971403687037,
"grad_norm": 3.5294547080993652,
"learning_rate": 8.193419195327923e-06,
"loss": 0.7861,
"step": 5433
},
{
"epoch": 1.5775874582668021,
"grad_norm": 3.565861701965332,
"learning_rate": 8.192680289634868e-06,
"loss": 0.7375,
"step": 5434
},
{
"epoch": 1.5778777761649005,
"grad_norm": 4.323357582092285,
"learning_rate": 8.191941266197032e-06,
"loss": 0.8921,
"step": 5435
},
{
"epoch": 1.578168094062999,
"grad_norm": 3.646151065826416,
"learning_rate": 8.19120212504167e-06,
"loss": 0.8274,
"step": 5436
},
{
"epoch": 1.5784584119610974,
"grad_norm": 3.351614236831665,
"learning_rate": 8.190462866196038e-06,
"loss": 0.8299,
"step": 5437
},
{
"epoch": 1.5787487298591958,
"grad_norm": 3.4705700874328613,
"learning_rate": 8.189723489687404e-06,
"loss": 0.6837,
"step": 5438
},
{
"epoch": 1.5790390477572942,
"grad_norm": 4.0358991622924805,
"learning_rate": 8.188983995543031e-06,
"loss": 0.9315,
"step": 5439
},
{
"epoch": 1.5793293656553926,
"grad_norm": 3.4540350437164307,
"learning_rate": 8.188244383790196e-06,
"loss": 0.7148,
"step": 5440
},
{
"epoch": 1.579619683553491,
"grad_norm": 3.703850507736206,
"learning_rate": 8.187504654456171e-06,
"loss": 0.7906,
"step": 5441
},
{
"epoch": 1.5799100014515894,
"grad_norm": 3.2540676593780518,
"learning_rate": 8.18676480756824e-06,
"loss": 0.8288,
"step": 5442
},
{
"epoch": 1.5802003193496879,
"grad_norm": 3.3832411766052246,
"learning_rate": 8.186024843153689e-06,
"loss": 0.7643,
"step": 5443
},
{
"epoch": 1.5804906372477863,
"grad_norm": 3.6068215370178223,
"learning_rate": 8.185284761239805e-06,
"loss": 0.8474,
"step": 5444
},
{
"epoch": 1.580780955145885,
"grad_norm": 3.1546831130981445,
"learning_rate": 8.184544561853882e-06,
"loss": 0.703,
"step": 5445
},
{
"epoch": 1.581071273043983,
"grad_norm": 3.4897522926330566,
"learning_rate": 8.18380424502322e-06,
"loss": 0.8384,
"step": 5446
},
{
"epoch": 1.5813615909420817,
"grad_norm": 3.1088387966156006,
"learning_rate": 8.183063810775121e-06,
"loss": 0.7216,
"step": 5447
},
{
"epoch": 1.58165190884018,
"grad_norm": 3.126387596130371,
"learning_rate": 8.182323259136893e-06,
"loss": 0.7299,
"step": 5448
},
{
"epoch": 1.5819422267382786,
"grad_norm": 3.984802484512329,
"learning_rate": 8.181582590135846e-06,
"loss": 0.6985,
"step": 5449
},
{
"epoch": 1.5822325446363767,
"grad_norm": 3.2511186599731445,
"learning_rate": 8.180841803799293e-06,
"loss": 0.6762,
"step": 5450
},
{
"epoch": 1.5825228625344754,
"grad_norm": 3.4527862071990967,
"learning_rate": 8.180100900154559e-06,
"loss": 0.7734,
"step": 5451
},
{
"epoch": 1.5828131804325736,
"grad_norm": 3.6589744091033936,
"learning_rate": 8.179359879228966e-06,
"loss": 0.8921,
"step": 5452
},
{
"epoch": 1.5831034983306722,
"grad_norm": 3.0081589221954346,
"learning_rate": 8.178618741049841e-06,
"loss": 0.6017,
"step": 5453
},
{
"epoch": 1.5833938162287704,
"grad_norm": 3.805534839630127,
"learning_rate": 8.177877485644518e-06,
"loss": 0.8037,
"step": 5454
},
{
"epoch": 1.583684134126869,
"grad_norm": 3.2553961277008057,
"learning_rate": 8.177136113040337e-06,
"loss": 0.6002,
"step": 5455
},
{
"epoch": 1.5839744520249672,
"grad_norm": 3.6897778511047363,
"learning_rate": 8.176394623264634e-06,
"loss": 0.7646,
"step": 5456
},
{
"epoch": 1.5842647699230659,
"grad_norm": 4.139689922332764,
"learning_rate": 8.17565301634476e-06,
"loss": 0.9523,
"step": 5457
},
{
"epoch": 1.584555087821164,
"grad_norm": 3.3002512454986572,
"learning_rate": 8.17491129230806e-06,
"loss": 0.7418,
"step": 5458
},
{
"epoch": 1.5848454057192627,
"grad_norm": 3.755394220352173,
"learning_rate": 8.174169451181893e-06,
"loss": 0.8796,
"step": 5459
},
{
"epoch": 1.5851357236173609,
"grad_norm": 3.5037105083465576,
"learning_rate": 8.173427492993617e-06,
"loss": 0.7438,
"step": 5460
},
{
"epoch": 1.5854260415154595,
"grad_norm": 3.9173336029052734,
"learning_rate": 8.172685417770595e-06,
"loss": 0.9091,
"step": 5461
},
{
"epoch": 1.5857163594135577,
"grad_norm": 3.251797676086426,
"learning_rate": 8.171943225540193e-06,
"loss": 0.7687,
"step": 5462
},
{
"epoch": 1.5860066773116563,
"grad_norm": 3.7072701454162598,
"learning_rate": 8.171200916329782e-06,
"loss": 0.8204,
"step": 5463
},
{
"epoch": 1.5862969952097545,
"grad_norm": 3.598876476287842,
"learning_rate": 8.170458490166741e-06,
"loss": 0.8249,
"step": 5464
},
{
"epoch": 1.5865873131078532,
"grad_norm": 3.932330846786499,
"learning_rate": 8.16971594707845e-06,
"loss": 0.8425,
"step": 5465
},
{
"epoch": 1.5868776310059514,
"grad_norm": 4.134816646575928,
"learning_rate": 8.168973287092292e-06,
"loss": 0.925,
"step": 5466
},
{
"epoch": 1.58716794890405,
"grad_norm": 3.6095468997955322,
"learning_rate": 8.168230510235655e-06,
"loss": 0.8141,
"step": 5467
},
{
"epoch": 1.5874582668021482,
"grad_norm": 3.84780216217041,
"learning_rate": 8.167487616535937e-06,
"loss": 0.9084,
"step": 5468
},
{
"epoch": 1.5877485847002468,
"grad_norm": 3.4866528511047363,
"learning_rate": 8.166744606020532e-06,
"loss": 0.8294,
"step": 5469
},
{
"epoch": 1.5880389025983452,
"grad_norm": 3.47239089012146,
"learning_rate": 8.166001478716842e-06,
"loss": 0.7165,
"step": 5470
},
{
"epoch": 1.5883292204964437,
"grad_norm": 3.2797508239746094,
"learning_rate": 8.165258234652273e-06,
"loss": 0.7534,
"step": 5471
},
{
"epoch": 1.588619538394542,
"grad_norm": 3.6644527912139893,
"learning_rate": 8.164514873854238e-06,
"loss": 0.7998,
"step": 5472
},
{
"epoch": 1.5889098562926405,
"grad_norm": 4.518185138702393,
"learning_rate": 8.163771396350149e-06,
"loss": 0.9153,
"step": 5473
},
{
"epoch": 1.589200174190739,
"grad_norm": 3.9391283988952637,
"learning_rate": 8.163027802167427e-06,
"loss": 0.7404,
"step": 5474
},
{
"epoch": 1.5894904920888373,
"grad_norm": 3.672680616378784,
"learning_rate": 8.162284091333495e-06,
"loss": 0.7028,
"step": 5475
},
{
"epoch": 1.5897808099869357,
"grad_norm": 3.7391512393951416,
"learning_rate": 8.16154026387578e-06,
"loss": 0.8395,
"step": 5476
},
{
"epoch": 1.5900711278850341,
"grad_norm": 3.7817800045013428,
"learning_rate": 8.160796319821715e-06,
"loss": 0.7917,
"step": 5477
},
{
"epoch": 1.5903614457831325,
"grad_norm": 3.9017398357391357,
"learning_rate": 8.160052259198737e-06,
"loss": 0.7596,
"step": 5478
},
{
"epoch": 1.590651763681231,
"grad_norm": 3.59230375289917,
"learning_rate": 8.159308082034284e-06,
"loss": 0.8597,
"step": 5479
},
{
"epoch": 1.5909420815793294,
"grad_norm": 3.1670892238616943,
"learning_rate": 8.158563788355803e-06,
"loss": 0.7628,
"step": 5480
},
{
"epoch": 1.5912323994774278,
"grad_norm": 3.757706880569458,
"learning_rate": 8.157819378190743e-06,
"loss": 0.7866,
"step": 5481
},
{
"epoch": 1.5915227173755262,
"grad_norm": 3.193671703338623,
"learning_rate": 8.157074851566558e-06,
"loss": 0.698,
"step": 5482
},
{
"epoch": 1.5918130352736246,
"grad_norm": 3.6582417488098145,
"learning_rate": 8.156330208510706e-06,
"loss": 0.7991,
"step": 5483
},
{
"epoch": 1.592103353171723,
"grad_norm": 4.1811089515686035,
"learning_rate": 8.155585449050647e-06,
"loss": 0.9821,
"step": 5484
},
{
"epoch": 1.5923936710698214,
"grad_norm": 3.4012670516967773,
"learning_rate": 8.15484057321385e-06,
"loss": 0.8423,
"step": 5485
},
{
"epoch": 1.5926839889679199,
"grad_norm": 3.3922324180603027,
"learning_rate": 8.154095581027783e-06,
"loss": 0.8446,
"step": 5486
},
{
"epoch": 1.5929743068660183,
"grad_norm": 3.582942008972168,
"learning_rate": 8.153350472519925e-06,
"loss": 0.7196,
"step": 5487
},
{
"epoch": 1.5932646247641167,
"grad_norm": 3.835096836090088,
"learning_rate": 8.152605247717753e-06,
"loss": 0.8157,
"step": 5488
},
{
"epoch": 1.593554942662215,
"grad_norm": 3.4639639854431152,
"learning_rate": 8.151859906648747e-06,
"loss": 0.7725,
"step": 5489
},
{
"epoch": 1.5938452605603135,
"grad_norm": 3.6137194633483887,
"learning_rate": 8.151114449340403e-06,
"loss": 0.8316,
"step": 5490
},
{
"epoch": 1.594135578458412,
"grad_norm": 3.6025030612945557,
"learning_rate": 8.150368875820206e-06,
"loss": 0.7249,
"step": 5491
},
{
"epoch": 1.5944258963565103,
"grad_norm": 3.8320367336273193,
"learning_rate": 8.149623186115655e-06,
"loss": 0.958,
"step": 5492
},
{
"epoch": 1.5947162142546087,
"grad_norm": 3.5915944576263428,
"learning_rate": 8.14887738025425e-06,
"loss": 0.8933,
"step": 5493
},
{
"epoch": 1.5950065321527074,
"grad_norm": 3.5409955978393555,
"learning_rate": 8.148131458263499e-06,
"loss": 0.7437,
"step": 5494
},
{
"epoch": 1.5952968500508056,
"grad_norm": 3.5840892791748047,
"learning_rate": 8.147385420170907e-06,
"loss": 0.731,
"step": 5495
},
{
"epoch": 1.5955871679489042,
"grad_norm": 2.954227924346924,
"learning_rate": 8.146639266003991e-06,
"loss": 0.611,
"step": 5496
},
{
"epoch": 1.5958774858470024,
"grad_norm": 3.372689723968506,
"learning_rate": 8.145892995790269e-06,
"loss": 0.7692,
"step": 5497
},
{
"epoch": 1.596167803745101,
"grad_norm": 3.156162738800049,
"learning_rate": 8.145146609557259e-06,
"loss": 0.7034,
"step": 5498
},
{
"epoch": 1.5964581216431992,
"grad_norm": 3.1837658882141113,
"learning_rate": 8.144400107332491e-06,
"loss": 0.7963,
"step": 5499
},
{
"epoch": 1.5967484395412979,
"grad_norm": 3.6337132453918457,
"learning_rate": 8.143653489143495e-06,
"loss": 0.8182,
"step": 5500
},
{
"epoch": 1.5967484395412979,
"eval_loss": 1.1764451265335083,
"eval_runtime": 13.4597,
"eval_samples_per_second": 29.718,
"eval_steps_per_second": 3.715,
"step": 5500
},
{
"epoch": 1.597038757439396,
"grad_norm": 3.751736879348755,
"learning_rate": 8.142906755017806e-06,
"loss": 0.8149,
"step": 5501
},
{
"epoch": 1.5973290753374947,
"grad_norm": 3.2839596271514893,
"learning_rate": 8.142159904982963e-06,
"loss": 0.6112,
"step": 5502
},
{
"epoch": 1.5976193932355929,
"grad_norm": 3.4218335151672363,
"learning_rate": 8.14141293906651e-06,
"loss": 0.8055,
"step": 5503
},
{
"epoch": 1.5979097111336915,
"grad_norm": 3.7377045154571533,
"learning_rate": 8.140665857295994e-06,
"loss": 0.8185,
"step": 5504
},
{
"epoch": 1.5982000290317897,
"grad_norm": 3.6234383583068848,
"learning_rate": 8.139918659698967e-06,
"loss": 0.9353,
"step": 5505
},
{
"epoch": 1.5984903469298883,
"grad_norm": 3.7796764373779297,
"learning_rate": 8.139171346302987e-06,
"loss": 0.8076,
"step": 5506
},
{
"epoch": 1.5987806648279865,
"grad_norm": 3.846904993057251,
"learning_rate": 8.138423917135613e-06,
"loss": 0.7598,
"step": 5507
},
{
"epoch": 1.5990709827260852,
"grad_norm": 3.7689170837402344,
"learning_rate": 8.13767637222441e-06,
"loss": 0.8609,
"step": 5508
},
{
"epoch": 1.5993613006241834,
"grad_norm": 3.787233352661133,
"learning_rate": 8.136928711596948e-06,
"loss": 0.7595,
"step": 5509
},
{
"epoch": 1.599651618522282,
"grad_norm": 3.4965553283691406,
"learning_rate": 8.1361809352808e-06,
"loss": 0.7314,
"step": 5510
},
{
"epoch": 1.5999419364203802,
"grad_norm": 3.4074811935424805,
"learning_rate": 8.135433043303543e-06,
"loss": 0.7915,
"step": 5511
},
{
"epoch": 1.6002322543184788,
"grad_norm": 3.774893283843994,
"learning_rate": 8.134685035692761e-06,
"loss": 0.7789,
"step": 5512
},
{
"epoch": 1.600522572216577,
"grad_norm": 3.5672433376312256,
"learning_rate": 8.133936912476038e-06,
"loss": 0.7728,
"step": 5513
},
{
"epoch": 1.6008128901146756,
"grad_norm": 3.479285717010498,
"learning_rate": 8.133188673680966e-06,
"loss": 0.7429,
"step": 5514
},
{
"epoch": 1.6011032080127738,
"grad_norm": 3.180401563644409,
"learning_rate": 8.132440319335138e-06,
"loss": 0.6545,
"step": 5515
},
{
"epoch": 1.6013935259108725,
"grad_norm": 3.3858981132507324,
"learning_rate": 8.131691849466154e-06,
"loss": 0.7118,
"step": 5516
},
{
"epoch": 1.6016838438089707,
"grad_norm": 3.231828212738037,
"learning_rate": 8.130943264101618e-06,
"loss": 0.7514,
"step": 5517
},
{
"epoch": 1.6019741617070693,
"grad_norm": 3.7033121585845947,
"learning_rate": 8.130194563269137e-06,
"loss": 0.7819,
"step": 5518
},
{
"epoch": 1.6022644796051677,
"grad_norm": 3.5103394985198975,
"learning_rate": 8.129445746996322e-06,
"loss": 0.8944,
"step": 5519
},
{
"epoch": 1.6025547975032661,
"grad_norm": 3.523192882537842,
"learning_rate": 8.12869681531079e-06,
"loss": 0.7582,
"step": 5520
},
{
"epoch": 1.6028451154013645,
"grad_norm": 3.773475408554077,
"learning_rate": 8.127947768240161e-06,
"loss": 0.7963,
"step": 5521
},
{
"epoch": 1.603135433299463,
"grad_norm": 3.4685418605804443,
"learning_rate": 8.12719860581206e-06,
"loss": 0.8421,
"step": 5522
},
{
"epoch": 1.6034257511975614,
"grad_norm": 3.8262131214141846,
"learning_rate": 8.126449328054115e-06,
"loss": 0.7972,
"step": 5523
},
{
"epoch": 1.6037160690956598,
"grad_norm": 3.396672487258911,
"learning_rate": 8.125699934993961e-06,
"loss": 0.724,
"step": 5524
},
{
"epoch": 1.6040063869937582,
"grad_norm": 3.644125461578369,
"learning_rate": 8.124950426659231e-06,
"loss": 0.818,
"step": 5525
},
{
"epoch": 1.6042967048918566,
"grad_norm": 3.7308244705200195,
"learning_rate": 8.124200803077571e-06,
"loss": 0.7834,
"step": 5526
},
{
"epoch": 1.604587022789955,
"grad_norm": 3.544517755508423,
"learning_rate": 8.123451064276625e-06,
"loss": 0.7286,
"step": 5527
},
{
"epoch": 1.6048773406880534,
"grad_norm": 3.779484272003174,
"learning_rate": 8.122701210284042e-06,
"loss": 0.879,
"step": 5528
},
{
"epoch": 1.6051676585861518,
"grad_norm": 3.2026147842407227,
"learning_rate": 8.12195124112748e-06,
"loss": 0.6605,
"step": 5529
},
{
"epoch": 1.6054579764842503,
"grad_norm": 3.595618486404419,
"learning_rate": 8.121201156834595e-06,
"loss": 0.7681,
"step": 5530
},
{
"epoch": 1.6057482943823487,
"grad_norm": 3.6730122566223145,
"learning_rate": 8.120450957433048e-06,
"loss": 0.8714,
"step": 5531
},
{
"epoch": 1.606038612280447,
"grad_norm": 3.6328916549682617,
"learning_rate": 8.11970064295051e-06,
"loss": 0.9266,
"step": 5532
},
{
"epoch": 1.6063289301785455,
"grad_norm": 3.8567254543304443,
"learning_rate": 8.11895021341465e-06,
"loss": 0.8275,
"step": 5533
},
{
"epoch": 1.606619248076644,
"grad_norm": 3.534677505493164,
"learning_rate": 8.118199668853141e-06,
"loss": 0.8414,
"step": 5534
},
{
"epoch": 1.6069095659747423,
"grad_norm": 3.3644704818725586,
"learning_rate": 8.117449009293668e-06,
"loss": 0.7034,
"step": 5535
},
{
"epoch": 1.6071998838728407,
"grad_norm": 3.779590129852295,
"learning_rate": 8.116698234763913e-06,
"loss": 0.6894,
"step": 5536
},
{
"epoch": 1.6074902017709392,
"grad_norm": 3.705146312713623,
"learning_rate": 8.115947345291565e-06,
"loss": 0.8024,
"step": 5537
},
{
"epoch": 1.6077805196690376,
"grad_norm": 3.603299856185913,
"learning_rate": 8.115196340904312e-06,
"loss": 0.8889,
"step": 5538
},
{
"epoch": 1.608070837567136,
"grad_norm": 3.7928433418273926,
"learning_rate": 8.114445221629856e-06,
"loss": 0.987,
"step": 5539
},
{
"epoch": 1.6083611554652344,
"grad_norm": 3.6779932975769043,
"learning_rate": 8.113693987495897e-06,
"loss": 0.7934,
"step": 5540
},
{
"epoch": 1.6086514733633328,
"grad_norm": 3.47401762008667,
"learning_rate": 8.112942638530137e-06,
"loss": 0.8087,
"step": 5541
},
{
"epoch": 1.6089417912614312,
"grad_norm": 3.580387830734253,
"learning_rate": 8.112191174760289e-06,
"loss": 0.8183,
"step": 5542
},
{
"epoch": 1.6092321091595296,
"grad_norm": 3.6536662578582764,
"learning_rate": 8.111439596214066e-06,
"loss": 0.8197,
"step": 5543
},
{
"epoch": 1.609522427057628,
"grad_norm": 3.348607301712036,
"learning_rate": 8.110687902919185e-06,
"loss": 0.7254,
"step": 5544
},
{
"epoch": 1.6098127449557267,
"grad_norm": 3.2281482219696045,
"learning_rate": 8.10993609490337e-06,
"loss": 0.7144,
"step": 5545
},
{
"epoch": 1.6101030628538249,
"grad_norm": 3.217322826385498,
"learning_rate": 8.109184172194344e-06,
"loss": 0.7845,
"step": 5546
},
{
"epoch": 1.6103933807519235,
"grad_norm": 3.4739761352539062,
"learning_rate": 8.10843213481984e-06,
"loss": 0.7105,
"step": 5547
},
{
"epoch": 1.6106836986500217,
"grad_norm": 3.3393218517303467,
"learning_rate": 8.107679982807593e-06,
"loss": 0.7621,
"step": 5548
},
{
"epoch": 1.6109740165481203,
"grad_norm": 3.1378118991851807,
"learning_rate": 8.106927716185341e-06,
"loss": 0.7798,
"step": 5549
},
{
"epoch": 1.6112643344462185,
"grad_norm": 3.5275983810424805,
"learning_rate": 8.106175334980828e-06,
"loss": 0.7628,
"step": 5550
},
{
"epoch": 1.6115546523443172,
"grad_norm": 4.053371906280518,
"learning_rate": 8.105422839221801e-06,
"loss": 0.849,
"step": 5551
},
{
"epoch": 1.6118449702424154,
"grad_norm": 3.6364362239837646,
"learning_rate": 8.104670228936014e-06,
"loss": 0.76,
"step": 5552
},
{
"epoch": 1.612135288140514,
"grad_norm": 2.8951313495635986,
"learning_rate": 8.103917504151219e-06,
"loss": 0.7134,
"step": 5553
},
{
"epoch": 1.6124256060386122,
"grad_norm": 3.7895846366882324,
"learning_rate": 8.103164664895179e-06,
"loss": 0.8141,
"step": 5554
},
{
"epoch": 1.6127159239367108,
"grad_norm": 3.412078619003296,
"learning_rate": 8.102411711195657e-06,
"loss": 0.7362,
"step": 5555
},
{
"epoch": 1.613006241834809,
"grad_norm": 3.6012485027313232,
"learning_rate": 8.101658643080421e-06,
"loss": 0.8171,
"step": 5556
},
{
"epoch": 1.6132965597329076,
"grad_norm": 3.7033040523529053,
"learning_rate": 8.100905460577246e-06,
"loss": 0.8706,
"step": 5557
},
{
"epoch": 1.6135868776310058,
"grad_norm": 3.526740074157715,
"learning_rate": 8.100152163713911e-06,
"loss": 0.7134,
"step": 5558
},
{
"epoch": 1.6138771955291045,
"grad_norm": 3.473214864730835,
"learning_rate": 8.09939875251819e-06,
"loss": 0.8147,
"step": 5559
},
{
"epoch": 1.6141675134272027,
"grad_norm": 3.854447603225708,
"learning_rate": 8.098645227017876e-06,
"loss": 0.8453,
"step": 5560
},
{
"epoch": 1.6144578313253013,
"grad_norm": 3.334552049636841,
"learning_rate": 8.097891587240754e-06,
"loss": 0.7638,
"step": 5561
},
{
"epoch": 1.6147481492233995,
"grad_norm": 3.6212611198425293,
"learning_rate": 8.097137833214621e-06,
"loss": 0.8392,
"step": 5562
},
{
"epoch": 1.6150384671214981,
"grad_norm": 3.836317300796509,
"learning_rate": 8.096383964967273e-06,
"loss": 0.8645,
"step": 5563
},
{
"epoch": 1.6153287850195963,
"grad_norm": 3.2368345260620117,
"learning_rate": 8.095629982526513e-06,
"loss": 0.7104,
"step": 5564
},
{
"epoch": 1.615619102917695,
"grad_norm": 3.441826105117798,
"learning_rate": 8.094875885920148e-06,
"loss": 0.7553,
"step": 5565
},
{
"epoch": 1.6159094208157931,
"grad_norm": 3.322342872619629,
"learning_rate": 8.094121675175988e-06,
"loss": 0.7563,
"step": 5566
},
{
"epoch": 1.6161997387138918,
"grad_norm": 3.713310956954956,
"learning_rate": 8.09336735032185e-06,
"loss": 0.7707,
"step": 5567
},
{
"epoch": 1.61649005661199,
"grad_norm": 3.6072230339050293,
"learning_rate": 8.092612911385551e-06,
"loss": 0.6832,
"step": 5568
},
{
"epoch": 1.6167803745100886,
"grad_norm": 3.0848445892333984,
"learning_rate": 8.091858358394915e-06,
"loss": 0.7505,
"step": 5569
},
{
"epoch": 1.617070692408187,
"grad_norm": 3.962153673171997,
"learning_rate": 8.09110369137777e-06,
"loss": 0.8181,
"step": 5570
},
{
"epoch": 1.6173610103062854,
"grad_norm": 3.5778603553771973,
"learning_rate": 8.090348910361946e-06,
"loss": 0.8057,
"step": 5571
},
{
"epoch": 1.6176513282043838,
"grad_norm": 3.639045238494873,
"learning_rate": 8.089594015375281e-06,
"loss": 0.8074,
"step": 5572
},
{
"epoch": 1.6179416461024823,
"grad_norm": 3.190915584564209,
"learning_rate": 8.088839006445615e-06,
"loss": 0.6914,
"step": 5573
},
{
"epoch": 1.6182319640005807,
"grad_norm": 3.173288345336914,
"learning_rate": 8.088083883600793e-06,
"loss": 0.7042,
"step": 5574
},
{
"epoch": 1.618522281898679,
"grad_norm": 3.3784337043762207,
"learning_rate": 8.087328646868663e-06,
"loss": 0.7792,
"step": 5575
},
{
"epoch": 1.6188125997967775,
"grad_norm": 3.4538354873657227,
"learning_rate": 8.086573296277078e-06,
"loss": 0.7685,
"step": 5576
},
{
"epoch": 1.619102917694876,
"grad_norm": 3.314093828201294,
"learning_rate": 8.085817831853893e-06,
"loss": 0.8075,
"step": 5577
},
{
"epoch": 1.6193932355929743,
"grad_norm": 3.4923017024993896,
"learning_rate": 8.085062253626971e-06,
"loss": 0.8034,
"step": 5578
},
{
"epoch": 1.6196835534910727,
"grad_norm": 3.724478244781494,
"learning_rate": 8.084306561624177e-06,
"loss": 0.7567,
"step": 5579
},
{
"epoch": 1.6199738713891711,
"grad_norm": 3.650859832763672,
"learning_rate": 8.083550755873384e-06,
"loss": 0.7958,
"step": 5580
},
{
"epoch": 1.6202641892872696,
"grad_norm": 3.2904725074768066,
"learning_rate": 8.08279483640246e-06,
"loss": 0.7741,
"step": 5581
},
{
"epoch": 1.620554507185368,
"grad_norm": 3.688880205154419,
"learning_rate": 8.082038803239288e-06,
"loss": 0.7899,
"step": 5582
},
{
"epoch": 1.6208448250834664,
"grad_norm": 3.716184139251709,
"learning_rate": 8.081282656411746e-06,
"loss": 0.7081,
"step": 5583
},
{
"epoch": 1.6211351429815648,
"grad_norm": 3.6786234378814697,
"learning_rate": 8.080526395947722e-06,
"loss": 0.9142,
"step": 5584
},
{
"epoch": 1.6214254608796632,
"grad_norm": 3.396521806716919,
"learning_rate": 8.079770021875108e-06,
"loss": 0.7703,
"step": 5585
},
{
"epoch": 1.6217157787777616,
"grad_norm": 3.3132734298706055,
"learning_rate": 8.079013534221798e-06,
"loss": 0.7606,
"step": 5586
},
{
"epoch": 1.62200609667586,
"grad_norm": 3.5055415630340576,
"learning_rate": 8.078256933015692e-06,
"loss": 0.8032,
"step": 5587
},
{
"epoch": 1.6222964145739585,
"grad_norm": 3.584742307662964,
"learning_rate": 8.077500218284689e-06,
"loss": 0.7928,
"step": 5588
},
{
"epoch": 1.6225867324720569,
"grad_norm": 3.809736490249634,
"learning_rate": 8.0767433900567e-06,
"loss": 0.8064,
"step": 5589
},
{
"epoch": 1.6228770503701553,
"grad_norm": 3.6083149909973145,
"learning_rate": 8.075986448359637e-06,
"loss": 0.7596,
"step": 5590
},
{
"epoch": 1.6231673682682537,
"grad_norm": 3.65105938911438,
"learning_rate": 8.075229393221413e-06,
"loss": 0.8699,
"step": 5591
},
{
"epoch": 1.623457686166352,
"grad_norm": 3.3243539333343506,
"learning_rate": 8.074472224669952e-06,
"loss": 0.7765,
"step": 5592
},
{
"epoch": 1.6237480040644505,
"grad_norm": 3.975712537765503,
"learning_rate": 8.073714942733173e-06,
"loss": 0.9207,
"step": 5593
},
{
"epoch": 1.624038321962549,
"grad_norm": 3.689615488052368,
"learning_rate": 8.072957547439006e-06,
"loss": 0.9121,
"step": 5594
},
{
"epoch": 1.6243286398606473,
"grad_norm": 3.562192440032959,
"learning_rate": 8.072200038815387e-06,
"loss": 0.7415,
"step": 5595
},
{
"epoch": 1.624618957758746,
"grad_norm": 3.7881624698638916,
"learning_rate": 8.071442416890247e-06,
"loss": 0.7459,
"step": 5596
},
{
"epoch": 1.6249092756568442,
"grad_norm": 3.2582058906555176,
"learning_rate": 8.070684681691532e-06,
"loss": 0.7617,
"step": 5597
},
{
"epoch": 1.6251995935549428,
"grad_norm": 3.686997175216675,
"learning_rate": 8.069926833247181e-06,
"loss": 0.8463,
"step": 5598
},
{
"epoch": 1.625489911453041,
"grad_norm": 4.284474849700928,
"learning_rate": 8.06916887158515e-06,
"loss": 0.9876,
"step": 5599
},
{
"epoch": 1.6257802293511396,
"grad_norm": 3.551377058029175,
"learning_rate": 8.068410796733388e-06,
"loss": 0.8189,
"step": 5600
},
{
"epoch": 1.6260705472492378,
"grad_norm": 3.5549912452697754,
"learning_rate": 8.067652608719854e-06,
"loss": 0.7113,
"step": 5601
},
{
"epoch": 1.6263608651473365,
"grad_norm": 3.1862168312072754,
"learning_rate": 8.066894307572507e-06,
"loss": 0.7421,
"step": 5602
},
{
"epoch": 1.6266511830454347,
"grad_norm": 3.8687539100646973,
"learning_rate": 8.066135893319316e-06,
"loss": 0.9149,
"step": 5603
},
{
"epoch": 1.6269415009435333,
"grad_norm": 3.6645760536193848,
"learning_rate": 8.065377365988252e-06,
"loss": 0.7268,
"step": 5604
},
{
"epoch": 1.6272318188416315,
"grad_norm": 3.643216609954834,
"learning_rate": 8.064618725607284e-06,
"loss": 0.7743,
"step": 5605
},
{
"epoch": 1.6275221367397301,
"grad_norm": 3.8267617225646973,
"learning_rate": 8.063859972204395e-06,
"loss": 0.7137,
"step": 5606
},
{
"epoch": 1.6278124546378283,
"grad_norm": 3.9164083003997803,
"learning_rate": 8.063101105807566e-06,
"loss": 0.8744,
"step": 5607
},
{
"epoch": 1.628102772535927,
"grad_norm": 3.626497507095337,
"learning_rate": 8.062342126444786e-06,
"loss": 0.7174,
"step": 5608
},
{
"epoch": 1.6283930904340251,
"grad_norm": 3.792872428894043,
"learning_rate": 8.06158303414404e-06,
"loss": 0.7724,
"step": 5609
},
{
"epoch": 1.6286834083321238,
"grad_norm": 4.004924297332764,
"learning_rate": 8.060823828933329e-06,
"loss": 0.8403,
"step": 5610
},
{
"epoch": 1.628973726230222,
"grad_norm": 4.27058219909668,
"learning_rate": 8.060064510840648e-06,
"loss": 0.7813,
"step": 5611
},
{
"epoch": 1.6292640441283206,
"grad_norm": 3.6475908756256104,
"learning_rate": 8.059305079894004e-06,
"loss": 0.8612,
"step": 5612
},
{
"epoch": 1.6295543620264188,
"grad_norm": 3.181816339492798,
"learning_rate": 8.058545536121402e-06,
"loss": 0.659,
"step": 5613
},
{
"epoch": 1.6298446799245174,
"grad_norm": 3.768768072128296,
"learning_rate": 8.057785879550854e-06,
"loss": 0.7758,
"step": 5614
},
{
"epoch": 1.6301349978226156,
"grad_norm": 4.072582244873047,
"learning_rate": 8.057026110210378e-06,
"loss": 0.8186,
"step": 5615
},
{
"epoch": 1.6304253157207143,
"grad_norm": 3.40413498878479,
"learning_rate": 8.05626622812799e-06,
"loss": 0.7683,
"step": 5616
},
{
"epoch": 1.6307156336188124,
"grad_norm": 3.935901403427124,
"learning_rate": 8.055506233331718e-06,
"loss": 0.773,
"step": 5617
},
{
"epoch": 1.631005951516911,
"grad_norm": 3.696681499481201,
"learning_rate": 8.054746125849587e-06,
"loss": 0.8155,
"step": 5618
},
{
"epoch": 1.6312962694150093,
"grad_norm": 3.344435691833496,
"learning_rate": 8.053985905709632e-06,
"loss": 0.7765,
"step": 5619
},
{
"epoch": 1.631586587313108,
"grad_norm": 3.0115749835968018,
"learning_rate": 8.053225572939888e-06,
"loss": 0.6434,
"step": 5620
},
{
"epoch": 1.6318769052112063,
"grad_norm": 3.36995005607605,
"learning_rate": 8.052465127568399e-06,
"loss": 0.7216,
"step": 5621
},
{
"epoch": 1.6321672231093047,
"grad_norm": 3.2760109901428223,
"learning_rate": 8.051704569623205e-06,
"loss": 0.6746,
"step": 5622
},
{
"epoch": 1.6324575410074031,
"grad_norm": 3.5613889694213867,
"learning_rate": 8.050943899132357e-06,
"loss": 0.7582,
"step": 5623
},
{
"epoch": 1.6327478589055016,
"grad_norm": 3.7071661949157715,
"learning_rate": 8.05018311612391e-06,
"loss": 0.85,
"step": 5624
},
{
"epoch": 1.6330381768036,
"grad_norm": 3.911090135574341,
"learning_rate": 8.049422220625921e-06,
"loss": 0.9153,
"step": 5625
},
{
"epoch": 1.6333284947016984,
"grad_norm": 3.132866621017456,
"learning_rate": 8.048661212666449e-06,
"loss": 0.7028,
"step": 5626
},
{
"epoch": 1.6336188125997968,
"grad_norm": 3.5012645721435547,
"learning_rate": 8.047900092273562e-06,
"loss": 0.797,
"step": 5627
},
{
"epoch": 1.6339091304978952,
"grad_norm": 3.3185794353485107,
"learning_rate": 8.047138859475328e-06,
"loss": 0.6393,
"step": 5628
},
{
"epoch": 1.6341994483959936,
"grad_norm": 3.0536088943481445,
"learning_rate": 8.046377514299824e-06,
"loss": 0.76,
"step": 5629
},
{
"epoch": 1.634489766294092,
"grad_norm": 3.2665023803710938,
"learning_rate": 8.045616056775124e-06,
"loss": 0.7035,
"step": 5630
},
{
"epoch": 1.6347800841921905,
"grad_norm": 3.6513378620147705,
"learning_rate": 8.044854486929315e-06,
"loss": 0.7328,
"step": 5631
},
{
"epoch": 1.6350704020902889,
"grad_norm": 4.083636283874512,
"learning_rate": 8.04409280479048e-06,
"loss": 0.8992,
"step": 5632
},
{
"epoch": 1.6353607199883873,
"grad_norm": 3.52335524559021,
"learning_rate": 8.043331010386709e-06,
"loss": 0.8255,
"step": 5633
},
{
"epoch": 1.6356510378864857,
"grad_norm": 3.233189582824707,
"learning_rate": 8.0425691037461e-06,
"loss": 0.6768,
"step": 5634
},
{
"epoch": 1.635941355784584,
"grad_norm": 3.613593578338623,
"learning_rate": 8.04180708489675e-06,
"loss": 0.8201,
"step": 5635
},
{
"epoch": 1.6362316736826825,
"grad_norm": 3.1805691719055176,
"learning_rate": 8.041044953866758e-06,
"loss": 0.6954,
"step": 5636
},
{
"epoch": 1.636521991580781,
"grad_norm": 3.4872689247131348,
"learning_rate": 8.040282710684238e-06,
"loss": 0.8031,
"step": 5637
},
{
"epoch": 1.6368123094788793,
"grad_norm": 3.5049612522125244,
"learning_rate": 8.039520355377299e-06,
"loss": 0.7646,
"step": 5638
},
{
"epoch": 1.6371026273769778,
"grad_norm": 4.077413558959961,
"learning_rate": 8.038757887974053e-06,
"loss": 0.8644,
"step": 5639
},
{
"epoch": 1.6373929452750762,
"grad_norm": 3.759481430053711,
"learning_rate": 8.037995308502625e-06,
"loss": 0.9328,
"step": 5640
},
{
"epoch": 1.6376832631731746,
"grad_norm": 3.288496971130371,
"learning_rate": 8.037232616991132e-06,
"loss": 0.7038,
"step": 5641
},
{
"epoch": 1.637973581071273,
"grad_norm": 3.1828713417053223,
"learning_rate": 8.036469813467707e-06,
"loss": 0.7033,
"step": 5642
},
{
"epoch": 1.6382638989693714,
"grad_norm": 3.8580126762390137,
"learning_rate": 8.03570689796048e-06,
"loss": 0.84,
"step": 5643
},
{
"epoch": 1.6385542168674698,
"grad_norm": 3.1978330612182617,
"learning_rate": 8.034943870497589e-06,
"loss": 0.7056,
"step": 5644
},
{
"epoch": 1.6388445347655685,
"grad_norm": 3.5237538814544678,
"learning_rate": 8.034180731107171e-06,
"loss": 0.8868,
"step": 5645
},
{
"epoch": 1.6391348526636667,
"grad_norm": 3.573692798614502,
"learning_rate": 8.033417479817371e-06,
"loss": 0.6922,
"step": 5646
},
{
"epoch": 1.6394251705617653,
"grad_norm": 3.6821346282958984,
"learning_rate": 8.03265411665634e-06,
"loss": 0.8068,
"step": 5647
},
{
"epoch": 1.6397154884598635,
"grad_norm": 3.5693955421447754,
"learning_rate": 8.031890641652228e-06,
"loss": 0.7738,
"step": 5648
},
{
"epoch": 1.6400058063579621,
"grad_norm": 3.874678134918213,
"learning_rate": 8.031127054833192e-06,
"loss": 0.7949,
"step": 5649
},
{
"epoch": 1.6402961242560603,
"grad_norm": 3.197110414505005,
"learning_rate": 8.030363356227393e-06,
"loss": 0.8176,
"step": 5650
},
{
"epoch": 1.640586442154159,
"grad_norm": 3.5319745540618896,
"learning_rate": 8.029599545862994e-06,
"loss": 0.8178,
"step": 5651
},
{
"epoch": 1.6408767600522571,
"grad_norm": 3.6435129642486572,
"learning_rate": 8.02883562376817e-06,
"loss": 0.8178,
"step": 5652
},
{
"epoch": 1.6411670779503558,
"grad_norm": 3.5644171237945557,
"learning_rate": 8.028071589971086e-06,
"loss": 0.7177,
"step": 5653
},
{
"epoch": 1.641457395848454,
"grad_norm": 3.39943528175354,
"learning_rate": 8.027307444499927e-06,
"loss": 0.745,
"step": 5654
},
{
"epoch": 1.6417477137465526,
"grad_norm": 4.047821521759033,
"learning_rate": 8.02654318738287e-06,
"loss": 0.8497,
"step": 5655
},
{
"epoch": 1.6420380316446508,
"grad_norm": 3.406195640563965,
"learning_rate": 8.0257788186481e-06,
"loss": 0.7035,
"step": 5656
},
{
"epoch": 1.6423283495427494,
"grad_norm": 3.4617931842803955,
"learning_rate": 8.02501433832381e-06,
"loss": 0.7898,
"step": 5657
},
{
"epoch": 1.6426186674408476,
"grad_norm": 3.187101364135742,
"learning_rate": 8.024249746438189e-06,
"loss": 0.6932,
"step": 5658
},
{
"epoch": 1.6429089853389462,
"grad_norm": 3.927638053894043,
"learning_rate": 8.023485043019437e-06,
"loss": 0.7909,
"step": 5659
},
{
"epoch": 1.6431993032370444,
"grad_norm": 4.530860424041748,
"learning_rate": 8.02272022809576e-06,
"loss": 0.8495,
"step": 5660
},
{
"epoch": 1.643489621135143,
"grad_norm": 3.5661168098449707,
"learning_rate": 8.021955301695357e-06,
"loss": 0.8213,
"step": 5661
},
{
"epoch": 1.6437799390332413,
"grad_norm": 4.098810195922852,
"learning_rate": 8.021190263846445e-06,
"loss": 0.9182,
"step": 5662
},
{
"epoch": 1.64407025693134,
"grad_norm": 3.5405988693237305,
"learning_rate": 8.020425114577232e-06,
"loss": 0.7886,
"step": 5663
},
{
"epoch": 1.644360574829438,
"grad_norm": 3.576150894165039,
"learning_rate": 8.01965985391594e-06,
"loss": 0.7849,
"step": 5664
},
{
"epoch": 1.6446508927275367,
"grad_norm": 3.515380620956421,
"learning_rate": 8.018894481890793e-06,
"loss": 0.7205,
"step": 5665
},
{
"epoch": 1.644941210625635,
"grad_norm": 3.65975284576416,
"learning_rate": 8.018128998530013e-06,
"loss": 0.7721,
"step": 5666
},
{
"epoch": 1.6452315285237336,
"grad_norm": 3.3606433868408203,
"learning_rate": 8.017363403861836e-06,
"loss": 0.7938,
"step": 5667
},
{
"epoch": 1.6455218464218317,
"grad_norm": 3.917895793914795,
"learning_rate": 8.016597697914492e-06,
"loss": 0.7639,
"step": 5668
},
{
"epoch": 1.6458121643199304,
"grad_norm": 3.221787452697754,
"learning_rate": 8.015831880716222e-06,
"loss": 0.6328,
"step": 5669
},
{
"epoch": 1.6461024822180288,
"grad_norm": 3.6997926235198975,
"learning_rate": 8.01506595229527e-06,
"loss": 0.7413,
"step": 5670
},
{
"epoch": 1.6463928001161272,
"grad_norm": 3.6128926277160645,
"learning_rate": 8.014299912679882e-06,
"loss": 0.82,
"step": 5671
},
{
"epoch": 1.6466831180142256,
"grad_norm": 3.536489963531494,
"learning_rate": 8.013533761898308e-06,
"loss": 0.7352,
"step": 5672
},
{
"epoch": 1.646973435912324,
"grad_norm": 3.36811900138855,
"learning_rate": 8.012767499978806e-06,
"loss": 0.7863,
"step": 5673
},
{
"epoch": 1.6472637538104224,
"grad_norm": 3.6035544872283936,
"learning_rate": 8.012001126949634e-06,
"loss": 0.7394,
"step": 5674
},
{
"epoch": 1.6475540717085209,
"grad_norm": 3.541083574295044,
"learning_rate": 8.011234642839057e-06,
"loss": 0.7212,
"step": 5675
},
{
"epoch": 1.6478443896066193,
"grad_norm": 3.6132876873016357,
"learning_rate": 8.010468047675339e-06,
"loss": 0.7709,
"step": 5676
},
{
"epoch": 1.6481347075047177,
"grad_norm": 3.5941474437713623,
"learning_rate": 8.009701341486755e-06,
"loss": 0.7479,
"step": 5677
},
{
"epoch": 1.648425025402816,
"grad_norm": 3.7650184631347656,
"learning_rate": 8.00893452430158e-06,
"loss": 0.8398,
"step": 5678
},
{
"epoch": 1.6487153433009145,
"grad_norm": 3.681375503540039,
"learning_rate": 8.008167596148094e-06,
"loss": 0.7961,
"step": 5679
},
{
"epoch": 1.649005661199013,
"grad_norm": 3.072575330734253,
"learning_rate": 8.007400557054581e-06,
"loss": 0.6448,
"step": 5680
},
{
"epoch": 1.6492959790971113,
"grad_norm": 3.290656566619873,
"learning_rate": 8.006633407049329e-06,
"loss": 0.7265,
"step": 5681
},
{
"epoch": 1.6495862969952098,
"grad_norm": 3.1598901748657227,
"learning_rate": 8.005866146160628e-06,
"loss": 0.6802,
"step": 5682
},
{
"epoch": 1.6498766148933082,
"grad_norm": 3.601827383041382,
"learning_rate": 8.005098774416779e-06,
"loss": 0.7517,
"step": 5683
},
{
"epoch": 1.6501669327914066,
"grad_norm": 4.136563777923584,
"learning_rate": 8.00433129184608e-06,
"loss": 0.8474,
"step": 5684
},
{
"epoch": 1.650457250689505,
"grad_norm": 3.642002582550049,
"learning_rate": 8.003563698476832e-06,
"loss": 0.7993,
"step": 5685
},
{
"epoch": 1.6507475685876034,
"grad_norm": 3.2611653804779053,
"learning_rate": 8.00279599433735e-06,
"loss": 0.9312,
"step": 5686
},
{
"epoch": 1.6510378864857018,
"grad_norm": 3.619309186935425,
"learning_rate": 8.002028179455941e-06,
"loss": 0.7925,
"step": 5687
},
{
"epoch": 1.6513282043838002,
"grad_norm": 2.8980655670166016,
"learning_rate": 8.001260253860926e-06,
"loss": 0.6433,
"step": 5688
},
{
"epoch": 1.6516185222818986,
"grad_norm": 3.9477474689483643,
"learning_rate": 8.000492217580623e-06,
"loss": 0.853,
"step": 5689
},
{
"epoch": 1.651908840179997,
"grad_norm": 3.8791873455047607,
"learning_rate": 7.999724070643357e-06,
"loss": 0.8406,
"step": 5690
},
{
"epoch": 1.6521991580780955,
"grad_norm": 4.422399044036865,
"learning_rate": 7.998955813077457e-06,
"loss": 1.0581,
"step": 5691
},
{
"epoch": 1.6524894759761939,
"grad_norm": 3.169612169265747,
"learning_rate": 7.998187444911259e-06,
"loss": 0.7056,
"step": 5692
},
{
"epoch": 1.6527797938742923,
"grad_norm": 3.5287580490112305,
"learning_rate": 7.997418966173098e-06,
"loss": 0.7648,
"step": 5693
},
{
"epoch": 1.6530701117723907,
"grad_norm": 3.7084598541259766,
"learning_rate": 7.996650376891314e-06,
"loss": 0.7283,
"step": 5694
},
{
"epoch": 1.6533604296704891,
"grad_norm": 3.714036226272583,
"learning_rate": 7.995881677094252e-06,
"loss": 0.8884,
"step": 5695
},
{
"epoch": 1.6536507475685878,
"grad_norm": 3.511685371398926,
"learning_rate": 7.995112866810264e-06,
"loss": 0.7522,
"step": 5696
},
{
"epoch": 1.653941065466686,
"grad_norm": 3.6127731800079346,
"learning_rate": 7.994343946067702e-06,
"loss": 0.7927,
"step": 5697
},
{
"epoch": 1.6542313833647846,
"grad_norm": 3.3412842750549316,
"learning_rate": 7.993574914894924e-06,
"loss": 0.8249,
"step": 5698
},
{
"epoch": 1.6545217012628828,
"grad_norm": 3.3941237926483154,
"learning_rate": 7.99280577332029e-06,
"loss": 0.8165,
"step": 5699
},
{
"epoch": 1.6548120191609814,
"grad_norm": 3.492751121520996,
"learning_rate": 7.992036521372168e-06,
"loss": 0.9081,
"step": 5700
},
{
"epoch": 1.6551023370590796,
"grad_norm": 3.8079779148101807,
"learning_rate": 7.991267159078926e-06,
"loss": 0.9421,
"step": 5701
},
{
"epoch": 1.6553926549571782,
"grad_norm": 3.5926706790924072,
"learning_rate": 7.990497686468937e-06,
"loss": 0.821,
"step": 5702
},
{
"epoch": 1.6556829728552764,
"grad_norm": 3.417275905609131,
"learning_rate": 7.989728103570582e-06,
"loss": 0.7135,
"step": 5703
},
{
"epoch": 1.655973290753375,
"grad_norm": 3.2997395992279053,
"learning_rate": 7.98895841041224e-06,
"loss": 0.8215,
"step": 5704
},
{
"epoch": 1.6562636086514733,
"grad_norm": 3.397256374359131,
"learning_rate": 7.988188607022297e-06,
"loss": 0.7057,
"step": 5705
},
{
"epoch": 1.656553926549572,
"grad_norm": 4.578166961669922,
"learning_rate": 7.987418693429145e-06,
"loss": 0.8764,
"step": 5706
},
{
"epoch": 1.65684424444767,
"grad_norm": 3.8914785385131836,
"learning_rate": 7.986648669661177e-06,
"loss": 0.9121,
"step": 5707
},
{
"epoch": 1.6571345623457687,
"grad_norm": 3.776986598968506,
"learning_rate": 7.985878535746791e-06,
"loss": 0.7753,
"step": 5708
},
{
"epoch": 1.657424880243867,
"grad_norm": 3.3599276542663574,
"learning_rate": 7.98510829171439e-06,
"loss": 0.818,
"step": 5709
},
{
"epoch": 1.6577151981419656,
"grad_norm": 3.421091318130493,
"learning_rate": 7.984337937592379e-06,
"loss": 0.7669,
"step": 5710
},
{
"epoch": 1.6580055160400637,
"grad_norm": 3.1329991817474365,
"learning_rate": 7.983567473409171e-06,
"loss": 0.7219,
"step": 5711
},
{
"epoch": 1.6582958339381624,
"grad_norm": 3.530151844024658,
"learning_rate": 7.982796899193177e-06,
"loss": 0.6851,
"step": 5712
},
{
"epoch": 1.6585861518362606,
"grad_norm": 3.30942702293396,
"learning_rate": 7.982026214972819e-06,
"loss": 0.7465,
"step": 5713
},
{
"epoch": 1.6588764697343592,
"grad_norm": 3.1759490966796875,
"learning_rate": 7.981255420776513e-06,
"loss": 0.6359,
"step": 5714
},
{
"epoch": 1.6591667876324574,
"grad_norm": 3.953688621520996,
"learning_rate": 7.980484516632693e-06,
"loss": 0.8722,
"step": 5715
},
{
"epoch": 1.659457105530556,
"grad_norm": 3.810572862625122,
"learning_rate": 7.979713502569787e-06,
"loss": 0.863,
"step": 5716
},
{
"epoch": 1.6597474234286542,
"grad_norm": 3.367386817932129,
"learning_rate": 7.97894237861623e-06,
"loss": 0.7079,
"step": 5717
},
{
"epoch": 1.6600377413267529,
"grad_norm": 3.8818647861480713,
"learning_rate": 7.97817114480046e-06,
"loss": 0.9224,
"step": 5718
},
{
"epoch": 1.660328059224851,
"grad_norm": 3.8525187969207764,
"learning_rate": 7.97739980115092e-06,
"loss": 0.8674,
"step": 5719
},
{
"epoch": 1.6606183771229497,
"grad_norm": 3.035203695297241,
"learning_rate": 7.976628347696057e-06,
"loss": 0.6624,
"step": 5720
},
{
"epoch": 1.660908695021048,
"grad_norm": 3.5461297035217285,
"learning_rate": 7.975856784464322e-06,
"loss": 0.7999,
"step": 5721
},
{
"epoch": 1.6611990129191465,
"grad_norm": 3.743105888366699,
"learning_rate": 7.975085111484169e-06,
"loss": 0.852,
"step": 5722
},
{
"epoch": 1.661489330817245,
"grad_norm": 3.713768482208252,
"learning_rate": 7.974313328784056e-06,
"loss": 0.8012,
"step": 5723
},
{
"epoch": 1.6617796487153433,
"grad_norm": 3.6123037338256836,
"learning_rate": 7.97354143639245e-06,
"loss": 0.8306,
"step": 5724
},
{
"epoch": 1.6620699666134418,
"grad_norm": 2.929441213607788,
"learning_rate": 7.972769434337815e-06,
"loss": 0.6391,
"step": 5725
},
{
"epoch": 1.6623602845115402,
"grad_norm": 3.306553602218628,
"learning_rate": 7.971997322648623e-06,
"loss": 0.7801,
"step": 5726
},
{
"epoch": 1.6626506024096386,
"grad_norm": 3.7010748386383057,
"learning_rate": 7.971225101353351e-06,
"loss": 0.8044,
"step": 5727
},
{
"epoch": 1.662940920307737,
"grad_norm": 3.7694780826568604,
"learning_rate": 7.970452770480474e-06,
"loss": 0.8357,
"step": 5728
},
{
"epoch": 1.6632312382058354,
"grad_norm": 3.188607692718506,
"learning_rate": 7.969680330058478e-06,
"loss": 0.8356,
"step": 5729
},
{
"epoch": 1.6635215561039338,
"grad_norm": 3.9263787269592285,
"learning_rate": 7.96890778011585e-06,
"loss": 0.8787,
"step": 5730
},
{
"epoch": 1.6638118740020322,
"grad_norm": 3.170591115951538,
"learning_rate": 7.968135120681082e-06,
"loss": 0.707,
"step": 5731
},
{
"epoch": 1.6641021919001306,
"grad_norm": 3.5900368690490723,
"learning_rate": 7.967362351782668e-06,
"loss": 0.7168,
"step": 5732
},
{
"epoch": 1.664392509798229,
"grad_norm": 3.5648303031921387,
"learning_rate": 7.966589473449109e-06,
"loss": 0.8968,
"step": 5733
},
{
"epoch": 1.6646828276963275,
"grad_norm": 3.489239454269409,
"learning_rate": 7.965816485708905e-06,
"loss": 0.7251,
"step": 5734
},
{
"epoch": 1.6649731455944259,
"grad_norm": 3.441934585571289,
"learning_rate": 7.96504338859057e-06,
"loss": 0.7385,
"step": 5735
},
{
"epoch": 1.6652634634925243,
"grad_norm": 3.39975905418396,
"learning_rate": 7.96427018212261e-06,
"loss": 0.7562,
"step": 5736
},
{
"epoch": 1.6655537813906227,
"grad_norm": 3.6406619548797607,
"learning_rate": 7.96349686633354e-06,
"loss": 0.8788,
"step": 5737
},
{
"epoch": 1.6658440992887211,
"grad_norm": 3.939983606338501,
"learning_rate": 7.962723441251882e-06,
"loss": 0.8964,
"step": 5738
},
{
"epoch": 1.6661344171868195,
"grad_norm": 3.801276445388794,
"learning_rate": 7.96194990690616e-06,
"loss": 0.884,
"step": 5739
},
{
"epoch": 1.666424735084918,
"grad_norm": 3.2761764526367188,
"learning_rate": 7.961176263324902e-06,
"loss": 0.7168,
"step": 5740
},
{
"epoch": 1.6667150529830164,
"grad_norm": 3.361765146255493,
"learning_rate": 7.960402510536635e-06,
"loss": 0.687,
"step": 5741
},
{
"epoch": 1.6670053708811148,
"grad_norm": 3.3800249099731445,
"learning_rate": 7.959628648569901e-06,
"loss": 0.8002,
"step": 5742
},
{
"epoch": 1.6672956887792132,
"grad_norm": 3.9530911445617676,
"learning_rate": 7.958854677453238e-06,
"loss": 0.8342,
"step": 5743
},
{
"epoch": 1.6675860066773116,
"grad_norm": 3.6186470985412598,
"learning_rate": 7.958080597215187e-06,
"loss": 0.7748,
"step": 5744
},
{
"epoch": 1.66787632457541,
"grad_norm": 3.4672844409942627,
"learning_rate": 7.957306407884298e-06,
"loss": 0.7663,
"step": 5745
},
{
"epoch": 1.6681666424735084,
"grad_norm": 4.060912609100342,
"learning_rate": 7.95653210948912e-06,
"loss": 0.7882,
"step": 5746
},
{
"epoch": 1.668456960371607,
"grad_norm": 3.9443535804748535,
"learning_rate": 7.955757702058213e-06,
"loss": 0.9355,
"step": 5747
},
{
"epoch": 1.6687472782697053,
"grad_norm": 3.683994770050049,
"learning_rate": 7.954983185620136e-06,
"loss": 0.6635,
"step": 5748
},
{
"epoch": 1.669037596167804,
"grad_norm": 3.9671192169189453,
"learning_rate": 7.95420856020345e-06,
"loss": 0.8924,
"step": 5749
},
{
"epoch": 1.669327914065902,
"grad_norm": 3.1241607666015625,
"learning_rate": 7.953433825836725e-06,
"loss": 0.65,
"step": 5750
},
{
"epoch": 1.6696182319640007,
"grad_norm": 3.5456268787384033,
"learning_rate": 7.952658982548533e-06,
"loss": 0.7186,
"step": 5751
},
{
"epoch": 1.669908549862099,
"grad_norm": 3.425567388534546,
"learning_rate": 7.95188403036745e-06,
"loss": 0.6548,
"step": 5752
},
{
"epoch": 1.6701988677601975,
"grad_norm": 3.700671672821045,
"learning_rate": 7.951108969322054e-06,
"loss": 0.8279,
"step": 5753
},
{
"epoch": 1.6704891856582957,
"grad_norm": 3.732058525085449,
"learning_rate": 7.95033379944093e-06,
"loss": 0.7564,
"step": 5754
},
{
"epoch": 1.6707795035563944,
"grad_norm": 3.4859352111816406,
"learning_rate": 7.949558520752667e-06,
"loss": 0.7317,
"step": 5755
},
{
"epoch": 1.6710698214544926,
"grad_norm": 3.258023738861084,
"learning_rate": 7.948783133285858e-06,
"loss": 0.7544,
"step": 5756
},
{
"epoch": 1.6713601393525912,
"grad_norm": 3.767179012298584,
"learning_rate": 7.948007637069095e-06,
"loss": 0.8025,
"step": 5757
},
{
"epoch": 1.6716504572506894,
"grad_norm": 3.410964012145996,
"learning_rate": 7.947232032130982e-06,
"loss": 0.6954,
"step": 5758
},
{
"epoch": 1.671940775148788,
"grad_norm": 3.8064308166503906,
"learning_rate": 7.94645631850012e-06,
"loss": 0.9072,
"step": 5759
},
{
"epoch": 1.6722310930468862,
"grad_norm": 3.1589906215667725,
"learning_rate": 7.945680496205117e-06,
"loss": 0.7262,
"step": 5760
},
{
"epoch": 1.6725214109449849,
"grad_norm": 3.672649621963501,
"learning_rate": 7.944904565274588e-06,
"loss": 0.8108,
"step": 5761
},
{
"epoch": 1.672811728843083,
"grad_norm": 3.2626302242279053,
"learning_rate": 7.944128525737147e-06,
"loss": 0.7403,
"step": 5762
},
{
"epoch": 1.6731020467411817,
"grad_norm": 3.6295340061187744,
"learning_rate": 7.943352377621414e-06,
"loss": 0.7882,
"step": 5763
},
{
"epoch": 1.6733923646392799,
"grad_norm": 4.048469543457031,
"learning_rate": 7.942576120956014e-06,
"loss": 0.8053,
"step": 5764
},
{
"epoch": 1.6736826825373785,
"grad_norm": 3.6602652072906494,
"learning_rate": 7.941799755769573e-06,
"loss": 0.7699,
"step": 5765
},
{
"epoch": 1.6739730004354767,
"grad_norm": 3.469912528991699,
"learning_rate": 7.941023282090727e-06,
"loss": 0.6628,
"step": 5766
},
{
"epoch": 1.6742633183335753,
"grad_norm": 3.6404995918273926,
"learning_rate": 7.940246699948107e-06,
"loss": 0.8513,
"step": 5767
},
{
"epoch": 1.6745536362316735,
"grad_norm": 4.017561435699463,
"learning_rate": 7.939470009370357e-06,
"loss": 0.918,
"step": 5768
},
{
"epoch": 1.6748439541297722,
"grad_norm": 3.269432544708252,
"learning_rate": 7.938693210386118e-06,
"loss": 0.7086,
"step": 5769
},
{
"epoch": 1.6751342720278704,
"grad_norm": 3.6618704795837402,
"learning_rate": 7.93791630302404e-06,
"loss": 0.8762,
"step": 5770
},
{
"epoch": 1.675424589925969,
"grad_norm": 3.3765363693237305,
"learning_rate": 7.937139287312777e-06,
"loss": 0.7739,
"step": 5771
},
{
"epoch": 1.6757149078240674,
"grad_norm": 3.6694111824035645,
"learning_rate": 7.93636216328098e-06,
"loss": 0.7754,
"step": 5772
},
{
"epoch": 1.6760052257221658,
"grad_norm": 3.989017963409424,
"learning_rate": 7.935584930957312e-06,
"loss": 0.8737,
"step": 5773
},
{
"epoch": 1.6762955436202642,
"grad_norm": 3.580270528793335,
"learning_rate": 7.934807590370438e-06,
"loss": 0.7978,
"step": 5774
},
{
"epoch": 1.6765858615183626,
"grad_norm": 3.720231771469116,
"learning_rate": 7.934030141549024e-06,
"loss": 0.851,
"step": 5775
},
{
"epoch": 1.676876179416461,
"grad_norm": 3.835939407348633,
"learning_rate": 7.933252584521743e-06,
"loss": 0.8481,
"step": 5776
},
{
"epoch": 1.6771664973145595,
"grad_norm": 3.7228312492370605,
"learning_rate": 7.93247491931727e-06,
"loss": 0.957,
"step": 5777
},
{
"epoch": 1.6774568152126579,
"grad_norm": 3.7690441608428955,
"learning_rate": 7.931697145964284e-06,
"loss": 0.8309,
"step": 5778
},
{
"epoch": 1.6777471331107563,
"grad_norm": 3.3121449947357178,
"learning_rate": 7.930919264491473e-06,
"loss": 0.7899,
"step": 5779
},
{
"epoch": 1.6780374510088547,
"grad_norm": 3.385662794113159,
"learning_rate": 7.930141274927522e-06,
"loss": 0.7839,
"step": 5780
},
{
"epoch": 1.6783277689069531,
"grad_norm": 3.8395118713378906,
"learning_rate": 7.929363177301124e-06,
"loss": 0.9903,
"step": 5781
},
{
"epoch": 1.6786180868050515,
"grad_norm": 3.8420722484588623,
"learning_rate": 7.928584971640974e-06,
"loss": 0.8054,
"step": 5782
},
{
"epoch": 1.67890840470315,
"grad_norm": 3.230956554412842,
"learning_rate": 7.927806657975775e-06,
"loss": 0.7696,
"step": 5783
},
{
"epoch": 1.6791987226012484,
"grad_norm": 3.2777044773101807,
"learning_rate": 7.927028236334224e-06,
"loss": 0.694,
"step": 5784
},
{
"epoch": 1.6794890404993468,
"grad_norm": 3.614997625350952,
"learning_rate": 7.926249706745036e-06,
"loss": 0.839,
"step": 5785
},
{
"epoch": 1.6797793583974452,
"grad_norm": 3.2601284980773926,
"learning_rate": 7.92547106923692e-06,
"loss": 0.7297,
"step": 5786
},
{
"epoch": 1.6800696762955436,
"grad_norm": 3.0316452980041504,
"learning_rate": 7.92469232383859e-06,
"loss": 0.768,
"step": 5787
},
{
"epoch": 1.680359994193642,
"grad_norm": 3.3039333820343018,
"learning_rate": 7.92391347057877e-06,
"loss": 0.7392,
"step": 5788
},
{
"epoch": 1.6806503120917404,
"grad_norm": 3.2324368953704834,
"learning_rate": 7.92313450948618e-06,
"loss": 0.655,
"step": 5789
},
{
"epoch": 1.6809406299898388,
"grad_norm": 3.5473809242248535,
"learning_rate": 7.92235544058955e-06,
"loss": 0.7821,
"step": 5790
},
{
"epoch": 1.6812309478879373,
"grad_norm": 3.683997392654419,
"learning_rate": 7.921576263917612e-06,
"loss": 0.7927,
"step": 5791
},
{
"epoch": 1.6815212657860357,
"grad_norm": 3.726501703262329,
"learning_rate": 7.920796979499098e-06,
"loss": 0.7179,
"step": 5792
},
{
"epoch": 1.681811583684134,
"grad_norm": 3.5410258769989014,
"learning_rate": 7.920017587362751e-06,
"loss": 0.6961,
"step": 5793
},
{
"epoch": 1.6821019015822325,
"grad_norm": 3.490867853164673,
"learning_rate": 7.919238087537317e-06,
"loss": 0.8215,
"step": 5794
},
{
"epoch": 1.682392219480331,
"grad_norm": 3.436814308166504,
"learning_rate": 7.91845848005154e-06,
"loss": 0.7662,
"step": 5795
},
{
"epoch": 1.6826825373784295,
"grad_norm": 2.991690158843994,
"learning_rate": 7.917678764934169e-06,
"loss": 0.7011,
"step": 5796
},
{
"epoch": 1.6829728552765277,
"grad_norm": 3.5436899662017822,
"learning_rate": 7.916898942213967e-06,
"loss": 0.6851,
"step": 5797
},
{
"epoch": 1.6832631731746264,
"grad_norm": 3.9489386081695557,
"learning_rate": 7.916119011919687e-06,
"loss": 0.9344,
"step": 5798
},
{
"epoch": 1.6835534910727246,
"grad_norm": 3.72562313079834,
"learning_rate": 7.915338974080098e-06,
"loss": 0.7195,
"step": 5799
},
{
"epoch": 1.6838438089708232,
"grad_norm": 3.375615358352661,
"learning_rate": 7.914558828723961e-06,
"loss": 0.6861,
"step": 5800
},
{
"epoch": 1.6841341268689214,
"grad_norm": 3.519691228866577,
"learning_rate": 7.913778575880054e-06,
"loss": 0.8167,
"step": 5801
},
{
"epoch": 1.68442444476702,
"grad_norm": 3.521460771560669,
"learning_rate": 7.912998215577147e-06,
"loss": 0.8164,
"step": 5802
},
{
"epoch": 1.6847147626651182,
"grad_norm": 3.0612387657165527,
"learning_rate": 7.912217747844022e-06,
"loss": 0.6171,
"step": 5803
},
{
"epoch": 1.6850050805632169,
"grad_norm": 3.277848243713379,
"learning_rate": 7.911437172709464e-06,
"loss": 0.6403,
"step": 5804
},
{
"epoch": 1.685295398461315,
"grad_norm": 3.4739081859588623,
"learning_rate": 7.910656490202258e-06,
"loss": 0.7629,
"step": 5805
},
{
"epoch": 1.6855857163594137,
"grad_norm": 3.8861570358276367,
"learning_rate": 7.909875700351193e-06,
"loss": 0.8584,
"step": 5806
},
{
"epoch": 1.6858760342575119,
"grad_norm": 3.6719019412994385,
"learning_rate": 7.909094803185071e-06,
"loss": 0.7888,
"step": 5807
},
{
"epoch": 1.6861663521556105,
"grad_norm": 3.4244160652160645,
"learning_rate": 7.908313798732685e-06,
"loss": 0.6949,
"step": 5808
},
{
"epoch": 1.6864566700537087,
"grad_norm": 3.7639153003692627,
"learning_rate": 7.907532687022841e-06,
"loss": 0.814,
"step": 5809
},
{
"epoch": 1.6867469879518073,
"grad_norm": 3.6842236518859863,
"learning_rate": 7.906751468084343e-06,
"loss": 0.7004,
"step": 5810
},
{
"epoch": 1.6870373058499055,
"grad_norm": 3.259575366973877,
"learning_rate": 7.905970141946006e-06,
"loss": 0.6729,
"step": 5811
},
{
"epoch": 1.6873276237480042,
"grad_norm": 3.651085138320923,
"learning_rate": 7.905188708636645e-06,
"loss": 0.7953,
"step": 5812
},
{
"epoch": 1.6876179416461023,
"grad_norm": 3.5897328853607178,
"learning_rate": 7.904407168185076e-06,
"loss": 0.773,
"step": 5813
},
{
"epoch": 1.687908259544201,
"grad_norm": 3.297179937362671,
"learning_rate": 7.903625520620122e-06,
"loss": 0.7771,
"step": 5814
},
{
"epoch": 1.6881985774422992,
"grad_norm": 3.8753912448883057,
"learning_rate": 7.902843765970611e-06,
"loss": 0.7852,
"step": 5815
},
{
"epoch": 1.6884888953403978,
"grad_norm": 3.782907247543335,
"learning_rate": 7.902061904265375e-06,
"loss": 0.7274,
"step": 5816
},
{
"epoch": 1.688779213238496,
"grad_norm": 3.083601713180542,
"learning_rate": 7.901279935533248e-06,
"loss": 0.7227,
"step": 5817
},
{
"epoch": 1.6890695311365946,
"grad_norm": 3.3517086505889893,
"learning_rate": 7.900497859803069e-06,
"loss": 0.6743,
"step": 5818
},
{
"epoch": 1.6893598490346928,
"grad_norm": 3.5704421997070312,
"learning_rate": 7.899715677103677e-06,
"loss": 0.7981,
"step": 5819
},
{
"epoch": 1.6896501669327915,
"grad_norm": 2.869518280029297,
"learning_rate": 7.898933387463924e-06,
"loss": 0.5827,
"step": 5820
},
{
"epoch": 1.6899404848308899,
"grad_norm": 3.6910226345062256,
"learning_rate": 7.898150990912657e-06,
"loss": 0.8739,
"step": 5821
},
{
"epoch": 1.6902308027289883,
"grad_norm": 3.580432415008545,
"learning_rate": 7.897368487478733e-06,
"loss": 0.8449,
"step": 5822
},
{
"epoch": 1.6905211206270867,
"grad_norm": 3.478239059448242,
"learning_rate": 7.896585877191007e-06,
"loss": 0.7331,
"step": 5823
},
{
"epoch": 1.6908114385251851,
"grad_norm": 3.5383105278015137,
"learning_rate": 7.895803160078344e-06,
"loss": 0.7373,
"step": 5824
},
{
"epoch": 1.6911017564232835,
"grad_norm": 3.06196928024292,
"learning_rate": 7.89502033616961e-06,
"loss": 0.7516,
"step": 5825
},
{
"epoch": 1.691392074321382,
"grad_norm": 3.9107048511505127,
"learning_rate": 7.894237405493675e-06,
"loss": 0.8451,
"step": 5826
},
{
"epoch": 1.6916823922194804,
"grad_norm": 3.3762965202331543,
"learning_rate": 7.893454368079413e-06,
"loss": 0.7507,
"step": 5827
},
{
"epoch": 1.6919727101175788,
"grad_norm": 3.90720534324646,
"learning_rate": 7.892671223955702e-06,
"loss": 0.8307,
"step": 5828
},
{
"epoch": 1.6922630280156772,
"grad_norm": 3.7784852981567383,
"learning_rate": 7.891887973151424e-06,
"loss": 0.8638,
"step": 5829
},
{
"epoch": 1.6925533459137756,
"grad_norm": 3.581059455871582,
"learning_rate": 7.891104615695463e-06,
"loss": 0.7242,
"step": 5830
},
{
"epoch": 1.692843663811874,
"grad_norm": 3.4262542724609375,
"learning_rate": 7.890321151616716e-06,
"loss": 0.7449,
"step": 5831
},
{
"epoch": 1.6931339817099724,
"grad_norm": 3.4586639404296875,
"learning_rate": 7.889537580944068e-06,
"loss": 0.7635,
"step": 5832
},
{
"epoch": 1.6934242996080708,
"grad_norm": 4.539180755615234,
"learning_rate": 7.888753903706422e-06,
"loss": 0.8506,
"step": 5833
},
{
"epoch": 1.6937146175061693,
"grad_norm": 3.6680805683135986,
"learning_rate": 7.887970119932678e-06,
"loss": 0.707,
"step": 5834
},
{
"epoch": 1.6940049354042677,
"grad_norm": 3.8652119636535645,
"learning_rate": 7.887186229651741e-06,
"loss": 0.8594,
"step": 5835
},
{
"epoch": 1.694295253302366,
"grad_norm": 3.5802907943725586,
"learning_rate": 7.886402232892525e-06,
"loss": 0.77,
"step": 5836
},
{
"epoch": 1.6945855712004645,
"grad_norm": 3.5474374294281006,
"learning_rate": 7.885618129683938e-06,
"loss": 0.806,
"step": 5837
},
{
"epoch": 1.694875889098563,
"grad_norm": 3.7476847171783447,
"learning_rate": 7.8848339200549e-06,
"loss": 0.8719,
"step": 5838
},
{
"epoch": 1.6951662069966613,
"grad_norm": 3.4608943462371826,
"learning_rate": 7.884049604034331e-06,
"loss": 0.8042,
"step": 5839
},
{
"epoch": 1.6954565248947597,
"grad_norm": 3.389352798461914,
"learning_rate": 7.883265181651158e-06,
"loss": 0.7396,
"step": 5840
},
{
"epoch": 1.6957468427928581,
"grad_norm": 3.1610846519470215,
"learning_rate": 7.882480652934307e-06,
"loss": 0.7559,
"step": 5841
},
{
"epoch": 1.6960371606909566,
"grad_norm": 3.6229166984558105,
"learning_rate": 7.881696017912716e-06,
"loss": 0.7203,
"step": 5842
},
{
"epoch": 1.696327478589055,
"grad_norm": 3.709913492202759,
"learning_rate": 7.880911276615319e-06,
"loss": 0.8945,
"step": 5843
},
{
"epoch": 1.6966177964871534,
"grad_norm": 3.5395514965057373,
"learning_rate": 7.880126429071057e-06,
"loss": 0.7933,
"step": 5844
},
{
"epoch": 1.6969081143852518,
"grad_norm": 3.6049327850341797,
"learning_rate": 7.879341475308876e-06,
"loss": 0.7339,
"step": 5845
},
{
"epoch": 1.6971984322833502,
"grad_norm": 3.444969415664673,
"learning_rate": 7.878556415357721e-06,
"loss": 0.8457,
"step": 5846
},
{
"epoch": 1.6974887501814488,
"grad_norm": 3.630948781967163,
"learning_rate": 7.877771249246551e-06,
"loss": 0.7315,
"step": 5847
},
{
"epoch": 1.697779068079547,
"grad_norm": 3.8517673015594482,
"learning_rate": 7.876985977004319e-06,
"loss": 0.8216,
"step": 5848
},
{
"epoch": 1.6980693859776457,
"grad_norm": 3.088366985321045,
"learning_rate": 7.876200598659984e-06,
"loss": 0.6817,
"step": 5849
},
{
"epoch": 1.6983597038757439,
"grad_norm": 3.610283374786377,
"learning_rate": 7.875415114242514e-06,
"loss": 0.7258,
"step": 5850
},
{
"epoch": 1.6986500217738425,
"grad_norm": 3.6045877933502197,
"learning_rate": 7.874629523780875e-06,
"loss": 0.7373,
"step": 5851
},
{
"epoch": 1.6989403396719407,
"grad_norm": 3.6890554428100586,
"learning_rate": 7.873843827304039e-06,
"loss": 0.9028,
"step": 5852
},
{
"epoch": 1.6992306575700393,
"grad_norm": 3.803805112838745,
"learning_rate": 7.873058024840985e-06,
"loss": 0.9551,
"step": 5853
},
{
"epoch": 1.6995209754681375,
"grad_norm": 3.7046024799346924,
"learning_rate": 7.87227211642069e-06,
"loss": 0.8835,
"step": 5854
},
{
"epoch": 1.6998112933662362,
"grad_norm": 3.598008155822754,
"learning_rate": 7.871486102072138e-06,
"loss": 0.81,
"step": 5855
},
{
"epoch": 1.7001016112643343,
"grad_norm": 3.314302921295166,
"learning_rate": 7.870699981824322e-06,
"loss": 0.8002,
"step": 5856
},
{
"epoch": 1.700391929162433,
"grad_norm": 3.438389301300049,
"learning_rate": 7.869913755706227e-06,
"loss": 0.697,
"step": 5857
},
{
"epoch": 1.7006822470605312,
"grad_norm": 3.140916585922241,
"learning_rate": 7.869127423746852e-06,
"loss": 0.7491,
"step": 5858
},
{
"epoch": 1.7009725649586298,
"grad_norm": 3.362424612045288,
"learning_rate": 7.868340985975195e-06,
"loss": 0.8557,
"step": 5859
},
{
"epoch": 1.701262882856728,
"grad_norm": 3.793604850769043,
"learning_rate": 7.867554442420262e-06,
"loss": 0.6942,
"step": 5860
},
{
"epoch": 1.7015532007548266,
"grad_norm": 3.624799966812134,
"learning_rate": 7.86676779311106e-06,
"loss": 0.7548,
"step": 5861
},
{
"epoch": 1.7018435186529248,
"grad_norm": 4.076056957244873,
"learning_rate": 7.865981038076598e-06,
"loss": 0.8502,
"step": 5862
},
{
"epoch": 1.7021338365510235,
"grad_norm": 3.5222671031951904,
"learning_rate": 7.865194177345894e-06,
"loss": 0.6433,
"step": 5863
},
{
"epoch": 1.7024241544491217,
"grad_norm": 3.4212605953216553,
"learning_rate": 7.864407210947965e-06,
"loss": 0.7633,
"step": 5864
},
{
"epoch": 1.7027144723472203,
"grad_norm": 3.3345491886138916,
"learning_rate": 7.863620138911833e-06,
"loss": 0.7564,
"step": 5865
},
{
"epoch": 1.7030047902453185,
"grad_norm": 3.045092821121216,
"learning_rate": 7.862832961266529e-06,
"loss": 0.7526,
"step": 5866
},
{
"epoch": 1.7032951081434171,
"grad_norm": 3.5737078189849854,
"learning_rate": 7.862045678041082e-06,
"loss": 0.7683,
"step": 5867
},
{
"epoch": 1.7035854260415153,
"grad_norm": 3.4689781665802,
"learning_rate": 7.861258289264524e-06,
"loss": 0.716,
"step": 5868
},
{
"epoch": 1.703875743939614,
"grad_norm": 3.78070068359375,
"learning_rate": 7.860470794965896e-06,
"loss": 0.7166,
"step": 5869
},
{
"epoch": 1.7041660618377121,
"grad_norm": 3.7463736534118652,
"learning_rate": 7.859683195174242e-06,
"loss": 0.8338,
"step": 5870
},
{
"epoch": 1.7044563797358108,
"grad_norm": 3.7229490280151367,
"learning_rate": 7.858895489918605e-06,
"loss": 0.8716,
"step": 5871
},
{
"epoch": 1.7047466976339092,
"grad_norm": 3.8799808025360107,
"learning_rate": 7.858107679228037e-06,
"loss": 0.7594,
"step": 5872
},
{
"epoch": 1.7050370155320076,
"grad_norm": 3.2937357425689697,
"learning_rate": 7.857319763131592e-06,
"loss": 0.6893,
"step": 5873
},
{
"epoch": 1.705327333430106,
"grad_norm": 3.463261127471924,
"learning_rate": 7.856531741658328e-06,
"loss": 0.7997,
"step": 5874
},
{
"epoch": 1.7056176513282044,
"grad_norm": 3.727832317352295,
"learning_rate": 7.855743614837307e-06,
"loss": 0.7482,
"step": 5875
},
{
"epoch": 1.7059079692263028,
"grad_norm": 3.596024990081787,
"learning_rate": 7.854955382697597e-06,
"loss": 0.6919,
"step": 5876
},
{
"epoch": 1.7061982871244012,
"grad_norm": 3.800488233566284,
"learning_rate": 7.854167045268265e-06,
"loss": 0.9058,
"step": 5877
},
{
"epoch": 1.7064886050224997,
"grad_norm": 3.0563924312591553,
"learning_rate": 7.853378602578381e-06,
"loss": 0.6268,
"step": 5878
},
{
"epoch": 1.706778922920598,
"grad_norm": 4.0375494956970215,
"learning_rate": 7.85259005465703e-06,
"loss": 0.8667,
"step": 5879
},
{
"epoch": 1.7070692408186965,
"grad_norm": 3.549715995788574,
"learning_rate": 7.851801401533288e-06,
"loss": 0.6337,
"step": 5880
},
{
"epoch": 1.707359558716795,
"grad_norm": 3.2920758724212646,
"learning_rate": 7.851012643236244e-06,
"loss": 0.6598,
"step": 5881
},
{
"epoch": 1.7076498766148933,
"grad_norm": 2.9315907955169678,
"learning_rate": 7.850223779794983e-06,
"loss": 0.6499,
"step": 5882
},
{
"epoch": 1.7079401945129917,
"grad_norm": 3.107271432876587,
"learning_rate": 7.849434811238601e-06,
"loss": 0.6202,
"step": 5883
},
{
"epoch": 1.7082305124110901,
"grad_norm": 3.9191412925720215,
"learning_rate": 7.848645737596193e-06,
"loss": 0.887,
"step": 5884
},
{
"epoch": 1.7085208303091886,
"grad_norm": 3.584061861038208,
"learning_rate": 7.847856558896863e-06,
"loss": 0.8037,
"step": 5885
},
{
"epoch": 1.708811148207287,
"grad_norm": 3.5416791439056396,
"learning_rate": 7.847067275169711e-06,
"loss": 0.8083,
"step": 5886
},
{
"epoch": 1.7091014661053854,
"grad_norm": 3.7633187770843506,
"learning_rate": 7.846277886443849e-06,
"loss": 0.7173,
"step": 5887
},
{
"epoch": 1.7093917840034838,
"grad_norm": 3.4615838527679443,
"learning_rate": 7.845488392748387e-06,
"loss": 0.7684,
"step": 5888
},
{
"epoch": 1.7096821019015822,
"grad_norm": 3.8253400325775146,
"learning_rate": 7.844698794112444e-06,
"loss": 0.7963,
"step": 5889
},
{
"epoch": 1.7099724197996806,
"grad_norm": 3.686365842819214,
"learning_rate": 7.843909090565136e-06,
"loss": 0.7613,
"step": 5890
},
{
"epoch": 1.710262737697779,
"grad_norm": 3.3100762367248535,
"learning_rate": 7.843119282135592e-06,
"loss": 0.743,
"step": 5891
},
{
"epoch": 1.7105530555958774,
"grad_norm": 3.4483158588409424,
"learning_rate": 7.842329368852935e-06,
"loss": 0.7322,
"step": 5892
},
{
"epoch": 1.7108433734939759,
"grad_norm": 3.625225305557251,
"learning_rate": 7.841539350746299e-06,
"loss": 0.7968,
"step": 5893
},
{
"epoch": 1.7111336913920743,
"grad_norm": 3.4722776412963867,
"learning_rate": 7.840749227844819e-06,
"loss": 0.7476,
"step": 5894
},
{
"epoch": 1.7114240092901727,
"grad_norm": 3.5864033699035645,
"learning_rate": 7.839959000177637e-06,
"loss": 0.7872,
"step": 5895
},
{
"epoch": 1.7117143271882713,
"grad_norm": 3.2345564365386963,
"learning_rate": 7.839168667773891e-06,
"loss": 0.7775,
"step": 5896
},
{
"epoch": 1.7120046450863695,
"grad_norm": 3.407197952270508,
"learning_rate": 7.838378230662732e-06,
"loss": 0.7034,
"step": 5897
},
{
"epoch": 1.7122949629844681,
"grad_norm": 3.791569948196411,
"learning_rate": 7.837587688873314e-06,
"loss": 0.782,
"step": 5898
},
{
"epoch": 1.7125852808825663,
"grad_norm": 4.092060089111328,
"learning_rate": 7.836797042434785e-06,
"loss": 0.8197,
"step": 5899
},
{
"epoch": 1.712875598780665,
"grad_norm": 3.3512213230133057,
"learning_rate": 7.836006291376307e-06,
"loss": 0.6995,
"step": 5900
},
{
"epoch": 1.7131659166787632,
"grad_norm": 3.657559394836426,
"learning_rate": 7.835215435727042e-06,
"loss": 0.7018,
"step": 5901
},
{
"epoch": 1.7134562345768618,
"grad_norm": 3.197721481323242,
"learning_rate": 7.834424475516158e-06,
"loss": 0.755,
"step": 5902
},
{
"epoch": 1.71374655247496,
"grad_norm": 3.3309671878814697,
"learning_rate": 7.833633410772823e-06,
"loss": 0.7921,
"step": 5903
},
{
"epoch": 1.7140368703730586,
"grad_norm": 3.4525208473205566,
"learning_rate": 7.832842241526212e-06,
"loss": 0.7811,
"step": 5904
},
{
"epoch": 1.7143271882711568,
"grad_norm": 3.945049285888672,
"learning_rate": 7.832050967805504e-06,
"loss": 0.702,
"step": 5905
},
{
"epoch": 1.7146175061692555,
"grad_norm": 3.4726674556732178,
"learning_rate": 7.83125958963988e-06,
"loss": 0.7474,
"step": 5906
},
{
"epoch": 1.7149078240673536,
"grad_norm": 3.5951087474823,
"learning_rate": 7.830468107058527e-06,
"loss": 0.7378,
"step": 5907
},
{
"epoch": 1.7151981419654523,
"grad_norm": 3.877894401550293,
"learning_rate": 7.829676520090632e-06,
"loss": 0.855,
"step": 5908
},
{
"epoch": 1.7154884598635505,
"grad_norm": 3.470466375350952,
"learning_rate": 7.828884828765391e-06,
"loss": 0.7057,
"step": 5909
},
{
"epoch": 1.715778777761649,
"grad_norm": 3.618359088897705,
"learning_rate": 7.828093033112e-06,
"loss": 0.8365,
"step": 5910
},
{
"epoch": 1.7160690956597473,
"grad_norm": 3.4028820991516113,
"learning_rate": 7.827301133159659e-06,
"loss": 0.8622,
"step": 5911
},
{
"epoch": 1.716359413557846,
"grad_norm": 3.890469789505005,
"learning_rate": 7.826509128937576e-06,
"loss": 0.7958,
"step": 5912
},
{
"epoch": 1.7166497314559441,
"grad_norm": 3.6213538646698,
"learning_rate": 7.825717020474957e-06,
"loss": 0.8028,
"step": 5913
},
{
"epoch": 1.7169400493540428,
"grad_norm": 3.528296709060669,
"learning_rate": 7.824924807801015e-06,
"loss": 0.8284,
"step": 5914
},
{
"epoch": 1.717230367252141,
"grad_norm": 3.321072816848755,
"learning_rate": 7.824132490944968e-06,
"loss": 0.7871,
"step": 5915
},
{
"epoch": 1.7175206851502396,
"grad_norm": 3.2413792610168457,
"learning_rate": 7.823340069936035e-06,
"loss": 0.7666,
"step": 5916
},
{
"epoch": 1.7178110030483378,
"grad_norm": 4.080096244812012,
"learning_rate": 7.82254754480344e-06,
"loss": 0.7143,
"step": 5917
},
{
"epoch": 1.7181013209464364,
"grad_norm": 3.3351078033447266,
"learning_rate": 7.821754915576415e-06,
"loss": 0.8247,
"step": 5918
},
{
"epoch": 1.7183916388445346,
"grad_norm": 3.2570137977600098,
"learning_rate": 7.820962182284183e-06,
"loss": 0.6952,
"step": 5919
},
{
"epoch": 1.7186819567426332,
"grad_norm": 3.4597902297973633,
"learning_rate": 7.820169344955991e-06,
"loss": 0.6665,
"step": 5920
},
{
"epoch": 1.7189722746407314,
"grad_norm": 3.462433099746704,
"learning_rate": 7.819376403621068e-06,
"loss": 0.7972,
"step": 5921
},
{
"epoch": 1.71926259253883,
"grad_norm": 3.6604247093200684,
"learning_rate": 7.818583358308664e-06,
"loss": 0.747,
"step": 5922
},
{
"epoch": 1.7195529104369285,
"grad_norm": 3.404092311859131,
"learning_rate": 7.817790209048025e-06,
"loss": 0.7847,
"step": 5923
},
{
"epoch": 1.719843228335027,
"grad_norm": 3.8753247261047363,
"learning_rate": 7.8169969558684e-06,
"loss": 0.7468,
"step": 5924
},
{
"epoch": 1.7201335462331253,
"grad_norm": 3.532658338546753,
"learning_rate": 7.816203598799046e-06,
"loss": 0.7734,
"step": 5925
},
{
"epoch": 1.7204238641312237,
"grad_norm": 3.13362193107605,
"learning_rate": 7.815410137869222e-06,
"loss": 0.6992,
"step": 5926
},
{
"epoch": 1.7207141820293221,
"grad_norm": 3.5808610916137695,
"learning_rate": 7.814616573108188e-06,
"loss": 0.7753,
"step": 5927
},
{
"epoch": 1.7210044999274206,
"grad_norm": 3.5286667346954346,
"learning_rate": 7.81382290454521e-06,
"loss": 0.6765,
"step": 5928
},
{
"epoch": 1.721294817825519,
"grad_norm": 3.8136179447174072,
"learning_rate": 7.813029132209562e-06,
"loss": 0.8816,
"step": 5929
},
{
"epoch": 1.7215851357236174,
"grad_norm": 3.408217668533325,
"learning_rate": 7.812235256130515e-06,
"loss": 0.7641,
"step": 5930
},
{
"epoch": 1.7218754536217158,
"grad_norm": 3.4473049640655518,
"learning_rate": 7.811441276337348e-06,
"loss": 0.7553,
"step": 5931
},
{
"epoch": 1.7221657715198142,
"grad_norm": 3.727487087249756,
"learning_rate": 7.810647192859344e-06,
"loss": 0.8163,
"step": 5932
},
{
"epoch": 1.7224560894179126,
"grad_norm": 3.421032667160034,
"learning_rate": 7.809853005725784e-06,
"loss": 0.9554,
"step": 5933
},
{
"epoch": 1.722746407316011,
"grad_norm": 3.630430221557617,
"learning_rate": 7.809058714965962e-06,
"loss": 0.719,
"step": 5934
},
{
"epoch": 1.7230367252141094,
"grad_norm": 3.216792583465576,
"learning_rate": 7.80826432060917e-06,
"loss": 0.7135,
"step": 5935
},
{
"epoch": 1.7233270431122079,
"grad_norm": 3.312319278717041,
"learning_rate": 7.807469822684704e-06,
"loss": 0.7871,
"step": 5936
},
{
"epoch": 1.7236173610103063,
"grad_norm": 3.6939849853515625,
"learning_rate": 7.806675221221862e-06,
"loss": 0.7946,
"step": 5937
},
{
"epoch": 1.7239076789084047,
"grad_norm": 3.2859673500061035,
"learning_rate": 7.805880516249955e-06,
"loss": 0.742,
"step": 5938
},
{
"epoch": 1.724197996806503,
"grad_norm": 3.6563122272491455,
"learning_rate": 7.805085707798288e-06,
"loss": 0.7939,
"step": 5939
},
{
"epoch": 1.7244883147046015,
"grad_norm": 3.717435598373413,
"learning_rate": 7.804290795896172e-06,
"loss": 0.7775,
"step": 5940
},
{
"epoch": 1.7247786326027,
"grad_norm": 3.4693424701690674,
"learning_rate": 7.803495780572925e-06,
"loss": 0.7695,
"step": 5941
},
{
"epoch": 1.7250689505007983,
"grad_norm": 3.7334964275360107,
"learning_rate": 7.802700661857864e-06,
"loss": 0.853,
"step": 5942
},
{
"epoch": 1.7253592683988968,
"grad_norm": 3.2945621013641357,
"learning_rate": 7.801905439780317e-06,
"loss": 0.8119,
"step": 5943
},
{
"epoch": 1.7256495862969952,
"grad_norm": 3.5244734287261963,
"learning_rate": 7.80111011436961e-06,
"loss": 0.7805,
"step": 5944
},
{
"epoch": 1.7259399041950936,
"grad_norm": 3.339840888977051,
"learning_rate": 7.800314685655072e-06,
"loss": 0.7999,
"step": 5945
},
{
"epoch": 1.726230222093192,
"grad_norm": 3.149946928024292,
"learning_rate": 7.79951915366604e-06,
"loss": 0.7761,
"step": 5946
},
{
"epoch": 1.7265205399912906,
"grad_norm": 3.8940494060516357,
"learning_rate": 7.798723518431852e-06,
"loss": 0.803,
"step": 5947
},
{
"epoch": 1.7268108578893888,
"grad_norm": 3.4763505458831787,
"learning_rate": 7.797927779981854e-06,
"loss": 0.7353,
"step": 5948
},
{
"epoch": 1.7271011757874875,
"grad_norm": 3.4645235538482666,
"learning_rate": 7.797131938345386e-06,
"loss": 0.6931,
"step": 5949
},
{
"epoch": 1.7273914936855856,
"grad_norm": 3.8292295932769775,
"learning_rate": 7.796335993551805e-06,
"loss": 0.806,
"step": 5950
},
{
"epoch": 1.7276818115836843,
"grad_norm": 3.6954762935638428,
"learning_rate": 7.79553994563046e-06,
"loss": 0.8108,
"step": 5951
},
{
"epoch": 1.7279721294817825,
"grad_norm": 3.1089465618133545,
"learning_rate": 7.794743794610713e-06,
"loss": 0.668,
"step": 5952
},
{
"epoch": 1.728262447379881,
"grad_norm": 3.7287204265594482,
"learning_rate": 7.793947540521922e-06,
"loss": 0.7968,
"step": 5953
},
{
"epoch": 1.7285527652779793,
"grad_norm": 3.2793920040130615,
"learning_rate": 7.793151183393458e-06,
"loss": 0.7453,
"step": 5954
},
{
"epoch": 1.728843083176078,
"grad_norm": 3.862212896347046,
"learning_rate": 7.792354723254682e-06,
"loss": 0.8377,
"step": 5955
},
{
"epoch": 1.7291334010741761,
"grad_norm": 3.502390146255493,
"learning_rate": 7.791558160134975e-06,
"loss": 0.7483,
"step": 5956
},
{
"epoch": 1.7294237189722748,
"grad_norm": 3.9124982357025146,
"learning_rate": 7.790761494063712e-06,
"loss": 0.7549,
"step": 5957
},
{
"epoch": 1.729714036870373,
"grad_norm": 3.570953845977783,
"learning_rate": 7.789964725070269e-06,
"loss": 0.8017,
"step": 5958
},
{
"epoch": 1.7300043547684716,
"grad_norm": 3.851487874984741,
"learning_rate": 7.789167853184036e-06,
"loss": 0.8175,
"step": 5959
},
{
"epoch": 1.7302946726665698,
"grad_norm": 3.938213348388672,
"learning_rate": 7.7883708784344e-06,
"loss": 0.7864,
"step": 5960
},
{
"epoch": 1.7305849905646684,
"grad_norm": 3.95170521736145,
"learning_rate": 7.787573800850752e-06,
"loss": 0.8373,
"step": 5961
},
{
"epoch": 1.7308753084627666,
"grad_norm": 3.3376810550689697,
"learning_rate": 7.786776620462488e-06,
"loss": 0.7517,
"step": 5962
},
{
"epoch": 1.7311656263608652,
"grad_norm": 3.5237679481506348,
"learning_rate": 7.785979337299008e-06,
"loss": 0.8221,
"step": 5963
},
{
"epoch": 1.7314559442589634,
"grad_norm": 3.8222129344940186,
"learning_rate": 7.785181951389718e-06,
"loss": 0.8373,
"step": 5964
},
{
"epoch": 1.731746262157062,
"grad_norm": 3.338149070739746,
"learning_rate": 7.784384462764019e-06,
"loss": 0.7124,
"step": 5965
},
{
"epoch": 1.7320365800551603,
"grad_norm": 3.3781659603118896,
"learning_rate": 7.783586871451328e-06,
"loss": 0.7377,
"step": 5966
},
{
"epoch": 1.732326897953259,
"grad_norm": 3.5843288898468018,
"learning_rate": 7.782789177481057e-06,
"loss": 0.7315,
"step": 5967
},
{
"epoch": 1.732617215851357,
"grad_norm": 3.395334005355835,
"learning_rate": 7.781991380882627e-06,
"loss": 0.8184,
"step": 5968
},
{
"epoch": 1.7329075337494557,
"grad_norm": 3.441681385040283,
"learning_rate": 7.781193481685459e-06,
"loss": 0.8113,
"step": 5969
},
{
"epoch": 1.733197851647554,
"grad_norm": 3.6689629554748535,
"learning_rate": 7.780395479918979e-06,
"loss": 0.7977,
"step": 5970
},
{
"epoch": 1.7334881695456525,
"grad_norm": 3.465517520904541,
"learning_rate": 7.779597375612616e-06,
"loss": 0.8234,
"step": 5971
},
{
"epoch": 1.733778487443751,
"grad_norm": 3.51955246925354,
"learning_rate": 7.778799168795804e-06,
"loss": 0.7416,
"step": 5972
},
{
"epoch": 1.7340688053418494,
"grad_norm": 3.4402823448181152,
"learning_rate": 7.778000859497983e-06,
"loss": 0.7273,
"step": 5973
},
{
"epoch": 1.7343591232399478,
"grad_norm": 3.8265280723571777,
"learning_rate": 7.777202447748592e-06,
"loss": 0.8453,
"step": 5974
},
{
"epoch": 1.7346494411380462,
"grad_norm": 3.3544716835021973,
"learning_rate": 7.776403933577077e-06,
"loss": 0.6991,
"step": 5975
},
{
"epoch": 1.7349397590361446,
"grad_norm": 3.417309045791626,
"learning_rate": 7.775605317012886e-06,
"loss": 0.7992,
"step": 5976
},
{
"epoch": 1.735230076934243,
"grad_norm": 3.171778678894043,
"learning_rate": 7.774806598085473e-06,
"loss": 0.6875,
"step": 5977
},
{
"epoch": 1.7355203948323414,
"grad_norm": 3.8337888717651367,
"learning_rate": 7.774007776824293e-06,
"loss": 0.8176,
"step": 5978
},
{
"epoch": 1.7358107127304399,
"grad_norm": 3.4257326126098633,
"learning_rate": 7.77320885325881e-06,
"loss": 0.7383,
"step": 5979
},
{
"epoch": 1.7361010306285383,
"grad_norm": 3.621321201324463,
"learning_rate": 7.772409827418481e-06,
"loss": 0.8088,
"step": 5980
},
{
"epoch": 1.7363913485266367,
"grad_norm": 3.9669549465179443,
"learning_rate": 7.77161069933278e-06,
"loss": 0.8997,
"step": 5981
},
{
"epoch": 1.736681666424735,
"grad_norm": 3.9241344928741455,
"learning_rate": 7.770811469031176e-06,
"loss": 0.9407,
"step": 5982
},
{
"epoch": 1.7369719843228335,
"grad_norm": 3.7991113662719727,
"learning_rate": 7.770012136543144e-06,
"loss": 0.6812,
"step": 5983
},
{
"epoch": 1.737262302220932,
"grad_norm": 3.605419158935547,
"learning_rate": 7.769212701898166e-06,
"loss": 0.7869,
"step": 5984
},
{
"epoch": 1.7375526201190303,
"grad_norm": 3.2687923908233643,
"learning_rate": 7.76841316512572e-06,
"loss": 0.7058,
"step": 5985
},
{
"epoch": 1.7378429380171287,
"grad_norm": 3.817347288131714,
"learning_rate": 7.767613526255296e-06,
"loss": 0.8495,
"step": 5986
},
{
"epoch": 1.7381332559152272,
"grad_norm": 3.1826589107513428,
"learning_rate": 7.766813785316382e-06,
"loss": 0.792,
"step": 5987
},
{
"epoch": 1.7384235738133256,
"grad_norm": 3.6973764896392822,
"learning_rate": 7.766013942338476e-06,
"loss": 0.7691,
"step": 5988
},
{
"epoch": 1.738713891711424,
"grad_norm": 3.428189992904663,
"learning_rate": 7.765213997351072e-06,
"loss": 0.8026,
"step": 5989
},
{
"epoch": 1.7390042096095224,
"grad_norm": 3.3443777561187744,
"learning_rate": 7.764413950383674e-06,
"loss": 0.7425,
"step": 5990
},
{
"epoch": 1.7392945275076208,
"grad_norm": 2.8721110820770264,
"learning_rate": 7.763613801465785e-06,
"loss": 0.6768,
"step": 5991
},
{
"epoch": 1.7395848454057192,
"grad_norm": 3.564232587814331,
"learning_rate": 7.762813550626917e-06,
"loss": 0.6933,
"step": 5992
},
{
"epoch": 1.7398751633038176,
"grad_norm": 3.7007267475128174,
"learning_rate": 7.76201319789658e-06,
"loss": 0.8681,
"step": 5993
},
{
"epoch": 1.740165481201916,
"grad_norm": 3.5045223236083984,
"learning_rate": 7.761212743304294e-06,
"loss": 0.7965,
"step": 5994
},
{
"epoch": 1.7404557991000145,
"grad_norm": 3.9651434421539307,
"learning_rate": 7.760412186879579e-06,
"loss": 0.8799,
"step": 5995
},
{
"epoch": 1.7407461169981129,
"grad_norm": 3.1684725284576416,
"learning_rate": 7.759611528651954e-06,
"loss": 0.7174,
"step": 5996
},
{
"epoch": 1.7410364348962113,
"grad_norm": 3.4137959480285645,
"learning_rate": 7.758810768650954e-06,
"loss": 0.781,
"step": 5997
},
{
"epoch": 1.74132675279431,
"grad_norm": 3.7508652210235596,
"learning_rate": 7.758009906906107e-06,
"loss": 0.8172,
"step": 5998
},
{
"epoch": 1.7416170706924081,
"grad_norm": 3.002896308898926,
"learning_rate": 7.75720894344695e-06,
"loss": 0.7221,
"step": 5999
},
{
"epoch": 1.7419073885905068,
"grad_norm": 3.63832426071167,
"learning_rate": 7.75640787830302e-06,
"loss": 0.7877,
"step": 6000
},
{
"epoch": 1.7419073885905068,
"eval_loss": 1.1696195602416992,
"eval_runtime": 13.3242,
"eval_samples_per_second": 30.02,
"eval_steps_per_second": 3.753,
"step": 6000
},
{
"epoch": 1.742197706488605,
"grad_norm": 3.424290657043457,
"learning_rate": 7.755606711503861e-06,
"loss": 0.8493,
"step": 6001
},
{
"epoch": 1.7424880243867036,
"grad_norm": 3.4848201274871826,
"learning_rate": 7.75480544307902e-06,
"loss": 0.7085,
"step": 6002
},
{
"epoch": 1.7427783422848018,
"grad_norm": 3.4856338500976562,
"learning_rate": 7.754004073058048e-06,
"loss": 0.8014,
"step": 6003
},
{
"epoch": 1.7430686601829004,
"grad_norm": 3.9823102951049805,
"learning_rate": 7.753202601470499e-06,
"loss": 0.9238,
"step": 6004
},
{
"epoch": 1.7433589780809986,
"grad_norm": 3.394909620285034,
"learning_rate": 7.752401028345932e-06,
"loss": 0.8048,
"step": 6005
},
{
"epoch": 1.7436492959790972,
"grad_norm": 3.9474101066589355,
"learning_rate": 7.751599353713906e-06,
"loss": 0.8962,
"step": 6006
},
{
"epoch": 1.7439396138771954,
"grad_norm": 3.826502799987793,
"learning_rate": 7.750797577603988e-06,
"loss": 0.8611,
"step": 6007
},
{
"epoch": 1.744229931775294,
"grad_norm": 3.7918648719787598,
"learning_rate": 7.749995700045746e-06,
"loss": 0.781,
"step": 6008
},
{
"epoch": 1.7445202496733923,
"grad_norm": 3.3785643577575684,
"learning_rate": 7.749193721068754e-06,
"loss": 0.7255,
"step": 6009
},
{
"epoch": 1.744810567571491,
"grad_norm": 2.9595866203308105,
"learning_rate": 7.748391640702588e-06,
"loss": 0.752,
"step": 6010
},
{
"epoch": 1.745100885469589,
"grad_norm": 3.2847795486450195,
"learning_rate": 7.74758945897683e-06,
"loss": 0.6969,
"step": 6011
},
{
"epoch": 1.7453912033676877,
"grad_norm": 3.771801233291626,
"learning_rate": 7.746787175921065e-06,
"loss": 0.7866,
"step": 6012
},
{
"epoch": 1.745681521265786,
"grad_norm": 3.230302333831787,
"learning_rate": 7.745984791564876e-06,
"loss": 0.7506,
"step": 6013
},
{
"epoch": 1.7459718391638845,
"grad_norm": 4.036153316497803,
"learning_rate": 7.745182305937859e-06,
"loss": 1.0717,
"step": 6014
},
{
"epoch": 1.7462621570619827,
"grad_norm": 3.5328006744384766,
"learning_rate": 7.744379719069607e-06,
"loss": 0.791,
"step": 6015
},
{
"epoch": 1.7465524749600814,
"grad_norm": 3.5628857612609863,
"learning_rate": 7.74357703098972e-06,
"loss": 0.7866,
"step": 6016
},
{
"epoch": 1.7468427928581796,
"grad_norm": 3.3404319286346436,
"learning_rate": 7.742774241727801e-06,
"loss": 0.7193,
"step": 6017
},
{
"epoch": 1.7471331107562782,
"grad_norm": 3.2553791999816895,
"learning_rate": 7.741971351313458e-06,
"loss": 0.7112,
"step": 6018
},
{
"epoch": 1.7474234286543764,
"grad_norm": 3.824651002883911,
"learning_rate": 7.7411683597763e-06,
"loss": 0.7889,
"step": 6019
},
{
"epoch": 1.747713746552475,
"grad_norm": 2.963634967803955,
"learning_rate": 7.740365267145937e-06,
"loss": 0.6034,
"step": 6020
},
{
"epoch": 1.7480040644505732,
"grad_norm": 3.501497268676758,
"learning_rate": 7.739562073451994e-06,
"loss": 0.7022,
"step": 6021
},
{
"epoch": 1.7482943823486718,
"grad_norm": 3.259615898132324,
"learning_rate": 7.738758778724087e-06,
"loss": 0.7075,
"step": 6022
},
{
"epoch": 1.7485847002467703,
"grad_norm": 3.740983009338379,
"learning_rate": 7.737955382991844e-06,
"loss": 0.8299,
"step": 6023
},
{
"epoch": 1.7488750181448687,
"grad_norm": 3.5070557594299316,
"learning_rate": 7.737151886284893e-06,
"loss": 0.7363,
"step": 6024
},
{
"epoch": 1.749165336042967,
"grad_norm": 3.7931597232818604,
"learning_rate": 7.736348288632866e-06,
"loss": 0.8515,
"step": 6025
},
{
"epoch": 1.7494556539410655,
"grad_norm": 3.109853744506836,
"learning_rate": 7.7355445900654e-06,
"loss": 0.669,
"step": 6026
},
{
"epoch": 1.749745971839164,
"grad_norm": 3.4060046672821045,
"learning_rate": 7.734740790612137e-06,
"loss": 0.8745,
"step": 6027
},
{
"epoch": 1.7500362897372623,
"grad_norm": 3.7956717014312744,
"learning_rate": 7.733936890302716e-06,
"loss": 0.8567,
"step": 6028
},
{
"epoch": 1.7503266076353607,
"grad_norm": 3.112710475921631,
"learning_rate": 7.733132889166788e-06,
"loss": 0.7221,
"step": 6029
},
{
"epoch": 1.7506169255334592,
"grad_norm": 3.7839791774749756,
"learning_rate": 7.732328787234006e-06,
"loss": 0.8762,
"step": 6030
},
{
"epoch": 1.7509072434315576,
"grad_norm": 3.9805736541748047,
"learning_rate": 7.73152458453402e-06,
"loss": 0.8325,
"step": 6031
},
{
"epoch": 1.751197561329656,
"grad_norm": 3.4485583305358887,
"learning_rate": 7.730720281096493e-06,
"loss": 0.7338,
"step": 6032
},
{
"epoch": 1.7514878792277544,
"grad_norm": 3.645721912384033,
"learning_rate": 7.729915876951082e-06,
"loss": 0.7995,
"step": 6033
},
{
"epoch": 1.7517781971258528,
"grad_norm": 3.793673515319824,
"learning_rate": 7.72911137212746e-06,
"loss": 0.8108,
"step": 6034
},
{
"epoch": 1.7520685150239512,
"grad_norm": 3.6693036556243896,
"learning_rate": 7.728306766655294e-06,
"loss": 0.7696,
"step": 6035
},
{
"epoch": 1.7523588329220496,
"grad_norm": 3.7668471336364746,
"learning_rate": 7.727502060564257e-06,
"loss": 0.8003,
"step": 6036
},
{
"epoch": 1.752649150820148,
"grad_norm": 3.386531352996826,
"learning_rate": 7.726697253884026e-06,
"loss": 0.8003,
"step": 6037
},
{
"epoch": 1.7529394687182465,
"grad_norm": 3.680187940597534,
"learning_rate": 7.725892346644281e-06,
"loss": 0.876,
"step": 6038
},
{
"epoch": 1.7532297866163449,
"grad_norm": 2.98075795173645,
"learning_rate": 7.72508733887471e-06,
"loss": 0.7267,
"step": 6039
},
{
"epoch": 1.7535201045144433,
"grad_norm": 3.63118314743042,
"learning_rate": 7.724282230604998e-06,
"loss": 0.7591,
"step": 6040
},
{
"epoch": 1.7538104224125417,
"grad_norm": 3.2664151191711426,
"learning_rate": 7.72347702186484e-06,
"loss": 0.7249,
"step": 6041
},
{
"epoch": 1.7541007403106401,
"grad_norm": 3.529172897338867,
"learning_rate": 7.722671712683929e-06,
"loss": 0.7926,
"step": 6042
},
{
"epoch": 1.7543910582087385,
"grad_norm": 3.5128173828125,
"learning_rate": 7.721866303091965e-06,
"loss": 0.7381,
"step": 6043
},
{
"epoch": 1.754681376106837,
"grad_norm": 3.793933153152466,
"learning_rate": 7.721060793118653e-06,
"loss": 0.8778,
"step": 6044
},
{
"epoch": 1.7549716940049354,
"grad_norm": 3.560621500015259,
"learning_rate": 7.7202551827937e-06,
"loss": 0.7361,
"step": 6045
},
{
"epoch": 1.7552620119030338,
"grad_norm": 3.519472360610962,
"learning_rate": 7.719449472146814e-06,
"loss": 0.726,
"step": 6046
},
{
"epoch": 1.7555523298011324,
"grad_norm": 3.8505566120147705,
"learning_rate": 7.71864366120771e-06,
"loss": 0.9294,
"step": 6047
},
{
"epoch": 1.7558426476992306,
"grad_norm": 3.6858813762664795,
"learning_rate": 7.717837750006106e-06,
"loss": 0.7188,
"step": 6048
},
{
"epoch": 1.7561329655973292,
"grad_norm": 3.213684320449829,
"learning_rate": 7.717031738571726e-06,
"loss": 0.8008,
"step": 6049
},
{
"epoch": 1.7564232834954274,
"grad_norm": 3.483856678009033,
"learning_rate": 7.716225626934293e-06,
"loss": 0.7414,
"step": 6050
},
{
"epoch": 1.756713601393526,
"grad_norm": 3.566657781600952,
"learning_rate": 7.715419415123537e-06,
"loss": 0.8229,
"step": 6051
},
{
"epoch": 1.7570039192916242,
"grad_norm": 3.8110122680664062,
"learning_rate": 7.71461310316919e-06,
"loss": 0.8532,
"step": 6052
},
{
"epoch": 1.7572942371897229,
"grad_norm": 3.7343101501464844,
"learning_rate": 7.71380669110099e-06,
"loss": 0.8744,
"step": 6053
},
{
"epoch": 1.757584555087821,
"grad_norm": 3.0625345706939697,
"learning_rate": 7.713000178948675e-06,
"loss": 0.7301,
"step": 6054
},
{
"epoch": 1.7578748729859197,
"grad_norm": 3.1641945838928223,
"learning_rate": 7.712193566741993e-06,
"loss": 0.6697,
"step": 6055
},
{
"epoch": 1.758165190884018,
"grad_norm": 3.662405014038086,
"learning_rate": 7.711386854510685e-06,
"loss": 0.8059,
"step": 6056
},
{
"epoch": 1.7584555087821165,
"grad_norm": 3.7662250995635986,
"learning_rate": 7.710580042284507e-06,
"loss": 0.7312,
"step": 6057
},
{
"epoch": 1.7587458266802147,
"grad_norm": 3.9004745483398438,
"learning_rate": 7.709773130093213e-06,
"loss": 0.7461,
"step": 6058
},
{
"epoch": 1.7590361445783134,
"grad_norm": 3.377485513687134,
"learning_rate": 7.70896611796656e-06,
"loss": 0.8538,
"step": 6059
},
{
"epoch": 1.7593264624764116,
"grad_norm": 3.235250473022461,
"learning_rate": 7.708159005934312e-06,
"loss": 0.7092,
"step": 6060
},
{
"epoch": 1.7596167803745102,
"grad_norm": 3.500490665435791,
"learning_rate": 7.707351794026236e-06,
"loss": 0.7842,
"step": 6061
},
{
"epoch": 1.7599070982726084,
"grad_norm": 3.8645684719085693,
"learning_rate": 7.7065444822721e-06,
"loss": 0.7956,
"step": 6062
},
{
"epoch": 1.760197416170707,
"grad_norm": 3.4011542797088623,
"learning_rate": 7.705737070701678e-06,
"loss": 0.8391,
"step": 6063
},
{
"epoch": 1.7604877340688052,
"grad_norm": 3.686098337173462,
"learning_rate": 7.704929559344745e-06,
"loss": 0.943,
"step": 6064
},
{
"epoch": 1.7607780519669038,
"grad_norm": 3.844574451446533,
"learning_rate": 7.704121948231083e-06,
"loss": 0.9983,
"step": 6065
},
{
"epoch": 1.761068369865002,
"grad_norm": 3.554001808166504,
"learning_rate": 7.703314237390478e-06,
"loss": 0.8524,
"step": 6066
},
{
"epoch": 1.7613586877631007,
"grad_norm": 3.8397789001464844,
"learning_rate": 7.702506426852715e-06,
"loss": 0.8776,
"step": 6067
},
{
"epoch": 1.7616490056611989,
"grad_norm": 3.77868914604187,
"learning_rate": 7.70169851664759e-06,
"loss": 0.9187,
"step": 6068
},
{
"epoch": 1.7619393235592975,
"grad_norm": 3.272463321685791,
"learning_rate": 7.700890506804895e-06,
"loss": 0.6733,
"step": 6069
},
{
"epoch": 1.7622296414573957,
"grad_norm": 3.852590322494507,
"learning_rate": 7.70008239735443e-06,
"loss": 0.9901,
"step": 6070
},
{
"epoch": 1.7625199593554943,
"grad_norm": 3.8171653747558594,
"learning_rate": 7.699274188325995e-06,
"loss": 0.9094,
"step": 6071
},
{
"epoch": 1.7628102772535927,
"grad_norm": 3.6177287101745605,
"learning_rate": 7.698465879749404e-06,
"loss": 0.8565,
"step": 6072
},
{
"epoch": 1.7631005951516912,
"grad_norm": 3.4823312759399414,
"learning_rate": 7.697657471654459e-06,
"loss": 0.8491,
"step": 6073
},
{
"epoch": 1.7633909130497896,
"grad_norm": 3.9708127975463867,
"learning_rate": 7.696848964070976e-06,
"loss": 0.9884,
"step": 6074
},
{
"epoch": 1.763681230947888,
"grad_norm": 3.4418365955352783,
"learning_rate": 7.696040357028775e-06,
"loss": 0.7678,
"step": 6075
},
{
"epoch": 1.7639715488459864,
"grad_norm": 3.3301215171813965,
"learning_rate": 7.695231650557675e-06,
"loss": 0.7267,
"step": 6076
},
{
"epoch": 1.7642618667440848,
"grad_norm": 3.1033713817596436,
"learning_rate": 7.694422844687502e-06,
"loss": 0.6836,
"step": 6077
},
{
"epoch": 1.7645521846421832,
"grad_norm": 4.057397365570068,
"learning_rate": 7.693613939448083e-06,
"loss": 0.7511,
"step": 6078
},
{
"epoch": 1.7648425025402816,
"grad_norm": 3.352520227432251,
"learning_rate": 7.692804934869252e-06,
"loss": 0.7612,
"step": 6079
},
{
"epoch": 1.76513282043838,
"grad_norm": 3.3684096336364746,
"learning_rate": 7.691995830980841e-06,
"loss": 0.7262,
"step": 6080
},
{
"epoch": 1.7654231383364785,
"grad_norm": 3.3228354454040527,
"learning_rate": 7.691186627812696e-06,
"loss": 0.7095,
"step": 6081
},
{
"epoch": 1.7657134562345769,
"grad_norm": 3.406299591064453,
"learning_rate": 7.690377325394653e-06,
"loss": 0.7504,
"step": 6082
},
{
"epoch": 1.7660037741326753,
"grad_norm": 3.5867552757263184,
"learning_rate": 7.689567923756563e-06,
"loss": 0.7775,
"step": 6083
},
{
"epoch": 1.7662940920307737,
"grad_norm": 3.1561825275421143,
"learning_rate": 7.688758422928275e-06,
"loss": 0.707,
"step": 6084
},
{
"epoch": 1.7665844099288721,
"grad_norm": 2.969261646270752,
"learning_rate": 7.687948822939643e-06,
"loss": 0.7095,
"step": 6085
},
{
"epoch": 1.7668747278269705,
"grad_norm": 3.4857072830200195,
"learning_rate": 7.687139123820526e-06,
"loss": 0.705,
"step": 6086
},
{
"epoch": 1.767165045725069,
"grad_norm": 3.617248296737671,
"learning_rate": 7.686329325600785e-06,
"loss": 0.7477,
"step": 6087
},
{
"epoch": 1.7674553636231674,
"grad_norm": 3.9258131980895996,
"learning_rate": 7.685519428310282e-06,
"loss": 0.8036,
"step": 6088
},
{
"epoch": 1.7677456815212658,
"grad_norm": 3.5120155811309814,
"learning_rate": 7.684709431978891e-06,
"loss": 0.6849,
"step": 6089
},
{
"epoch": 1.7680359994193642,
"grad_norm": 3.392848491668701,
"learning_rate": 7.68389933663648e-06,
"loss": 0.7749,
"step": 6090
},
{
"epoch": 1.7683263173174626,
"grad_norm": 4.192860126495361,
"learning_rate": 7.683089142312927e-06,
"loss": 0.8256,
"step": 6091
},
{
"epoch": 1.768616635215561,
"grad_norm": 4.079232215881348,
"learning_rate": 7.682278849038109e-06,
"loss": 0.9657,
"step": 6092
},
{
"epoch": 1.7689069531136594,
"grad_norm": 3.493929386138916,
"learning_rate": 7.681468456841914e-06,
"loss": 0.7045,
"step": 6093
},
{
"epoch": 1.7691972710117578,
"grad_norm": 3.630089044570923,
"learning_rate": 7.680657965754227e-06,
"loss": 0.8063,
"step": 6094
},
{
"epoch": 1.7694875889098562,
"grad_norm": 3.227755546569824,
"learning_rate": 7.679847375804938e-06,
"loss": 0.6261,
"step": 6095
},
{
"epoch": 1.7697779068079547,
"grad_norm": 3.3954944610595703,
"learning_rate": 7.67903668702394e-06,
"loss": 0.6809,
"step": 6096
},
{
"epoch": 1.770068224706053,
"grad_norm": 3.9170215129852295,
"learning_rate": 7.678225899441131e-06,
"loss": 0.8088,
"step": 6097
},
{
"epoch": 1.7703585426041517,
"grad_norm": 3.5438239574432373,
"learning_rate": 7.677415013086415e-06,
"loss": 0.7075,
"step": 6098
},
{
"epoch": 1.77064886050225,
"grad_norm": 3.731586456298828,
"learning_rate": 7.676604027989695e-06,
"loss": 0.7176,
"step": 6099
},
{
"epoch": 1.7709391784003485,
"grad_norm": 3.9872632026672363,
"learning_rate": 7.675792944180884e-06,
"loss": 0.7342,
"step": 6100
},
{
"epoch": 1.7712294962984467,
"grad_norm": 3.564387083053589,
"learning_rate": 7.674981761689885e-06,
"loss": 0.8111,
"step": 6101
},
{
"epoch": 1.7715198141965454,
"grad_norm": 3.6033754348754883,
"learning_rate": 7.674170480546626e-06,
"loss": 0.6986,
"step": 6102
},
{
"epoch": 1.7718101320946436,
"grad_norm": 3.794177532196045,
"learning_rate": 7.673359100781018e-06,
"loss": 0.8078,
"step": 6103
},
{
"epoch": 1.7721004499927422,
"grad_norm": 3.224788188934326,
"learning_rate": 7.67254762242299e-06,
"loss": 0.7318,
"step": 6104
},
{
"epoch": 1.7723907678908404,
"grad_norm": 3.258075714111328,
"learning_rate": 7.671736045502462e-06,
"loss": 0.7327,
"step": 6105
},
{
"epoch": 1.772681085788939,
"grad_norm": 3.753732919692993,
"learning_rate": 7.67092437004937e-06,
"loss": 0.7999,
"step": 6106
},
{
"epoch": 1.7729714036870372,
"grad_norm": 3.635417938232422,
"learning_rate": 7.670112596093649e-06,
"loss": 0.7014,
"step": 6107
},
{
"epoch": 1.7732617215851358,
"grad_norm": 4.326013565063477,
"learning_rate": 7.669300723665234e-06,
"loss": 0.9172,
"step": 6108
},
{
"epoch": 1.773552039483234,
"grad_norm": 3.5447564125061035,
"learning_rate": 7.668488752794067e-06,
"loss": 0.7672,
"step": 6109
},
{
"epoch": 1.7738423573813327,
"grad_norm": 3.6314609050750732,
"learning_rate": 7.667676683510095e-06,
"loss": 0.8618,
"step": 6110
},
{
"epoch": 1.7741326752794309,
"grad_norm": 3.521106004714966,
"learning_rate": 7.666864515843266e-06,
"loss": 0.7867,
"step": 6111
},
{
"epoch": 1.7744229931775295,
"grad_norm": 3.3227083683013916,
"learning_rate": 7.66605224982353e-06,
"loss": 0.7712,
"step": 6112
},
{
"epoch": 1.7747133110756277,
"grad_norm": 3.570622682571411,
"learning_rate": 7.665239885480846e-06,
"loss": 0.7956,
"step": 6113
},
{
"epoch": 1.7750036289737263,
"grad_norm": 3.695883274078369,
"learning_rate": 7.664427422845172e-06,
"loss": 0.8755,
"step": 6114
},
{
"epoch": 1.7752939468718245,
"grad_norm": 3.544062376022339,
"learning_rate": 7.663614861946474e-06,
"loss": 0.8408,
"step": 6115
},
{
"epoch": 1.7755842647699231,
"grad_norm": 3.7533979415893555,
"learning_rate": 7.662802202814717e-06,
"loss": 0.8039,
"step": 6116
},
{
"epoch": 1.7758745826680213,
"grad_norm": 3.271301031112671,
"learning_rate": 7.661989445479869e-06,
"loss": 0.7642,
"step": 6117
},
{
"epoch": 1.77616490056612,
"grad_norm": 3.6111979484558105,
"learning_rate": 7.661176589971909e-06,
"loss": 0.7683,
"step": 6118
},
{
"epoch": 1.7764552184642182,
"grad_norm": 3.15321683883667,
"learning_rate": 7.660363636320809e-06,
"loss": 0.7051,
"step": 6119
},
{
"epoch": 1.7767455363623168,
"grad_norm": 3.64837646484375,
"learning_rate": 7.659550584556556e-06,
"loss": 0.716,
"step": 6120
},
{
"epoch": 1.777035854260415,
"grad_norm": 3.7064368724823,
"learning_rate": 7.658737434709134e-06,
"loss": 0.7225,
"step": 6121
},
{
"epoch": 1.7773261721585136,
"grad_norm": 3.836670160293579,
"learning_rate": 7.657924186808528e-06,
"loss": 0.7857,
"step": 6122
},
{
"epoch": 1.777616490056612,
"grad_norm": 3.381930351257324,
"learning_rate": 7.657110840884736e-06,
"loss": 0.7435,
"step": 6123
},
{
"epoch": 1.7779068079547105,
"grad_norm": 3.776498317718506,
"learning_rate": 7.656297396967747e-06,
"loss": 0.8766,
"step": 6124
},
{
"epoch": 1.7781971258528089,
"grad_norm": 4.0997419357299805,
"learning_rate": 7.655483855087566e-06,
"loss": 0.8466,
"step": 6125
},
{
"epoch": 1.7784874437509073,
"grad_norm": 3.578490734100342,
"learning_rate": 7.654670215274194e-06,
"loss": 0.8105,
"step": 6126
},
{
"epoch": 1.7787777616490057,
"grad_norm": 3.371166229248047,
"learning_rate": 7.653856477557639e-06,
"loss": 0.7181,
"step": 6127
},
{
"epoch": 1.779068079547104,
"grad_norm": 3.529717206954956,
"learning_rate": 7.65304264196791e-06,
"loss": 0.7721,
"step": 6128
},
{
"epoch": 1.7793583974452025,
"grad_norm": 3.6220967769622803,
"learning_rate": 7.65222870853502e-06,
"loss": 0.793,
"step": 6129
},
{
"epoch": 1.779648715343301,
"grad_norm": 3.6344494819641113,
"learning_rate": 7.651414677288987e-06,
"loss": 0.6975,
"step": 6130
},
{
"epoch": 1.7799390332413993,
"grad_norm": 3.3892741203308105,
"learning_rate": 7.650600548259835e-06,
"loss": 0.7217,
"step": 6131
},
{
"epoch": 1.7802293511394978,
"grad_norm": 2.9629781246185303,
"learning_rate": 7.649786321477585e-06,
"loss": 0.7099,
"step": 6132
},
{
"epoch": 1.7805196690375962,
"grad_norm": 3.578287124633789,
"learning_rate": 7.648971996972268e-06,
"loss": 0.772,
"step": 6133
},
{
"epoch": 1.7808099869356946,
"grad_norm": 3.6381213665008545,
"learning_rate": 7.648157574773915e-06,
"loss": 0.712,
"step": 6134
},
{
"epoch": 1.781100304833793,
"grad_norm": 3.346418619155884,
"learning_rate": 7.647343054912561e-06,
"loss": 0.7385,
"step": 6135
},
{
"epoch": 1.7813906227318914,
"grad_norm": 3.614990472793579,
"learning_rate": 7.646528437418246e-06,
"loss": 0.783,
"step": 6136
},
{
"epoch": 1.7816809406299898,
"grad_norm": 3.3961567878723145,
"learning_rate": 7.645713722321013e-06,
"loss": 0.7439,
"step": 6137
},
{
"epoch": 1.7819712585280882,
"grad_norm": 3.5309431552886963,
"learning_rate": 7.644898909650906e-06,
"loss": 0.7021,
"step": 6138
},
{
"epoch": 1.7822615764261867,
"grad_norm": 3.698122262954712,
"learning_rate": 7.644083999437976e-06,
"loss": 0.7764,
"step": 6139
},
{
"epoch": 1.782551894324285,
"grad_norm": 3.429757595062256,
"learning_rate": 7.643268991712281e-06,
"loss": 0.6601,
"step": 6140
},
{
"epoch": 1.7828422122223835,
"grad_norm": 3.651519775390625,
"learning_rate": 7.642453886503873e-06,
"loss": 0.7773,
"step": 6141
},
{
"epoch": 1.783132530120482,
"grad_norm": 3.704296112060547,
"learning_rate": 7.641638683842814e-06,
"loss": 0.7685,
"step": 6142
},
{
"epoch": 1.7834228480185803,
"grad_norm": 3.3031558990478516,
"learning_rate": 7.640823383759169e-06,
"loss": 0.7214,
"step": 6143
},
{
"epoch": 1.7837131659166787,
"grad_norm": 3.5565595626831055,
"learning_rate": 7.640007986283006e-06,
"loss": 0.7482,
"step": 6144
},
{
"epoch": 1.7840034838147771,
"grad_norm": 4.059230327606201,
"learning_rate": 7.639192491444395e-06,
"loss": 0.848,
"step": 6145
},
{
"epoch": 1.7842938017128755,
"grad_norm": 3.8568592071533203,
"learning_rate": 7.638376899273414e-06,
"loss": 0.7522,
"step": 6146
},
{
"epoch": 1.784584119610974,
"grad_norm": 3.5061683654785156,
"learning_rate": 7.637561209800137e-06,
"loss": 0.7799,
"step": 6147
},
{
"epoch": 1.7848744375090724,
"grad_norm": 3.739004135131836,
"learning_rate": 7.636745423054652e-06,
"loss": 0.8028,
"step": 6148
},
{
"epoch": 1.785164755407171,
"grad_norm": 3.494581699371338,
"learning_rate": 7.635929539067042e-06,
"loss": 0.8013,
"step": 6149
},
{
"epoch": 1.7854550733052692,
"grad_norm": 3.7833151817321777,
"learning_rate": 7.635113557867395e-06,
"loss": 0.8237,
"step": 6150
},
{
"epoch": 1.7857453912033678,
"grad_norm": 3.478761911392212,
"learning_rate": 7.634297479485806e-06,
"loss": 0.7016,
"step": 6151
},
{
"epoch": 1.786035709101466,
"grad_norm": 3.378567934036255,
"learning_rate": 7.633481303952373e-06,
"loss": 0.8555,
"step": 6152
},
{
"epoch": 1.7863260269995647,
"grad_norm": 3.6236679553985596,
"learning_rate": 7.632665031297193e-06,
"loss": 0.8543,
"step": 6153
},
{
"epoch": 1.7866163448976629,
"grad_norm": 3.544419050216675,
"learning_rate": 7.631848661550372e-06,
"loss": 0.7616,
"step": 6154
},
{
"epoch": 1.7869066627957615,
"grad_norm": 3.239393472671509,
"learning_rate": 7.631032194742017e-06,
"loss": 0.7845,
"step": 6155
},
{
"epoch": 1.7871969806938597,
"grad_norm": 3.15290904045105,
"learning_rate": 7.630215630902236e-06,
"loss": 0.7698,
"step": 6156
},
{
"epoch": 1.7874872985919583,
"grad_norm": 3.545022964477539,
"learning_rate": 7.62939897006115e-06,
"loss": 0.7294,
"step": 6157
},
{
"epoch": 1.7877776164900565,
"grad_norm": 2.9995696544647217,
"learning_rate": 7.628582212248871e-06,
"loss": 0.6932,
"step": 6158
},
{
"epoch": 1.7880679343881551,
"grad_norm": 3.410565137863159,
"learning_rate": 7.627765357495526e-06,
"loss": 0.6982,
"step": 6159
},
{
"epoch": 1.7883582522862533,
"grad_norm": 3.6005923748016357,
"learning_rate": 7.626948405831235e-06,
"loss": 0.757,
"step": 6160
},
{
"epoch": 1.788648570184352,
"grad_norm": 3.7826449871063232,
"learning_rate": 7.626131357286129e-06,
"loss": 0.8267,
"step": 6161
},
{
"epoch": 1.7889388880824502,
"grad_norm": 3.534515619277954,
"learning_rate": 7.625314211890342e-06,
"loss": 0.6781,
"step": 6162
},
{
"epoch": 1.7892292059805488,
"grad_norm": 3.6266918182373047,
"learning_rate": 7.624496969674009e-06,
"loss": 0.6734,
"step": 6163
},
{
"epoch": 1.789519523878647,
"grad_norm": 3.3739120960235596,
"learning_rate": 7.623679630667269e-06,
"loss": 0.6884,
"step": 6164
},
{
"epoch": 1.7898098417767456,
"grad_norm": 3.380641222000122,
"learning_rate": 7.622862194900263e-06,
"loss": 0.6936,
"step": 6165
},
{
"epoch": 1.7901001596748438,
"grad_norm": 3.769023895263672,
"learning_rate": 7.622044662403143e-06,
"loss": 0.7827,
"step": 6166
},
{
"epoch": 1.7903904775729425,
"grad_norm": 3.9562571048736572,
"learning_rate": 7.621227033206055e-06,
"loss": 0.9208,
"step": 6167
},
{
"epoch": 1.7906807954710406,
"grad_norm": 3.863774299621582,
"learning_rate": 7.620409307339156e-06,
"loss": 0.8076,
"step": 6168
},
{
"epoch": 1.7909711133691393,
"grad_norm": 3.953861951828003,
"learning_rate": 7.6195914848326e-06,
"loss": 0.8365,
"step": 6169
},
{
"epoch": 1.7912614312672375,
"grad_norm": 3.024517059326172,
"learning_rate": 7.61877356571655e-06,
"loss": 0.6976,
"step": 6170
},
{
"epoch": 1.791551749165336,
"grad_norm": 3.4500885009765625,
"learning_rate": 7.617955550021169e-06,
"loss": 0.7894,
"step": 6171
},
{
"epoch": 1.7918420670634343,
"grad_norm": 3.453752040863037,
"learning_rate": 7.617137437776627e-06,
"loss": 0.8166,
"step": 6172
},
{
"epoch": 1.792132384961533,
"grad_norm": 3.911886215209961,
"learning_rate": 7.616319229013096e-06,
"loss": 0.9803,
"step": 6173
},
{
"epoch": 1.7924227028596313,
"grad_norm": 3.8347620964050293,
"learning_rate": 7.615500923760748e-06,
"loss": 0.7538,
"step": 6174
},
{
"epoch": 1.7927130207577298,
"grad_norm": 3.304626226425171,
"learning_rate": 7.614682522049766e-06,
"loss": 0.747,
"step": 6175
},
{
"epoch": 1.7930033386558282,
"grad_norm": 3.2706761360168457,
"learning_rate": 7.613864023910329e-06,
"loss": 0.7474,
"step": 6176
},
{
"epoch": 1.7932936565539266,
"grad_norm": 3.834886312484741,
"learning_rate": 7.613045429372624e-06,
"loss": 0.8663,
"step": 6177
},
{
"epoch": 1.793583974452025,
"grad_norm": 3.344585418701172,
"learning_rate": 7.612226738466841e-06,
"loss": 0.62,
"step": 6178
},
{
"epoch": 1.7938742923501234,
"grad_norm": 3.5737040042877197,
"learning_rate": 7.611407951223173e-06,
"loss": 0.7471,
"step": 6179
},
{
"epoch": 1.7941646102482218,
"grad_norm": 3.5841925144195557,
"learning_rate": 7.610589067671814e-06,
"loss": 0.8081,
"step": 6180
},
{
"epoch": 1.7944549281463202,
"grad_norm": 3.6530447006225586,
"learning_rate": 7.609770087842969e-06,
"loss": 0.7242,
"step": 6181
},
{
"epoch": 1.7947452460444187,
"grad_norm": 3.2289116382598877,
"learning_rate": 7.6089510117668365e-06,
"loss": 0.7093,
"step": 6182
},
{
"epoch": 1.795035563942517,
"grad_norm": 3.61566424369812,
"learning_rate": 7.608131839473627e-06,
"loss": 0.7938,
"step": 6183
},
{
"epoch": 1.7953258818406155,
"grad_norm": 3.7904341220855713,
"learning_rate": 7.607312570993551e-06,
"loss": 0.821,
"step": 6184
},
{
"epoch": 1.795616199738714,
"grad_norm": 3.485880136489868,
"learning_rate": 7.606493206356821e-06,
"loss": 0.7012,
"step": 6185
},
{
"epoch": 1.7959065176368123,
"grad_norm": 3.770455837249756,
"learning_rate": 7.6056737455936556e-06,
"loss": 0.7758,
"step": 6186
},
{
"epoch": 1.7961968355349107,
"grad_norm": 3.34679913520813,
"learning_rate": 7.604854188734278e-06,
"loss": 0.7696,
"step": 6187
},
{
"epoch": 1.7964871534330091,
"grad_norm": 3.1228458881378174,
"learning_rate": 7.604034535808909e-06,
"loss": 0.6932,
"step": 6188
},
{
"epoch": 1.7967774713311075,
"grad_norm": 3.367436408996582,
"learning_rate": 7.603214786847781e-06,
"loss": 0.8846,
"step": 6189
},
{
"epoch": 1.797067789229206,
"grad_norm": 3.469499349594116,
"learning_rate": 7.602394941881126e-06,
"loss": 0.7274,
"step": 6190
},
{
"epoch": 1.7973581071273044,
"grad_norm": 3.600771903991699,
"learning_rate": 7.6015750009391776e-06,
"loss": 0.7988,
"step": 6191
},
{
"epoch": 1.7976484250254028,
"grad_norm": 3.430292844772339,
"learning_rate": 7.600754964052174e-06,
"loss": 0.8242,
"step": 6192
},
{
"epoch": 1.7979387429235012,
"grad_norm": 3.573873281478882,
"learning_rate": 7.5999348312503614e-06,
"loss": 0.859,
"step": 6193
},
{
"epoch": 1.7982290608215996,
"grad_norm": 3.5837037563323975,
"learning_rate": 7.5991146025639825e-06,
"loss": 0.7537,
"step": 6194
},
{
"epoch": 1.798519378719698,
"grad_norm": 3.798265218734741,
"learning_rate": 7.59829427802329e-06,
"loss": 0.8035,
"step": 6195
},
{
"epoch": 1.7988096966177964,
"grad_norm": 3.419114112854004,
"learning_rate": 7.597473857658535e-06,
"loss": 0.6888,
"step": 6196
},
{
"epoch": 1.7991000145158949,
"grad_norm": 3.157182216644287,
"learning_rate": 7.596653341499974e-06,
"loss": 0.7266,
"step": 6197
},
{
"epoch": 1.7993903324139935,
"grad_norm": 3.9746930599212646,
"learning_rate": 7.59583272957787e-06,
"loss": 0.9873,
"step": 6198
},
{
"epoch": 1.7996806503120917,
"grad_norm": 3.456258535385132,
"learning_rate": 7.595012021922483e-06,
"loss": 0.8182,
"step": 6199
},
{
"epoch": 1.7999709682101903,
"grad_norm": 3.296928882598877,
"learning_rate": 7.594191218564084e-06,
"loss": 0.7492,
"step": 6200
},
{
"epoch": 1.8002612861082885,
"grad_norm": 3.6365811824798584,
"learning_rate": 7.5933703195329426e-06,
"loss": 0.8622,
"step": 6201
},
{
"epoch": 1.8005516040063871,
"grad_norm": 3.2589075565338135,
"learning_rate": 7.592549324859332e-06,
"loss": 0.673,
"step": 6202
},
{
"epoch": 1.8008419219044853,
"grad_norm": 4.169826507568359,
"learning_rate": 7.591728234573531e-06,
"loss": 0.8656,
"step": 6203
},
{
"epoch": 1.801132239802584,
"grad_norm": 3.259309768676758,
"learning_rate": 7.590907048705822e-06,
"loss": 0.7238,
"step": 6204
},
{
"epoch": 1.8014225577006822,
"grad_norm": 4.122686862945557,
"learning_rate": 7.590085767286488e-06,
"loss": 1.0135,
"step": 6205
},
{
"epoch": 1.8017128755987808,
"grad_norm": 3.3853394985198975,
"learning_rate": 7.58926439034582e-06,
"loss": 0.7526,
"step": 6206
},
{
"epoch": 1.802003193496879,
"grad_norm": 3.3177542686462402,
"learning_rate": 7.5884429179141076e-06,
"loss": 0.7382,
"step": 6207
},
{
"epoch": 1.8022935113949776,
"grad_norm": 3.5391876697540283,
"learning_rate": 7.587621350021649e-06,
"loss": 0.8011,
"step": 6208
},
{
"epoch": 1.8025838292930758,
"grad_norm": 3.7560062408447266,
"learning_rate": 7.58679968669874e-06,
"loss": 0.8786,
"step": 6209
},
{
"epoch": 1.8028741471911744,
"grad_norm": 3.5351386070251465,
"learning_rate": 7.585977927975687e-06,
"loss": 0.726,
"step": 6210
},
{
"epoch": 1.8031644650892726,
"grad_norm": 3.548893451690674,
"learning_rate": 7.585156073882793e-06,
"loss": 0.7565,
"step": 6211
},
{
"epoch": 1.8034547829873713,
"grad_norm": 3.7670400142669678,
"learning_rate": 7.58433412445037e-06,
"loss": 0.8406,
"step": 6212
},
{
"epoch": 1.8037451008854695,
"grad_norm": 3.432896375656128,
"learning_rate": 7.583512079708729e-06,
"loss": 0.7089,
"step": 6213
},
{
"epoch": 1.804035418783568,
"grad_norm": 3.5606884956359863,
"learning_rate": 7.582689939688188e-06,
"loss": 0.8647,
"step": 6214
},
{
"epoch": 1.8043257366816663,
"grad_norm": 3.3018386363983154,
"learning_rate": 7.581867704419068e-06,
"loss": 0.7557,
"step": 6215
},
{
"epoch": 1.804616054579765,
"grad_norm": 3.351177215576172,
"learning_rate": 7.581045373931691e-06,
"loss": 0.8048,
"step": 6216
},
{
"epoch": 1.8049063724778631,
"grad_norm": 3.514824151992798,
"learning_rate": 7.580222948256384e-06,
"loss": 0.7764,
"step": 6217
},
{
"epoch": 1.8051966903759618,
"grad_norm": 3.573287010192871,
"learning_rate": 7.579400427423479e-06,
"loss": 0.8168,
"step": 6218
},
{
"epoch": 1.80548700827406,
"grad_norm": 3.355710506439209,
"learning_rate": 7.57857781146331e-06,
"loss": 0.7323,
"step": 6219
},
{
"epoch": 1.8057773261721586,
"grad_norm": 3.2817916870117188,
"learning_rate": 7.577755100406215e-06,
"loss": 0.7215,
"step": 6220
},
{
"epoch": 1.8060676440702568,
"grad_norm": 3.442941665649414,
"learning_rate": 7.5769322942825345e-06,
"loss": 0.7334,
"step": 6221
},
{
"epoch": 1.8063579619683554,
"grad_norm": 3.865924596786499,
"learning_rate": 7.576109393122613e-06,
"loss": 0.8406,
"step": 6222
},
{
"epoch": 1.8066482798664538,
"grad_norm": 3.839789628982544,
"learning_rate": 7.5752863969568e-06,
"loss": 0.8302,
"step": 6223
},
{
"epoch": 1.8069385977645522,
"grad_norm": 3.4474151134490967,
"learning_rate": 7.574463305815446e-06,
"loss": 0.8842,
"step": 6224
},
{
"epoch": 1.8072289156626506,
"grad_norm": 3.137389659881592,
"learning_rate": 7.573640119728909e-06,
"loss": 0.8209,
"step": 6225
},
{
"epoch": 1.807519233560749,
"grad_norm": 3.777895212173462,
"learning_rate": 7.572816838727544e-06,
"loss": 0.8116,
"step": 6226
},
{
"epoch": 1.8078095514588475,
"grad_norm": 3.000427484512329,
"learning_rate": 7.571993462841714e-06,
"loss": 0.6237,
"step": 6227
},
{
"epoch": 1.8080998693569459,
"grad_norm": 3.8934295177459717,
"learning_rate": 7.571169992101788e-06,
"loss": 0.9309,
"step": 6228
},
{
"epoch": 1.8083901872550443,
"grad_norm": 3.262486457824707,
"learning_rate": 7.570346426538131e-06,
"loss": 0.6841,
"step": 6229
},
{
"epoch": 1.8086805051531427,
"grad_norm": 3.2486703395843506,
"learning_rate": 7.56952276618112e-06,
"loss": 0.8261,
"step": 6230
},
{
"epoch": 1.8089708230512411,
"grad_norm": 3.4097964763641357,
"learning_rate": 7.568699011061127e-06,
"loss": 0.7107,
"step": 6231
},
{
"epoch": 1.8092611409493395,
"grad_norm": 3.5118725299835205,
"learning_rate": 7.5678751612085344e-06,
"loss": 0.7122,
"step": 6232
},
{
"epoch": 1.809551458847438,
"grad_norm": 3.1857311725616455,
"learning_rate": 7.567051216653725e-06,
"loss": 0.697,
"step": 6233
},
{
"epoch": 1.8098417767455364,
"grad_norm": 4.186178207397461,
"learning_rate": 7.566227177427085e-06,
"loss": 0.8029,
"step": 6234
},
{
"epoch": 1.8101320946436348,
"grad_norm": 3.4743754863739014,
"learning_rate": 7.565403043559007e-06,
"loss": 0.7779,
"step": 6235
},
{
"epoch": 1.8104224125417332,
"grad_norm": 3.412288188934326,
"learning_rate": 7.5645788150798814e-06,
"loss": 0.7435,
"step": 6236
},
{
"epoch": 1.8107127304398316,
"grad_norm": 3.591625690460205,
"learning_rate": 7.563754492020108e-06,
"loss": 0.9457,
"step": 6237
},
{
"epoch": 1.81100304833793,
"grad_norm": 3.9877660274505615,
"learning_rate": 7.562930074410084e-06,
"loss": 0.8225,
"step": 6238
},
{
"epoch": 1.8112933662360284,
"grad_norm": 3.482994556427002,
"learning_rate": 7.562105562280218e-06,
"loss": 0.8183,
"step": 6239
},
{
"epoch": 1.8115836841341268,
"grad_norm": 3.938270330429077,
"learning_rate": 7.561280955660915e-06,
"loss": 0.8329,
"step": 6240
},
{
"epoch": 1.8118740020322253,
"grad_norm": 3.121049404144287,
"learning_rate": 7.560456254582586e-06,
"loss": 0.6843,
"step": 6241
},
{
"epoch": 1.8121643199303237,
"grad_norm": 3.8467633724212646,
"learning_rate": 7.559631459075646e-06,
"loss": 0.9058,
"step": 6242
},
{
"epoch": 1.812454637828422,
"grad_norm": 3.8543753623962402,
"learning_rate": 7.558806569170514e-06,
"loss": 0.8795,
"step": 6243
},
{
"epoch": 1.8127449557265205,
"grad_norm": 3.738771438598633,
"learning_rate": 7.557981584897612e-06,
"loss": 0.7087,
"step": 6244
},
{
"epoch": 1.813035273624619,
"grad_norm": 3.7522284984588623,
"learning_rate": 7.557156506287364e-06,
"loss": 0.7569,
"step": 6245
},
{
"epoch": 1.8133255915227173,
"grad_norm": 3.697587251663208,
"learning_rate": 7.556331333370199e-06,
"loss": 0.8145,
"step": 6246
},
{
"epoch": 1.8136159094208157,
"grad_norm": 3.8390111923217773,
"learning_rate": 7.555506066176549e-06,
"loss": 0.833,
"step": 6247
},
{
"epoch": 1.8139062273189142,
"grad_norm": 3.501277208328247,
"learning_rate": 7.5546807047368485e-06,
"loss": 0.717,
"step": 6248
},
{
"epoch": 1.8141965452170128,
"grad_norm": 3.8523659706115723,
"learning_rate": 7.553855249081538e-06,
"loss": 0.8559,
"step": 6249
},
{
"epoch": 1.814486863115111,
"grad_norm": 3.714585781097412,
"learning_rate": 7.553029699241059e-06,
"loss": 0.7097,
"step": 6250
},
{
"epoch": 1.8147771810132096,
"grad_norm": 3.495954751968384,
"learning_rate": 7.552204055245858e-06,
"loss": 0.7008,
"step": 6251
},
{
"epoch": 1.8150674989113078,
"grad_norm": 3.6363167762756348,
"learning_rate": 7.551378317126384e-06,
"loss": 0.7602,
"step": 6252
},
{
"epoch": 1.8153578168094064,
"grad_norm": 3.7626495361328125,
"learning_rate": 7.5505524849130915e-06,
"loss": 0.8059,
"step": 6253
},
{
"epoch": 1.8156481347075046,
"grad_norm": 3.3501880168914795,
"learning_rate": 7.549726558636434e-06,
"loss": 0.7476,
"step": 6254
},
{
"epoch": 1.8159384526056033,
"grad_norm": 3.376075267791748,
"learning_rate": 7.548900538326874e-06,
"loss": 0.7685,
"step": 6255
},
{
"epoch": 1.8162287705037015,
"grad_norm": 3.886094570159912,
"learning_rate": 7.548074424014873e-06,
"loss": 0.8429,
"step": 6256
},
{
"epoch": 1.8165190884018,
"grad_norm": 3.8451836109161377,
"learning_rate": 7.5472482157308975e-06,
"loss": 0.8856,
"step": 6257
},
{
"epoch": 1.8168094062998983,
"grad_norm": 3.035158395767212,
"learning_rate": 7.54642191350542e-06,
"loss": 0.6661,
"step": 6258
},
{
"epoch": 1.817099724197997,
"grad_norm": 3.0387699604034424,
"learning_rate": 7.545595517368913e-06,
"loss": 0.669,
"step": 6259
},
{
"epoch": 1.8173900420960951,
"grad_norm": 3.523467540740967,
"learning_rate": 7.544769027351853e-06,
"loss": 0.7385,
"step": 6260
},
{
"epoch": 1.8176803599941938,
"grad_norm": 3.1985654830932617,
"learning_rate": 7.543942443484721e-06,
"loss": 0.7173,
"step": 6261
},
{
"epoch": 1.817970677892292,
"grad_norm": 3.688586473464966,
"learning_rate": 7.543115765798002e-06,
"loss": 0.7391,
"step": 6262
},
{
"epoch": 1.8182609957903906,
"grad_norm": 3.3867619037628174,
"learning_rate": 7.542288994322181e-06,
"loss": 0.7213,
"step": 6263
},
{
"epoch": 1.8185513136884888,
"grad_norm": 3.24111008644104,
"learning_rate": 7.5414621290877525e-06,
"loss": 0.744,
"step": 6264
},
{
"epoch": 1.8188416315865874,
"grad_norm": 3.452265739440918,
"learning_rate": 7.540635170125208e-06,
"loss": 0.6929,
"step": 6265
},
{
"epoch": 1.8191319494846856,
"grad_norm": 3.555257558822632,
"learning_rate": 7.539808117465047e-06,
"loss": 0.8184,
"step": 6266
},
{
"epoch": 1.8194222673827842,
"grad_norm": 3.979184865951538,
"learning_rate": 7.538980971137771e-06,
"loss": 0.85,
"step": 6267
},
{
"epoch": 1.8197125852808824,
"grad_norm": 3.006906747817993,
"learning_rate": 7.538153731173885e-06,
"loss": 0.6521,
"step": 6268
},
{
"epoch": 1.820002903178981,
"grad_norm": 3.9368133544921875,
"learning_rate": 7.5373263976038944e-06,
"loss": 0.9165,
"step": 6269
},
{
"epoch": 1.8202932210770792,
"grad_norm": 3.690107583999634,
"learning_rate": 7.536498970458314e-06,
"loss": 0.7681,
"step": 6270
},
{
"epoch": 1.8205835389751779,
"grad_norm": 3.7240521907806396,
"learning_rate": 7.535671449767659e-06,
"loss": 0.7563,
"step": 6271
},
{
"epoch": 1.820873856873276,
"grad_norm": 3.656486988067627,
"learning_rate": 7.534843835562448e-06,
"loss": 0.7902,
"step": 6272
},
{
"epoch": 1.8211641747713747,
"grad_norm": 3.4625377655029297,
"learning_rate": 7.5340161278732e-06,
"loss": 0.7638,
"step": 6273
},
{
"epoch": 1.8214544926694731,
"grad_norm": 3.750249147415161,
"learning_rate": 7.533188326730444e-06,
"loss": 0.8196,
"step": 6274
},
{
"epoch": 1.8217448105675715,
"grad_norm": 3.308974266052246,
"learning_rate": 7.532360432164707e-06,
"loss": 0.7057,
"step": 6275
},
{
"epoch": 1.82203512846567,
"grad_norm": 3.5016844272613525,
"learning_rate": 7.531532444206524e-06,
"loss": 0.8291,
"step": 6276
},
{
"epoch": 1.8223254463637684,
"grad_norm": 3.492377758026123,
"learning_rate": 7.530704362886428e-06,
"loss": 0.7162,
"step": 6277
},
{
"epoch": 1.8226157642618668,
"grad_norm": 3.7556068897247314,
"learning_rate": 7.5298761882349594e-06,
"loss": 0.7858,
"step": 6278
},
{
"epoch": 1.8229060821599652,
"grad_norm": 3.9492125511169434,
"learning_rate": 7.5290479202826596e-06,
"loss": 0.8273,
"step": 6279
},
{
"epoch": 1.8231964000580636,
"grad_norm": 4.034613609313965,
"learning_rate": 7.528219559060077e-06,
"loss": 0.8135,
"step": 6280
},
{
"epoch": 1.823486717956162,
"grad_norm": 3.474411725997925,
"learning_rate": 7.527391104597761e-06,
"loss": 0.8682,
"step": 6281
},
{
"epoch": 1.8237770358542604,
"grad_norm": 3.4744694232940674,
"learning_rate": 7.526562556926265e-06,
"loss": 0.7112,
"step": 6282
},
{
"epoch": 1.8240673537523588,
"grad_norm": 3.711562395095825,
"learning_rate": 7.525733916076142e-06,
"loss": 0.76,
"step": 6283
},
{
"epoch": 1.8243576716504573,
"grad_norm": 3.230764150619507,
"learning_rate": 7.524905182077955e-06,
"loss": 0.6565,
"step": 6284
},
{
"epoch": 1.8246479895485557,
"grad_norm": 3.4089322090148926,
"learning_rate": 7.5240763549622685e-06,
"loss": 0.6973,
"step": 6285
},
{
"epoch": 1.824938307446654,
"grad_norm": 3.709282636642456,
"learning_rate": 7.523247434759646e-06,
"loss": 0.8532,
"step": 6286
},
{
"epoch": 1.8252286253447525,
"grad_norm": 3.3632187843322754,
"learning_rate": 7.522418421500662e-06,
"loss": 0.8516,
"step": 6287
},
{
"epoch": 1.825518943242851,
"grad_norm": 3.4261248111724854,
"learning_rate": 7.5215893152158846e-06,
"loss": 0.8845,
"step": 6288
},
{
"epoch": 1.8258092611409493,
"grad_norm": 3.668027400970459,
"learning_rate": 7.5207601159358955e-06,
"loss": 0.7571,
"step": 6289
},
{
"epoch": 1.8260995790390477,
"grad_norm": 3.609893321990967,
"learning_rate": 7.519930823691272e-06,
"loss": 0.847,
"step": 6290
},
{
"epoch": 1.8263898969371462,
"grad_norm": 3.379772186279297,
"learning_rate": 7.519101438512602e-06,
"loss": 0.734,
"step": 6291
},
{
"epoch": 1.8266802148352446,
"grad_norm": 3.4122653007507324,
"learning_rate": 7.5182719604304685e-06,
"loss": 0.7448,
"step": 6292
},
{
"epoch": 1.826970532733343,
"grad_norm": 3.6492867469787598,
"learning_rate": 7.5174423894754664e-06,
"loss": 0.763,
"step": 6293
},
{
"epoch": 1.8272608506314414,
"grad_norm": 3.439892292022705,
"learning_rate": 7.5166127256781876e-06,
"loss": 0.7315,
"step": 6294
},
{
"epoch": 1.8275511685295398,
"grad_norm": 3.6350364685058594,
"learning_rate": 7.515782969069229e-06,
"loss": 0.7174,
"step": 6295
},
{
"epoch": 1.8278414864276382,
"grad_norm": 3.2767841815948486,
"learning_rate": 7.514953119679193e-06,
"loss": 0.714,
"step": 6296
},
{
"epoch": 1.8281318043257366,
"grad_norm": 3.690453052520752,
"learning_rate": 7.514123177538686e-06,
"loss": 0.6819,
"step": 6297
},
{
"epoch": 1.828422122223835,
"grad_norm": 3.7709054946899414,
"learning_rate": 7.513293142678313e-06,
"loss": 0.7278,
"step": 6298
},
{
"epoch": 1.8287124401219335,
"grad_norm": 3.1825685501098633,
"learning_rate": 7.5124630151286845e-06,
"loss": 0.7173,
"step": 6299
},
{
"epoch": 1.829002758020032,
"grad_norm": 3.712411880493164,
"learning_rate": 7.511632794920419e-06,
"loss": 0.7861,
"step": 6300
},
{
"epoch": 1.8292930759181303,
"grad_norm": 3.5475590229034424,
"learning_rate": 7.510802482084132e-06,
"loss": 0.678,
"step": 6301
},
{
"epoch": 1.829583393816229,
"grad_norm": 4.581618309020996,
"learning_rate": 7.509972076650446e-06,
"loss": 0.8925,
"step": 6302
},
{
"epoch": 1.8298737117143271,
"grad_norm": 3.616469383239746,
"learning_rate": 7.509141578649986e-06,
"loss": 0.7198,
"step": 6303
},
{
"epoch": 1.8301640296124257,
"grad_norm": 3.2408971786499023,
"learning_rate": 7.50831098811338e-06,
"loss": 0.7338,
"step": 6304
},
{
"epoch": 1.830454347510524,
"grad_norm": 3.7839319705963135,
"learning_rate": 7.50748030507126e-06,
"loss": 0.8358,
"step": 6305
},
{
"epoch": 1.8307446654086226,
"grad_norm": 3.9839742183685303,
"learning_rate": 7.506649529554261e-06,
"loss": 0.8758,
"step": 6306
},
{
"epoch": 1.8310349833067208,
"grad_norm": 4.165936470031738,
"learning_rate": 7.505818661593023e-06,
"loss": 0.8142,
"step": 6307
},
{
"epoch": 1.8313253012048194,
"grad_norm": 3.3792271614074707,
"learning_rate": 7.504987701218187e-06,
"loss": 0.8431,
"step": 6308
},
{
"epoch": 1.8316156191029176,
"grad_norm": 3.979881525039673,
"learning_rate": 7.5041566484603975e-06,
"loss": 0.9142,
"step": 6309
},
{
"epoch": 1.8319059370010162,
"grad_norm": 3.540987253189087,
"learning_rate": 7.503325503350307e-06,
"loss": 0.8675,
"step": 6310
},
{
"epoch": 1.8321962548991144,
"grad_norm": 3.5563859939575195,
"learning_rate": 7.502494265918563e-06,
"loss": 0.779,
"step": 6311
},
{
"epoch": 1.832486572797213,
"grad_norm": 3.8116211891174316,
"learning_rate": 7.501662936195824e-06,
"loss": 0.8108,
"step": 6312
},
{
"epoch": 1.8327768906953112,
"grad_norm": 3.5146663188934326,
"learning_rate": 7.500831514212749e-06,
"loss": 0.7253,
"step": 6313
},
{
"epoch": 1.8330672085934099,
"grad_norm": 3.380580425262451,
"learning_rate": 7.500000000000001e-06,
"loss": 0.7129,
"step": 6314
},
{
"epoch": 1.833357526491508,
"grad_norm": 3.595702886581421,
"learning_rate": 7.499168393588244e-06,
"loss": 0.7543,
"step": 6315
},
{
"epoch": 1.8336478443896067,
"grad_norm": 3.2393553256988525,
"learning_rate": 7.498336695008148e-06,
"loss": 0.773,
"step": 6316
},
{
"epoch": 1.833938162287705,
"grad_norm": 4.056413650512695,
"learning_rate": 7.497504904290388e-06,
"loss": 0.8839,
"step": 6317
},
{
"epoch": 1.8342284801858035,
"grad_norm": 3.646803617477417,
"learning_rate": 7.496673021465637e-06,
"loss": 0.8599,
"step": 6318
},
{
"epoch": 1.8345187980839017,
"grad_norm": 3.5614094734191895,
"learning_rate": 7.495841046564577e-06,
"loss": 0.8281,
"step": 6319
},
{
"epoch": 1.8348091159820004,
"grad_norm": 3.5541832447052,
"learning_rate": 7.495008979617887e-06,
"loss": 0.7304,
"step": 6320
},
{
"epoch": 1.8350994338800986,
"grad_norm": 3.597524404525757,
"learning_rate": 7.494176820656258e-06,
"loss": 0.757,
"step": 6321
},
{
"epoch": 1.8353897517781972,
"grad_norm": 3.2266886234283447,
"learning_rate": 7.493344569710377e-06,
"loss": 0.7391,
"step": 6322
},
{
"epoch": 1.8356800696762954,
"grad_norm": 3.777841329574585,
"learning_rate": 7.492512226810938e-06,
"loss": 0.7076,
"step": 6323
},
{
"epoch": 1.835970387574394,
"grad_norm": 3.5459866523742676,
"learning_rate": 7.491679791988636e-06,
"loss": 0.7855,
"step": 6324
},
{
"epoch": 1.8362607054724924,
"grad_norm": 3.8192386627197266,
"learning_rate": 7.490847265274174e-06,
"loss": 0.7813,
"step": 6325
},
{
"epoch": 1.8365510233705908,
"grad_norm": 3.7294278144836426,
"learning_rate": 7.490014646698252e-06,
"loss": 0.7653,
"step": 6326
},
{
"epoch": 1.8368413412686893,
"grad_norm": 3.3755605220794678,
"learning_rate": 7.489181936291578e-06,
"loss": 0.7804,
"step": 6327
},
{
"epoch": 1.8371316591667877,
"grad_norm": 3.258549928665161,
"learning_rate": 7.488349134084864e-06,
"loss": 0.6664,
"step": 6328
},
{
"epoch": 1.837421977064886,
"grad_norm": 3.3586201667785645,
"learning_rate": 7.487516240108819e-06,
"loss": 0.7859,
"step": 6329
},
{
"epoch": 1.8377122949629845,
"grad_norm": 3.6065549850463867,
"learning_rate": 7.486683254394164e-06,
"loss": 0.7288,
"step": 6330
},
{
"epoch": 1.838002612861083,
"grad_norm": 3.9054665565490723,
"learning_rate": 7.485850176971615e-06,
"loss": 0.7768,
"step": 6331
},
{
"epoch": 1.8382929307591813,
"grad_norm": 3.5716748237609863,
"learning_rate": 7.4850170078719e-06,
"loss": 0.7479,
"step": 6332
},
{
"epoch": 1.8385832486572797,
"grad_norm": 3.473572254180908,
"learning_rate": 7.484183747125743e-06,
"loss": 0.8524,
"step": 6333
},
{
"epoch": 1.8388735665553781,
"grad_norm": 3.5693931579589844,
"learning_rate": 7.483350394763875e-06,
"loss": 0.8059,
"step": 6334
},
{
"epoch": 1.8391638844534766,
"grad_norm": 3.8996100425720215,
"learning_rate": 7.48251695081703e-06,
"loss": 0.8808,
"step": 6335
},
{
"epoch": 1.839454202351575,
"grad_norm": 3.6452038288116455,
"learning_rate": 7.481683415315947e-06,
"loss": 0.7321,
"step": 6336
},
{
"epoch": 1.8397445202496734,
"grad_norm": 3.863975763320923,
"learning_rate": 7.480849788291363e-06,
"loss": 0.8304,
"step": 6337
},
{
"epoch": 1.8400348381477718,
"grad_norm": 3.3858823776245117,
"learning_rate": 7.480016069774022e-06,
"loss": 0.7193,
"step": 6338
},
{
"epoch": 1.8403251560458702,
"grad_norm": 3.359248161315918,
"learning_rate": 7.479182259794673e-06,
"loss": 0.804,
"step": 6339
},
{
"epoch": 1.8406154739439686,
"grad_norm": 3.686079740524292,
"learning_rate": 7.478348358384068e-06,
"loss": 0.8708,
"step": 6340
},
{
"epoch": 1.840905791842067,
"grad_norm": 3.9238340854644775,
"learning_rate": 7.477514365572958e-06,
"loss": 0.8281,
"step": 6341
},
{
"epoch": 1.8411961097401655,
"grad_norm": 3.203186273574829,
"learning_rate": 7.4766802813921016e-06,
"loss": 0.7698,
"step": 6342
},
{
"epoch": 1.8414864276382639,
"grad_norm": 3.614574432373047,
"learning_rate": 7.475846105872258e-06,
"loss": 0.7622,
"step": 6343
},
{
"epoch": 1.8417767455363623,
"grad_norm": 3.4722697734832764,
"learning_rate": 7.475011839044193e-06,
"loss": 0.7134,
"step": 6344
},
{
"epoch": 1.8420670634344607,
"grad_norm": 3.442232608795166,
"learning_rate": 7.4741774809386734e-06,
"loss": 0.7563,
"step": 6345
},
{
"epoch": 1.842357381332559,
"grad_norm": 4.323866844177246,
"learning_rate": 7.473343031586472e-06,
"loss": 0.8256,
"step": 6346
},
{
"epoch": 1.8426476992306575,
"grad_norm": 3.4767138957977295,
"learning_rate": 7.47250849101836e-06,
"loss": 0.6983,
"step": 6347
},
{
"epoch": 1.842938017128756,
"grad_norm": 3.646294593811035,
"learning_rate": 7.471673859265115e-06,
"loss": 0.8051,
"step": 6348
},
{
"epoch": 1.8432283350268546,
"grad_norm": 3.3605406284332275,
"learning_rate": 7.470839136357521e-06,
"loss": 0.7647,
"step": 6349
},
{
"epoch": 1.8435186529249528,
"grad_norm": 3.6406664848327637,
"learning_rate": 7.470004322326358e-06,
"loss": 0.844,
"step": 6350
},
{
"epoch": 1.8438089708230514,
"grad_norm": 3.698698043823242,
"learning_rate": 7.469169417202418e-06,
"loss": 0.7931,
"step": 6351
},
{
"epoch": 1.8440992887211496,
"grad_norm": 4.0768280029296875,
"learning_rate": 7.468334421016486e-06,
"loss": 0.8189,
"step": 6352
},
{
"epoch": 1.8443896066192482,
"grad_norm": 3.440924644470215,
"learning_rate": 7.467499333799364e-06,
"loss": 0.6892,
"step": 6353
},
{
"epoch": 1.8446799245173464,
"grad_norm": 3.8425514698028564,
"learning_rate": 7.466664155581844e-06,
"loss": 0.817,
"step": 6354
},
{
"epoch": 1.844970242415445,
"grad_norm": 3.595719337463379,
"learning_rate": 7.465828886394729e-06,
"loss": 0.7626,
"step": 6355
},
{
"epoch": 1.8452605603135432,
"grad_norm": 3.3320703506469727,
"learning_rate": 7.464993526268822e-06,
"loss": 0.6524,
"step": 6356
},
{
"epoch": 1.8455508782116419,
"grad_norm": 3.798980951309204,
"learning_rate": 7.464158075234934e-06,
"loss": 0.7571,
"step": 6357
},
{
"epoch": 1.84584119610974,
"grad_norm": 3.508420944213867,
"learning_rate": 7.463322533323874e-06,
"loss": 0.7707,
"step": 6358
},
{
"epoch": 1.8461315140078387,
"grad_norm": 3.330502986907959,
"learning_rate": 7.4624869005664554e-06,
"loss": 0.6898,
"step": 6359
},
{
"epoch": 1.846421831905937,
"grad_norm": 3.756951332092285,
"learning_rate": 7.4616511769934985e-06,
"loss": 0.8923,
"step": 6360
},
{
"epoch": 1.8467121498040355,
"grad_norm": 3.696202516555786,
"learning_rate": 7.460815362635821e-06,
"loss": 0.851,
"step": 6361
},
{
"epoch": 1.8470024677021337,
"grad_norm": 3.410972833633423,
"learning_rate": 7.45997945752425e-06,
"loss": 0.7278,
"step": 6362
},
{
"epoch": 1.8472927856002324,
"grad_norm": 3.7810752391815186,
"learning_rate": 7.4591434616896156e-06,
"loss": 0.8884,
"step": 6363
},
{
"epoch": 1.8475831034983305,
"grad_norm": 3.368793487548828,
"learning_rate": 7.458307375162743e-06,
"loss": 0.6754,
"step": 6364
},
{
"epoch": 1.8478734213964292,
"grad_norm": 3.527655839920044,
"learning_rate": 7.4574711979744705e-06,
"loss": 0.8358,
"step": 6365
},
{
"epoch": 1.8481637392945274,
"grad_norm": 3.6964645385742188,
"learning_rate": 7.4566349301556366e-06,
"loss": 0.776,
"step": 6366
},
{
"epoch": 1.848454057192626,
"grad_norm": 3.480604410171509,
"learning_rate": 7.45579857173708e-06,
"loss": 0.813,
"step": 6367
},
{
"epoch": 1.8487443750907242,
"grad_norm": 3.0932321548461914,
"learning_rate": 7.454962122749648e-06,
"loss": 0.6029,
"step": 6368
},
{
"epoch": 1.8490346929888228,
"grad_norm": 3.5673985481262207,
"learning_rate": 7.454125583224186e-06,
"loss": 0.8752,
"step": 6369
},
{
"epoch": 1.849325010886921,
"grad_norm": 3.8833866119384766,
"learning_rate": 7.453288953191547e-06,
"loss": 0.8049,
"step": 6370
},
{
"epoch": 1.8496153287850197,
"grad_norm": 3.3621320724487305,
"learning_rate": 7.452452232682585e-06,
"loss": 0.778,
"step": 6371
},
{
"epoch": 1.8499056466831179,
"grad_norm": 3.439912796020508,
"learning_rate": 7.451615421728158e-06,
"loss": 0.7637,
"step": 6372
},
{
"epoch": 1.8501959645812165,
"grad_norm": 3.4569733142852783,
"learning_rate": 7.450778520359127e-06,
"loss": 0.757,
"step": 6373
},
{
"epoch": 1.850486282479315,
"grad_norm": 3.3859477043151855,
"learning_rate": 7.449941528606356e-06,
"loss": 0.7486,
"step": 6374
},
{
"epoch": 1.8507766003774133,
"grad_norm": 4.253404140472412,
"learning_rate": 7.449104446500713e-06,
"loss": 0.9496,
"step": 6375
},
{
"epoch": 1.8510669182755117,
"grad_norm": 3.733933448791504,
"learning_rate": 7.448267274073072e-06,
"loss": 0.8169,
"step": 6376
},
{
"epoch": 1.8513572361736101,
"grad_norm": 3.200833320617676,
"learning_rate": 7.447430011354304e-06,
"loss": 0.6549,
"step": 6377
},
{
"epoch": 1.8516475540717086,
"grad_norm": 3.777592658996582,
"learning_rate": 7.44659265837529e-06,
"loss": 0.7889,
"step": 6378
},
{
"epoch": 1.851937871969807,
"grad_norm": 3.5749125480651855,
"learning_rate": 7.4457552151669085e-06,
"loss": 0.8438,
"step": 6379
},
{
"epoch": 1.8522281898679054,
"grad_norm": 3.531050205230713,
"learning_rate": 7.444917681760046e-06,
"loss": 0.8027,
"step": 6380
},
{
"epoch": 1.8525185077660038,
"grad_norm": 3.0747227668762207,
"learning_rate": 7.444080058185587e-06,
"loss": 0.6814,
"step": 6381
},
{
"epoch": 1.8528088256641022,
"grad_norm": 3.703937530517578,
"learning_rate": 7.443242344474429e-06,
"loss": 0.8243,
"step": 6382
},
{
"epoch": 1.8530991435622006,
"grad_norm": 3.3314077854156494,
"learning_rate": 7.442404540657461e-06,
"loss": 0.7393,
"step": 6383
},
{
"epoch": 1.853389461460299,
"grad_norm": 3.324211835861206,
"learning_rate": 7.4415666467655835e-06,
"loss": 0.7398,
"step": 6384
},
{
"epoch": 1.8536797793583975,
"grad_norm": 3.0877864360809326,
"learning_rate": 7.440728662829697e-06,
"loss": 0.7265,
"step": 6385
},
{
"epoch": 1.8539700972564959,
"grad_norm": 3.642578363418579,
"learning_rate": 7.439890588880705e-06,
"loss": 0.7797,
"step": 6386
},
{
"epoch": 1.8542604151545943,
"grad_norm": 3.4550280570983887,
"learning_rate": 7.439052424949518e-06,
"loss": 0.7592,
"step": 6387
},
{
"epoch": 1.8545507330526927,
"grad_norm": 3.4730403423309326,
"learning_rate": 7.438214171067042e-06,
"loss": 0.7711,
"step": 6388
},
{
"epoch": 1.854841050950791,
"grad_norm": 3.5537898540496826,
"learning_rate": 7.437375827264198e-06,
"loss": 0.9184,
"step": 6389
},
{
"epoch": 1.8551313688488895,
"grad_norm": 3.556471586227417,
"learning_rate": 7.4365373935719e-06,
"loss": 0.7449,
"step": 6390
},
{
"epoch": 1.855421686746988,
"grad_norm": 3.9682884216308594,
"learning_rate": 7.435698870021071e-06,
"loss": 0.8094,
"step": 6391
},
{
"epoch": 1.8557120046450863,
"grad_norm": 3.6690304279327393,
"learning_rate": 7.434860256642633e-06,
"loss": 0.8124,
"step": 6392
},
{
"epoch": 1.8560023225431848,
"grad_norm": 3.4016544818878174,
"learning_rate": 7.434021553467514e-06,
"loss": 0.8016,
"step": 6393
},
{
"epoch": 1.8562926404412832,
"grad_norm": 3.5285894870758057,
"learning_rate": 7.433182760526647e-06,
"loss": 0.802,
"step": 6394
},
{
"epoch": 1.8565829583393816,
"grad_norm": 3.331476926803589,
"learning_rate": 7.432343877850966e-06,
"loss": 0.6942,
"step": 6395
},
{
"epoch": 1.85687327623748,
"grad_norm": 3.557368516921997,
"learning_rate": 7.431504905471407e-06,
"loss": 0.696,
"step": 6396
},
{
"epoch": 1.8571635941355784,
"grad_norm": 3.8558270931243896,
"learning_rate": 7.4306658434189126e-06,
"loss": 0.8857,
"step": 6397
},
{
"epoch": 1.8574539120336768,
"grad_norm": 3.4773919582366943,
"learning_rate": 7.4298266917244266e-06,
"loss": 0.6939,
"step": 6398
},
{
"epoch": 1.8577442299317752,
"grad_norm": 3.5946531295776367,
"learning_rate": 7.428987450418896e-06,
"loss": 0.8188,
"step": 6399
},
{
"epoch": 1.8580345478298739,
"grad_norm": 3.5143725872039795,
"learning_rate": 7.428148119533274e-06,
"loss": 0.8558,
"step": 6400
},
{
"epoch": 1.858324865727972,
"grad_norm": 3.770815372467041,
"learning_rate": 7.427308699098511e-06,
"loss": 0.7335,
"step": 6401
},
{
"epoch": 1.8586151836260707,
"grad_norm": 3.5556554794311523,
"learning_rate": 7.426469189145567e-06,
"loss": 0.7183,
"step": 6402
},
{
"epoch": 1.858905501524169,
"grad_norm": 3.102630138397217,
"learning_rate": 7.425629589705401e-06,
"loss": 0.8115,
"step": 6403
},
{
"epoch": 1.8591958194222675,
"grad_norm": 3.410172700881958,
"learning_rate": 7.42478990080898e-06,
"loss": 0.7377,
"step": 6404
},
{
"epoch": 1.8594861373203657,
"grad_norm": 3.825101613998413,
"learning_rate": 7.423950122487269e-06,
"loss": 0.8198,
"step": 6405
},
{
"epoch": 1.8597764552184644,
"grad_norm": 3.740804672241211,
"learning_rate": 7.423110254771238e-06,
"loss": 0.724,
"step": 6406
},
{
"epoch": 1.8600667731165625,
"grad_norm": 4.087116718292236,
"learning_rate": 7.4222702976918635e-06,
"loss": 0.8019,
"step": 6407
},
{
"epoch": 1.8603570910146612,
"grad_norm": 3.577281951904297,
"learning_rate": 7.421430251280123e-06,
"loss": 0.7734,
"step": 6408
},
{
"epoch": 1.8606474089127594,
"grad_norm": 3.1149165630340576,
"learning_rate": 7.420590115566995e-06,
"loss": 0.6023,
"step": 6409
},
{
"epoch": 1.860937726810858,
"grad_norm": 3.652672052383423,
"learning_rate": 7.419749890583464e-06,
"loss": 0.8898,
"step": 6410
},
{
"epoch": 1.8612280447089562,
"grad_norm": 3.6932666301727295,
"learning_rate": 7.418909576360515e-06,
"loss": 0.8296,
"step": 6411
},
{
"epoch": 1.8615183626070548,
"grad_norm": 3.1710166931152344,
"learning_rate": 7.418069172929144e-06,
"loss": 0.6779,
"step": 6412
},
{
"epoch": 1.861808680505153,
"grad_norm": 3.5479466915130615,
"learning_rate": 7.417228680320341e-06,
"loss": 0.7505,
"step": 6413
},
{
"epoch": 1.8620989984032517,
"grad_norm": 3.41398549079895,
"learning_rate": 7.416388098565103e-06,
"loss": 0.8062,
"step": 6414
},
{
"epoch": 1.8623893163013499,
"grad_norm": 3.561964511871338,
"learning_rate": 7.41554742769443e-06,
"loss": 0.7902,
"step": 6415
},
{
"epoch": 1.8626796341994485,
"grad_norm": 3.3961057662963867,
"learning_rate": 7.414706667739327e-06,
"loss": 0.7915,
"step": 6416
},
{
"epoch": 1.8629699520975467,
"grad_norm": 3.501466751098633,
"learning_rate": 7.413865818730801e-06,
"loss": 0.829,
"step": 6417
},
{
"epoch": 1.8632602699956453,
"grad_norm": 3.181313991546631,
"learning_rate": 7.413024880699861e-06,
"loss": 0.6991,
"step": 6418
},
{
"epoch": 1.8635505878937435,
"grad_norm": 3.7406692504882812,
"learning_rate": 7.412183853677522e-06,
"loss": 0.999,
"step": 6419
},
{
"epoch": 1.8638409057918421,
"grad_norm": 3.098989248275757,
"learning_rate": 7.4113427376947966e-06,
"loss": 0.7114,
"step": 6420
},
{
"epoch": 1.8641312236899403,
"grad_norm": 3.511604070663452,
"learning_rate": 7.4105015327827115e-06,
"loss": 0.6936,
"step": 6421
},
{
"epoch": 1.864421541588039,
"grad_norm": 3.8496603965759277,
"learning_rate": 7.409660238972285e-06,
"loss": 0.9334,
"step": 6422
},
{
"epoch": 1.8647118594861372,
"grad_norm": 3.1544764041900635,
"learning_rate": 7.4088188562945454e-06,
"loss": 0.7209,
"step": 6423
},
{
"epoch": 1.8650021773842358,
"grad_norm": 4.877438068389893,
"learning_rate": 7.4079773847805216e-06,
"loss": 0.9736,
"step": 6424
},
{
"epoch": 1.8652924952823342,
"grad_norm": 3.5776352882385254,
"learning_rate": 7.407135824461247e-06,
"loss": 0.7248,
"step": 6425
},
{
"epoch": 1.8655828131804326,
"grad_norm": 3.487882375717163,
"learning_rate": 7.406294175367758e-06,
"loss": 0.7247,
"step": 6426
},
{
"epoch": 1.865873131078531,
"grad_norm": 3.9391684532165527,
"learning_rate": 7.405452437531098e-06,
"loss": 0.8622,
"step": 6427
},
{
"epoch": 1.8661634489766294,
"grad_norm": 3.6147098541259766,
"learning_rate": 7.4046106109823045e-06,
"loss": 0.7524,
"step": 6428
},
{
"epoch": 1.8664537668747279,
"grad_norm": 4.174846649169922,
"learning_rate": 7.403768695752426e-06,
"loss": 0.842,
"step": 6429
},
{
"epoch": 1.8667440847728263,
"grad_norm": 3.839925527572632,
"learning_rate": 7.402926691872512e-06,
"loss": 0.853,
"step": 6430
},
{
"epoch": 1.8670344026709247,
"grad_norm": 3.8808486461639404,
"learning_rate": 7.402084599373616e-06,
"loss": 0.7748,
"step": 6431
},
{
"epoch": 1.867324720569023,
"grad_norm": 3.5012404918670654,
"learning_rate": 7.401242418286792e-06,
"loss": 0.8308,
"step": 6432
},
{
"epoch": 1.8676150384671215,
"grad_norm": 3.0792105197906494,
"learning_rate": 7.400400148643101e-06,
"loss": 0.6845,
"step": 6433
},
{
"epoch": 1.86790535636522,
"grad_norm": 3.1592519283294678,
"learning_rate": 7.399557790473604e-06,
"loss": 0.7151,
"step": 6434
},
{
"epoch": 1.8681956742633183,
"grad_norm": 3.6104846000671387,
"learning_rate": 7.398715343809368e-06,
"loss": 0.7171,
"step": 6435
},
{
"epoch": 1.8684859921614168,
"grad_norm": 3.654996633529663,
"learning_rate": 7.397872808681465e-06,
"loss": 0.8835,
"step": 6436
},
{
"epoch": 1.8687763100595152,
"grad_norm": 3.450308322906494,
"learning_rate": 7.397030185120962e-06,
"loss": 0.7241,
"step": 6437
},
{
"epoch": 1.8690666279576136,
"grad_norm": 4.059999465942383,
"learning_rate": 7.396187473158937e-06,
"loss": 0.8683,
"step": 6438
},
{
"epoch": 1.869356945855712,
"grad_norm": 3.439053773880005,
"learning_rate": 7.395344672826469e-06,
"loss": 0.6581,
"step": 6439
},
{
"epoch": 1.8696472637538104,
"grad_norm": 3.5375428199768066,
"learning_rate": 7.394501784154641e-06,
"loss": 0.7848,
"step": 6440
},
{
"epoch": 1.8699375816519088,
"grad_norm": 3.373065710067749,
"learning_rate": 7.393658807174536e-06,
"loss": 0.6419,
"step": 6441
},
{
"epoch": 1.8702278995500072,
"grad_norm": 3.765425682067871,
"learning_rate": 7.392815741917245e-06,
"loss": 0.8696,
"step": 6442
},
{
"epoch": 1.8705182174481056,
"grad_norm": 3.7273731231689453,
"learning_rate": 7.391972588413858e-06,
"loss": 0.6883,
"step": 6443
},
{
"epoch": 1.870808535346204,
"grad_norm": 3.4617130756378174,
"learning_rate": 7.391129346695472e-06,
"loss": 0.8119,
"step": 6444
},
{
"epoch": 1.8710988532443025,
"grad_norm": 3.6720211505889893,
"learning_rate": 7.390286016793185e-06,
"loss": 0.7574,
"step": 6445
},
{
"epoch": 1.8713891711424009,
"grad_norm": 3.469089984893799,
"learning_rate": 7.389442598738098e-06,
"loss": 0.8107,
"step": 6446
},
{
"epoch": 1.8716794890404993,
"grad_norm": 3.012542963027954,
"learning_rate": 7.388599092561315e-06,
"loss": 0.7098,
"step": 6447
},
{
"epoch": 1.8719698069385977,
"grad_norm": 3.592057943344116,
"learning_rate": 7.387755498293947e-06,
"loss": 0.6834,
"step": 6448
},
{
"epoch": 1.8722601248366961,
"grad_norm": 3.2716832160949707,
"learning_rate": 7.386911815967104e-06,
"loss": 0.6979,
"step": 6449
},
{
"epoch": 1.8725504427347945,
"grad_norm": 3.7392630577087402,
"learning_rate": 7.386068045611899e-06,
"loss": 0.7324,
"step": 6450
},
{
"epoch": 1.8728407606328932,
"grad_norm": 3.501025676727295,
"learning_rate": 7.385224187259451e-06,
"loss": 0.8299,
"step": 6451
},
{
"epoch": 1.8731310785309914,
"grad_norm": 3.846646547317505,
"learning_rate": 7.384380240940883e-06,
"loss": 0.7621,
"step": 6452
},
{
"epoch": 1.87342139642909,
"grad_norm": 3.536499261856079,
"learning_rate": 7.383536206687317e-06,
"loss": 0.7554,
"step": 6453
},
{
"epoch": 1.8737117143271882,
"grad_norm": 3.91064715385437,
"learning_rate": 7.382692084529881e-06,
"loss": 0.7909,
"step": 6454
},
{
"epoch": 1.8740020322252868,
"grad_norm": 3.2774910926818848,
"learning_rate": 7.381847874499708e-06,
"loss": 0.7301,
"step": 6455
},
{
"epoch": 1.874292350123385,
"grad_norm": 4.022462368011475,
"learning_rate": 7.38100357662793e-06,
"loss": 0.7458,
"step": 6456
},
{
"epoch": 1.8745826680214837,
"grad_norm": 4.091184139251709,
"learning_rate": 7.380159190945685e-06,
"loss": 0.7613,
"step": 6457
},
{
"epoch": 1.8748729859195818,
"grad_norm": 3.5496578216552734,
"learning_rate": 7.379314717484113e-06,
"loss": 0.7163,
"step": 6458
},
{
"epoch": 1.8751633038176805,
"grad_norm": 3.375134229660034,
"learning_rate": 7.37847015627436e-06,
"loss": 0.7181,
"step": 6459
},
{
"epoch": 1.8754536217157787,
"grad_norm": 3.6883907318115234,
"learning_rate": 7.3776255073475696e-06,
"loss": 0.7514,
"step": 6460
},
{
"epoch": 1.8757439396138773,
"grad_norm": 3.7220544815063477,
"learning_rate": 7.376780770734895e-06,
"loss": 0.8063,
"step": 6461
},
{
"epoch": 1.8760342575119755,
"grad_norm": 3.9749653339385986,
"learning_rate": 7.375935946467487e-06,
"loss": 0.8315,
"step": 6462
},
{
"epoch": 1.8763245754100741,
"grad_norm": 3.658550500869751,
"learning_rate": 7.375091034576507e-06,
"loss": 0.8187,
"step": 6463
},
{
"epoch": 1.8766148933081723,
"grad_norm": 3.2026803493499756,
"learning_rate": 7.374246035093111e-06,
"loss": 0.7014,
"step": 6464
},
{
"epoch": 1.876905211206271,
"grad_norm": 3.760976791381836,
"learning_rate": 7.373400948048464e-06,
"loss": 0.8147,
"step": 6465
},
{
"epoch": 1.8771955291043692,
"grad_norm": 3.686145544052124,
"learning_rate": 7.372555773473731e-06,
"loss": 0.7361,
"step": 6466
},
{
"epoch": 1.8774858470024678,
"grad_norm": 3.6365010738372803,
"learning_rate": 7.371710511400083e-06,
"loss": 0.7642,
"step": 6467
},
{
"epoch": 1.877776164900566,
"grad_norm": 3.697004795074463,
"learning_rate": 7.3708651618586925e-06,
"loss": 0.8165,
"step": 6468
},
{
"epoch": 1.8780664827986646,
"grad_norm": 3.7043352127075195,
"learning_rate": 7.370019724880734e-06,
"loss": 0.7413,
"step": 6469
},
{
"epoch": 1.8783568006967628,
"grad_norm": 3.635573148727417,
"learning_rate": 7.3691742004973906e-06,
"loss": 0.7286,
"step": 6470
},
{
"epoch": 1.8786471185948614,
"grad_norm": 3.533658742904663,
"learning_rate": 7.368328588739843e-06,
"loss": 0.7747,
"step": 6471
},
{
"epoch": 1.8789374364929596,
"grad_norm": 3.5193533897399902,
"learning_rate": 7.367482889639277e-06,
"loss": 0.7303,
"step": 6472
},
{
"epoch": 1.8792277543910583,
"grad_norm": 3.6575841903686523,
"learning_rate": 7.36663710322688e-06,
"loss": 0.8409,
"step": 6473
},
{
"epoch": 1.8795180722891565,
"grad_norm": 4.039218425750732,
"learning_rate": 7.365791229533848e-06,
"loss": 0.8452,
"step": 6474
},
{
"epoch": 1.879808390187255,
"grad_norm": 3.2911484241485596,
"learning_rate": 7.36494526859137e-06,
"loss": 0.7872,
"step": 6475
},
{
"epoch": 1.8800987080853535,
"grad_norm": 3.6404707431793213,
"learning_rate": 7.364099220430654e-06,
"loss": 0.8814,
"step": 6476
},
{
"epoch": 1.880389025983452,
"grad_norm": 3.8109161853790283,
"learning_rate": 7.3632530850828934e-06,
"loss": 0.6996,
"step": 6477
},
{
"epoch": 1.8806793438815503,
"grad_norm": 3.478952169418335,
"learning_rate": 7.362406862579299e-06,
"loss": 0.745,
"step": 6478
},
{
"epoch": 1.8809696617796487,
"grad_norm": 3.923051118850708,
"learning_rate": 7.3615605529510766e-06,
"loss": 0.8903,
"step": 6479
},
{
"epoch": 1.8812599796777472,
"grad_norm": 3.3513667583465576,
"learning_rate": 7.360714156229437e-06,
"loss": 0.8369,
"step": 6480
},
{
"epoch": 1.8815502975758456,
"grad_norm": 3.3167412281036377,
"learning_rate": 7.359867672445598e-06,
"loss": 0.8021,
"step": 6481
},
{
"epoch": 1.881840615473944,
"grad_norm": 3.9195165634155273,
"learning_rate": 7.359021101630775e-06,
"loss": 0.8945,
"step": 6482
},
{
"epoch": 1.8821309333720424,
"grad_norm": 3.156968116760254,
"learning_rate": 7.358174443816188e-06,
"loss": 0.7998,
"step": 6483
},
{
"epoch": 1.8824212512701408,
"grad_norm": 3.577028512954712,
"learning_rate": 7.357327699033065e-06,
"loss": 0.7762,
"step": 6484
},
{
"epoch": 1.8827115691682392,
"grad_norm": 3.363496780395508,
"learning_rate": 7.356480867312632e-06,
"loss": 0.7806,
"step": 6485
},
{
"epoch": 1.8830018870663376,
"grad_norm": 3.6327083110809326,
"learning_rate": 7.355633948686121e-06,
"loss": 0.8288,
"step": 6486
},
{
"epoch": 1.883292204964436,
"grad_norm": 3.394564628601074,
"learning_rate": 7.354786943184763e-06,
"loss": 0.7802,
"step": 6487
},
{
"epoch": 1.8835825228625345,
"grad_norm": 3.100290298461914,
"learning_rate": 7.353939850839796e-06,
"loss": 0.7393,
"step": 6488
},
{
"epoch": 1.8838728407606329,
"grad_norm": 3.4168612957000732,
"learning_rate": 7.353092671682464e-06,
"loss": 0.7864,
"step": 6489
},
{
"epoch": 1.8841631586587313,
"grad_norm": 3.401819944381714,
"learning_rate": 7.352245405744007e-06,
"loss": 0.7972,
"step": 6490
},
{
"epoch": 1.8844534765568297,
"grad_norm": 3.8674604892730713,
"learning_rate": 7.351398053055673e-06,
"loss": 0.7671,
"step": 6491
},
{
"epoch": 1.8847437944549281,
"grad_norm": 3.5375800132751465,
"learning_rate": 7.35055061364871e-06,
"loss": 0.7949,
"step": 6492
},
{
"epoch": 1.8850341123530265,
"grad_norm": 3.1606504917144775,
"learning_rate": 7.349703087554376e-06,
"loss": 0.6934,
"step": 6493
},
{
"epoch": 1.885324430251125,
"grad_norm": 3.7292003631591797,
"learning_rate": 7.348855474803923e-06,
"loss": 0.8148,
"step": 6494
},
{
"epoch": 1.8856147481492234,
"grad_norm": 3.975048542022705,
"learning_rate": 7.348007775428613e-06,
"loss": 0.7449,
"step": 6495
},
{
"epoch": 1.8859050660473218,
"grad_norm": 3.215825319290161,
"learning_rate": 7.347159989459707e-06,
"loss": 0.6939,
"step": 6496
},
{
"epoch": 1.8861953839454202,
"grad_norm": 3.75365948677063,
"learning_rate": 7.346312116928473e-06,
"loss": 0.7789,
"step": 6497
},
{
"epoch": 1.8864857018435186,
"grad_norm": 3.8031654357910156,
"learning_rate": 7.34546415786618e-06,
"loss": 0.7878,
"step": 6498
},
{
"epoch": 1.886776019741617,
"grad_norm": 3.699834108352661,
"learning_rate": 7.3446161123040975e-06,
"loss": 0.7436,
"step": 6499
},
{
"epoch": 1.8870663376397157,
"grad_norm": 3.516376256942749,
"learning_rate": 7.3437679802735054e-06,
"loss": 0.7246,
"step": 6500
},
{
"epoch": 1.8870663376397157,
"eval_loss": 1.1672762632369995,
"eval_runtime": 13.3449,
"eval_samples_per_second": 29.974,
"eval_steps_per_second": 3.747,
"step": 6500
},
{
"epoch": 1.8873566555378138,
"grad_norm": 2.8746337890625,
"learning_rate": 7.342919761805678e-06,
"loss": 0.6085,
"step": 6501
},
{
"epoch": 1.8876469734359125,
"grad_norm": 3.8056139945983887,
"learning_rate": 7.342071456931901e-06,
"loss": 0.8326,
"step": 6502
},
{
"epoch": 1.8879372913340107,
"grad_norm": 3.5527572631835938,
"learning_rate": 7.3412230656834584e-06,
"loss": 0.7709,
"step": 6503
},
{
"epoch": 1.8882276092321093,
"grad_norm": 3.6476054191589355,
"learning_rate": 7.340374588091638e-06,
"loss": 0.7901,
"step": 6504
},
{
"epoch": 1.8885179271302075,
"grad_norm": 3.307996988296509,
"learning_rate": 7.339526024187731e-06,
"loss": 0.738,
"step": 6505
},
{
"epoch": 1.8888082450283061,
"grad_norm": 3.871455192565918,
"learning_rate": 7.338677374003032e-06,
"loss": 0.8552,
"step": 6506
},
{
"epoch": 1.8890985629264043,
"grad_norm": 3.560155153274536,
"learning_rate": 7.33782863756884e-06,
"loss": 0.8033,
"step": 6507
},
{
"epoch": 1.889388880824503,
"grad_norm": 3.363393783569336,
"learning_rate": 7.336979814916456e-06,
"loss": 0.7238,
"step": 6508
},
{
"epoch": 1.8896791987226011,
"grad_norm": 3.2523813247680664,
"learning_rate": 7.336130906077183e-06,
"loss": 0.7462,
"step": 6509
},
{
"epoch": 1.8899695166206998,
"grad_norm": 3.4237465858459473,
"learning_rate": 7.335281911082332e-06,
"loss": 0.7069,
"step": 6510
},
{
"epoch": 1.890259834518798,
"grad_norm": 3.43580961227417,
"learning_rate": 7.334432829963207e-06,
"loss": 0.7886,
"step": 6511
},
{
"epoch": 1.8905501524168966,
"grad_norm": 3.4298160076141357,
"learning_rate": 7.333583662751128e-06,
"loss": 0.7729,
"step": 6512
},
{
"epoch": 1.8908404703149948,
"grad_norm": 3.8691980838775635,
"learning_rate": 7.332734409477409e-06,
"loss": 0.9029,
"step": 6513
},
{
"epoch": 1.8911307882130934,
"grad_norm": 3.4163694381713867,
"learning_rate": 7.331885070173371e-06,
"loss": 0.8358,
"step": 6514
},
{
"epoch": 1.8914211061111916,
"grad_norm": 3.1868526935577393,
"learning_rate": 7.331035644870336e-06,
"loss": 0.7406,
"step": 6515
},
{
"epoch": 1.8917114240092903,
"grad_norm": 3.5221593379974365,
"learning_rate": 7.3301861335996325e-06,
"loss": 0.7748,
"step": 6516
},
{
"epoch": 1.8920017419073885,
"grad_norm": 3.6796584129333496,
"learning_rate": 7.3293365363925894e-06,
"loss": 0.7916,
"step": 6517
},
{
"epoch": 1.892292059805487,
"grad_norm": 3.560765266418457,
"learning_rate": 7.328486853280539e-06,
"loss": 0.7967,
"step": 6518
},
{
"epoch": 1.8925823777035853,
"grad_norm": 3.809666633605957,
"learning_rate": 7.327637084294818e-06,
"loss": 0.818,
"step": 6519
},
{
"epoch": 1.892872695601684,
"grad_norm": 3.327310085296631,
"learning_rate": 7.326787229466762e-06,
"loss": 0.7358,
"step": 6520
},
{
"epoch": 1.893163013499782,
"grad_norm": 3.5378589630126953,
"learning_rate": 7.325937288827719e-06,
"loss": 0.7298,
"step": 6521
},
{
"epoch": 1.8934533313978807,
"grad_norm": 3.669187068939209,
"learning_rate": 7.325087262409031e-06,
"loss": 0.8244,
"step": 6522
},
{
"epoch": 1.893743649295979,
"grad_norm": 3.2379751205444336,
"learning_rate": 7.3242371502420485e-06,
"loss": 0.7645,
"step": 6523
},
{
"epoch": 1.8940339671940776,
"grad_norm": 4.166474342346191,
"learning_rate": 7.3233869523581214e-06,
"loss": 0.9135,
"step": 6524
},
{
"epoch": 1.894324285092176,
"grad_norm": 3.6318092346191406,
"learning_rate": 7.322536668788605e-06,
"loss": 0.777,
"step": 6525
},
{
"epoch": 1.8946146029902744,
"grad_norm": 3.7004711627960205,
"learning_rate": 7.321686299564858e-06,
"loss": 0.6785,
"step": 6526
},
{
"epoch": 1.8949049208883728,
"grad_norm": 3.1796281337738037,
"learning_rate": 7.320835844718243e-06,
"loss": 0.7538,
"step": 6527
},
{
"epoch": 1.8951952387864712,
"grad_norm": 3.454525947570801,
"learning_rate": 7.319985304280122e-06,
"loss": 0.748,
"step": 6528
},
{
"epoch": 1.8954855566845696,
"grad_norm": 3.8724629878997803,
"learning_rate": 7.319134678281863e-06,
"loss": 0.8925,
"step": 6529
},
{
"epoch": 1.895775874582668,
"grad_norm": 3.6467819213867188,
"learning_rate": 7.318283966754838e-06,
"loss": 0.7681,
"step": 6530
},
{
"epoch": 1.8960661924807665,
"grad_norm": 3.970150947570801,
"learning_rate": 7.317433169730421e-06,
"loss": 0.815,
"step": 6531
},
{
"epoch": 1.8963565103788649,
"grad_norm": 3.366507053375244,
"learning_rate": 7.3165822872399875e-06,
"loss": 0.7705,
"step": 6532
},
{
"epoch": 1.8966468282769633,
"grad_norm": 3.836026668548584,
"learning_rate": 7.315731319314919e-06,
"loss": 0.8512,
"step": 6533
},
{
"epoch": 1.8969371461750617,
"grad_norm": 3.7245543003082275,
"learning_rate": 7.314880265986598e-06,
"loss": 0.8078,
"step": 6534
},
{
"epoch": 1.8972274640731601,
"grad_norm": 3.417665481567383,
"learning_rate": 7.3140291272864116e-06,
"loss": 0.6402,
"step": 6535
},
{
"epoch": 1.8975177819712585,
"grad_norm": 3.5677568912506104,
"learning_rate": 7.313177903245749e-06,
"loss": 0.8362,
"step": 6536
},
{
"epoch": 1.897808099869357,
"grad_norm": 4.0231218338012695,
"learning_rate": 7.312326593896004e-06,
"loss": 0.9341,
"step": 6537
},
{
"epoch": 1.8980984177674554,
"grad_norm": 3.707977056503296,
"learning_rate": 7.311475199268572e-06,
"loss": 0.7686,
"step": 6538
},
{
"epoch": 1.8983887356655538,
"grad_norm": 3.406618595123291,
"learning_rate": 7.3106237193948504e-06,
"loss": 0.7152,
"step": 6539
},
{
"epoch": 1.8986790535636522,
"grad_norm": 3.426307439804077,
"learning_rate": 7.309772154306245e-06,
"loss": 0.7234,
"step": 6540
},
{
"epoch": 1.8989693714617506,
"grad_norm": 3.2683463096618652,
"learning_rate": 7.308920504034157e-06,
"loss": 0.7997,
"step": 6541
},
{
"epoch": 1.899259689359849,
"grad_norm": 3.643825054168701,
"learning_rate": 7.308068768609999e-06,
"loss": 0.8139,
"step": 6542
},
{
"epoch": 1.8995500072579474,
"grad_norm": 3.777906656265259,
"learning_rate": 7.3072169480651785e-06,
"loss": 0.8931,
"step": 6543
},
{
"epoch": 1.8998403251560458,
"grad_norm": 3.40627121925354,
"learning_rate": 7.306365042431115e-06,
"loss": 0.8319,
"step": 6544
},
{
"epoch": 1.9001306430541443,
"grad_norm": 3.8914313316345215,
"learning_rate": 7.305513051739222e-06,
"loss": 0.8638,
"step": 6545
},
{
"epoch": 1.9004209609522427,
"grad_norm": 4.062667369842529,
"learning_rate": 7.3046609760209255e-06,
"loss": 0.8284,
"step": 6546
},
{
"epoch": 1.900711278850341,
"grad_norm": 3.093411684036255,
"learning_rate": 7.303808815307644e-06,
"loss": 0.69,
"step": 6547
},
{
"epoch": 1.9010015967484395,
"grad_norm": 3.077059745788574,
"learning_rate": 7.302956569630808e-06,
"loss": 0.7037,
"step": 6548
},
{
"epoch": 1.901291914646538,
"grad_norm": 3.481987714767456,
"learning_rate": 7.302104239021849e-06,
"loss": 0.8128,
"step": 6549
},
{
"epoch": 1.9015822325446363,
"grad_norm": 3.439530372619629,
"learning_rate": 7.3012518235121976e-06,
"loss": 0.7401,
"step": 6550
},
{
"epoch": 1.901872550442735,
"grad_norm": 3.3708889484405518,
"learning_rate": 7.300399323133292e-06,
"loss": 0.7138,
"step": 6551
},
{
"epoch": 1.9021628683408331,
"grad_norm": 3.8107917308807373,
"learning_rate": 7.299546737916574e-06,
"loss": 0.8779,
"step": 6552
},
{
"epoch": 1.9024531862389318,
"grad_norm": 3.5310473442077637,
"learning_rate": 7.298694067893483e-06,
"loss": 0.7679,
"step": 6553
},
{
"epoch": 1.90274350413703,
"grad_norm": 3.196654796600342,
"learning_rate": 7.297841313095468e-06,
"loss": 0.7009,
"step": 6554
},
{
"epoch": 1.9030338220351286,
"grad_norm": 3.6681015491485596,
"learning_rate": 7.296988473553979e-06,
"loss": 0.7745,
"step": 6555
},
{
"epoch": 1.9033241399332268,
"grad_norm": 3.4849812984466553,
"learning_rate": 7.296135549300465e-06,
"loss": 0.7308,
"step": 6556
},
{
"epoch": 1.9036144578313254,
"grad_norm": 3.7782578468322754,
"learning_rate": 7.295282540366382e-06,
"loss": 0.8262,
"step": 6557
},
{
"epoch": 1.9039047757294236,
"grad_norm": 3.266765832901001,
"learning_rate": 7.29442944678319e-06,
"loss": 0.7228,
"step": 6558
},
{
"epoch": 1.9041950936275223,
"grad_norm": 3.374159336090088,
"learning_rate": 7.293576268582352e-06,
"loss": 0.7207,
"step": 6559
},
{
"epoch": 1.9044854115256205,
"grad_norm": 3.4658048152923584,
"learning_rate": 7.29272300579533e-06,
"loss": 0.7921,
"step": 6560
},
{
"epoch": 1.904775729423719,
"grad_norm": 3.6144564151763916,
"learning_rate": 7.291869658453594e-06,
"loss": 0.7876,
"step": 6561
},
{
"epoch": 1.9050660473218173,
"grad_norm": 3.865516424179077,
"learning_rate": 7.2910162265886146e-06,
"loss": 0.8732,
"step": 6562
},
{
"epoch": 1.905356365219916,
"grad_norm": 3.7226791381835938,
"learning_rate": 7.2901627102318665e-06,
"loss": 0.9022,
"step": 6563
},
{
"epoch": 1.905646683118014,
"grad_norm": 3.6240618228912354,
"learning_rate": 7.289309109414825e-06,
"loss": 0.8165,
"step": 6564
},
{
"epoch": 1.9059370010161127,
"grad_norm": 3.4062204360961914,
"learning_rate": 7.2884554241689744e-06,
"loss": 0.7112,
"step": 6565
},
{
"epoch": 1.906227318914211,
"grad_norm": 3.518115520477295,
"learning_rate": 7.287601654525793e-06,
"loss": 0.7026,
"step": 6566
},
{
"epoch": 1.9065176368123096,
"grad_norm": 3.3960046768188477,
"learning_rate": 7.286747800516771e-06,
"loss": 0.7845,
"step": 6567
},
{
"epoch": 1.9068079547104078,
"grad_norm": 3.9768590927124023,
"learning_rate": 7.2858938621734e-06,
"loss": 0.835,
"step": 6568
},
{
"epoch": 1.9070982726085064,
"grad_norm": 4.007421493530273,
"learning_rate": 7.285039839527168e-06,
"loss": 0.8687,
"step": 6569
},
{
"epoch": 1.9073885905066046,
"grad_norm": 3.7359652519226074,
"learning_rate": 7.284185732609574e-06,
"loss": 0.8011,
"step": 6570
},
{
"epoch": 1.9076789084047032,
"grad_norm": 3.613187313079834,
"learning_rate": 7.283331541452117e-06,
"loss": 0.6303,
"step": 6571
},
{
"epoch": 1.9079692263028014,
"grad_norm": 3.4708168506622314,
"learning_rate": 7.2824772660863e-06,
"loss": 0.6899,
"step": 6572
},
{
"epoch": 1.9082595442009,
"grad_norm": 3.855139970779419,
"learning_rate": 7.281622906543625e-06,
"loss": 0.843,
"step": 6573
},
{
"epoch": 1.9085498620989982,
"grad_norm": 3.631195068359375,
"learning_rate": 7.280768462855605e-06,
"loss": 0.8049,
"step": 6574
},
{
"epoch": 1.9088401799970969,
"grad_norm": 2.9242310523986816,
"learning_rate": 7.2799139350537466e-06,
"loss": 0.7044,
"step": 6575
},
{
"epoch": 1.9091304978951953,
"grad_norm": 3.3103771209716797,
"learning_rate": 7.279059323169569e-06,
"loss": 0.6607,
"step": 6576
},
{
"epoch": 1.9094208157932937,
"grad_norm": 3.7364091873168945,
"learning_rate": 7.278204627234587e-06,
"loss": 0.824,
"step": 6577
},
{
"epoch": 1.9097111336913921,
"grad_norm": 4.07366418838501,
"learning_rate": 7.277349847280323e-06,
"loss": 0.8653,
"step": 6578
},
{
"epoch": 1.9100014515894905,
"grad_norm": 3.9594852924346924,
"learning_rate": 7.276494983338298e-06,
"loss": 0.8074,
"step": 6579
},
{
"epoch": 1.910291769487589,
"grad_norm": 3.387207269668579,
"learning_rate": 7.2756400354400445e-06,
"loss": 0.7408,
"step": 6580
},
{
"epoch": 1.9105820873856874,
"grad_norm": 3.4568238258361816,
"learning_rate": 7.274785003617088e-06,
"loss": 0.7503,
"step": 6581
},
{
"epoch": 1.9108724052837858,
"grad_norm": 3.83988881111145,
"learning_rate": 7.273929887900965e-06,
"loss": 0.9153,
"step": 6582
},
{
"epoch": 1.9111627231818842,
"grad_norm": 3.5666446685791016,
"learning_rate": 7.273074688323209e-06,
"loss": 0.6675,
"step": 6583
},
{
"epoch": 1.9114530410799826,
"grad_norm": 3.3551602363586426,
"learning_rate": 7.272219404915359e-06,
"loss": 0.7733,
"step": 6584
},
{
"epoch": 1.911743358978081,
"grad_norm": 3.7108700275421143,
"learning_rate": 7.271364037708961e-06,
"loss": 0.765,
"step": 6585
},
{
"epoch": 1.9120336768761794,
"grad_norm": 4.1356916427612305,
"learning_rate": 7.270508586735559e-06,
"loss": 0.8728,
"step": 6586
},
{
"epoch": 1.9123239947742778,
"grad_norm": 3.6741342544555664,
"learning_rate": 7.269653052026701e-06,
"loss": 0.7273,
"step": 6587
},
{
"epoch": 1.9126143126723762,
"grad_norm": 3.7104272842407227,
"learning_rate": 7.268797433613938e-06,
"loss": 0.785,
"step": 6588
},
{
"epoch": 1.9129046305704747,
"grad_norm": 3.8318393230438232,
"learning_rate": 7.267941731528827e-06,
"loss": 0.8279,
"step": 6589
},
{
"epoch": 1.913194948468573,
"grad_norm": 3.612663507461548,
"learning_rate": 7.267085945802923e-06,
"loss": 0.7359,
"step": 6590
},
{
"epoch": 1.9134852663666715,
"grad_norm": 3.567901611328125,
"learning_rate": 7.266230076467792e-06,
"loss": 0.7328,
"step": 6591
},
{
"epoch": 1.91377558426477,
"grad_norm": 3.3783185482025146,
"learning_rate": 7.265374123554993e-06,
"loss": 0.7242,
"step": 6592
},
{
"epoch": 1.9140659021628683,
"grad_norm": 3.4487850666046143,
"learning_rate": 7.264518087096095e-06,
"loss": 0.7309,
"step": 6593
},
{
"epoch": 1.9143562200609667,
"grad_norm": 2.840123176574707,
"learning_rate": 7.26366196712267e-06,
"loss": 0.697,
"step": 6594
},
{
"epoch": 1.9146465379590651,
"grad_norm": 3.352851152420044,
"learning_rate": 7.26280576366629e-06,
"loss": 0.747,
"step": 6595
},
{
"epoch": 1.9149368558571636,
"grad_norm": 3.16660213470459,
"learning_rate": 7.261949476758531e-06,
"loss": 0.7444,
"step": 6596
},
{
"epoch": 1.915227173755262,
"grad_norm": 3.6520681381225586,
"learning_rate": 7.261093106430973e-06,
"loss": 0.7809,
"step": 6597
},
{
"epoch": 1.9155174916533604,
"grad_norm": 3.453809976577759,
"learning_rate": 7.260236652715198e-06,
"loss": 0.7439,
"step": 6598
},
{
"epoch": 1.9158078095514588,
"grad_norm": 3.48639178276062,
"learning_rate": 7.2593801156427924e-06,
"loss": 0.7891,
"step": 6599
},
{
"epoch": 1.9160981274495574,
"grad_norm": 3.535409927368164,
"learning_rate": 7.258523495245344e-06,
"loss": 0.6814,
"step": 6600
},
{
"epoch": 1.9163884453476556,
"grad_norm": 3.8680124282836914,
"learning_rate": 7.257666791554448e-06,
"loss": 0.806,
"step": 6601
},
{
"epoch": 1.9166787632457543,
"grad_norm": 3.6400327682495117,
"learning_rate": 7.256810004601694e-06,
"loss": 0.8711,
"step": 6602
},
{
"epoch": 1.9169690811438524,
"grad_norm": 3.5471885204315186,
"learning_rate": 7.255953134418684e-06,
"loss": 0.8371,
"step": 6603
},
{
"epoch": 1.917259399041951,
"grad_norm": 3.0958104133605957,
"learning_rate": 7.255096181037018e-06,
"loss": 0.6935,
"step": 6604
},
{
"epoch": 1.9175497169400493,
"grad_norm": 3.3974525928497314,
"learning_rate": 7.254239144488297e-06,
"loss": 0.7654,
"step": 6605
},
{
"epoch": 1.917840034838148,
"grad_norm": 3.58324933052063,
"learning_rate": 7.253382024804134e-06,
"loss": 0.7546,
"step": 6606
},
{
"epoch": 1.918130352736246,
"grad_norm": 3.7460758686065674,
"learning_rate": 7.252524822016135e-06,
"loss": 0.7191,
"step": 6607
},
{
"epoch": 1.9184206706343447,
"grad_norm": 3.605059862136841,
"learning_rate": 7.251667536155915e-06,
"loss": 0.8426,
"step": 6608
},
{
"epoch": 1.918710988532443,
"grad_norm": 3.271284580230713,
"learning_rate": 7.250810167255089e-06,
"loss": 0.6733,
"step": 6609
},
{
"epoch": 1.9190013064305416,
"grad_norm": 3.9770383834838867,
"learning_rate": 7.2499527153452775e-06,
"loss": 0.9251,
"step": 6610
},
{
"epoch": 1.9192916243286398,
"grad_norm": 3.7332961559295654,
"learning_rate": 7.249095180458101e-06,
"loss": 0.8789,
"step": 6611
},
{
"epoch": 1.9195819422267384,
"grad_norm": 3.8814618587493896,
"learning_rate": 7.24823756262519e-06,
"loss": 0.8266,
"step": 6612
},
{
"epoch": 1.9198722601248366,
"grad_norm": 3.7765979766845703,
"learning_rate": 7.247379861878167e-06,
"loss": 0.7793,
"step": 6613
},
{
"epoch": 1.9201625780229352,
"grad_norm": 3.925607442855835,
"learning_rate": 7.24652207824867e-06,
"loss": 0.8038,
"step": 6614
},
{
"epoch": 1.9204528959210334,
"grad_norm": 3.446561098098755,
"learning_rate": 7.245664211768327e-06,
"loss": 0.7647,
"step": 6615
},
{
"epoch": 1.920743213819132,
"grad_norm": 4.287924289703369,
"learning_rate": 7.24480626246878e-06,
"loss": 0.7625,
"step": 6616
},
{
"epoch": 1.9210335317172302,
"grad_norm": 3.6572999954223633,
"learning_rate": 7.24394823038167e-06,
"loss": 0.7498,
"step": 6617
},
{
"epoch": 1.9213238496153289,
"grad_norm": 4.16467809677124,
"learning_rate": 7.243090115538639e-06,
"loss": 0.8243,
"step": 6618
},
{
"epoch": 1.921614167513427,
"grad_norm": 3.5425045490264893,
"learning_rate": 7.242231917971335e-06,
"loss": 0.7329,
"step": 6619
},
{
"epoch": 1.9219044854115257,
"grad_norm": 3.067556858062744,
"learning_rate": 7.241373637711407e-06,
"loss": 0.6621,
"step": 6620
},
{
"epoch": 1.922194803309624,
"grad_norm": 3.9090375900268555,
"learning_rate": 7.240515274790508e-06,
"loss": 0.8719,
"step": 6621
},
{
"epoch": 1.9224851212077225,
"grad_norm": 3.3259966373443604,
"learning_rate": 7.239656829240296e-06,
"loss": 0.7411,
"step": 6622
},
{
"epoch": 1.9227754391058207,
"grad_norm": 3.5183987617492676,
"learning_rate": 7.238798301092429e-06,
"loss": 0.8731,
"step": 6623
},
{
"epoch": 1.9230657570039194,
"grad_norm": 3.416977882385254,
"learning_rate": 7.237939690378568e-06,
"loss": 0.7071,
"step": 6624
},
{
"epoch": 1.9233560749020175,
"grad_norm": 3.410515069961548,
"learning_rate": 7.2370809971303805e-06,
"loss": 0.7712,
"step": 6625
},
{
"epoch": 1.9236463928001162,
"grad_norm": 3.2718276977539062,
"learning_rate": 7.236222221379532e-06,
"loss": 0.6932,
"step": 6626
},
{
"epoch": 1.9239367106982146,
"grad_norm": 3.1431262493133545,
"learning_rate": 7.235363363157697e-06,
"loss": 0.6635,
"step": 6627
},
{
"epoch": 1.924227028596313,
"grad_norm": 3.419757843017578,
"learning_rate": 7.234504422496548e-06,
"loss": 0.8539,
"step": 6628
},
{
"epoch": 1.9245173464944114,
"grad_norm": 3.7469241619110107,
"learning_rate": 7.233645399427762e-06,
"loss": 0.8466,
"step": 6629
},
{
"epoch": 1.9248076643925098,
"grad_norm": 3.4239110946655273,
"learning_rate": 7.2327862939830204e-06,
"loss": 0.7049,
"step": 6630
},
{
"epoch": 1.9250979822906082,
"grad_norm": 3.939842462539673,
"learning_rate": 7.231927106194007e-06,
"loss": 0.8585,
"step": 6631
},
{
"epoch": 1.9253883001887067,
"grad_norm": 3.831742286682129,
"learning_rate": 7.231067836092407e-06,
"loss": 0.8349,
"step": 6632
},
{
"epoch": 1.925678618086805,
"grad_norm": 3.6673312187194824,
"learning_rate": 7.23020848370991e-06,
"loss": 0.8182,
"step": 6633
},
{
"epoch": 1.9259689359849035,
"grad_norm": 3.4564390182495117,
"learning_rate": 7.229349049078211e-06,
"loss": 0.7056,
"step": 6634
},
{
"epoch": 1.926259253883002,
"grad_norm": 3.5960192680358887,
"learning_rate": 7.228489532229001e-06,
"loss": 0.8343,
"step": 6635
},
{
"epoch": 1.9265495717811003,
"grad_norm": 3.550015926361084,
"learning_rate": 7.227629933193983e-06,
"loss": 0.7848,
"step": 6636
},
{
"epoch": 1.9268398896791987,
"grad_norm": 3.623354196548462,
"learning_rate": 7.226770252004858e-06,
"loss": 0.831,
"step": 6637
},
{
"epoch": 1.9271302075772971,
"grad_norm": 3.6000454425811768,
"learning_rate": 7.225910488693328e-06,
"loss": 0.8775,
"step": 6638
},
{
"epoch": 1.9274205254753956,
"grad_norm": 3.9402761459350586,
"learning_rate": 7.225050643291103e-06,
"loss": 0.786,
"step": 6639
},
{
"epoch": 1.927710843373494,
"grad_norm": 3.940194845199585,
"learning_rate": 7.224190715829894e-06,
"loss": 0.9916,
"step": 6640
},
{
"epoch": 1.9280011612715924,
"grad_norm": 3.295717239379883,
"learning_rate": 7.223330706341414e-06,
"loss": 0.7205,
"step": 6641
},
{
"epoch": 1.9282914791696908,
"grad_norm": 3.6699130535125732,
"learning_rate": 7.22247061485738e-06,
"loss": 0.8265,
"step": 6642
},
{
"epoch": 1.9285817970677892,
"grad_norm": 3.288679361343384,
"learning_rate": 7.221610441409509e-06,
"loss": 0.826,
"step": 6643
},
{
"epoch": 1.9288721149658876,
"grad_norm": 3.609783411026001,
"learning_rate": 7.220750186029529e-06,
"loss": 0.7258,
"step": 6644
},
{
"epoch": 1.929162432863986,
"grad_norm": 3.97063946723938,
"learning_rate": 7.219889848749163e-06,
"loss": 0.8644,
"step": 6645
},
{
"epoch": 1.9294527507620844,
"grad_norm": 3.5488922595977783,
"learning_rate": 7.21902942960014e-06,
"loss": 0.8222,
"step": 6646
},
{
"epoch": 1.9297430686601829,
"grad_norm": 3.388948678970337,
"learning_rate": 7.2181689286141935e-06,
"loss": 0.782,
"step": 6647
},
{
"epoch": 1.9300333865582813,
"grad_norm": 3.740267515182495,
"learning_rate": 7.2173083458230556e-06,
"loss": 0.8105,
"step": 6648
},
{
"epoch": 1.9303237044563797,
"grad_norm": 3.400404214859009,
"learning_rate": 7.2164476812584676e-06,
"loss": 0.6943,
"step": 6649
},
{
"epoch": 1.930614022354478,
"grad_norm": 3.628769874572754,
"learning_rate": 7.215586934952167e-06,
"loss": 0.7671,
"step": 6650
},
{
"epoch": 1.9309043402525767,
"grad_norm": 3.8510613441467285,
"learning_rate": 7.2147261069359e-06,
"loss": 0.7778,
"step": 6651
},
{
"epoch": 1.931194658150675,
"grad_norm": 3.6918275356292725,
"learning_rate": 7.213865197241412e-06,
"loss": 0.91,
"step": 6652
},
{
"epoch": 1.9314849760487736,
"grad_norm": 3.9917702674865723,
"learning_rate": 7.2130042059004554e-06,
"loss": 0.7967,
"step": 6653
},
{
"epoch": 1.9317752939468718,
"grad_norm": 3.8993303775787354,
"learning_rate": 7.212143132944782e-06,
"loss": 0.694,
"step": 6654
},
{
"epoch": 1.9320656118449704,
"grad_norm": 4.138810634613037,
"learning_rate": 7.2112819784061484e-06,
"loss": 0.8451,
"step": 6655
},
{
"epoch": 1.9323559297430686,
"grad_norm": 3.826202392578125,
"learning_rate": 7.210420742316311e-06,
"loss": 0.7908,
"step": 6656
},
{
"epoch": 1.9326462476411672,
"grad_norm": 3.8772799968719482,
"learning_rate": 7.209559424707034e-06,
"loss": 0.8552,
"step": 6657
},
{
"epoch": 1.9329365655392654,
"grad_norm": 3.5785393714904785,
"learning_rate": 7.208698025610084e-06,
"loss": 0.7256,
"step": 6658
},
{
"epoch": 1.933226883437364,
"grad_norm": 3.79007887840271,
"learning_rate": 7.207836545057226e-06,
"loss": 0.8709,
"step": 6659
},
{
"epoch": 1.9335172013354622,
"grad_norm": 3.2313954830169678,
"learning_rate": 7.206974983080233e-06,
"loss": 0.735,
"step": 6660
},
{
"epoch": 1.9338075192335609,
"grad_norm": 3.2719595432281494,
"learning_rate": 7.206113339710877e-06,
"loss": 0.7153,
"step": 6661
},
{
"epoch": 1.934097837131659,
"grad_norm": 3.3899588584899902,
"learning_rate": 7.205251614980938e-06,
"loss": 0.776,
"step": 6662
},
{
"epoch": 1.9343881550297577,
"grad_norm": 3.854118824005127,
"learning_rate": 7.204389808922194e-06,
"loss": 0.8208,
"step": 6663
},
{
"epoch": 1.9346784729278559,
"grad_norm": 3.564875841140747,
"learning_rate": 7.203527921566429e-06,
"loss": 0.8051,
"step": 6664
},
{
"epoch": 1.9349687908259545,
"grad_norm": 3.328470468521118,
"learning_rate": 7.202665952945429e-06,
"loss": 0.7459,
"step": 6665
},
{
"epoch": 1.9352591087240527,
"grad_norm": 3.722935438156128,
"learning_rate": 7.201803903090983e-06,
"loss": 0.8839,
"step": 6666
},
{
"epoch": 1.9355494266221513,
"grad_norm": 3.237356662750244,
"learning_rate": 7.20094177203488e-06,
"loss": 0.7183,
"step": 6667
},
{
"epoch": 1.9358397445202495,
"grad_norm": 3.388763666152954,
"learning_rate": 7.2000795598089215e-06,
"loss": 0.7665,
"step": 6668
},
{
"epoch": 1.9361300624183482,
"grad_norm": 3.321420907974243,
"learning_rate": 7.1992172664449e-06,
"loss": 0.75,
"step": 6669
},
{
"epoch": 1.9364203803164464,
"grad_norm": 3.4232888221740723,
"learning_rate": 7.1983548919746185e-06,
"loss": 0.7208,
"step": 6670
},
{
"epoch": 1.936710698214545,
"grad_norm": 3.926154375076294,
"learning_rate": 7.1974924364298804e-06,
"loss": 0.9375,
"step": 6671
},
{
"epoch": 1.9370010161126432,
"grad_norm": 3.0157711505889893,
"learning_rate": 7.196629899842495e-06,
"loss": 0.6688,
"step": 6672
},
{
"epoch": 1.9372913340107418,
"grad_norm": 3.3146955966949463,
"learning_rate": 7.19576728224427e-06,
"loss": 0.7946,
"step": 6673
},
{
"epoch": 1.93758165190884,
"grad_norm": 3.8927111625671387,
"learning_rate": 7.1949045836670195e-06,
"loss": 0.8109,
"step": 6674
},
{
"epoch": 1.9378719698069387,
"grad_norm": 3.6933443546295166,
"learning_rate": 7.194041804142556e-06,
"loss": 0.8922,
"step": 6675
},
{
"epoch": 1.938162287705037,
"grad_norm": 3.167834758758545,
"learning_rate": 7.193178943702706e-06,
"loss": 0.6685,
"step": 6676
},
{
"epoch": 1.9384526056031355,
"grad_norm": 3.4566543102264404,
"learning_rate": 7.192316002379283e-06,
"loss": 0.8034,
"step": 6677
},
{
"epoch": 1.938742923501234,
"grad_norm": 3.62845778465271,
"learning_rate": 7.191452980204119e-06,
"loss": 0.7736,
"step": 6678
},
{
"epoch": 1.9390332413993323,
"grad_norm": 3.444850206375122,
"learning_rate": 7.190589877209036e-06,
"loss": 0.8366,
"step": 6679
},
{
"epoch": 1.9393235592974307,
"grad_norm": 3.749293565750122,
"learning_rate": 7.189726693425869e-06,
"loss": 0.8621,
"step": 6680
},
{
"epoch": 1.9396138771955291,
"grad_norm": 3.366168737411499,
"learning_rate": 7.18886342888645e-06,
"loss": 0.7352,
"step": 6681
},
{
"epoch": 1.9399041950936275,
"grad_norm": 3.095916509628296,
"learning_rate": 7.1880000836226175e-06,
"loss": 0.7065,
"step": 6682
},
{
"epoch": 1.940194512991726,
"grad_norm": 3.3080127239227295,
"learning_rate": 7.187136657666208e-06,
"loss": 0.7082,
"step": 6683
},
{
"epoch": 1.9404848308898244,
"grad_norm": 3.6062538623809814,
"learning_rate": 7.186273151049068e-06,
"loss": 0.8088,
"step": 6684
},
{
"epoch": 1.9407751487879228,
"grad_norm": 3.4961612224578857,
"learning_rate": 7.185409563803042e-06,
"loss": 0.7568,
"step": 6685
},
{
"epoch": 1.9410654666860212,
"grad_norm": 3.556150436401367,
"learning_rate": 7.184545895959978e-06,
"loss": 0.8106,
"step": 6686
},
{
"epoch": 1.9413557845841196,
"grad_norm": 3.6129298210144043,
"learning_rate": 7.183682147551729e-06,
"loss": 0.8475,
"step": 6687
},
{
"epoch": 1.941646102482218,
"grad_norm": 3.856016159057617,
"learning_rate": 7.182818318610148e-06,
"loss": 0.7978,
"step": 6688
},
{
"epoch": 1.9419364203803164,
"grad_norm": 3.685530424118042,
"learning_rate": 7.1819544091670935e-06,
"loss": 0.7201,
"step": 6689
},
{
"epoch": 1.9422267382784149,
"grad_norm": 3.1159701347351074,
"learning_rate": 7.1810904192544265e-06,
"loss": 0.713,
"step": 6690
},
{
"epoch": 1.9425170561765133,
"grad_norm": 3.259775161743164,
"learning_rate": 7.180226348904012e-06,
"loss": 0.803,
"step": 6691
},
{
"epoch": 1.9428073740746117,
"grad_norm": 3.3046481609344482,
"learning_rate": 7.179362198147712e-06,
"loss": 0.7637,
"step": 6692
},
{
"epoch": 1.94309769197271,
"grad_norm": 3.818223476409912,
"learning_rate": 7.178497967017401e-06,
"loss": 0.7639,
"step": 6693
},
{
"epoch": 1.9433880098708085,
"grad_norm": 3.595642328262329,
"learning_rate": 7.177633655544949e-06,
"loss": 0.7035,
"step": 6694
},
{
"epoch": 1.943678327768907,
"grad_norm": 3.9954261779785156,
"learning_rate": 7.1767692637622336e-06,
"loss": 0.9131,
"step": 6695
},
{
"epoch": 1.9439686456670053,
"grad_norm": 3.853381872177124,
"learning_rate": 7.175904791701129e-06,
"loss": 0.7387,
"step": 6696
},
{
"epoch": 1.9442589635651037,
"grad_norm": 3.729569911956787,
"learning_rate": 7.17504023939352e-06,
"loss": 0.8771,
"step": 6697
},
{
"epoch": 1.9445492814632022,
"grad_norm": 3.354240894317627,
"learning_rate": 7.174175606871291e-06,
"loss": 0.7374,
"step": 6698
},
{
"epoch": 1.9448395993613006,
"grad_norm": 3.4094414710998535,
"learning_rate": 7.173310894166328e-06,
"loss": 0.7392,
"step": 6699
},
{
"epoch": 1.945129917259399,
"grad_norm": 3.737236976623535,
"learning_rate": 7.172446101310521e-06,
"loss": 0.8732,
"step": 6700
},
{
"epoch": 1.9454202351574974,
"grad_norm": 3.317800521850586,
"learning_rate": 7.171581228335764e-06,
"loss": 0.6949,
"step": 6701
},
{
"epoch": 1.945710553055596,
"grad_norm": 3.552617311477661,
"learning_rate": 7.170716275273954e-06,
"loss": 0.7557,
"step": 6702
},
{
"epoch": 1.9460008709536942,
"grad_norm": 3.36234712600708,
"learning_rate": 7.169851242156988e-06,
"loss": 0.7444,
"step": 6703
},
{
"epoch": 1.9462911888517929,
"grad_norm": 3.4189670085906982,
"learning_rate": 7.168986129016771e-06,
"loss": 0.7771,
"step": 6704
},
{
"epoch": 1.946581506749891,
"grad_norm": 3.6109812259674072,
"learning_rate": 7.168120935885203e-06,
"loss": 0.6837,
"step": 6705
},
{
"epoch": 1.9468718246479897,
"grad_norm": 4.015439510345459,
"learning_rate": 7.1672556627941995e-06,
"loss": 0.7297,
"step": 6706
},
{
"epoch": 1.9471621425460879,
"grad_norm": 3.6183969974517822,
"learning_rate": 7.166390309775664e-06,
"loss": 0.7379,
"step": 6707
},
{
"epoch": 1.9474524604441865,
"grad_norm": 3.7580604553222656,
"learning_rate": 7.165524876861515e-06,
"loss": 0.7974,
"step": 6708
},
{
"epoch": 1.9477427783422847,
"grad_norm": 2.972172975540161,
"learning_rate": 7.164659364083667e-06,
"loss": 0.5959,
"step": 6709
},
{
"epoch": 1.9480330962403833,
"grad_norm": 3.764477252960205,
"learning_rate": 7.1637937714740414e-06,
"loss": 0.8297,
"step": 6710
},
{
"epoch": 1.9483234141384815,
"grad_norm": 3.675285816192627,
"learning_rate": 7.162928099064559e-06,
"loss": 0.7536,
"step": 6711
},
{
"epoch": 1.9486137320365802,
"grad_norm": 3.6830227375030518,
"learning_rate": 7.1620623468871484e-06,
"loss": 0.6829,
"step": 6712
},
{
"epoch": 1.9489040499346784,
"grad_norm": 3.798758029937744,
"learning_rate": 7.161196514973735e-06,
"loss": 0.7489,
"step": 6713
},
{
"epoch": 1.949194367832777,
"grad_norm": 3.6859729290008545,
"learning_rate": 7.160330603356254e-06,
"loss": 0.7206,
"step": 6714
},
{
"epoch": 1.9494846857308752,
"grad_norm": 4.245936393737793,
"learning_rate": 7.159464612066636e-06,
"loss": 0.922,
"step": 6715
},
{
"epoch": 1.9497750036289738,
"grad_norm": 3.321417808532715,
"learning_rate": 7.158598541136819e-06,
"loss": 0.7266,
"step": 6716
},
{
"epoch": 1.950065321527072,
"grad_norm": 3.910299301147461,
"learning_rate": 7.1577323905987465e-06,
"loss": 0.8134,
"step": 6717
},
{
"epoch": 1.9503556394251707,
"grad_norm": 3.536652088165283,
"learning_rate": 7.156866160484358e-06,
"loss": 0.7933,
"step": 6718
},
{
"epoch": 1.9506459573232688,
"grad_norm": 3.7989182472229004,
"learning_rate": 7.155999850825604e-06,
"loss": 0.7904,
"step": 6719
},
{
"epoch": 1.9509362752213675,
"grad_norm": 3.8353662490844727,
"learning_rate": 7.155133461654429e-06,
"loss": 0.8632,
"step": 6720
},
{
"epoch": 1.9512265931194657,
"grad_norm": 3.9606752395629883,
"learning_rate": 7.154266993002786e-06,
"loss": 0.8703,
"step": 6721
},
{
"epoch": 1.9515169110175643,
"grad_norm": 3.5751256942749023,
"learning_rate": 7.1534004449026325e-06,
"loss": 0.6907,
"step": 6722
},
{
"epoch": 1.9518072289156625,
"grad_norm": 3.333437442779541,
"learning_rate": 7.152533817385927e-06,
"loss": 0.698,
"step": 6723
},
{
"epoch": 1.9520975468137611,
"grad_norm": 3.3084776401519775,
"learning_rate": 7.151667110484626e-06,
"loss": 0.8091,
"step": 6724
},
{
"epoch": 1.9523878647118593,
"grad_norm": 3.4940638542175293,
"learning_rate": 7.150800324230696e-06,
"loss": 0.8429,
"step": 6725
},
{
"epoch": 1.952678182609958,
"grad_norm": 3.774066209793091,
"learning_rate": 7.149933458656104e-06,
"loss": 0.8869,
"step": 6726
},
{
"epoch": 1.9529685005080564,
"grad_norm": 3.1335461139678955,
"learning_rate": 7.149066513792821e-06,
"loss": 0.7892,
"step": 6727
},
{
"epoch": 1.9532588184061548,
"grad_norm": 3.268209934234619,
"learning_rate": 7.148199489672816e-06,
"loss": 0.7628,
"step": 6728
},
{
"epoch": 1.9535491363042532,
"grad_norm": 3.1325523853302,
"learning_rate": 7.1473323863280666e-06,
"loss": 0.6749,
"step": 6729
},
{
"epoch": 1.9538394542023516,
"grad_norm": 4.100725173950195,
"learning_rate": 7.146465203790549e-06,
"loss": 0.8469,
"step": 6730
},
{
"epoch": 1.95412977210045,
"grad_norm": 3.513888359069824,
"learning_rate": 7.14559794209225e-06,
"loss": 0.754,
"step": 6731
},
{
"epoch": 1.9544200899985484,
"grad_norm": 3.4546597003936768,
"learning_rate": 7.144730601265148e-06,
"loss": 0.6669,
"step": 6732
},
{
"epoch": 1.9547104078966469,
"grad_norm": 3.6429920196533203,
"learning_rate": 7.143863181341234e-06,
"loss": 0.7706,
"step": 6733
},
{
"epoch": 1.9550007257947453,
"grad_norm": 3.8409531116485596,
"learning_rate": 7.1429956823524956e-06,
"loss": 0.8653,
"step": 6734
},
{
"epoch": 1.9552910436928437,
"grad_norm": 3.6878139972686768,
"learning_rate": 7.1421281043309265e-06,
"loss": 0.9262,
"step": 6735
},
{
"epoch": 1.955581361590942,
"grad_norm": 3.273050546646118,
"learning_rate": 7.141260447308525e-06,
"loss": 0.7529,
"step": 6736
},
{
"epoch": 1.9558716794890405,
"grad_norm": 3.7971384525299072,
"learning_rate": 7.140392711317286e-06,
"loss": 0.8767,
"step": 6737
},
{
"epoch": 1.956161997387139,
"grad_norm": 3.7323203086853027,
"learning_rate": 7.139524896389214e-06,
"loss": 0.7121,
"step": 6738
},
{
"epoch": 1.9564523152852373,
"grad_norm": 3.6993770599365234,
"learning_rate": 7.138657002556311e-06,
"loss": 0.7983,
"step": 6739
},
{
"epoch": 1.9567426331833357,
"grad_norm": 3.204155206680298,
"learning_rate": 7.13778902985059e-06,
"loss": 0.7088,
"step": 6740
},
{
"epoch": 1.9570329510814342,
"grad_norm": 3.982203483581543,
"learning_rate": 7.136920978304056e-06,
"loss": 0.9362,
"step": 6741
},
{
"epoch": 1.9573232689795326,
"grad_norm": 3.766463279724121,
"learning_rate": 7.136052847948724e-06,
"loss": 0.7985,
"step": 6742
},
{
"epoch": 1.957613586877631,
"grad_norm": 3.5600168704986572,
"learning_rate": 7.13518463881661e-06,
"loss": 0.7356,
"step": 6743
},
{
"epoch": 1.9579039047757294,
"grad_norm": 3.440335750579834,
"learning_rate": 7.134316350939736e-06,
"loss": 0.7941,
"step": 6744
},
{
"epoch": 1.9581942226738278,
"grad_norm": 3.4422836303710938,
"learning_rate": 7.13344798435012e-06,
"loss": 0.7227,
"step": 6745
},
{
"epoch": 1.9584845405719262,
"grad_norm": 3.442683219909668,
"learning_rate": 7.13257953907979e-06,
"loss": 0.7831,
"step": 6746
},
{
"epoch": 1.9587748584700246,
"grad_norm": 3.355893611907959,
"learning_rate": 7.1317110151607724e-06,
"loss": 0.7271,
"step": 6747
},
{
"epoch": 1.959065176368123,
"grad_norm": 3.4449734687805176,
"learning_rate": 7.130842412625099e-06,
"loss": 0.7658,
"step": 6748
},
{
"epoch": 1.9593554942662215,
"grad_norm": 4.014678478240967,
"learning_rate": 7.129973731504802e-06,
"loss": 0.8837,
"step": 6749
},
{
"epoch": 1.9596458121643199,
"grad_norm": 3.140547513961792,
"learning_rate": 7.1291049718319214e-06,
"loss": 0.7722,
"step": 6750
},
{
"epoch": 1.9599361300624185,
"grad_norm": 3.45985746383667,
"learning_rate": 7.128236133638492e-06,
"loss": 0.8081,
"step": 6751
},
{
"epoch": 1.9602264479605167,
"grad_norm": 3.613837718963623,
"learning_rate": 7.127367216956559e-06,
"loss": 0.8547,
"step": 6752
},
{
"epoch": 1.9605167658586153,
"grad_norm": 3.648763418197632,
"learning_rate": 7.126498221818167e-06,
"loss": 0.8113,
"step": 6753
},
{
"epoch": 1.9608070837567135,
"grad_norm": 2.954113483428955,
"learning_rate": 7.125629148255366e-06,
"loss": 0.6359,
"step": 6754
},
{
"epoch": 1.9610974016548122,
"grad_norm": 3.690190076828003,
"learning_rate": 7.1247599963002055e-06,
"loss": 0.7334,
"step": 6755
},
{
"epoch": 1.9613877195529104,
"grad_norm": 4.204606533050537,
"learning_rate": 7.123890765984738e-06,
"loss": 0.885,
"step": 6756
},
{
"epoch": 1.961678037451009,
"grad_norm": 3.6308844089508057,
"learning_rate": 7.123021457341022e-06,
"loss": 0.8379,
"step": 6757
},
{
"epoch": 1.9619683553491072,
"grad_norm": 3.752915620803833,
"learning_rate": 7.1221520704011186e-06,
"loss": 0.7165,
"step": 6758
},
{
"epoch": 1.9622586732472058,
"grad_norm": 3.76926326751709,
"learning_rate": 7.121282605197087e-06,
"loss": 0.7306,
"step": 6759
},
{
"epoch": 1.962548991145304,
"grad_norm": 3.9330079555511475,
"learning_rate": 7.120413061760996e-06,
"loss": 0.8327,
"step": 6760
},
{
"epoch": 1.9628393090434026,
"grad_norm": 3.476900339126587,
"learning_rate": 7.119543440124913e-06,
"loss": 0.8174,
"step": 6761
},
{
"epoch": 1.9631296269415008,
"grad_norm": 3.7719175815582275,
"learning_rate": 7.118673740320907e-06,
"loss": 0.6952,
"step": 6762
},
{
"epoch": 1.9634199448395995,
"grad_norm": 3.6090521812438965,
"learning_rate": 7.117803962381057e-06,
"loss": 0.7363,
"step": 6763
},
{
"epoch": 1.9637102627376977,
"grad_norm": 3.7342145442962646,
"learning_rate": 7.116934106337436e-06,
"loss": 0.7811,
"step": 6764
},
{
"epoch": 1.9640005806357963,
"grad_norm": 3.467252731323242,
"learning_rate": 7.1160641722221255e-06,
"loss": 0.7612,
"step": 6765
},
{
"epoch": 1.9642908985338945,
"grad_norm": 3.8008577823638916,
"learning_rate": 7.115194160067208e-06,
"loss": 0.8841,
"step": 6766
},
{
"epoch": 1.9645812164319931,
"grad_norm": 3.648664951324463,
"learning_rate": 7.114324069904769e-06,
"loss": 0.7991,
"step": 6767
},
{
"epoch": 1.9648715343300913,
"grad_norm": 3.3115179538726807,
"learning_rate": 7.113453901766898e-06,
"loss": 0.7317,
"step": 6768
},
{
"epoch": 1.96516185222819,
"grad_norm": 4.139917373657227,
"learning_rate": 7.112583655685685e-06,
"loss": 0.8714,
"step": 6769
},
{
"epoch": 1.9654521701262881,
"grad_norm": 3.2607545852661133,
"learning_rate": 7.1117133316932255e-06,
"loss": 0.6839,
"step": 6770
},
{
"epoch": 1.9657424880243868,
"grad_norm": 3.5741126537323,
"learning_rate": 7.110842929821615e-06,
"loss": 0.795,
"step": 6771
},
{
"epoch": 1.966032805922485,
"grad_norm": 3.477534294128418,
"learning_rate": 7.109972450102958e-06,
"loss": 0.7614,
"step": 6772
},
{
"epoch": 1.9663231238205836,
"grad_norm": 3.491774797439575,
"learning_rate": 7.109101892569351e-06,
"loss": 0.6599,
"step": 6773
},
{
"epoch": 1.9666134417186818,
"grad_norm": 3.976912021636963,
"learning_rate": 7.108231257252906e-06,
"loss": 0.9449,
"step": 6774
},
{
"epoch": 1.9669037596167804,
"grad_norm": 3.2056448459625244,
"learning_rate": 7.107360544185726e-06,
"loss": 0.8332,
"step": 6775
},
{
"epoch": 1.9671940775148788,
"grad_norm": 4.093783855438232,
"learning_rate": 7.1064897533999275e-06,
"loss": 0.849,
"step": 6776
},
{
"epoch": 1.9674843954129773,
"grad_norm": 3.3977859020233154,
"learning_rate": 7.105618884927622e-06,
"loss": 0.7746,
"step": 6777
},
{
"epoch": 1.9677747133110757,
"grad_norm": 3.2878258228302,
"learning_rate": 7.104747938800929e-06,
"loss": 0.7264,
"step": 6778
},
{
"epoch": 1.968065031209174,
"grad_norm": 3.859818696975708,
"learning_rate": 7.1038769150519656e-06,
"loss": 0.8852,
"step": 6779
},
{
"epoch": 1.9683553491072725,
"grad_norm": 3.4528026580810547,
"learning_rate": 7.103005813712856e-06,
"loss": 0.7505,
"step": 6780
},
{
"epoch": 1.968645667005371,
"grad_norm": 3.6107583045959473,
"learning_rate": 7.1021346348157285e-06,
"loss": 0.7107,
"step": 6781
},
{
"epoch": 1.9689359849034693,
"grad_norm": 3.1933040618896484,
"learning_rate": 7.101263378392709e-06,
"loss": 0.6672,
"step": 6782
},
{
"epoch": 1.9692263028015677,
"grad_norm": 3.2831835746765137,
"learning_rate": 7.10039204447593e-06,
"loss": 0.7403,
"step": 6783
},
{
"epoch": 1.9695166206996662,
"grad_norm": 3.4789631366729736,
"learning_rate": 7.099520633097525e-06,
"loss": 0.8518,
"step": 6784
},
{
"epoch": 1.9698069385977646,
"grad_norm": 3.596649646759033,
"learning_rate": 7.098649144289633e-06,
"loss": 0.7417,
"step": 6785
},
{
"epoch": 1.970097256495863,
"grad_norm": 3.3953075408935547,
"learning_rate": 7.097777578084394e-06,
"loss": 0.7524,
"step": 6786
},
{
"epoch": 1.9703875743939614,
"grad_norm": 3.428148031234741,
"learning_rate": 7.09690593451395e-06,
"loss": 0.738,
"step": 6787
},
{
"epoch": 1.9706778922920598,
"grad_norm": 3.6509974002838135,
"learning_rate": 7.096034213610448e-06,
"loss": 0.7525,
"step": 6788
},
{
"epoch": 1.9709682101901582,
"grad_norm": 3.5561928749084473,
"learning_rate": 7.095162415406034e-06,
"loss": 0.8845,
"step": 6789
},
{
"epoch": 1.9712585280882566,
"grad_norm": 3.5671966075897217,
"learning_rate": 7.0942905399328625e-06,
"loss": 0.6514,
"step": 6790
},
{
"epoch": 1.971548845986355,
"grad_norm": 3.40765118598938,
"learning_rate": 7.093418587223088e-06,
"loss": 0.7776,
"step": 6791
},
{
"epoch": 1.9718391638844535,
"grad_norm": 3.513580560684204,
"learning_rate": 7.092546557308866e-06,
"loss": 0.6769,
"step": 6792
},
{
"epoch": 1.9721294817825519,
"grad_norm": 3.5958590507507324,
"learning_rate": 7.091674450222357e-06,
"loss": 0.7664,
"step": 6793
},
{
"epoch": 1.9724197996806503,
"grad_norm": 3.6124091148376465,
"learning_rate": 7.090802265995723e-06,
"loss": 0.7266,
"step": 6794
},
{
"epoch": 1.9727101175787487,
"grad_norm": 3.596867322921753,
"learning_rate": 7.089930004661134e-06,
"loss": 0.7801,
"step": 6795
},
{
"epoch": 1.9730004354768471,
"grad_norm": 3.997195243835449,
"learning_rate": 7.089057666250754e-06,
"loss": 0.8244,
"step": 6796
},
{
"epoch": 1.9732907533749455,
"grad_norm": 3.457582712173462,
"learning_rate": 7.088185250796757e-06,
"loss": 0.7506,
"step": 6797
},
{
"epoch": 1.973581071273044,
"grad_norm": 3.2290596961975098,
"learning_rate": 7.087312758331318e-06,
"loss": 0.8002,
"step": 6798
},
{
"epoch": 1.9738713891711424,
"grad_norm": 3.566000461578369,
"learning_rate": 7.08644018888661e-06,
"loss": 0.8488,
"step": 6799
},
{
"epoch": 1.9741617070692408,
"grad_norm": 3.3688695430755615,
"learning_rate": 7.085567542494815e-06,
"loss": 0.6546,
"step": 6800
},
{
"epoch": 1.9744520249673392,
"grad_norm": 3.4332211017608643,
"learning_rate": 7.08469481918812e-06,
"loss": 0.7747,
"step": 6801
},
{
"epoch": 1.9747423428654378,
"grad_norm": 3.266073703765869,
"learning_rate": 7.083822018998706e-06,
"loss": 0.6387,
"step": 6802
},
{
"epoch": 1.975032660763536,
"grad_norm": 3.644442558288574,
"learning_rate": 7.082949141958762e-06,
"loss": 0.9104,
"step": 6803
},
{
"epoch": 1.9753229786616346,
"grad_norm": 3.220064878463745,
"learning_rate": 7.082076188100483e-06,
"loss": 0.709,
"step": 6804
},
{
"epoch": 1.9756132965597328,
"grad_norm": 3.7324562072753906,
"learning_rate": 7.081203157456058e-06,
"loss": 0.7557,
"step": 6805
},
{
"epoch": 1.9759036144578315,
"grad_norm": 3.2915639877319336,
"learning_rate": 7.080330050057687e-06,
"loss": 0.7483,
"step": 6806
},
{
"epoch": 1.9761939323559297,
"grad_norm": 3.8188564777374268,
"learning_rate": 7.079456865937568e-06,
"loss": 0.8745,
"step": 6807
},
{
"epoch": 1.9764842502540283,
"grad_norm": 3.867581844329834,
"learning_rate": 7.078583605127908e-06,
"loss": 0.7953,
"step": 6808
},
{
"epoch": 1.9767745681521265,
"grad_norm": 3.83316969871521,
"learning_rate": 7.077710267660908e-06,
"loss": 0.8975,
"step": 6809
},
{
"epoch": 1.9770648860502251,
"grad_norm": 3.6134462356567383,
"learning_rate": 7.076836853568778e-06,
"loss": 0.8214,
"step": 6810
},
{
"epoch": 1.9773552039483233,
"grad_norm": 3.6381266117095947,
"learning_rate": 7.0759633628837285e-06,
"loss": 0.6846,
"step": 6811
},
{
"epoch": 1.977645521846422,
"grad_norm": 3.7517611980438232,
"learning_rate": 7.075089795637974e-06,
"loss": 0.7253,
"step": 6812
},
{
"epoch": 1.9779358397445201,
"grad_norm": 3.577470302581787,
"learning_rate": 7.074216151863731e-06,
"loss": 0.7477,
"step": 6813
},
{
"epoch": 1.9782261576426188,
"grad_norm": 3.7703053951263428,
"learning_rate": 7.0733424315932195e-06,
"loss": 0.7689,
"step": 6814
},
{
"epoch": 1.978516475540717,
"grad_norm": 3.7044544219970703,
"learning_rate": 7.072468634858663e-06,
"loss": 0.886,
"step": 6815
},
{
"epoch": 1.9788067934388156,
"grad_norm": 3.4169695377349854,
"learning_rate": 7.071594761692284e-06,
"loss": 0.7732,
"step": 6816
},
{
"epoch": 1.9790971113369138,
"grad_norm": 3.8502378463745117,
"learning_rate": 7.070720812126315e-06,
"loss": 0.8438,
"step": 6817
},
{
"epoch": 1.9793874292350124,
"grad_norm": 3.873922348022461,
"learning_rate": 7.069846786192982e-06,
"loss": 0.8482,
"step": 6818
},
{
"epoch": 1.9796777471331106,
"grad_norm": 3.5439321994781494,
"learning_rate": 7.068972683924522e-06,
"loss": 0.7929,
"step": 6819
},
{
"epoch": 1.9799680650312093,
"grad_norm": 3.0595645904541016,
"learning_rate": 7.068098505353169e-06,
"loss": 0.6958,
"step": 6820
},
{
"epoch": 1.9802583829293074,
"grad_norm": 3.681124210357666,
"learning_rate": 7.0672242505111644e-06,
"loss": 0.7487,
"step": 6821
},
{
"epoch": 1.980548700827406,
"grad_norm": 3.742825508117676,
"learning_rate": 7.066349919430751e-06,
"loss": 0.7309,
"step": 6822
},
{
"epoch": 1.9808390187255043,
"grad_norm": 3.4205269813537598,
"learning_rate": 7.065475512144172e-06,
"loss": 0.7474,
"step": 6823
},
{
"epoch": 1.981129336623603,
"grad_norm": 3.3415684700012207,
"learning_rate": 7.064601028683675e-06,
"loss": 0.6876,
"step": 6824
},
{
"epoch": 1.981419654521701,
"grad_norm": 3.7281363010406494,
"learning_rate": 7.063726469081511e-06,
"loss": 0.8471,
"step": 6825
},
{
"epoch": 1.9817099724197997,
"grad_norm": 3.569338798522949,
"learning_rate": 7.062851833369935e-06,
"loss": 0.8374,
"step": 6826
},
{
"epoch": 1.9820002903178982,
"grad_norm": 3.6577813625335693,
"learning_rate": 7.061977121581202e-06,
"loss": 0.7678,
"step": 6827
},
{
"epoch": 1.9822906082159966,
"grad_norm": 3.6792924404144287,
"learning_rate": 7.06110233374757e-06,
"loss": 0.7505,
"step": 6828
},
{
"epoch": 1.982580926114095,
"grad_norm": 3.227928400039673,
"learning_rate": 7.060227469901304e-06,
"loss": 0.7637,
"step": 6829
},
{
"epoch": 1.9828712440121934,
"grad_norm": 3.342305898666382,
"learning_rate": 7.0593525300746635e-06,
"loss": 0.6598,
"step": 6830
},
{
"epoch": 1.9831615619102918,
"grad_norm": 3.869431734085083,
"learning_rate": 7.058477514299921e-06,
"loss": 0.7006,
"step": 6831
},
{
"epoch": 1.9834518798083902,
"grad_norm": 3.2897863388061523,
"learning_rate": 7.057602422609343e-06,
"loss": 0.821,
"step": 6832
},
{
"epoch": 1.9837421977064886,
"grad_norm": 3.5811805725097656,
"learning_rate": 7.056727255035206e-06,
"loss": 0.793,
"step": 6833
},
{
"epoch": 1.984032515604587,
"grad_norm": 4.032071113586426,
"learning_rate": 7.0558520116097826e-06,
"loss": 0.8207,
"step": 6834
},
{
"epoch": 1.9843228335026855,
"grad_norm": 4.270670413970947,
"learning_rate": 7.054976692365354e-06,
"loss": 0.9153,
"step": 6835
},
{
"epoch": 1.9846131514007839,
"grad_norm": 3.341407537460327,
"learning_rate": 7.0541012973342e-06,
"loss": 0.7869,
"step": 6836
},
{
"epoch": 1.9849034692988823,
"grad_norm": 3.6755237579345703,
"learning_rate": 7.053225826548605e-06,
"loss": 0.8061,
"step": 6837
},
{
"epoch": 1.9851937871969807,
"grad_norm": 3.738955497741699,
"learning_rate": 7.052350280040858e-06,
"loss": 0.7908,
"step": 6838
},
{
"epoch": 1.985484105095079,
"grad_norm": 3.7172625064849854,
"learning_rate": 7.051474657843245e-06,
"loss": 0.812,
"step": 6839
},
{
"epoch": 1.9857744229931775,
"grad_norm": 3.776444435119629,
"learning_rate": 7.050598959988062e-06,
"loss": 0.9028,
"step": 6840
},
{
"epoch": 1.986064740891276,
"grad_norm": 3.6935839653015137,
"learning_rate": 7.049723186507602e-06,
"loss": 0.8667,
"step": 6841
},
{
"epoch": 1.9863550587893744,
"grad_norm": 3.6881377696990967,
"learning_rate": 7.048847337434166e-06,
"loss": 0.8647,
"step": 6842
},
{
"epoch": 1.9866453766874728,
"grad_norm": 3.4528255462646484,
"learning_rate": 7.047971412800051e-06,
"loss": 0.775,
"step": 6843
},
{
"epoch": 1.9869356945855712,
"grad_norm": 3.9001612663269043,
"learning_rate": 7.047095412637563e-06,
"loss": 0.8675,
"step": 6844
},
{
"epoch": 1.9872260124836696,
"grad_norm": 3.6792030334472656,
"learning_rate": 7.04621933697901e-06,
"loss": 0.7322,
"step": 6845
},
{
"epoch": 1.987516330381768,
"grad_norm": 3.6226887702941895,
"learning_rate": 7.045343185856701e-06,
"loss": 0.7921,
"step": 6846
},
{
"epoch": 1.9878066482798664,
"grad_norm": 3.9914066791534424,
"learning_rate": 7.044466959302945e-06,
"loss": 0.8576,
"step": 6847
},
{
"epoch": 1.9880969661779648,
"grad_norm": 3.397376537322998,
"learning_rate": 7.043590657350059e-06,
"loss": 0.6744,
"step": 6848
},
{
"epoch": 1.9883872840760632,
"grad_norm": 3.3360671997070312,
"learning_rate": 7.042714280030361e-06,
"loss": 0.6491,
"step": 6849
},
{
"epoch": 1.9886776019741617,
"grad_norm": 3.4122045040130615,
"learning_rate": 7.041837827376171e-06,
"loss": 0.8094,
"step": 6850
},
{
"epoch": 1.98896791987226,
"grad_norm": 3.3993654251098633,
"learning_rate": 7.040961299419812e-06,
"loss": 0.7477,
"step": 6851
},
{
"epoch": 1.9892582377703585,
"grad_norm": 3.57908296585083,
"learning_rate": 7.040084696193611e-06,
"loss": 0.8479,
"step": 6852
},
{
"epoch": 1.9895485556684571,
"grad_norm": 3.6195056438446045,
"learning_rate": 7.039208017729895e-06,
"loss": 0.6888,
"step": 6853
},
{
"epoch": 1.9898388735665553,
"grad_norm": 3.9272801876068115,
"learning_rate": 7.038331264060996e-06,
"loss": 0.7325,
"step": 6854
},
{
"epoch": 1.990129191464654,
"grad_norm": 4.077366352081299,
"learning_rate": 7.037454435219251e-06,
"loss": 0.7975,
"step": 6855
},
{
"epoch": 1.9904195093627521,
"grad_norm": 3.617011547088623,
"learning_rate": 7.0365775312369935e-06,
"loss": 0.7656,
"step": 6856
},
{
"epoch": 1.9907098272608508,
"grad_norm": 3.4023525714874268,
"learning_rate": 7.0357005521465635e-06,
"loss": 0.6409,
"step": 6857
},
{
"epoch": 1.991000145158949,
"grad_norm": 3.8578407764434814,
"learning_rate": 7.034823497980307e-06,
"loss": 0.9175,
"step": 6858
},
{
"epoch": 1.9912904630570476,
"grad_norm": 4.258701801300049,
"learning_rate": 7.033946368770568e-06,
"loss": 0.7781,
"step": 6859
},
{
"epoch": 1.9915807809551458,
"grad_norm": 3.1676242351531982,
"learning_rate": 7.033069164549692e-06,
"loss": 0.6299,
"step": 6860
},
{
"epoch": 1.9918710988532444,
"grad_norm": 3.4303393363952637,
"learning_rate": 7.032191885350034e-06,
"loss": 0.7877,
"step": 6861
},
{
"epoch": 1.9921614167513426,
"grad_norm": 3.757079601287842,
"learning_rate": 7.031314531203943e-06,
"loss": 0.8279,
"step": 6862
},
{
"epoch": 1.9924517346494413,
"grad_norm": 3.5876965522766113,
"learning_rate": 7.030437102143781e-06,
"loss": 0.7769,
"step": 6863
},
{
"epoch": 1.9927420525475394,
"grad_norm": 3.210477352142334,
"learning_rate": 7.029559598201903e-06,
"loss": 0.7287,
"step": 6864
},
{
"epoch": 1.993032370445638,
"grad_norm": 3.55476713180542,
"learning_rate": 7.028682019410673e-06,
"loss": 0.7846,
"step": 6865
},
{
"epoch": 1.9933226883437363,
"grad_norm": 3.201202630996704,
"learning_rate": 7.027804365802454e-06,
"loss": 0.6625,
"step": 6866
},
{
"epoch": 1.993613006241835,
"grad_norm": 3.8153321743011475,
"learning_rate": 7.026926637409615e-06,
"loss": 0.8795,
"step": 6867
},
{
"epoch": 1.993903324139933,
"grad_norm": 3.239248275756836,
"learning_rate": 7.0260488342645284e-06,
"loss": 0.7628,
"step": 6868
},
{
"epoch": 1.9941936420380317,
"grad_norm": 3.5351696014404297,
"learning_rate": 7.0251709563995626e-06,
"loss": 0.7015,
"step": 6869
},
{
"epoch": 1.99448395993613,
"grad_norm": 3.6981968879699707,
"learning_rate": 7.024293003847096e-06,
"loss": 0.9076,
"step": 6870
},
{
"epoch": 1.9947742778342286,
"grad_norm": 3.6667771339416504,
"learning_rate": 7.023414976639505e-06,
"loss": 0.7591,
"step": 6871
},
{
"epoch": 1.9950645957323268,
"grad_norm": 3.423527956008911,
"learning_rate": 7.022536874809176e-06,
"loss": 0.7876,
"step": 6872
},
{
"epoch": 1.9953549136304254,
"grad_norm": 3.921292304992676,
"learning_rate": 7.021658698388487e-06,
"loss": 0.8568,
"step": 6873
},
{
"epoch": 1.9956452315285236,
"grad_norm": 3.7030389308929443,
"learning_rate": 7.02078044740983e-06,
"loss": 0.7251,
"step": 6874
},
{
"epoch": 1.9959355494266222,
"grad_norm": 3.2520456314086914,
"learning_rate": 7.019902121905588e-06,
"loss": 0.7709,
"step": 6875
},
{
"epoch": 1.9962258673247204,
"grad_norm": 3.617114543914795,
"learning_rate": 7.019023721908162e-06,
"loss": 0.7272,
"step": 6876
},
{
"epoch": 1.996516185222819,
"grad_norm": 3.458791732788086,
"learning_rate": 7.018145247449939e-06,
"loss": 0.7036,
"step": 6877
},
{
"epoch": 1.9968065031209175,
"grad_norm": 3.542085886001587,
"learning_rate": 7.017266698563322e-06,
"loss": 0.7234,
"step": 6878
},
{
"epoch": 1.9970968210190159,
"grad_norm": 3.0014126300811768,
"learning_rate": 7.016388075280709e-06,
"loss": 0.7739,
"step": 6879
},
{
"epoch": 1.9973871389171143,
"grad_norm": 3.6589744091033936,
"learning_rate": 7.015509377634504e-06,
"loss": 0.8309,
"step": 6880
},
{
"epoch": 1.9976774568152127,
"grad_norm": 3.2439112663269043,
"learning_rate": 7.014630605657113e-06,
"loss": 0.7759,
"step": 6881
},
{
"epoch": 1.997967774713311,
"grad_norm": 3.3802876472473145,
"learning_rate": 7.013751759380944e-06,
"loss": 0.6549,
"step": 6882
},
{
"epoch": 1.9982580926114095,
"grad_norm": 3.9446818828582764,
"learning_rate": 7.01287283883841e-06,
"loss": 0.8399,
"step": 6883
},
{
"epoch": 1.998548410509508,
"grad_norm": 3.197936773300171,
"learning_rate": 7.011993844061925e-06,
"loss": 0.7113,
"step": 6884
},
{
"epoch": 1.9988387284076063,
"grad_norm": 3.189903974533081,
"learning_rate": 7.011114775083905e-06,
"loss": 0.8267,
"step": 6885
},
{
"epoch": 1.9991290463057048,
"grad_norm": 3.5381767749786377,
"learning_rate": 7.010235631936771e-06,
"loss": 0.835,
"step": 6886
},
{
"epoch": 1.9994193642038032,
"grad_norm": 3.2049825191497803,
"learning_rate": 7.009356414652944e-06,
"loss": 0.7166,
"step": 6887
},
{
"epoch": 1.9997096821019016,
"grad_norm": 3.489812135696411,
"learning_rate": 7.008477123264849e-06,
"loss": 0.8515,
"step": 6888
},
{
"epoch": 2.0,
"grad_norm": 3.69612717628479,
"learning_rate": 7.007597757804914e-06,
"loss": 0.785,
"step": 6889
}
],
"logging_steps": 1.0,
"max_steps": 17220,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2241570820120904e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}