Files
Tranport-llama3_1_8B_instruct/trainer_state.json
ModelHub XC 913157dc72 初始化项目,由ModelHub XC社区提供模型
Model: BAAI/Tranport-llama3_1_8B_instruct
Source: Original Platform
2026-05-18 18:49:38 +08:00

24590 lines
593 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7787942316244203,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000508226923321263,
"grad_norm": 11.132791519165039,
"learning_rate": 2.0325203252032523e-08,
"loss": 1.5488,
"step": 1
},
{
"epoch": 0.001016453846642526,
"grad_norm": 14.91269588470459,
"learning_rate": 4.0650406504065046e-08,
"loss": 1.7579,
"step": 2
},
{
"epoch": 0.0015246807699637887,
"grad_norm": 14.403929710388184,
"learning_rate": 6.097560975609757e-08,
"loss": 1.5603,
"step": 3
},
{
"epoch": 0.002032907693285052,
"grad_norm": 11.182633399963379,
"learning_rate": 8.130081300813009e-08,
"loss": 1.6072,
"step": 4
},
{
"epoch": 0.0025411346166063146,
"grad_norm": 13.164581298828125,
"learning_rate": 1.0162601626016261e-07,
"loss": 1.5791,
"step": 5
},
{
"epoch": 0.0030493615399275775,
"grad_norm": 16.108863830566406,
"learning_rate": 1.2195121951219514e-07,
"loss": 1.5734,
"step": 6
},
{
"epoch": 0.0035575884632488407,
"grad_norm": 12.01136302947998,
"learning_rate": 1.4227642276422766e-07,
"loss": 1.5399,
"step": 7
},
{
"epoch": 0.004065815386570104,
"grad_norm": 11.824952125549316,
"learning_rate": 1.6260162601626018e-07,
"loss": 1.5702,
"step": 8
},
{
"epoch": 0.004574042309891366,
"grad_norm": 11.490732192993164,
"learning_rate": 1.8292682926829268e-07,
"loss": 1.4625,
"step": 9
},
{
"epoch": 0.005082269233212629,
"grad_norm": 14.02952766418457,
"learning_rate": 2.0325203252032523e-07,
"loss": 1.6021,
"step": 10
},
{
"epoch": 0.005590496156533892,
"grad_norm": 13.60211181640625,
"learning_rate": 2.2357723577235775e-07,
"loss": 1.5947,
"step": 11
},
{
"epoch": 0.006098723079855155,
"grad_norm": 10.582362174987793,
"learning_rate": 2.439024390243903e-07,
"loss": 1.4733,
"step": 12
},
{
"epoch": 0.006606950003176419,
"grad_norm": 11.117897987365723,
"learning_rate": 2.642276422764228e-07,
"loss": 1.6174,
"step": 13
},
{
"epoch": 0.0071151769264976815,
"grad_norm": 10.191961288452148,
"learning_rate": 2.845528455284553e-07,
"loss": 1.4654,
"step": 14
},
{
"epoch": 0.007623403849818944,
"grad_norm": 14.525583267211914,
"learning_rate": 3.0487804878048784e-07,
"loss": 1.6612,
"step": 15
},
{
"epoch": 0.008131630773140207,
"grad_norm": 12.473858833312988,
"learning_rate": 3.2520325203252037e-07,
"loss": 1.6033,
"step": 16
},
{
"epoch": 0.008639857696461471,
"grad_norm": 11.088489532470703,
"learning_rate": 3.455284552845529e-07,
"loss": 1.4625,
"step": 17
},
{
"epoch": 0.009148084619782733,
"grad_norm": 13.150553703308105,
"learning_rate": 3.6585365853658536e-07,
"loss": 1.6221,
"step": 18
},
{
"epoch": 0.009656311543103997,
"grad_norm": 10.35750675201416,
"learning_rate": 3.8617886178861793e-07,
"loss": 1.4358,
"step": 19
},
{
"epoch": 0.010164538466425259,
"grad_norm": 11.396235466003418,
"learning_rate": 4.0650406504065046e-07,
"loss": 1.491,
"step": 20
},
{
"epoch": 0.010672765389746522,
"grad_norm": 10.519694328308105,
"learning_rate": 4.26829268292683e-07,
"loss": 1.5748,
"step": 21
},
{
"epoch": 0.011180992313067784,
"grad_norm": 12.369754791259766,
"learning_rate": 4.471544715447155e-07,
"loss": 1.5489,
"step": 22
},
{
"epoch": 0.011689219236389048,
"grad_norm": 10.128881454467773,
"learning_rate": 4.6747967479674797e-07,
"loss": 1.5057,
"step": 23
},
{
"epoch": 0.01219744615971031,
"grad_norm": 8.99166488647461,
"learning_rate": 4.878048780487805e-07,
"loss": 1.5258,
"step": 24
},
{
"epoch": 0.012705673083031574,
"grad_norm": 12.331857681274414,
"learning_rate": 5.081300813008131e-07,
"loss": 1.3793,
"step": 25
},
{
"epoch": 0.013213900006352837,
"grad_norm": 7.486877918243408,
"learning_rate": 5.284552845528456e-07,
"loss": 1.4606,
"step": 26
},
{
"epoch": 0.0137221269296741,
"grad_norm": 9.731522560119629,
"learning_rate": 5.487804878048781e-07,
"loss": 1.4973,
"step": 27
},
{
"epoch": 0.014230353852995363,
"grad_norm": 6.014042854309082,
"learning_rate": 5.691056910569106e-07,
"loss": 1.487,
"step": 28
},
{
"epoch": 0.014738580776316625,
"grad_norm": 6.246473789215088,
"learning_rate": 5.894308943089432e-07,
"loss": 1.4415,
"step": 29
},
{
"epoch": 0.015246807699637889,
"grad_norm": 5.654910087585449,
"learning_rate": 6.097560975609757e-07,
"loss": 1.506,
"step": 30
},
{
"epoch": 0.01575503462295915,
"grad_norm": 5.190532684326172,
"learning_rate": 6.300813008130081e-07,
"loss": 1.4196,
"step": 31
},
{
"epoch": 0.016263261546280414,
"grad_norm": 5.3967461585998535,
"learning_rate": 6.504065040650407e-07,
"loss": 1.4139,
"step": 32
},
{
"epoch": 0.016771488469601678,
"grad_norm": 5.363631725311279,
"learning_rate": 6.707317073170733e-07,
"loss": 1.4304,
"step": 33
},
{
"epoch": 0.017279715392922942,
"grad_norm": 4.950409889221191,
"learning_rate": 6.910569105691058e-07,
"loss": 1.3548,
"step": 34
},
{
"epoch": 0.017787942316244202,
"grad_norm": 5.297672271728516,
"learning_rate": 7.113821138211383e-07,
"loss": 1.4669,
"step": 35
},
{
"epoch": 0.018296169239565466,
"grad_norm": 5.159802436828613,
"learning_rate": 7.317073170731707e-07,
"loss": 1.4151,
"step": 36
},
{
"epoch": 0.01880439616288673,
"grad_norm": 4.77419900894165,
"learning_rate": 7.520325203252033e-07,
"loss": 1.379,
"step": 37
},
{
"epoch": 0.019312623086207993,
"grad_norm": 4.516266822814941,
"learning_rate": 7.723577235772359e-07,
"loss": 1.3126,
"step": 38
},
{
"epoch": 0.019820850009529253,
"grad_norm": 4.660902976989746,
"learning_rate": 7.926829268292684e-07,
"loss": 1.4777,
"step": 39
},
{
"epoch": 0.020329076932850517,
"grad_norm": 4.3722968101501465,
"learning_rate": 8.130081300813009e-07,
"loss": 1.4056,
"step": 40
},
{
"epoch": 0.02083730385617178,
"grad_norm": 4.381669521331787,
"learning_rate": 8.333333333333333e-07,
"loss": 1.38,
"step": 41
},
{
"epoch": 0.021345530779493044,
"grad_norm": 4.524435520172119,
"learning_rate": 8.53658536585366e-07,
"loss": 1.4145,
"step": 42
},
{
"epoch": 0.021853757702814308,
"grad_norm": 6.599025726318359,
"learning_rate": 8.739837398373985e-07,
"loss": 1.3931,
"step": 43
},
{
"epoch": 0.02236198462613557,
"grad_norm": 4.480719566345215,
"learning_rate": 8.94308943089431e-07,
"loss": 1.3041,
"step": 44
},
{
"epoch": 0.022870211549456832,
"grad_norm": 4.4983906745910645,
"learning_rate": 9.146341463414634e-07,
"loss": 1.3611,
"step": 45
},
{
"epoch": 0.023378438472778096,
"grad_norm": 4.583948612213135,
"learning_rate": 9.349593495934959e-07,
"loss": 1.3255,
"step": 46
},
{
"epoch": 0.02388666539609936,
"grad_norm": 4.392378807067871,
"learning_rate": 9.552845528455287e-07,
"loss": 1.4201,
"step": 47
},
{
"epoch": 0.02439489231942062,
"grad_norm": 4.692641258239746,
"learning_rate": 9.75609756097561e-07,
"loss": 1.3912,
"step": 48
},
{
"epoch": 0.024903119242741883,
"grad_norm": 4.219020843505859,
"learning_rate": 9.959349593495935e-07,
"loss": 1.4172,
"step": 49
},
{
"epoch": 0.025411346166063147,
"grad_norm": 3.9937944412231445,
"learning_rate": 1.0162601626016261e-06,
"loss": 1.4778,
"step": 50
},
{
"epoch": 0.02591957308938441,
"grad_norm": 4.721486568450928,
"learning_rate": 1.0365853658536586e-06,
"loss": 1.3501,
"step": 51
},
{
"epoch": 0.026427800012705675,
"grad_norm": 4.057364463806152,
"learning_rate": 1.0569105691056912e-06,
"loss": 1.4107,
"step": 52
},
{
"epoch": 0.026936026936026935,
"grad_norm": 4.496649742126465,
"learning_rate": 1.0772357723577236e-06,
"loss": 1.398,
"step": 53
},
{
"epoch": 0.0274442538593482,
"grad_norm": 4.019273281097412,
"learning_rate": 1.0975609756097562e-06,
"loss": 1.2613,
"step": 54
},
{
"epoch": 0.027952480782669462,
"grad_norm": 4.136529922485352,
"learning_rate": 1.1178861788617887e-06,
"loss": 1.3537,
"step": 55
},
{
"epoch": 0.028460707705990726,
"grad_norm": 4.095795631408691,
"learning_rate": 1.1382113821138213e-06,
"loss": 1.4782,
"step": 56
},
{
"epoch": 0.028968934629311986,
"grad_norm": 3.8188765048980713,
"learning_rate": 1.158536585365854e-06,
"loss": 1.3162,
"step": 57
},
{
"epoch": 0.02947716155263325,
"grad_norm": 3.8744707107543945,
"learning_rate": 1.1788617886178863e-06,
"loss": 1.2827,
"step": 58
},
{
"epoch": 0.029985388475954514,
"grad_norm": 4.022250652313232,
"learning_rate": 1.1991869918699187e-06,
"loss": 1.3503,
"step": 59
},
{
"epoch": 0.030493615399275777,
"grad_norm": 4.049084186553955,
"learning_rate": 1.2195121951219514e-06,
"loss": 1.252,
"step": 60
},
{
"epoch": 0.03100184232259704,
"grad_norm": 3.750056028366089,
"learning_rate": 1.2398373983739838e-06,
"loss": 1.3227,
"step": 61
},
{
"epoch": 0.0315100692459183,
"grad_norm": 4.167194366455078,
"learning_rate": 1.2601626016260162e-06,
"loss": 1.3036,
"step": 62
},
{
"epoch": 0.03201829616923957,
"grad_norm": 3.954740285873413,
"learning_rate": 1.2804878048780488e-06,
"loss": 1.2946,
"step": 63
},
{
"epoch": 0.03252652309256083,
"grad_norm": 4.393954753875732,
"learning_rate": 1.3008130081300815e-06,
"loss": 1.3785,
"step": 64
},
{
"epoch": 0.03303475001588209,
"grad_norm": 3.7162604331970215,
"learning_rate": 1.3211382113821139e-06,
"loss": 1.3086,
"step": 65
},
{
"epoch": 0.033542976939203356,
"grad_norm": 3.7479500770568848,
"learning_rate": 1.3414634146341465e-06,
"loss": 1.3727,
"step": 66
},
{
"epoch": 0.034051203862524616,
"grad_norm": 3.585484504699707,
"learning_rate": 1.361788617886179e-06,
"loss": 1.3153,
"step": 67
},
{
"epoch": 0.034559430785845884,
"grad_norm": 3.7799341678619385,
"learning_rate": 1.3821138211382116e-06,
"loss": 1.2355,
"step": 68
},
{
"epoch": 0.035067657709167144,
"grad_norm": 4.035519123077393,
"learning_rate": 1.4024390243902442e-06,
"loss": 1.3052,
"step": 69
},
{
"epoch": 0.035575884632488404,
"grad_norm": 3.966735363006592,
"learning_rate": 1.4227642276422766e-06,
"loss": 1.3895,
"step": 70
},
{
"epoch": 0.03608411155580967,
"grad_norm": 3.9452250003814697,
"learning_rate": 1.4430894308943092e-06,
"loss": 1.3275,
"step": 71
},
{
"epoch": 0.03659233847913093,
"grad_norm": 4.105930328369141,
"learning_rate": 1.4634146341463414e-06,
"loss": 1.4562,
"step": 72
},
{
"epoch": 0.03710056540245219,
"grad_norm": 3.8830127716064453,
"learning_rate": 1.483739837398374e-06,
"loss": 1.252,
"step": 73
},
{
"epoch": 0.03760879232577346,
"grad_norm": 4.440551280975342,
"learning_rate": 1.5040650406504067e-06,
"loss": 1.3924,
"step": 74
},
{
"epoch": 0.03811701924909472,
"grad_norm": 3.8785653114318848,
"learning_rate": 1.5243902439024391e-06,
"loss": 1.3019,
"step": 75
},
{
"epoch": 0.038625246172415986,
"grad_norm": 3.895341396331787,
"learning_rate": 1.5447154471544717e-06,
"loss": 1.2417,
"step": 76
},
{
"epoch": 0.039133473095737246,
"grad_norm": 3.4419727325439453,
"learning_rate": 1.5650406504065042e-06,
"loss": 1.2863,
"step": 77
},
{
"epoch": 0.03964170001905851,
"grad_norm": 3.9680559635162354,
"learning_rate": 1.5853658536585368e-06,
"loss": 1.3943,
"step": 78
},
{
"epoch": 0.040149926942379774,
"grad_norm": 3.7686707973480225,
"learning_rate": 1.6056910569105694e-06,
"loss": 1.3998,
"step": 79
},
{
"epoch": 0.040658153865701034,
"grad_norm": 4.245886325836182,
"learning_rate": 1.6260162601626018e-06,
"loss": 1.4582,
"step": 80
},
{
"epoch": 0.0411663807890223,
"grad_norm": 3.924715518951416,
"learning_rate": 1.6463414634146345e-06,
"loss": 1.3373,
"step": 81
},
{
"epoch": 0.04167460771234356,
"grad_norm": 4.548923969268799,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.2625,
"step": 82
},
{
"epoch": 0.04218283463566482,
"grad_norm": 4.1088714599609375,
"learning_rate": 1.6869918699186993e-06,
"loss": 1.3832,
"step": 83
},
{
"epoch": 0.04269106155898609,
"grad_norm": 3.9086315631866455,
"learning_rate": 1.707317073170732e-06,
"loss": 1.3633,
"step": 84
},
{
"epoch": 0.04319928848230735,
"grad_norm": 4.148958683013916,
"learning_rate": 1.7276422764227643e-06,
"loss": 1.2266,
"step": 85
},
{
"epoch": 0.043707515405628616,
"grad_norm": 3.861931562423706,
"learning_rate": 1.747967479674797e-06,
"loss": 1.4014,
"step": 86
},
{
"epoch": 0.04421574232894988,
"grad_norm": 4.312771320343018,
"learning_rate": 1.7682926829268294e-06,
"loss": 1.3073,
"step": 87
},
{
"epoch": 0.04472396925227114,
"grad_norm": 3.94911789894104,
"learning_rate": 1.788617886178862e-06,
"loss": 1.4017,
"step": 88
},
{
"epoch": 0.045232196175592404,
"grad_norm": 3.828352212905884,
"learning_rate": 1.8089430894308946e-06,
"loss": 1.238,
"step": 89
},
{
"epoch": 0.045740423098913664,
"grad_norm": 3.622032403945923,
"learning_rate": 1.8292682926829268e-06,
"loss": 1.275,
"step": 90
},
{
"epoch": 0.046248650022234924,
"grad_norm": 3.982901096343994,
"learning_rate": 1.8495934959349595e-06,
"loss": 1.247,
"step": 91
},
{
"epoch": 0.04675687694555619,
"grad_norm": 3.9050590991973877,
"learning_rate": 1.8699186991869919e-06,
"loss": 1.2841,
"step": 92
},
{
"epoch": 0.04726510386887745,
"grad_norm": 3.8051700592041016,
"learning_rate": 1.8902439024390245e-06,
"loss": 1.3774,
"step": 93
},
{
"epoch": 0.04777333079219872,
"grad_norm": 3.988053798675537,
"learning_rate": 1.9105691056910574e-06,
"loss": 1.3044,
"step": 94
},
{
"epoch": 0.04828155771551998,
"grad_norm": 4.018758296966553,
"learning_rate": 1.9308943089430896e-06,
"loss": 1.2674,
"step": 95
},
{
"epoch": 0.04878978463884124,
"grad_norm": 3.703763723373413,
"learning_rate": 1.951219512195122e-06,
"loss": 1.4012,
"step": 96
},
{
"epoch": 0.04929801156216251,
"grad_norm": 4.037637710571289,
"learning_rate": 1.9715447154471544e-06,
"loss": 1.3216,
"step": 97
},
{
"epoch": 0.04980623848548377,
"grad_norm": 3.6200430393218994,
"learning_rate": 1.991869918699187e-06,
"loss": 1.1986,
"step": 98
},
{
"epoch": 0.050314465408805034,
"grad_norm": 5.854780673980713,
"learning_rate": 2.0121951219512197e-06,
"loss": 1.4021,
"step": 99
},
{
"epoch": 0.050822692332126294,
"grad_norm": 4.096163272857666,
"learning_rate": 2.0325203252032523e-06,
"loss": 1.3754,
"step": 100
},
{
"epoch": 0.051330919255447555,
"grad_norm": 3.9238216876983643,
"learning_rate": 2.052845528455285e-06,
"loss": 1.3719,
"step": 101
},
{
"epoch": 0.05183914617876882,
"grad_norm": 3.885479211807251,
"learning_rate": 2.073170731707317e-06,
"loss": 1.3589,
"step": 102
},
{
"epoch": 0.05234737310209008,
"grad_norm": 3.7331907749176025,
"learning_rate": 2.0934959349593497e-06,
"loss": 1.3464,
"step": 103
},
{
"epoch": 0.05285560002541135,
"grad_norm": 3.8253138065338135,
"learning_rate": 2.1138211382113824e-06,
"loss": 1.4048,
"step": 104
},
{
"epoch": 0.05336382694873261,
"grad_norm": 4.024075984954834,
"learning_rate": 2.1341463414634146e-06,
"loss": 1.3333,
"step": 105
},
{
"epoch": 0.05387205387205387,
"grad_norm": 4.16942834854126,
"learning_rate": 2.154471544715447e-06,
"loss": 1.3049,
"step": 106
},
{
"epoch": 0.05438028079537514,
"grad_norm": 3.7079477310180664,
"learning_rate": 2.17479674796748e-06,
"loss": 1.2983,
"step": 107
},
{
"epoch": 0.0548885077186964,
"grad_norm": 4.08198881149292,
"learning_rate": 2.1951219512195125e-06,
"loss": 1.2067,
"step": 108
},
{
"epoch": 0.055396734642017664,
"grad_norm": 4.052254676818848,
"learning_rate": 2.215447154471545e-06,
"loss": 1.3061,
"step": 109
},
{
"epoch": 0.055904961565338925,
"grad_norm": 4.361356735229492,
"learning_rate": 2.2357723577235773e-06,
"loss": 1.3899,
"step": 110
},
{
"epoch": 0.056413188488660185,
"grad_norm": 8.015365600585938,
"learning_rate": 2.25609756097561e-06,
"loss": 1.3209,
"step": 111
},
{
"epoch": 0.05692141541198145,
"grad_norm": 3.764535665512085,
"learning_rate": 2.2764227642276426e-06,
"loss": 1.287,
"step": 112
},
{
"epoch": 0.05742964233530271,
"grad_norm": 5.49539852142334,
"learning_rate": 2.296747967479675e-06,
"loss": 1.3783,
"step": 113
},
{
"epoch": 0.05793786925862397,
"grad_norm": 3.8290023803710938,
"learning_rate": 2.317073170731708e-06,
"loss": 1.234,
"step": 114
},
{
"epoch": 0.05844609618194524,
"grad_norm": 4.1116228103637695,
"learning_rate": 2.33739837398374e-06,
"loss": 1.3752,
"step": 115
},
{
"epoch": 0.0589543231052665,
"grad_norm": 4.267752170562744,
"learning_rate": 2.3577235772357727e-06,
"loss": 1.3222,
"step": 116
},
{
"epoch": 0.05946255002858777,
"grad_norm": 3.951112985610962,
"learning_rate": 2.378048780487805e-06,
"loss": 1.3798,
"step": 117
},
{
"epoch": 0.05997077695190903,
"grad_norm": 3.748058319091797,
"learning_rate": 2.3983739837398375e-06,
"loss": 1.2211,
"step": 118
},
{
"epoch": 0.06047900387523029,
"grad_norm": 3.887105941772461,
"learning_rate": 2.41869918699187e-06,
"loss": 1.2549,
"step": 119
},
{
"epoch": 0.060987230798551555,
"grad_norm": 3.793177843093872,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.3849,
"step": 120
},
{
"epoch": 0.061495457721872815,
"grad_norm": 4.098204612731934,
"learning_rate": 2.4593495934959354e-06,
"loss": 1.3509,
"step": 121
},
{
"epoch": 0.06200368464519408,
"grad_norm": 3.8322818279266357,
"learning_rate": 2.4796747967479676e-06,
"loss": 1.1903,
"step": 122
},
{
"epoch": 0.06251191156851534,
"grad_norm": 4.026457786560059,
"learning_rate": 2.5e-06,
"loss": 1.2147,
"step": 123
},
{
"epoch": 0.0630201384918366,
"grad_norm": 3.7052459716796875,
"learning_rate": 2.5203252032520324e-06,
"loss": 1.398,
"step": 124
},
{
"epoch": 0.06352836541515787,
"grad_norm": 3.5341570377349854,
"learning_rate": 2.5406504065040655e-06,
"loss": 1.2919,
"step": 125
},
{
"epoch": 0.06403659233847914,
"grad_norm": 4.211786270141602,
"learning_rate": 2.5609756097560977e-06,
"loss": 1.1977,
"step": 126
},
{
"epoch": 0.06454481926180039,
"grad_norm": 3.801708221435547,
"learning_rate": 2.5813008130081303e-06,
"loss": 1.2276,
"step": 127
},
{
"epoch": 0.06505304618512166,
"grad_norm": 4.580326557159424,
"learning_rate": 2.601626016260163e-06,
"loss": 1.3152,
"step": 128
},
{
"epoch": 0.06556127310844292,
"grad_norm": 3.78059720993042,
"learning_rate": 2.6219512195121956e-06,
"loss": 1.2336,
"step": 129
},
{
"epoch": 0.06606950003176418,
"grad_norm": 4.220641136169434,
"learning_rate": 2.6422764227642278e-06,
"loss": 1.3903,
"step": 130
},
{
"epoch": 0.06657772695508545,
"grad_norm": 3.944988965988159,
"learning_rate": 2.66260162601626e-06,
"loss": 1.319,
"step": 131
},
{
"epoch": 0.06708595387840671,
"grad_norm": 4.109734535217285,
"learning_rate": 2.682926829268293e-06,
"loss": 1.2436,
"step": 132
},
{
"epoch": 0.06759418080172797,
"grad_norm": 3.725135326385498,
"learning_rate": 2.7032520325203252e-06,
"loss": 1.3013,
"step": 133
},
{
"epoch": 0.06810240772504923,
"grad_norm": 4.149574279785156,
"learning_rate": 2.723577235772358e-06,
"loss": 1.3835,
"step": 134
},
{
"epoch": 0.0686106346483705,
"grad_norm": 3.8214473724365234,
"learning_rate": 2.7439024390243905e-06,
"loss": 1.3422,
"step": 135
},
{
"epoch": 0.06911886157169177,
"grad_norm": 3.678873300552368,
"learning_rate": 2.764227642276423e-06,
"loss": 1.1785,
"step": 136
},
{
"epoch": 0.06962708849501302,
"grad_norm": 4.062511444091797,
"learning_rate": 2.7845528455284553e-06,
"loss": 1.2874,
"step": 137
},
{
"epoch": 0.07013531541833429,
"grad_norm": 3.8361012935638428,
"learning_rate": 2.8048780487804884e-06,
"loss": 1.3022,
"step": 138
},
{
"epoch": 0.07064354234165555,
"grad_norm": 4.04416561126709,
"learning_rate": 2.8252032520325206e-06,
"loss": 1.3684,
"step": 139
},
{
"epoch": 0.07115176926497681,
"grad_norm": 4.1772894859313965,
"learning_rate": 2.845528455284553e-06,
"loss": 1.3542,
"step": 140
},
{
"epoch": 0.07165999618829808,
"grad_norm": 3.7365682125091553,
"learning_rate": 2.8658536585365854e-06,
"loss": 1.3469,
"step": 141
},
{
"epoch": 0.07216822311161934,
"grad_norm": 3.7443156242370605,
"learning_rate": 2.8861788617886185e-06,
"loss": 1.3453,
"step": 142
},
{
"epoch": 0.0726764500349406,
"grad_norm": 3.999711513519287,
"learning_rate": 2.9065040650406507e-06,
"loss": 1.4442,
"step": 143
},
{
"epoch": 0.07318467695826186,
"grad_norm": 3.5781519412994385,
"learning_rate": 2.926829268292683e-06,
"loss": 1.2533,
"step": 144
},
{
"epoch": 0.07369290388158313,
"grad_norm": 3.80576491355896,
"learning_rate": 2.947154471544716e-06,
"loss": 1.2788,
"step": 145
},
{
"epoch": 0.07420113080490438,
"grad_norm": 4.316473960876465,
"learning_rate": 2.967479674796748e-06,
"loss": 1.2272,
"step": 146
},
{
"epoch": 0.07470935772822565,
"grad_norm": 4.160771369934082,
"learning_rate": 2.9878048780487808e-06,
"loss": 1.2916,
"step": 147
},
{
"epoch": 0.07521758465154692,
"grad_norm": 3.7304327487945557,
"learning_rate": 3.0081300813008134e-06,
"loss": 1.2154,
"step": 148
},
{
"epoch": 0.07572581157486818,
"grad_norm": 5.959589958190918,
"learning_rate": 3.028455284552846e-06,
"loss": 1.4461,
"step": 149
},
{
"epoch": 0.07623403849818944,
"grad_norm": 3.827523708343506,
"learning_rate": 3.0487804878048782e-06,
"loss": 1.329,
"step": 150
},
{
"epoch": 0.0767422654215107,
"grad_norm": 3.866091728210449,
"learning_rate": 3.0691056910569104e-06,
"loss": 1.2627,
"step": 151
},
{
"epoch": 0.07725049234483197,
"grad_norm": 3.7172887325286865,
"learning_rate": 3.0894308943089435e-06,
"loss": 1.4103,
"step": 152
},
{
"epoch": 0.07775871926815323,
"grad_norm": 4.245830535888672,
"learning_rate": 3.1097560975609757e-06,
"loss": 1.3797,
"step": 153
},
{
"epoch": 0.07826694619147449,
"grad_norm": 4.362545490264893,
"learning_rate": 3.1300813008130083e-06,
"loss": 1.3229,
"step": 154
},
{
"epoch": 0.07877517311479576,
"grad_norm": 3.8218653202056885,
"learning_rate": 3.150406504065041e-06,
"loss": 1.1794,
"step": 155
},
{
"epoch": 0.07928340003811701,
"grad_norm": 3.770843267440796,
"learning_rate": 3.1707317073170736e-06,
"loss": 1.2591,
"step": 156
},
{
"epoch": 0.07979162696143828,
"grad_norm": 3.6830074787139893,
"learning_rate": 3.1910569105691058e-06,
"loss": 1.2592,
"step": 157
},
{
"epoch": 0.08029985388475955,
"grad_norm": 4.0969367027282715,
"learning_rate": 3.211382113821139e-06,
"loss": 1.2888,
"step": 158
},
{
"epoch": 0.08080808080808081,
"grad_norm": 4.271267890930176,
"learning_rate": 3.231707317073171e-06,
"loss": 1.3786,
"step": 159
},
{
"epoch": 0.08131630773140207,
"grad_norm": 3.965411424636841,
"learning_rate": 3.2520325203252037e-06,
"loss": 1.2607,
"step": 160
},
{
"epoch": 0.08182453465472334,
"grad_norm": 3.780172824859619,
"learning_rate": 3.272357723577236e-06,
"loss": 1.2708,
"step": 161
},
{
"epoch": 0.0823327615780446,
"grad_norm": 3.947627305984497,
"learning_rate": 3.292682926829269e-06,
"loss": 1.4423,
"step": 162
},
{
"epoch": 0.08284098850136586,
"grad_norm": 3.788705348968506,
"learning_rate": 3.313008130081301e-06,
"loss": 1.2629,
"step": 163
},
{
"epoch": 0.08334921542468712,
"grad_norm": 4.064167499542236,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.3003,
"step": 164
},
{
"epoch": 0.08385744234800839,
"grad_norm": 3.8234219551086426,
"learning_rate": 3.3536585365853664e-06,
"loss": 1.2796,
"step": 165
},
{
"epoch": 0.08436566927132964,
"grad_norm": 3.8122544288635254,
"learning_rate": 3.3739837398373986e-06,
"loss": 1.2614,
"step": 166
},
{
"epoch": 0.08487389619465091,
"grad_norm": 3.916015863418579,
"learning_rate": 3.394308943089431e-06,
"loss": 1.2777,
"step": 167
},
{
"epoch": 0.08538212311797218,
"grad_norm": 3.9047353267669678,
"learning_rate": 3.414634146341464e-06,
"loss": 1.251,
"step": 168
},
{
"epoch": 0.08589035004129343,
"grad_norm": 3.993406057357788,
"learning_rate": 3.4349593495934965e-06,
"loss": 1.3075,
"step": 169
},
{
"epoch": 0.0863985769646147,
"grad_norm": 3.906684160232544,
"learning_rate": 3.4552845528455287e-06,
"loss": 1.2627,
"step": 170
},
{
"epoch": 0.08690680388793597,
"grad_norm": 4.104040622711182,
"learning_rate": 3.475609756097561e-06,
"loss": 1.2762,
"step": 171
},
{
"epoch": 0.08741503081125723,
"grad_norm": 3.6508748531341553,
"learning_rate": 3.495934959349594e-06,
"loss": 1.1899,
"step": 172
},
{
"epoch": 0.08792325773457849,
"grad_norm": 3.970284938812256,
"learning_rate": 3.516260162601626e-06,
"loss": 1.2013,
"step": 173
},
{
"epoch": 0.08843148465789975,
"grad_norm": 3.715240001678467,
"learning_rate": 3.5365853658536588e-06,
"loss": 1.2735,
"step": 174
},
{
"epoch": 0.08893971158122102,
"grad_norm": 3.685577392578125,
"learning_rate": 3.5569105691056914e-06,
"loss": 1.21,
"step": 175
},
{
"epoch": 0.08944793850454227,
"grad_norm": 3.7775447368621826,
"learning_rate": 3.577235772357724e-06,
"loss": 1.2972,
"step": 176
},
{
"epoch": 0.08995616542786354,
"grad_norm": 3.7754499912261963,
"learning_rate": 3.5975609756097562e-06,
"loss": 1.1667,
"step": 177
},
{
"epoch": 0.09046439235118481,
"grad_norm": 11.866535186767578,
"learning_rate": 3.6178861788617893e-06,
"loss": 1.5132,
"step": 178
},
{
"epoch": 0.09097261927450606,
"grad_norm": 3.855421781539917,
"learning_rate": 3.6382113821138215e-06,
"loss": 1.3445,
"step": 179
},
{
"epoch": 0.09148084619782733,
"grad_norm": 4.019442558288574,
"learning_rate": 3.6585365853658537e-06,
"loss": 1.2539,
"step": 180
},
{
"epoch": 0.0919890731211486,
"grad_norm": 4.017965316772461,
"learning_rate": 3.6788617886178863e-06,
"loss": 1.2669,
"step": 181
},
{
"epoch": 0.09249730004446985,
"grad_norm": 3.872027635574341,
"learning_rate": 3.699186991869919e-06,
"loss": 1.2374,
"step": 182
},
{
"epoch": 0.09300552696779112,
"grad_norm": 4.099319934844971,
"learning_rate": 3.7195121951219516e-06,
"loss": 1.3732,
"step": 183
},
{
"epoch": 0.09351375389111238,
"grad_norm": 3.8168752193450928,
"learning_rate": 3.7398373983739838e-06,
"loss": 1.3192,
"step": 184
},
{
"epoch": 0.09402198081443365,
"grad_norm": 3.548044443130493,
"learning_rate": 3.760162601626017e-06,
"loss": 1.2726,
"step": 185
},
{
"epoch": 0.0945302077377549,
"grad_norm": 3.644498109817505,
"learning_rate": 3.780487804878049e-06,
"loss": 1.2598,
"step": 186
},
{
"epoch": 0.09503843466107617,
"grad_norm": 4.000254154205322,
"learning_rate": 3.8008130081300817e-06,
"loss": 1.3566,
"step": 187
},
{
"epoch": 0.09554666158439744,
"grad_norm": 3.4733471870422363,
"learning_rate": 3.821138211382115e-06,
"loss": 1.1885,
"step": 188
},
{
"epoch": 0.09605488850771869,
"grad_norm": 3.7947239875793457,
"learning_rate": 3.8414634146341465e-06,
"loss": 1.3288,
"step": 189
},
{
"epoch": 0.09656311543103996,
"grad_norm": 3.94771409034729,
"learning_rate": 3.861788617886179e-06,
"loss": 1.3124,
"step": 190
},
{
"epoch": 0.09707134235436123,
"grad_norm": 4.032608509063721,
"learning_rate": 3.882113821138212e-06,
"loss": 1.236,
"step": 191
},
{
"epoch": 0.09757956927768248,
"grad_norm": 3.6716253757476807,
"learning_rate": 3.902439024390244e-06,
"loss": 1.2821,
"step": 192
},
{
"epoch": 0.09808779620100375,
"grad_norm": 3.8969194889068604,
"learning_rate": 3.922764227642277e-06,
"loss": 1.3023,
"step": 193
},
{
"epoch": 0.09859602312432501,
"grad_norm": 4.0722975730896,
"learning_rate": 3.943089430894309e-06,
"loss": 1.3167,
"step": 194
},
{
"epoch": 0.09910425004764628,
"grad_norm": 3.9485273361206055,
"learning_rate": 3.963414634146342e-06,
"loss": 1.2637,
"step": 195
},
{
"epoch": 0.09961247697096753,
"grad_norm": 3.7706732749938965,
"learning_rate": 3.983739837398374e-06,
"loss": 1.2213,
"step": 196
},
{
"epoch": 0.1001207038942888,
"grad_norm": 3.6940486431121826,
"learning_rate": 4.004065040650407e-06,
"loss": 1.2903,
"step": 197
},
{
"epoch": 0.10062893081761007,
"grad_norm": 3.6795332431793213,
"learning_rate": 4.024390243902439e-06,
"loss": 1.2003,
"step": 198
},
{
"epoch": 0.10113715774093132,
"grad_norm": 3.8393092155456543,
"learning_rate": 4.044715447154472e-06,
"loss": 1.352,
"step": 199
},
{
"epoch": 0.10164538466425259,
"grad_norm": 3.8912806510925293,
"learning_rate": 4.0650406504065046e-06,
"loss": 1.2611,
"step": 200
},
{
"epoch": 0.10215361158757386,
"grad_norm": 3.9540915489196777,
"learning_rate": 4.085365853658536e-06,
"loss": 1.2613,
"step": 201
},
{
"epoch": 0.10266183851089511,
"grad_norm": 3.922166585922241,
"learning_rate": 4.10569105691057e-06,
"loss": 1.3061,
"step": 202
},
{
"epoch": 0.10317006543421638,
"grad_norm": 4.365126609802246,
"learning_rate": 4.126016260162602e-06,
"loss": 1.3791,
"step": 203
},
{
"epoch": 0.10367829235753764,
"grad_norm": 3.6724672317504883,
"learning_rate": 4.146341463414634e-06,
"loss": 1.1408,
"step": 204
},
{
"epoch": 0.1041865192808589,
"grad_norm": 3.7531189918518066,
"learning_rate": 4.166666666666667e-06,
"loss": 1.276,
"step": 205
},
{
"epoch": 0.10469474620418016,
"grad_norm": 3.5939886569976807,
"learning_rate": 4.1869918699186995e-06,
"loss": 1.1531,
"step": 206
},
{
"epoch": 0.10520297312750143,
"grad_norm": 3.8948142528533936,
"learning_rate": 4.207317073170732e-06,
"loss": 1.2804,
"step": 207
},
{
"epoch": 0.1057112000508227,
"grad_norm": 3.7475123405456543,
"learning_rate": 4.227642276422765e-06,
"loss": 1.2897,
"step": 208
},
{
"epoch": 0.10621942697414395,
"grad_norm": 4.131088733673096,
"learning_rate": 4.247967479674797e-06,
"loss": 1.2971,
"step": 209
},
{
"epoch": 0.10672765389746522,
"grad_norm": 3.6580843925476074,
"learning_rate": 4.268292682926829e-06,
"loss": 1.1813,
"step": 210
},
{
"epoch": 0.10723588082078649,
"grad_norm": 12.907022476196289,
"learning_rate": 4.288617886178862e-06,
"loss": 1.434,
"step": 211
},
{
"epoch": 0.10774410774410774,
"grad_norm": 4.026226043701172,
"learning_rate": 4.308943089430894e-06,
"loss": 1.3102,
"step": 212
},
{
"epoch": 0.108252334667429,
"grad_norm": 3.583810567855835,
"learning_rate": 4.329268292682927e-06,
"loss": 1.2323,
"step": 213
},
{
"epoch": 0.10876056159075027,
"grad_norm": 3.931403636932373,
"learning_rate": 4.34959349593496e-06,
"loss": 1.2023,
"step": 214
},
{
"epoch": 0.10926878851407153,
"grad_norm": 3.6533145904541016,
"learning_rate": 4.369918699186992e-06,
"loss": 1.1653,
"step": 215
},
{
"epoch": 0.1097770154373928,
"grad_norm": 3.740746259689331,
"learning_rate": 4.390243902439025e-06,
"loss": 1.2121,
"step": 216
},
{
"epoch": 0.11028524236071406,
"grad_norm": 3.658018112182617,
"learning_rate": 4.410569105691057e-06,
"loss": 1.2733,
"step": 217
},
{
"epoch": 0.11079346928403533,
"grad_norm": 3.9621124267578125,
"learning_rate": 4.43089430894309e-06,
"loss": 1.1794,
"step": 218
},
{
"epoch": 0.11130169620735658,
"grad_norm": 3.379032850265503,
"learning_rate": 4.451219512195122e-06,
"loss": 1.2119,
"step": 219
},
{
"epoch": 0.11180992313067785,
"grad_norm": 3.9364140033721924,
"learning_rate": 4.471544715447155e-06,
"loss": 1.3891,
"step": 220
},
{
"epoch": 0.11231815005399912,
"grad_norm": 3.717283248901367,
"learning_rate": 4.491869918699187e-06,
"loss": 1.2106,
"step": 221
},
{
"epoch": 0.11282637697732037,
"grad_norm": 4.216766834259033,
"learning_rate": 4.51219512195122e-06,
"loss": 1.3475,
"step": 222
},
{
"epoch": 0.11333460390064164,
"grad_norm": 3.6524863243103027,
"learning_rate": 4.5325203252032525e-06,
"loss": 1.3016,
"step": 223
},
{
"epoch": 0.1138428308239629,
"grad_norm": 4.263420581817627,
"learning_rate": 4.552845528455285e-06,
"loss": 1.2905,
"step": 224
},
{
"epoch": 0.11435105774728416,
"grad_norm": 3.6008975505828857,
"learning_rate": 4.573170731707318e-06,
"loss": 1.2788,
"step": 225
},
{
"epoch": 0.11485928467060542,
"grad_norm": 3.713282823562622,
"learning_rate": 4.59349593495935e-06,
"loss": 1.277,
"step": 226
},
{
"epoch": 0.11536751159392669,
"grad_norm": 3.635056495666504,
"learning_rate": 4.613821138211382e-06,
"loss": 1.2814,
"step": 227
},
{
"epoch": 0.11587573851724794,
"grad_norm": 3.731588840484619,
"learning_rate": 4.634146341463416e-06,
"loss": 1.3636,
"step": 228
},
{
"epoch": 0.11638396544056921,
"grad_norm": 4.0097198486328125,
"learning_rate": 4.654471544715447e-06,
"loss": 1.2493,
"step": 229
},
{
"epoch": 0.11689219236389048,
"grad_norm": 4.035277843475342,
"learning_rate": 4.67479674796748e-06,
"loss": 1.2638,
"step": 230
},
{
"epoch": 0.11740041928721175,
"grad_norm": 3.686882972717285,
"learning_rate": 4.695121951219513e-06,
"loss": 1.2817,
"step": 231
},
{
"epoch": 0.117908646210533,
"grad_norm": 3.8758201599121094,
"learning_rate": 4.715447154471545e-06,
"loss": 1.2463,
"step": 232
},
{
"epoch": 0.11841687313385427,
"grad_norm": 4.043292045593262,
"learning_rate": 4.735772357723578e-06,
"loss": 1.2911,
"step": 233
},
{
"epoch": 0.11892510005717553,
"grad_norm": 3.9729626178741455,
"learning_rate": 4.75609756097561e-06,
"loss": 1.313,
"step": 234
},
{
"epoch": 0.11943332698049679,
"grad_norm": 3.574331521987915,
"learning_rate": 4.776422764227643e-06,
"loss": 1.3961,
"step": 235
},
{
"epoch": 0.11994155390381805,
"grad_norm": 4.03476619720459,
"learning_rate": 4.796747967479675e-06,
"loss": 1.2868,
"step": 236
},
{
"epoch": 0.12044978082713932,
"grad_norm": 3.672788381576538,
"learning_rate": 4.817073170731708e-06,
"loss": 1.3771,
"step": 237
},
{
"epoch": 0.12095800775046057,
"grad_norm": 4.011895179748535,
"learning_rate": 4.83739837398374e-06,
"loss": 1.2618,
"step": 238
},
{
"epoch": 0.12146623467378184,
"grad_norm": 3.7192506790161133,
"learning_rate": 4.857723577235773e-06,
"loss": 1.3259,
"step": 239
},
{
"epoch": 0.12197446159710311,
"grad_norm": 3.3653564453125,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.2904,
"step": 240
},
{
"epoch": 0.12248268852042436,
"grad_norm": 3.636655330657959,
"learning_rate": 4.898373983739837e-06,
"loss": 1.3524,
"step": 241
},
{
"epoch": 0.12299091544374563,
"grad_norm": 4.0803446769714355,
"learning_rate": 4.918699186991871e-06,
"loss": 1.3442,
"step": 242
},
{
"epoch": 0.1234991423670669,
"grad_norm": 3.5182483196258545,
"learning_rate": 4.9390243902439025e-06,
"loss": 1.2444,
"step": 243
},
{
"epoch": 0.12400736929038816,
"grad_norm": 3.481665849685669,
"learning_rate": 4.959349593495935e-06,
"loss": 1.181,
"step": 244
},
{
"epoch": 0.12451559621370942,
"grad_norm": 3.4673781394958496,
"learning_rate": 4.979674796747968e-06,
"loss": 1.3207,
"step": 245
},
{
"epoch": 0.12502382313703067,
"grad_norm": 3.4575881958007812,
"learning_rate": 5e-06,
"loss": 1.3064,
"step": 246
},
{
"epoch": 0.12553205006035195,
"grad_norm": 4.137662887573242,
"learning_rate": 5.020325203252033e-06,
"loss": 1.2268,
"step": 247
},
{
"epoch": 0.1260402769836732,
"grad_norm": 3.655907392501831,
"learning_rate": 5.040650406504065e-06,
"loss": 1.3024,
"step": 248
},
{
"epoch": 0.1265485039069945,
"grad_norm": 8.318976402282715,
"learning_rate": 5.060975609756098e-06,
"loss": 1.3418,
"step": 249
},
{
"epoch": 0.12705673083031574,
"grad_norm": 3.5912580490112305,
"learning_rate": 5.081300813008131e-06,
"loss": 1.2041,
"step": 250
},
{
"epoch": 0.127564957753637,
"grad_norm": 4.007481575012207,
"learning_rate": 5.101626016260163e-06,
"loss": 1.1676,
"step": 251
},
{
"epoch": 0.12807318467695827,
"grad_norm": 3.766157388687134,
"learning_rate": 5.121951219512195e-06,
"loss": 1.3185,
"step": 252
},
{
"epoch": 0.12858141160027953,
"grad_norm": 3.528630495071411,
"learning_rate": 5.142276422764229e-06,
"loss": 1.2942,
"step": 253
},
{
"epoch": 0.12908963852360078,
"grad_norm": 3.672837257385254,
"learning_rate": 5.162601626016261e-06,
"loss": 1.3008,
"step": 254
},
{
"epoch": 0.12959786544692206,
"grad_norm": 3.592590808868408,
"learning_rate": 5.182926829268293e-06,
"loss": 1.3084,
"step": 255
},
{
"epoch": 0.13010609237024331,
"grad_norm": 3.557032823562622,
"learning_rate": 5.203252032520326e-06,
"loss": 1.2775,
"step": 256
},
{
"epoch": 0.13061431929356457,
"grad_norm": 3.6543917655944824,
"learning_rate": 5.223577235772358e-06,
"loss": 1.3496,
"step": 257
},
{
"epoch": 0.13112254621688585,
"grad_norm": 3.6346216201782227,
"learning_rate": 5.243902439024391e-06,
"loss": 1.2644,
"step": 258
},
{
"epoch": 0.1316307731402071,
"grad_norm": 3.5259435176849365,
"learning_rate": 5.264227642276423e-06,
"loss": 1.3134,
"step": 259
},
{
"epoch": 0.13213900006352836,
"grad_norm": 3.558912515640259,
"learning_rate": 5.2845528455284555e-06,
"loss": 1.1762,
"step": 260
},
{
"epoch": 0.13264722698684964,
"grad_norm": 3.6628079414367676,
"learning_rate": 5.304878048780488e-06,
"loss": 1.3849,
"step": 261
},
{
"epoch": 0.1331554539101709,
"grad_norm": 3.4435086250305176,
"learning_rate": 5.32520325203252e-06,
"loss": 1.2441,
"step": 262
},
{
"epoch": 0.13366368083349214,
"grad_norm": 4.010739803314209,
"learning_rate": 5.345528455284553e-06,
"loss": 1.3847,
"step": 263
},
{
"epoch": 0.13417190775681342,
"grad_norm": 3.626926898956299,
"learning_rate": 5.365853658536586e-06,
"loss": 1.2959,
"step": 264
},
{
"epoch": 0.13468013468013468,
"grad_norm": 3.5818004608154297,
"learning_rate": 5.386178861788618e-06,
"loss": 1.2967,
"step": 265
},
{
"epoch": 0.13518836160345593,
"grad_norm": 3.964972496032715,
"learning_rate": 5.4065040650406504e-06,
"loss": 1.3061,
"step": 266
},
{
"epoch": 0.1356965885267772,
"grad_norm": 3.8659842014312744,
"learning_rate": 5.426829268292684e-06,
"loss": 1.3736,
"step": 267
},
{
"epoch": 0.13620481545009847,
"grad_norm": 3.6874732971191406,
"learning_rate": 5.447154471544716e-06,
"loss": 1.2194,
"step": 268
},
{
"epoch": 0.13671304237341972,
"grad_norm": 3.744476556777954,
"learning_rate": 5.467479674796748e-06,
"loss": 1.2867,
"step": 269
},
{
"epoch": 0.137221269296741,
"grad_norm": 3.51850962638855,
"learning_rate": 5.487804878048781e-06,
"loss": 1.2741,
"step": 270
},
{
"epoch": 0.13772949622006225,
"grad_norm": 3.6498262882232666,
"learning_rate": 5.508130081300814e-06,
"loss": 1.2259,
"step": 271
},
{
"epoch": 0.13823772314338353,
"grad_norm": 3.7769477367401123,
"learning_rate": 5.528455284552846e-06,
"loss": 1.2216,
"step": 272
},
{
"epoch": 0.1387459500667048,
"grad_norm": 3.5332465171813965,
"learning_rate": 5.548780487804879e-06,
"loss": 1.211,
"step": 273
},
{
"epoch": 0.13925417699002604,
"grad_norm": 3.7396240234375,
"learning_rate": 5.569105691056911e-06,
"loss": 1.3535,
"step": 274
},
{
"epoch": 0.13976240391334732,
"grad_norm": 3.5387160778045654,
"learning_rate": 5.589430894308944e-06,
"loss": 1.3375,
"step": 275
},
{
"epoch": 0.14027063083666858,
"grad_norm": 3.4825077056884766,
"learning_rate": 5.609756097560977e-06,
"loss": 1.3417,
"step": 276
},
{
"epoch": 0.14077885775998983,
"grad_norm": 3.5783963203430176,
"learning_rate": 5.6300813008130085e-06,
"loss": 1.2573,
"step": 277
},
{
"epoch": 0.1412870846833111,
"grad_norm": 3.5096850395202637,
"learning_rate": 5.650406504065041e-06,
"loss": 1.2363,
"step": 278
},
{
"epoch": 0.14179531160663236,
"grad_norm": 3.574193239212036,
"learning_rate": 5.670731707317073e-06,
"loss": 1.4032,
"step": 279
},
{
"epoch": 0.14230353852995362,
"grad_norm": 3.4912261962890625,
"learning_rate": 5.691056910569106e-06,
"loss": 1.2603,
"step": 280
},
{
"epoch": 0.1428117654532749,
"grad_norm": 3.5065510272979736,
"learning_rate": 5.711382113821139e-06,
"loss": 1.3125,
"step": 281
},
{
"epoch": 0.14331999237659615,
"grad_norm": 3.6454124450683594,
"learning_rate": 5.731707317073171e-06,
"loss": 1.285,
"step": 282
},
{
"epoch": 0.1438282192999174,
"grad_norm": 3.704364776611328,
"learning_rate": 5.7520325203252034e-06,
"loss": 1.1501,
"step": 283
},
{
"epoch": 0.14433644622323868,
"grad_norm": 3.756485939025879,
"learning_rate": 5.772357723577237e-06,
"loss": 1.3346,
"step": 284
},
{
"epoch": 0.14484467314655994,
"grad_norm": 3.815615177154541,
"learning_rate": 5.792682926829269e-06,
"loss": 1.3682,
"step": 285
},
{
"epoch": 0.1453529000698812,
"grad_norm": 3.9333648681640625,
"learning_rate": 5.813008130081301e-06,
"loss": 1.2763,
"step": 286
},
{
"epoch": 0.14586112699320247,
"grad_norm": 3.455777883529663,
"learning_rate": 5.833333333333334e-06,
"loss": 1.151,
"step": 287
},
{
"epoch": 0.14636935391652373,
"grad_norm": 3.815992593765259,
"learning_rate": 5.853658536585366e-06,
"loss": 1.3023,
"step": 288
},
{
"epoch": 0.14687758083984498,
"grad_norm": 3.914978504180908,
"learning_rate": 5.873983739837399e-06,
"loss": 1.25,
"step": 289
},
{
"epoch": 0.14738580776316626,
"grad_norm": 3.6481759548187256,
"learning_rate": 5.894308943089432e-06,
"loss": 1.2893,
"step": 290
},
{
"epoch": 0.1478940346864875,
"grad_norm": 3.5571045875549316,
"learning_rate": 5.914634146341464e-06,
"loss": 1.3232,
"step": 291
},
{
"epoch": 0.14840226160980877,
"grad_norm": 3.597348690032959,
"learning_rate": 5.934959349593496e-06,
"loss": 1.192,
"step": 292
},
{
"epoch": 0.14891048853313005,
"grad_norm": 3.44991397857666,
"learning_rate": 5.95528455284553e-06,
"loss": 1.1843,
"step": 293
},
{
"epoch": 0.1494187154564513,
"grad_norm": 3.8357386589050293,
"learning_rate": 5.9756097560975615e-06,
"loss": 1.2407,
"step": 294
},
{
"epoch": 0.14992694237977258,
"grad_norm": 3.804199457168579,
"learning_rate": 5.995934959349594e-06,
"loss": 1.2215,
"step": 295
},
{
"epoch": 0.15043516930309384,
"grad_norm": 3.6634774208068848,
"learning_rate": 6.016260162601627e-06,
"loss": 1.347,
"step": 296
},
{
"epoch": 0.1509433962264151,
"grad_norm": 3.491067886352539,
"learning_rate": 6.0365853658536585e-06,
"loss": 1.2255,
"step": 297
},
{
"epoch": 0.15145162314973637,
"grad_norm": 3.578895330429077,
"learning_rate": 6.056910569105692e-06,
"loss": 1.2312,
"step": 298
},
{
"epoch": 0.15195985007305762,
"grad_norm": 3.9656708240509033,
"learning_rate": 6.077235772357724e-06,
"loss": 1.3773,
"step": 299
},
{
"epoch": 0.15246807699637888,
"grad_norm": 3.67789888381958,
"learning_rate": 6.0975609756097564e-06,
"loss": 1.3023,
"step": 300
},
{
"epoch": 0.15297630391970016,
"grad_norm": 3.6001689434051514,
"learning_rate": 6.117886178861789e-06,
"loss": 1.2729,
"step": 301
},
{
"epoch": 0.1534845308430214,
"grad_norm": 3.572338581085205,
"learning_rate": 6.138211382113821e-06,
"loss": 1.3521,
"step": 302
},
{
"epoch": 0.15399275776634266,
"grad_norm": 3.7971441745758057,
"learning_rate": 6.158536585365854e-06,
"loss": 1.2599,
"step": 303
},
{
"epoch": 0.15450098468966394,
"grad_norm": 4.001463413238525,
"learning_rate": 6.178861788617887e-06,
"loss": 1.344,
"step": 304
},
{
"epoch": 0.1550092116129852,
"grad_norm": 3.4792215824127197,
"learning_rate": 6.199186991869919e-06,
"loss": 1.2284,
"step": 305
},
{
"epoch": 0.15551743853630645,
"grad_norm": 3.7361996173858643,
"learning_rate": 6.219512195121951e-06,
"loss": 1.2382,
"step": 306
},
{
"epoch": 0.15602566545962773,
"grad_norm": 3.6837079524993896,
"learning_rate": 6.239837398373985e-06,
"loss": 1.3571,
"step": 307
},
{
"epoch": 0.15653389238294899,
"grad_norm": 3.793705463409424,
"learning_rate": 6.260162601626017e-06,
"loss": 1.3289,
"step": 308
},
{
"epoch": 0.15704211930627024,
"grad_norm": 3.567331075668335,
"learning_rate": 6.280487804878049e-06,
"loss": 1.3228,
"step": 309
},
{
"epoch": 0.15755034622959152,
"grad_norm": 3.763274669647217,
"learning_rate": 6.300813008130082e-06,
"loss": 1.3429,
"step": 310
},
{
"epoch": 0.15805857315291277,
"grad_norm": 3.717379093170166,
"learning_rate": 6.321138211382114e-06,
"loss": 1.3641,
"step": 311
},
{
"epoch": 0.15856680007623403,
"grad_norm": 3.8312816619873047,
"learning_rate": 6.341463414634147e-06,
"loss": 1.3155,
"step": 312
},
{
"epoch": 0.1590750269995553,
"grad_norm": 3.651553153991699,
"learning_rate": 6.36178861788618e-06,
"loss": 1.2838,
"step": 313
},
{
"epoch": 0.15958325392287656,
"grad_norm": 3.682612895965576,
"learning_rate": 6.3821138211382115e-06,
"loss": 1.3848,
"step": 314
},
{
"epoch": 0.16009148084619781,
"grad_norm": 3.6725523471832275,
"learning_rate": 6.402439024390244e-06,
"loss": 1.2029,
"step": 315
},
{
"epoch": 0.1605997077695191,
"grad_norm": 3.7922701835632324,
"learning_rate": 6.422764227642278e-06,
"loss": 1.3111,
"step": 316
},
{
"epoch": 0.16110793469284035,
"grad_norm": 3.7131593227386475,
"learning_rate": 6.4430894308943094e-06,
"loss": 1.32,
"step": 317
},
{
"epoch": 0.16161616161616163,
"grad_norm": 3.859788656234741,
"learning_rate": 6.463414634146342e-06,
"loss": 1.3625,
"step": 318
},
{
"epoch": 0.16212438853948288,
"grad_norm": 3.674773693084717,
"learning_rate": 6.483739837398374e-06,
"loss": 1.2244,
"step": 319
},
{
"epoch": 0.16263261546280414,
"grad_norm": 3.4736006259918213,
"learning_rate": 6.504065040650407e-06,
"loss": 1.2257,
"step": 320
},
{
"epoch": 0.16314084238612542,
"grad_norm": 3.9480464458465576,
"learning_rate": 6.52439024390244e-06,
"loss": 1.4528,
"step": 321
},
{
"epoch": 0.16364906930944667,
"grad_norm": 3.6919679641723633,
"learning_rate": 6.544715447154472e-06,
"loss": 1.2453,
"step": 322
},
{
"epoch": 0.16415729623276792,
"grad_norm": 3.6807546615600586,
"learning_rate": 6.565040650406504e-06,
"loss": 1.2104,
"step": 323
},
{
"epoch": 0.1646655231560892,
"grad_norm": 3.67043137550354,
"learning_rate": 6.585365853658538e-06,
"loss": 1.3452,
"step": 324
},
{
"epoch": 0.16517375007941046,
"grad_norm": 3.3604013919830322,
"learning_rate": 6.60569105691057e-06,
"loss": 1.2311,
"step": 325
},
{
"epoch": 0.1656819770027317,
"grad_norm": 3.487772226333618,
"learning_rate": 6.626016260162602e-06,
"loss": 1.2692,
"step": 326
},
{
"epoch": 0.166190203926053,
"grad_norm": 3.803863286972046,
"learning_rate": 6.646341463414635e-06,
"loss": 1.4371,
"step": 327
},
{
"epoch": 0.16669843084937425,
"grad_norm": 3.3784923553466797,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2383,
"step": 328
},
{
"epoch": 0.1672066577726955,
"grad_norm": 3.524672746658325,
"learning_rate": 6.6869918699187e-06,
"loss": 1.2487,
"step": 329
},
{
"epoch": 0.16771488469601678,
"grad_norm": 3.207425832748413,
"learning_rate": 6.707317073170733e-06,
"loss": 1.2083,
"step": 330
},
{
"epoch": 0.16822311161933803,
"grad_norm": 3.3784162998199463,
"learning_rate": 6.7276422764227645e-06,
"loss": 1.2829,
"step": 331
},
{
"epoch": 0.1687313385426593,
"grad_norm": 4.187244415283203,
"learning_rate": 6.747967479674797e-06,
"loss": 1.3114,
"step": 332
},
{
"epoch": 0.16923956546598057,
"grad_norm": 3.5479447841644287,
"learning_rate": 6.768292682926831e-06,
"loss": 1.2949,
"step": 333
},
{
"epoch": 0.16974779238930182,
"grad_norm": 3.4103052616119385,
"learning_rate": 6.788617886178862e-06,
"loss": 1.1889,
"step": 334
},
{
"epoch": 0.17025601931262307,
"grad_norm": 3.217073678970337,
"learning_rate": 6.808943089430895e-06,
"loss": 1.3049,
"step": 335
},
{
"epoch": 0.17076424623594436,
"grad_norm": 3.2264113426208496,
"learning_rate": 6.829268292682928e-06,
"loss": 1.1391,
"step": 336
},
{
"epoch": 0.1712724731592656,
"grad_norm": 3.488623857498169,
"learning_rate": 6.8495934959349595e-06,
"loss": 1.17,
"step": 337
},
{
"epoch": 0.17178070008258686,
"grad_norm": 3.76481556892395,
"learning_rate": 6.869918699186993e-06,
"loss": 1.3463,
"step": 338
},
{
"epoch": 0.17228892700590814,
"grad_norm": 3.5634756088256836,
"learning_rate": 6.890243902439025e-06,
"loss": 1.2973,
"step": 339
},
{
"epoch": 0.1727971539292294,
"grad_norm": 3.3373970985412598,
"learning_rate": 6.910569105691057e-06,
"loss": 1.2365,
"step": 340
},
{
"epoch": 0.17330538085255065,
"grad_norm": 3.5796754360198975,
"learning_rate": 6.93089430894309e-06,
"loss": 1.405,
"step": 341
},
{
"epoch": 0.17381360777587193,
"grad_norm": 3.383561849594116,
"learning_rate": 6.951219512195122e-06,
"loss": 1.1957,
"step": 342
},
{
"epoch": 0.17432183469919318,
"grad_norm": 3.610441207885742,
"learning_rate": 6.971544715447155e-06,
"loss": 1.2192,
"step": 343
},
{
"epoch": 0.17483006162251447,
"grad_norm": 3.319985866546631,
"learning_rate": 6.991869918699188e-06,
"loss": 1.2916,
"step": 344
},
{
"epoch": 0.17533828854583572,
"grad_norm": 3.5332345962524414,
"learning_rate": 7.01219512195122e-06,
"loss": 1.2721,
"step": 345
},
{
"epoch": 0.17584651546915697,
"grad_norm": 3.552676200866699,
"learning_rate": 7.032520325203252e-06,
"loss": 1.3467,
"step": 346
},
{
"epoch": 0.17635474239247825,
"grad_norm": 3.745915412902832,
"learning_rate": 7.052845528455286e-06,
"loss": 1.3653,
"step": 347
},
{
"epoch": 0.1768629693157995,
"grad_norm": 3.4070985317230225,
"learning_rate": 7.0731707317073175e-06,
"loss": 1.3137,
"step": 348
},
{
"epoch": 0.17737119623912076,
"grad_norm": 3.583345890045166,
"learning_rate": 7.09349593495935e-06,
"loss": 1.2447,
"step": 349
},
{
"epoch": 0.17787942316244204,
"grad_norm": 3.593552350997925,
"learning_rate": 7.113821138211383e-06,
"loss": 1.2614,
"step": 350
},
{
"epoch": 0.1783876500857633,
"grad_norm": 3.6274521350860596,
"learning_rate": 7.1341463414634146e-06,
"loss": 1.2424,
"step": 351
},
{
"epoch": 0.17889587700908455,
"grad_norm": 3.4343936443328857,
"learning_rate": 7.154471544715448e-06,
"loss": 1.0972,
"step": 352
},
{
"epoch": 0.17940410393240583,
"grad_norm": 3.4829659461975098,
"learning_rate": 7.174796747967481e-06,
"loss": 1.3234,
"step": 353
},
{
"epoch": 0.17991233085572708,
"grad_norm": 3.9330294132232666,
"learning_rate": 7.1951219512195125e-06,
"loss": 1.2978,
"step": 354
},
{
"epoch": 0.18042055777904834,
"grad_norm": 3.7791481018066406,
"learning_rate": 7.215447154471545e-06,
"loss": 1.3102,
"step": 355
},
{
"epoch": 0.18092878470236962,
"grad_norm": 3.5597262382507324,
"learning_rate": 7.2357723577235786e-06,
"loss": 1.3284,
"step": 356
},
{
"epoch": 0.18143701162569087,
"grad_norm": 3.4017419815063477,
"learning_rate": 7.25609756097561e-06,
"loss": 1.2043,
"step": 357
},
{
"epoch": 0.18194523854901212,
"grad_norm": 3.3661866188049316,
"learning_rate": 7.276422764227643e-06,
"loss": 1.2812,
"step": 358
},
{
"epoch": 0.1824534654723334,
"grad_norm": 3.6549904346466064,
"learning_rate": 7.296747967479675e-06,
"loss": 1.2439,
"step": 359
},
{
"epoch": 0.18296169239565466,
"grad_norm": 3.5217676162719727,
"learning_rate": 7.317073170731707e-06,
"loss": 1.2781,
"step": 360
},
{
"epoch": 0.1834699193189759,
"grad_norm": 4.081654071807861,
"learning_rate": 7.337398373983741e-06,
"loss": 1.2801,
"step": 361
},
{
"epoch": 0.1839781462422972,
"grad_norm": 4.09951114654541,
"learning_rate": 7.357723577235773e-06,
"loss": 1.3082,
"step": 362
},
{
"epoch": 0.18448637316561844,
"grad_norm": 3.354565382003784,
"learning_rate": 7.378048780487805e-06,
"loss": 1.2412,
"step": 363
},
{
"epoch": 0.1849946000889397,
"grad_norm": 3.285402297973633,
"learning_rate": 7.398373983739838e-06,
"loss": 1.1878,
"step": 364
},
{
"epoch": 0.18550282701226098,
"grad_norm": 4.071623802185059,
"learning_rate": 7.41869918699187e-06,
"loss": 1.4499,
"step": 365
},
{
"epoch": 0.18601105393558223,
"grad_norm": 3.3457748889923096,
"learning_rate": 7.439024390243903e-06,
"loss": 1.3129,
"step": 366
},
{
"epoch": 0.1865192808589035,
"grad_norm": 3.6435835361480713,
"learning_rate": 7.459349593495936e-06,
"loss": 1.2058,
"step": 367
},
{
"epoch": 0.18702750778222477,
"grad_norm": 3.8403193950653076,
"learning_rate": 7.4796747967479676e-06,
"loss": 1.3017,
"step": 368
},
{
"epoch": 0.18753573470554602,
"grad_norm": 3.588543653488159,
"learning_rate": 7.500000000000001e-06,
"loss": 1.2786,
"step": 369
},
{
"epoch": 0.1880439616288673,
"grad_norm": 3.3542251586914062,
"learning_rate": 7.520325203252034e-06,
"loss": 1.28,
"step": 370
},
{
"epoch": 0.18855218855218855,
"grad_norm": 3.4125912189483643,
"learning_rate": 7.5406504065040654e-06,
"loss": 1.2436,
"step": 371
},
{
"epoch": 0.1890604154755098,
"grad_norm": 3.2614572048187256,
"learning_rate": 7.560975609756098e-06,
"loss": 1.2692,
"step": 372
},
{
"epoch": 0.1895686423988311,
"grad_norm": 3.295055866241455,
"learning_rate": 7.5813008130081316e-06,
"loss": 1.2411,
"step": 373
},
{
"epoch": 0.19007686932215234,
"grad_norm": 3.7534825801849365,
"learning_rate": 7.601626016260163e-06,
"loss": 1.2341,
"step": 374
},
{
"epoch": 0.1905850962454736,
"grad_norm": 3.991771936416626,
"learning_rate": 7.621951219512196e-06,
"loss": 1.2379,
"step": 375
},
{
"epoch": 0.19109332316879488,
"grad_norm": 3.7469890117645264,
"learning_rate": 7.64227642276423e-06,
"loss": 1.3563,
"step": 376
},
{
"epoch": 0.19160155009211613,
"grad_norm": 3.7260825634002686,
"learning_rate": 7.66260162601626e-06,
"loss": 1.2481,
"step": 377
},
{
"epoch": 0.19210977701543738,
"grad_norm": 3.3605759143829346,
"learning_rate": 7.682926829268293e-06,
"loss": 1.2917,
"step": 378
},
{
"epoch": 0.19261800393875866,
"grad_norm": 4.850787162780762,
"learning_rate": 7.703252032520326e-06,
"loss": 1.4126,
"step": 379
},
{
"epoch": 0.19312623086207992,
"grad_norm": 3.4996542930603027,
"learning_rate": 7.723577235772358e-06,
"loss": 1.4338,
"step": 380
},
{
"epoch": 0.19363445778540117,
"grad_norm": 3.6611642837524414,
"learning_rate": 7.743902439024391e-06,
"loss": 1.3108,
"step": 381
},
{
"epoch": 0.19414268470872245,
"grad_norm": 3.5380356311798096,
"learning_rate": 7.764227642276424e-06,
"loss": 1.3453,
"step": 382
},
{
"epoch": 0.1946509116320437,
"grad_norm": 3.764770984649658,
"learning_rate": 7.784552845528456e-06,
"loss": 1.2773,
"step": 383
},
{
"epoch": 0.19515913855536496,
"grad_norm": 3.463135004043579,
"learning_rate": 7.804878048780489e-06,
"loss": 1.314,
"step": 384
},
{
"epoch": 0.19566736547868624,
"grad_norm": 3.4924633502960205,
"learning_rate": 7.82520325203252e-06,
"loss": 1.3208,
"step": 385
},
{
"epoch": 0.1961755924020075,
"grad_norm": 3.3984928131103516,
"learning_rate": 7.845528455284554e-06,
"loss": 1.2752,
"step": 386
},
{
"epoch": 0.19668381932532875,
"grad_norm": 3.5272583961486816,
"learning_rate": 7.865853658536587e-06,
"loss": 1.2225,
"step": 387
},
{
"epoch": 0.19719204624865003,
"grad_norm": 3.674283027648926,
"learning_rate": 7.886178861788618e-06,
"loss": 1.2883,
"step": 388
},
{
"epoch": 0.19770027317197128,
"grad_norm": 3.394155263900757,
"learning_rate": 7.90650406504065e-06,
"loss": 1.3093,
"step": 389
},
{
"epoch": 0.19820850009529256,
"grad_norm": 3.619893789291382,
"learning_rate": 7.926829268292685e-06,
"loss": 1.2639,
"step": 390
},
{
"epoch": 0.19871672701861381,
"grad_norm": 3.583444833755493,
"learning_rate": 7.947154471544715e-06,
"loss": 1.2722,
"step": 391
},
{
"epoch": 0.19922495394193507,
"grad_norm": 3.5035605430603027,
"learning_rate": 7.967479674796748e-06,
"loss": 1.3141,
"step": 392
},
{
"epoch": 0.19973318086525635,
"grad_norm": 3.4563138484954834,
"learning_rate": 7.98780487804878e-06,
"loss": 1.3688,
"step": 393
},
{
"epoch": 0.2002414077885776,
"grad_norm": 3.50997257232666,
"learning_rate": 8.008130081300813e-06,
"loss": 1.2373,
"step": 394
},
{
"epoch": 0.20074963471189886,
"grad_norm": 3.5368010997772217,
"learning_rate": 8.028455284552846e-06,
"loss": 1.3064,
"step": 395
},
{
"epoch": 0.20125786163522014,
"grad_norm": 3.5220799446105957,
"learning_rate": 8.048780487804879e-06,
"loss": 1.2372,
"step": 396
},
{
"epoch": 0.2017660885585414,
"grad_norm": 3.81137752532959,
"learning_rate": 8.069105691056911e-06,
"loss": 1.5465,
"step": 397
},
{
"epoch": 0.20227431548186264,
"grad_norm": 3.8925790786743164,
"learning_rate": 8.089430894308944e-06,
"loss": 1.3473,
"step": 398
},
{
"epoch": 0.20278254240518392,
"grad_norm": 3.4865732192993164,
"learning_rate": 8.109756097560977e-06,
"loss": 1.2192,
"step": 399
},
{
"epoch": 0.20329076932850518,
"grad_norm": 3.5314934253692627,
"learning_rate": 8.130081300813009e-06,
"loss": 1.3106,
"step": 400
},
{
"epoch": 0.20379899625182643,
"grad_norm": 11.417930603027344,
"learning_rate": 8.150406504065042e-06,
"loss": 1.4589,
"step": 401
},
{
"epoch": 0.2043072231751477,
"grad_norm": 3.5613293647766113,
"learning_rate": 8.170731707317073e-06,
"loss": 1.3619,
"step": 402
},
{
"epoch": 0.20481545009846897,
"grad_norm": 5.17199182510376,
"learning_rate": 8.191056910569107e-06,
"loss": 1.341,
"step": 403
},
{
"epoch": 0.20532367702179022,
"grad_norm": 4.516615390777588,
"learning_rate": 8.21138211382114e-06,
"loss": 1.3727,
"step": 404
},
{
"epoch": 0.2058319039451115,
"grad_norm": 3.745323896408081,
"learning_rate": 8.23170731707317e-06,
"loss": 1.2878,
"step": 405
},
{
"epoch": 0.20634013086843275,
"grad_norm": 3.2874369621276855,
"learning_rate": 8.252032520325203e-06,
"loss": 1.172,
"step": 406
},
{
"epoch": 0.206848357791754,
"grad_norm": 3.345372438430786,
"learning_rate": 8.272357723577238e-06,
"loss": 1.3093,
"step": 407
},
{
"epoch": 0.2073565847150753,
"grad_norm": 3.8618834018707275,
"learning_rate": 8.292682926829268e-06,
"loss": 1.2398,
"step": 408
},
{
"epoch": 0.20786481163839654,
"grad_norm": 3.3758747577667236,
"learning_rate": 8.313008130081301e-06,
"loss": 1.3063,
"step": 409
},
{
"epoch": 0.2083730385617178,
"grad_norm": 3.501466751098633,
"learning_rate": 8.333333333333334e-06,
"loss": 1.3748,
"step": 410
},
{
"epoch": 0.20888126548503907,
"grad_norm": 3.5670862197875977,
"learning_rate": 8.353658536585366e-06,
"loss": 1.3696,
"step": 411
},
{
"epoch": 0.20938949240836033,
"grad_norm": 3.628492593765259,
"learning_rate": 8.373983739837399e-06,
"loss": 1.2935,
"step": 412
},
{
"epoch": 0.2098977193316816,
"grad_norm": 3.188523769378662,
"learning_rate": 8.394308943089432e-06,
"loss": 1.2003,
"step": 413
},
{
"epoch": 0.21040594625500286,
"grad_norm": 3.282963991165161,
"learning_rate": 8.414634146341464e-06,
"loss": 1.2503,
"step": 414
},
{
"epoch": 0.21091417317832412,
"grad_norm": 3.601407527923584,
"learning_rate": 8.434959349593497e-06,
"loss": 1.2435,
"step": 415
},
{
"epoch": 0.2114224001016454,
"grad_norm": 4.200768947601318,
"learning_rate": 8.45528455284553e-06,
"loss": 1.3499,
"step": 416
},
{
"epoch": 0.21193062702496665,
"grad_norm": 3.487779378890991,
"learning_rate": 8.475609756097562e-06,
"loss": 1.2928,
"step": 417
},
{
"epoch": 0.2124388539482879,
"grad_norm": 3.47430157661438,
"learning_rate": 8.495934959349595e-06,
"loss": 1.3469,
"step": 418
},
{
"epoch": 0.21294708087160918,
"grad_norm": 3.8267080783843994,
"learning_rate": 8.516260162601627e-06,
"loss": 1.3764,
"step": 419
},
{
"epoch": 0.21345530779493044,
"grad_norm": 3.6177916526794434,
"learning_rate": 8.536585365853658e-06,
"loss": 1.4348,
"step": 420
},
{
"epoch": 0.2139635347182517,
"grad_norm": 3.4687182903289795,
"learning_rate": 8.556910569105693e-06,
"loss": 1.319,
"step": 421
},
{
"epoch": 0.21447176164157297,
"grad_norm": 3.39560866355896,
"learning_rate": 8.577235772357724e-06,
"loss": 1.3131,
"step": 422
},
{
"epoch": 0.21497998856489423,
"grad_norm": 3.492347240447998,
"learning_rate": 8.597560975609756e-06,
"loss": 1.3446,
"step": 423
},
{
"epoch": 0.21548821548821548,
"grad_norm": 3.751417636871338,
"learning_rate": 8.617886178861789e-06,
"loss": 1.3222,
"step": 424
},
{
"epoch": 0.21599644241153676,
"grad_norm": 3.345554828643799,
"learning_rate": 8.638211382113821e-06,
"loss": 1.2489,
"step": 425
},
{
"epoch": 0.216504669334858,
"grad_norm": 3.6721158027648926,
"learning_rate": 8.658536585365854e-06,
"loss": 1.2827,
"step": 426
},
{
"epoch": 0.21701289625817927,
"grad_norm": 3.5361924171447754,
"learning_rate": 8.678861788617887e-06,
"loss": 1.3585,
"step": 427
},
{
"epoch": 0.21752112318150055,
"grad_norm": 3.324645757675171,
"learning_rate": 8.69918699186992e-06,
"loss": 1.3114,
"step": 428
},
{
"epoch": 0.2180293501048218,
"grad_norm": 3.320855140686035,
"learning_rate": 8.719512195121952e-06,
"loss": 1.2281,
"step": 429
},
{
"epoch": 0.21853757702814305,
"grad_norm": 3.440333127975464,
"learning_rate": 8.739837398373985e-06,
"loss": 1.4005,
"step": 430
},
{
"epoch": 0.21904580395146434,
"grad_norm": 3.48341965675354,
"learning_rate": 8.760162601626017e-06,
"loss": 1.363,
"step": 431
},
{
"epoch": 0.2195540308747856,
"grad_norm": 3.2691972255706787,
"learning_rate": 8.78048780487805e-06,
"loss": 1.2695,
"step": 432
},
{
"epoch": 0.22006225779810684,
"grad_norm": 4.021475791931152,
"learning_rate": 8.800813008130082e-06,
"loss": 1.4454,
"step": 433
},
{
"epoch": 0.22057048472142812,
"grad_norm": 3.26725697517395,
"learning_rate": 8.821138211382113e-06,
"loss": 1.3682,
"step": 434
},
{
"epoch": 0.22107871164474938,
"grad_norm": 3.592050790786743,
"learning_rate": 8.841463414634148e-06,
"loss": 1.3953,
"step": 435
},
{
"epoch": 0.22158693856807066,
"grad_norm": 3.366631031036377,
"learning_rate": 8.86178861788618e-06,
"loss": 1.29,
"step": 436
},
{
"epoch": 0.2220951654913919,
"grad_norm": 3.5437285900115967,
"learning_rate": 8.882113821138211e-06,
"loss": 1.2646,
"step": 437
},
{
"epoch": 0.22260339241471316,
"grad_norm": 3.404071569442749,
"learning_rate": 8.902439024390244e-06,
"loss": 1.2194,
"step": 438
},
{
"epoch": 0.22311161933803444,
"grad_norm": 3.740020275115967,
"learning_rate": 8.922764227642278e-06,
"loss": 1.1974,
"step": 439
},
{
"epoch": 0.2236198462613557,
"grad_norm": 3.812560558319092,
"learning_rate": 8.94308943089431e-06,
"loss": 1.2404,
"step": 440
},
{
"epoch": 0.22412807318467695,
"grad_norm": 3.365743637084961,
"learning_rate": 8.963414634146342e-06,
"loss": 1.3007,
"step": 441
},
{
"epoch": 0.22463630010799823,
"grad_norm": 3.463697671890259,
"learning_rate": 8.983739837398374e-06,
"loss": 1.2529,
"step": 442
},
{
"epoch": 0.22514452703131949,
"grad_norm": 3.325098991394043,
"learning_rate": 9.004065040650407e-06,
"loss": 1.2782,
"step": 443
},
{
"epoch": 0.22565275395464074,
"grad_norm": 3.305267810821533,
"learning_rate": 9.02439024390244e-06,
"loss": 1.3544,
"step": 444
},
{
"epoch": 0.22616098087796202,
"grad_norm": 3.480679750442505,
"learning_rate": 9.044715447154472e-06,
"loss": 1.3709,
"step": 445
},
{
"epoch": 0.22666920780128327,
"grad_norm": 3.7187793254852295,
"learning_rate": 9.065040650406505e-06,
"loss": 1.2159,
"step": 446
},
{
"epoch": 0.22717743472460453,
"grad_norm": 3.6196069717407227,
"learning_rate": 9.085365853658538e-06,
"loss": 1.312,
"step": 447
},
{
"epoch": 0.2276856616479258,
"grad_norm": 3.43747878074646,
"learning_rate": 9.10569105691057e-06,
"loss": 1.2508,
"step": 448
},
{
"epoch": 0.22819388857124706,
"grad_norm": 3.117326021194458,
"learning_rate": 9.126016260162603e-06,
"loss": 1.2848,
"step": 449
},
{
"epoch": 0.22870211549456831,
"grad_norm": 3.348893642425537,
"learning_rate": 9.146341463414635e-06,
"loss": 1.2183,
"step": 450
},
{
"epoch": 0.2292103424178896,
"grad_norm": 3.716628074645996,
"learning_rate": 9.166666666666666e-06,
"loss": 1.4024,
"step": 451
},
{
"epoch": 0.22971856934121085,
"grad_norm": 3.6212241649627686,
"learning_rate": 9.1869918699187e-06,
"loss": 1.3003,
"step": 452
},
{
"epoch": 0.2302267962645321,
"grad_norm": 3.806009292602539,
"learning_rate": 9.207317073170733e-06,
"loss": 1.3927,
"step": 453
},
{
"epoch": 0.23073502318785338,
"grad_norm": 3.6030616760253906,
"learning_rate": 9.227642276422764e-06,
"loss": 1.2962,
"step": 454
},
{
"epoch": 0.23124325011117464,
"grad_norm": 3.7318930625915527,
"learning_rate": 9.247967479674797e-06,
"loss": 1.2296,
"step": 455
},
{
"epoch": 0.2317514770344959,
"grad_norm": 3.260894775390625,
"learning_rate": 9.268292682926831e-06,
"loss": 1.3221,
"step": 456
},
{
"epoch": 0.23225970395781717,
"grad_norm": 3.47714900970459,
"learning_rate": 9.288617886178862e-06,
"loss": 1.1855,
"step": 457
},
{
"epoch": 0.23276793088113842,
"grad_norm": 4.364900588989258,
"learning_rate": 9.308943089430895e-06,
"loss": 1.3621,
"step": 458
},
{
"epoch": 0.2332761578044597,
"grad_norm": 3.5738487243652344,
"learning_rate": 9.329268292682927e-06,
"loss": 1.3473,
"step": 459
},
{
"epoch": 0.23378438472778096,
"grad_norm": 4.652425289154053,
"learning_rate": 9.34959349593496e-06,
"loss": 1.3563,
"step": 460
},
{
"epoch": 0.2342926116511022,
"grad_norm": 7.233104705810547,
"learning_rate": 9.369918699186993e-06,
"loss": 1.4006,
"step": 461
},
{
"epoch": 0.2348008385744235,
"grad_norm": 3.273244857788086,
"learning_rate": 9.390243902439025e-06,
"loss": 1.3137,
"step": 462
},
{
"epoch": 0.23530906549774475,
"grad_norm": 3.6843795776367188,
"learning_rate": 9.410569105691058e-06,
"loss": 1.3714,
"step": 463
},
{
"epoch": 0.235817292421066,
"grad_norm": 3.619368553161621,
"learning_rate": 9.43089430894309e-06,
"loss": 1.282,
"step": 464
},
{
"epoch": 0.23632551934438728,
"grad_norm": 3.4482295513153076,
"learning_rate": 9.451219512195122e-06,
"loss": 1.2551,
"step": 465
},
{
"epoch": 0.23683374626770853,
"grad_norm": 3.2826528549194336,
"learning_rate": 9.471544715447156e-06,
"loss": 1.2826,
"step": 466
},
{
"epoch": 0.2373419731910298,
"grad_norm": 3.5899658203125,
"learning_rate": 9.491869918699188e-06,
"loss": 1.3268,
"step": 467
},
{
"epoch": 0.23785020011435107,
"grad_norm": 3.3438339233398438,
"learning_rate": 9.51219512195122e-06,
"loss": 1.3673,
"step": 468
},
{
"epoch": 0.23835842703767232,
"grad_norm": 3.659921407699585,
"learning_rate": 9.532520325203252e-06,
"loss": 1.2785,
"step": 469
},
{
"epoch": 0.23886665396099357,
"grad_norm": 3.542293071746826,
"learning_rate": 9.552845528455286e-06,
"loss": 1.2533,
"step": 470
},
{
"epoch": 0.23937488088431486,
"grad_norm": 3.669058084487915,
"learning_rate": 9.573170731707317e-06,
"loss": 1.1636,
"step": 471
},
{
"epoch": 0.2398831078076361,
"grad_norm": 3.8697493076324463,
"learning_rate": 9.59349593495935e-06,
"loss": 1.3559,
"step": 472
},
{
"epoch": 0.24039133473095736,
"grad_norm": 3.661998987197876,
"learning_rate": 9.613821138211383e-06,
"loss": 1.3293,
"step": 473
},
{
"epoch": 0.24089956165427864,
"grad_norm": 3.7692317962646484,
"learning_rate": 9.634146341463415e-06,
"loss": 1.2875,
"step": 474
},
{
"epoch": 0.2414077885775999,
"grad_norm": 3.5682339668273926,
"learning_rate": 9.654471544715448e-06,
"loss": 1.3229,
"step": 475
},
{
"epoch": 0.24191601550092115,
"grad_norm": 3.4052696228027344,
"learning_rate": 9.67479674796748e-06,
"loss": 1.3713,
"step": 476
},
{
"epoch": 0.24242424242424243,
"grad_norm": 3.3954174518585205,
"learning_rate": 9.695121951219513e-06,
"loss": 1.2427,
"step": 477
},
{
"epoch": 0.24293246934756368,
"grad_norm": 3.2011301517486572,
"learning_rate": 9.715447154471546e-06,
"loss": 1.2075,
"step": 478
},
{
"epoch": 0.24344069627088494,
"grad_norm": 3.5140979290008545,
"learning_rate": 9.735772357723578e-06,
"loss": 1.4365,
"step": 479
},
{
"epoch": 0.24394892319420622,
"grad_norm": 3.40429425239563,
"learning_rate": 9.756097560975611e-06,
"loss": 1.1789,
"step": 480
},
{
"epoch": 0.24445715011752747,
"grad_norm": 3.4835615158081055,
"learning_rate": 9.776422764227644e-06,
"loss": 1.2674,
"step": 481
},
{
"epoch": 0.24496537704084873,
"grad_norm": 3.3621158599853516,
"learning_rate": 9.796747967479675e-06,
"loss": 1.2595,
"step": 482
},
{
"epoch": 0.24547360396417,
"grad_norm": 3.61655855178833,
"learning_rate": 9.817073170731707e-06,
"loss": 1.2872,
"step": 483
},
{
"epoch": 0.24598183088749126,
"grad_norm": 3.48075795173645,
"learning_rate": 9.837398373983741e-06,
"loss": 1.3344,
"step": 484
},
{
"epoch": 0.24649005781081254,
"grad_norm": 3.713700294494629,
"learning_rate": 9.857723577235772e-06,
"loss": 1.3467,
"step": 485
},
{
"epoch": 0.2469982847341338,
"grad_norm": 3.270226001739502,
"learning_rate": 9.878048780487805e-06,
"loss": 1.334,
"step": 486
},
{
"epoch": 0.24750651165745505,
"grad_norm": 3.2157111167907715,
"learning_rate": 9.898373983739838e-06,
"loss": 1.3273,
"step": 487
},
{
"epoch": 0.24801473858077633,
"grad_norm": 3.4948418140411377,
"learning_rate": 9.91869918699187e-06,
"loss": 1.3266,
"step": 488
},
{
"epoch": 0.24852296550409758,
"grad_norm": 3.462024450302124,
"learning_rate": 9.939024390243903e-06,
"loss": 1.3567,
"step": 489
},
{
"epoch": 0.24903119242741883,
"grad_norm": 3.0976338386535645,
"learning_rate": 9.959349593495936e-06,
"loss": 1.2992,
"step": 490
},
{
"epoch": 0.24953941935074012,
"grad_norm": 3.3008170127868652,
"learning_rate": 9.979674796747968e-06,
"loss": 1.3137,
"step": 491
},
{
"epoch": 0.25004764627406134,
"grad_norm": 3.765357494354248,
"learning_rate": 1e-05,
"loss": 1.2204,
"step": 492
},
{
"epoch": 0.25055587319738265,
"grad_norm": 3.619002342224121,
"learning_rate": 9.999999717338245e-06,
"loss": 1.339,
"step": 493
},
{
"epoch": 0.2510641001207039,
"grad_norm": 3.694655418395996,
"learning_rate": 9.99999886935301e-06,
"loss": 1.3753,
"step": 494
},
{
"epoch": 0.25157232704402516,
"grad_norm": 3.6122829914093018,
"learning_rate": 9.99999745604439e-06,
"loss": 1.3275,
"step": 495
},
{
"epoch": 0.2520805539673464,
"grad_norm": 3.870494842529297,
"learning_rate": 9.999995477412547e-06,
"loss": 1.3107,
"step": 496
},
{
"epoch": 0.25258878089066766,
"grad_norm": 3.936599016189575,
"learning_rate": 9.999992933457705e-06,
"loss": 1.2448,
"step": 497
},
{
"epoch": 0.253097007813989,
"grad_norm": 3.2846243381500244,
"learning_rate": 9.99998982418015e-06,
"loss": 1.3264,
"step": 498
},
{
"epoch": 0.2536052347373102,
"grad_norm": 3.724277973175049,
"learning_rate": 9.999986149580232e-06,
"loss": 1.3372,
"step": 499
},
{
"epoch": 0.2541134616606315,
"grad_norm": 3.324705123901367,
"learning_rate": 9.99998190965837e-06,
"loss": 1.3758,
"step": 500
},
{
"epoch": 0.2541134616606315,
"eval_loss": 1.3164880275726318,
"eval_runtime": 13.0856,
"eval_samples_per_second": 30.568,
"eval_steps_per_second": 3.821,
"step": 500
},
{
"epoch": 0.25462168858395273,
"grad_norm": 4.158553600311279,
"learning_rate": 9.999977104415042e-06,
"loss": 1.4618,
"step": 501
},
{
"epoch": 0.255129915507274,
"grad_norm": 4.20340633392334,
"learning_rate": 9.99997173385079e-06,
"loss": 1.3603,
"step": 502
},
{
"epoch": 0.25563814243059524,
"grad_norm": 3.5411834716796875,
"learning_rate": 9.999965797966223e-06,
"loss": 1.3046,
"step": 503
},
{
"epoch": 0.25614636935391655,
"grad_norm": 3.406993865966797,
"learning_rate": 9.999959296762012e-06,
"loss": 1.3119,
"step": 504
},
{
"epoch": 0.2566545962772378,
"grad_norm": 3.4021811485290527,
"learning_rate": 9.999952230238893e-06,
"loss": 1.3131,
"step": 505
},
{
"epoch": 0.25716282320055905,
"grad_norm": 3.237227201461792,
"learning_rate": 9.99994459839766e-06,
"loss": 1.2948,
"step": 506
},
{
"epoch": 0.2576710501238803,
"grad_norm": 3.6270179748535156,
"learning_rate": 9.999936401239181e-06,
"loss": 1.378,
"step": 507
},
{
"epoch": 0.25817927704720156,
"grad_norm": 3.573146343231201,
"learning_rate": 9.999927638764382e-06,
"loss": 1.3479,
"step": 508
},
{
"epoch": 0.2586875039705228,
"grad_norm": 3.4049582481384277,
"learning_rate": 9.999918310974252e-06,
"loss": 1.3017,
"step": 509
},
{
"epoch": 0.2591957308938441,
"grad_norm": 3.151167392730713,
"learning_rate": 9.999908417869846e-06,
"loss": 1.2649,
"step": 510
},
{
"epoch": 0.2597039578171654,
"grad_norm": 3.395052194595337,
"learning_rate": 9.999897959452286e-06,
"loss": 1.2947,
"step": 511
},
{
"epoch": 0.26021218474048663,
"grad_norm": 3.3076987266540527,
"learning_rate": 9.999886935722749e-06,
"loss": 1.201,
"step": 512
},
{
"epoch": 0.2607204116638079,
"grad_norm": 3.6244215965270996,
"learning_rate": 9.999875346682483e-06,
"loss": 1.3617,
"step": 513
},
{
"epoch": 0.26122863858712914,
"grad_norm": 3.355215311050415,
"learning_rate": 9.999863192332803e-06,
"loss": 1.2969,
"step": 514
},
{
"epoch": 0.2617368655104504,
"grad_norm": 3.464101552963257,
"learning_rate": 9.999850472675076e-06,
"loss": 1.2228,
"step": 515
},
{
"epoch": 0.2622450924337717,
"grad_norm": 3.1731834411621094,
"learning_rate": 9.999837187710746e-06,
"loss": 1.314,
"step": 516
},
{
"epoch": 0.26275331935709295,
"grad_norm": 3.4594202041625977,
"learning_rate": 9.999823337441312e-06,
"loss": 1.2405,
"step": 517
},
{
"epoch": 0.2632615462804142,
"grad_norm": 3.259009599685669,
"learning_rate": 9.999808921868341e-06,
"loss": 1.2927,
"step": 518
},
{
"epoch": 0.26376977320373546,
"grad_norm": 3.5948798656463623,
"learning_rate": 9.999793940993463e-06,
"loss": 1.2082,
"step": 519
},
{
"epoch": 0.2642780001270567,
"grad_norm": 3.314972162246704,
"learning_rate": 9.99977839481837e-06,
"loss": 1.2475,
"step": 520
},
{
"epoch": 0.264786227050378,
"grad_norm": 3.383493661880493,
"learning_rate": 9.999762283344825e-06,
"loss": 1.2592,
"step": 521
},
{
"epoch": 0.2652944539736993,
"grad_norm": 3.365828275680542,
"learning_rate": 9.999745606574642e-06,
"loss": 1.3599,
"step": 522
},
{
"epoch": 0.2658026808970205,
"grad_norm": 3.2802915573120117,
"learning_rate": 9.99972836450971e-06,
"loss": 1.3388,
"step": 523
},
{
"epoch": 0.2663109078203418,
"grad_norm": 3.3013274669647217,
"learning_rate": 9.999710557151983e-06,
"loss": 1.2858,
"step": 524
},
{
"epoch": 0.26681913474366303,
"grad_norm": 3.198275089263916,
"learning_rate": 9.999692184503466e-06,
"loss": 1.2994,
"step": 525
},
{
"epoch": 0.2673273616669843,
"grad_norm": 3.4907963275909424,
"learning_rate": 9.999673246566242e-06,
"loss": 1.3816,
"step": 526
},
{
"epoch": 0.2678355885903056,
"grad_norm": 3.2818679809570312,
"learning_rate": 9.999653743342452e-06,
"loss": 1.186,
"step": 527
},
{
"epoch": 0.26834381551362685,
"grad_norm": 3.373699903488159,
"learning_rate": 9.999633674834299e-06,
"loss": 1.2908,
"step": 528
},
{
"epoch": 0.2688520424369481,
"grad_norm": 3.4973933696746826,
"learning_rate": 9.999613041044051e-06,
"loss": 1.4183,
"step": 529
},
{
"epoch": 0.26936026936026936,
"grad_norm": 3.5590484142303467,
"learning_rate": 9.999591841974045e-06,
"loss": 1.3278,
"step": 530
},
{
"epoch": 0.2698684962835906,
"grad_norm": 3.671595573425293,
"learning_rate": 9.999570077626676e-06,
"loss": 1.3794,
"step": 531
},
{
"epoch": 0.27037672320691186,
"grad_norm": 3.295187473297119,
"learning_rate": 9.999547748004403e-06,
"loss": 1.3537,
"step": 532
},
{
"epoch": 0.27088495013023317,
"grad_norm": 3.641406536102295,
"learning_rate": 9.999524853109755e-06,
"loss": 1.3603,
"step": 533
},
{
"epoch": 0.2713931770535544,
"grad_norm": 3.371995449066162,
"learning_rate": 9.999501392945314e-06,
"loss": 1.2268,
"step": 534
},
{
"epoch": 0.2719014039768757,
"grad_norm": 3.432286024093628,
"learning_rate": 9.999477367513739e-06,
"loss": 1.3287,
"step": 535
},
{
"epoch": 0.27240963090019693,
"grad_norm": 3.212390184402466,
"learning_rate": 9.999452776817741e-06,
"loss": 1.2798,
"step": 536
},
{
"epoch": 0.2729178578235182,
"grad_norm": 3.8736019134521484,
"learning_rate": 9.999427620860107e-06,
"loss": 1.3578,
"step": 537
},
{
"epoch": 0.27342608474683944,
"grad_norm": 3.1469552516937256,
"learning_rate": 9.999401899643675e-06,
"loss": 1.3325,
"step": 538
},
{
"epoch": 0.27393431167016075,
"grad_norm": 4.098660945892334,
"learning_rate": 9.999375613171356e-06,
"loss": 1.3981,
"step": 539
},
{
"epoch": 0.274442538593482,
"grad_norm": 3.2645022869110107,
"learning_rate": 9.999348761446122e-06,
"loss": 1.3094,
"step": 540
},
{
"epoch": 0.27495076551680325,
"grad_norm": 3.239898204803467,
"learning_rate": 9.999321344471007e-06,
"loss": 1.2965,
"step": 541
},
{
"epoch": 0.2754589924401245,
"grad_norm": 3.435715913772583,
"learning_rate": 9.999293362249114e-06,
"loss": 1.3529,
"step": 542
},
{
"epoch": 0.27596721936344576,
"grad_norm": 3.2523412704467773,
"learning_rate": 9.999264814783603e-06,
"loss": 1.3146,
"step": 543
},
{
"epoch": 0.27647544628676707,
"grad_norm": 3.3631367683410645,
"learning_rate": 9.999235702077707e-06,
"loss": 1.2696,
"step": 544
},
{
"epoch": 0.2769836732100883,
"grad_norm": 3.2622344493865967,
"learning_rate": 9.999206024134714e-06,
"loss": 1.3845,
"step": 545
},
{
"epoch": 0.2774919001334096,
"grad_norm": 3.6121559143066406,
"learning_rate": 9.999175780957976e-06,
"loss": 1.3381,
"step": 546
},
{
"epoch": 0.27800012705673083,
"grad_norm": 3.354872941970825,
"learning_rate": 9.999144972550922e-06,
"loss": 1.3214,
"step": 547
},
{
"epoch": 0.2785083539800521,
"grad_norm": 3.4644815921783447,
"learning_rate": 9.999113598917027e-06,
"loss": 1.3543,
"step": 548
},
{
"epoch": 0.27901658090337333,
"grad_norm": 3.3032991886138916,
"learning_rate": 9.999081660059842e-06,
"loss": 1.3811,
"step": 549
},
{
"epoch": 0.27952480782669464,
"grad_norm": 3.470670461654663,
"learning_rate": 9.999049155982977e-06,
"loss": 1.3831,
"step": 550
},
{
"epoch": 0.2800330347500159,
"grad_norm": 3.5726518630981445,
"learning_rate": 9.999016086690108e-06,
"loss": 1.2807,
"step": 551
},
{
"epoch": 0.28054126167333715,
"grad_norm": 3.480273962020874,
"learning_rate": 9.998982452184974e-06,
"loss": 1.3818,
"step": 552
},
{
"epoch": 0.2810494885966584,
"grad_norm": 3.783210277557373,
"learning_rate": 9.998948252471375e-06,
"loss": 1.2638,
"step": 553
},
{
"epoch": 0.28155771551997966,
"grad_norm": 3.0054821968078613,
"learning_rate": 9.998913487553182e-06,
"loss": 1.2592,
"step": 554
},
{
"epoch": 0.2820659424433009,
"grad_norm": 3.3007564544677734,
"learning_rate": 9.998878157434322e-06,
"loss": 1.3479,
"step": 555
},
{
"epoch": 0.2825741693666222,
"grad_norm": 3.2451131343841553,
"learning_rate": 9.99884226211879e-06,
"loss": 1.263,
"step": 556
},
{
"epoch": 0.28308239628994347,
"grad_norm": 3.73813796043396,
"learning_rate": 9.99880580161065e-06,
"loss": 1.3618,
"step": 557
},
{
"epoch": 0.2835906232132647,
"grad_norm": 3.4133875370025635,
"learning_rate": 9.998768775914017e-06,
"loss": 1.3835,
"step": 558
},
{
"epoch": 0.284098850136586,
"grad_norm": 3.248453140258789,
"learning_rate": 9.998731185033081e-06,
"loss": 1.3094,
"step": 559
},
{
"epoch": 0.28460707705990723,
"grad_norm": 3.074777603149414,
"learning_rate": 9.998693028972092e-06,
"loss": 1.1955,
"step": 560
},
{
"epoch": 0.2851153039832285,
"grad_norm": 3.389275312423706,
"learning_rate": 9.998654307735364e-06,
"loss": 1.3009,
"step": 561
},
{
"epoch": 0.2856235309065498,
"grad_norm": 3.305894374847412,
"learning_rate": 9.998615021327274e-06,
"loss": 1.2888,
"step": 562
},
{
"epoch": 0.28613175782987105,
"grad_norm": 3.0569679737091064,
"learning_rate": 9.998575169752265e-06,
"loss": 1.301,
"step": 563
},
{
"epoch": 0.2866399847531923,
"grad_norm": 3.3297672271728516,
"learning_rate": 9.998534753014842e-06,
"loss": 1.2979,
"step": 564
},
{
"epoch": 0.28714821167651355,
"grad_norm": 3.3406970500946045,
"learning_rate": 9.998493771119576e-06,
"loss": 1.3016,
"step": 565
},
{
"epoch": 0.2876564385998348,
"grad_norm": 3.455514430999756,
"learning_rate": 9.9984522240711e-06,
"loss": 1.2808,
"step": 566
},
{
"epoch": 0.2881646655231561,
"grad_norm": 3.438077211380005,
"learning_rate": 9.99841011187411e-06,
"loss": 1.3682,
"step": 567
},
{
"epoch": 0.28867289244647737,
"grad_norm": 3.4340884685516357,
"learning_rate": 9.99836743453337e-06,
"loss": 1.2293,
"step": 568
},
{
"epoch": 0.2891811193697986,
"grad_norm": 3.3622660636901855,
"learning_rate": 9.998324192053704e-06,
"loss": 1.3429,
"step": 569
},
{
"epoch": 0.2896893462931199,
"grad_norm": 3.2343058586120605,
"learning_rate": 9.99828038444e-06,
"loss": 1.2378,
"step": 570
},
{
"epoch": 0.29019757321644113,
"grad_norm": 3.1985490322113037,
"learning_rate": 9.998236011697214e-06,
"loss": 1.3157,
"step": 571
},
{
"epoch": 0.2907058001397624,
"grad_norm": 3.379235029220581,
"learning_rate": 9.99819107383036e-06,
"loss": 1.3078,
"step": 572
},
{
"epoch": 0.2912140270630837,
"grad_norm": 3.259159564971924,
"learning_rate": 9.998145570844519e-06,
"loss": 1.3411,
"step": 573
},
{
"epoch": 0.29172225398640494,
"grad_norm": 3.191131591796875,
"learning_rate": 9.99809950274484e-06,
"loss": 1.2504,
"step": 574
},
{
"epoch": 0.2922304809097262,
"grad_norm": 3.2074849605560303,
"learning_rate": 9.998052869536526e-06,
"loss": 1.3674,
"step": 575
},
{
"epoch": 0.29273870783304745,
"grad_norm": 3.2082672119140625,
"learning_rate": 9.998005671224852e-06,
"loss": 1.2857,
"step": 576
},
{
"epoch": 0.2932469347563687,
"grad_norm": 3.390986919403076,
"learning_rate": 9.997957907815158e-06,
"loss": 1.4165,
"step": 577
},
{
"epoch": 0.29375516167968996,
"grad_norm": 3.38319993019104,
"learning_rate": 9.997909579312839e-06,
"loss": 1.2715,
"step": 578
},
{
"epoch": 0.29426338860301127,
"grad_norm": 4.208193302154541,
"learning_rate": 9.997860685723361e-06,
"loss": 1.2918,
"step": 579
},
{
"epoch": 0.2947716155263325,
"grad_norm": 3.22011137008667,
"learning_rate": 9.997811227052251e-06,
"loss": 1.2389,
"step": 580
},
{
"epoch": 0.2952798424496538,
"grad_norm": 3.2726387977600098,
"learning_rate": 9.997761203305105e-06,
"loss": 1.3157,
"step": 581
},
{
"epoch": 0.295788069372975,
"grad_norm": 3.379770040512085,
"learning_rate": 9.997710614487575e-06,
"loss": 1.2954,
"step": 582
},
{
"epoch": 0.2962962962962963,
"grad_norm": 3.0684187412261963,
"learning_rate": 9.997659460605382e-06,
"loss": 1.309,
"step": 583
},
{
"epoch": 0.29680452321961753,
"grad_norm": 3.5520968437194824,
"learning_rate": 9.99760774166431e-06,
"loss": 1.2515,
"step": 584
},
{
"epoch": 0.29731275014293884,
"grad_norm": 3.340465784072876,
"learning_rate": 9.997555457670207e-06,
"loss": 1.1975,
"step": 585
},
{
"epoch": 0.2978209770662601,
"grad_norm": 3.183685779571533,
"learning_rate": 9.997502608628984e-06,
"loss": 1.2544,
"step": 586
},
{
"epoch": 0.29832920398958135,
"grad_norm": 3.2117257118225098,
"learning_rate": 9.997449194546616e-06,
"loss": 1.2248,
"step": 587
},
{
"epoch": 0.2988374309129026,
"grad_norm": 3.3444666862487793,
"learning_rate": 9.997395215429142e-06,
"loss": 1.2858,
"step": 588
},
{
"epoch": 0.29934565783622386,
"grad_norm": 3.0064151287078857,
"learning_rate": 9.997340671282667e-06,
"loss": 1.2255,
"step": 589
},
{
"epoch": 0.29985388475954516,
"grad_norm": 3.2752397060394287,
"learning_rate": 9.997285562113355e-06,
"loss": 1.3126,
"step": 590
},
{
"epoch": 0.3003621116828664,
"grad_norm": 3.286292791366577,
"learning_rate": 9.99722988792744e-06,
"loss": 1.3219,
"step": 591
},
{
"epoch": 0.30087033860618767,
"grad_norm": 4.162260055541992,
"learning_rate": 9.997173648731214e-06,
"loss": 1.3552,
"step": 592
},
{
"epoch": 0.3013785655295089,
"grad_norm": 3.4235987663269043,
"learning_rate": 9.997116844531039e-06,
"loss": 1.294,
"step": 593
},
{
"epoch": 0.3018867924528302,
"grad_norm": 3.3392674922943115,
"learning_rate": 9.997059475333332e-06,
"loss": 1.4294,
"step": 594
},
{
"epoch": 0.30239501937615143,
"grad_norm": 3.367549180984497,
"learning_rate": 9.997001541144587e-06,
"loss": 1.3199,
"step": 595
},
{
"epoch": 0.30290324629947274,
"grad_norm": 3.3252546787261963,
"learning_rate": 9.996943041971348e-06,
"loss": 1.3147,
"step": 596
},
{
"epoch": 0.303411473222794,
"grad_norm": 3.1721370220184326,
"learning_rate": 9.996883977820233e-06,
"loss": 1.2498,
"step": 597
},
{
"epoch": 0.30391970014611525,
"grad_norm": 3.716733694076538,
"learning_rate": 9.996824348697917e-06,
"loss": 1.2548,
"step": 598
},
{
"epoch": 0.3044279270694365,
"grad_norm": 3.3994574546813965,
"learning_rate": 9.996764154611145e-06,
"loss": 1.3619,
"step": 599
},
{
"epoch": 0.30493615399275775,
"grad_norm": 3.4203522205352783,
"learning_rate": 9.996703395566721e-06,
"loss": 1.2884,
"step": 600
},
{
"epoch": 0.305444380916079,
"grad_norm": 3.305091381072998,
"learning_rate": 9.996642071571514e-06,
"loss": 1.3636,
"step": 601
},
{
"epoch": 0.3059526078394003,
"grad_norm": 3.121256113052368,
"learning_rate": 9.996580182632459e-06,
"loss": 1.4095,
"step": 602
},
{
"epoch": 0.30646083476272157,
"grad_norm": 3.227128267288208,
"learning_rate": 9.996517728756554e-06,
"loss": 1.3859,
"step": 603
},
{
"epoch": 0.3069690616860428,
"grad_norm": 3.152439594268799,
"learning_rate": 9.996454709950859e-06,
"loss": 1.3499,
"step": 604
},
{
"epoch": 0.3074772886093641,
"grad_norm": 3.302140235900879,
"learning_rate": 9.996391126222499e-06,
"loss": 1.3407,
"step": 605
},
{
"epoch": 0.30798551553268533,
"grad_norm": 3.436461925506592,
"learning_rate": 9.996326977578664e-06,
"loss": 1.2528,
"step": 606
},
{
"epoch": 0.3084937424560066,
"grad_norm": 3.0147430896759033,
"learning_rate": 9.996262264026608e-06,
"loss": 1.1042,
"step": 607
},
{
"epoch": 0.3090019693793279,
"grad_norm": 3.2218759059906006,
"learning_rate": 9.996196985573644e-06,
"loss": 1.431,
"step": 608
},
{
"epoch": 0.30951019630264914,
"grad_norm": 3.731808662414551,
"learning_rate": 9.996131142227156e-06,
"loss": 1.4065,
"step": 609
},
{
"epoch": 0.3100184232259704,
"grad_norm": 3.240323781967163,
"learning_rate": 9.996064733994588e-06,
"loss": 1.3583,
"step": 610
},
{
"epoch": 0.31052665014929165,
"grad_norm": 3.2610456943511963,
"learning_rate": 9.99599776088345e-06,
"loss": 1.2872,
"step": 611
},
{
"epoch": 0.3110348770726129,
"grad_norm": 3.4224603176116943,
"learning_rate": 9.99593022290131e-06,
"loss": 1.2538,
"step": 612
},
{
"epoch": 0.3115431039959342,
"grad_norm": 3.205958843231201,
"learning_rate": 9.995862120055807e-06,
"loss": 1.2848,
"step": 613
},
{
"epoch": 0.31205133091925547,
"grad_norm": 2.9460086822509766,
"learning_rate": 9.995793452354641e-06,
"loss": 1.2136,
"step": 614
},
{
"epoch": 0.3125595578425767,
"grad_norm": 3.2204792499542236,
"learning_rate": 9.995724219805575e-06,
"loss": 1.2838,
"step": 615
},
{
"epoch": 0.31306778476589797,
"grad_norm": 3.413954019546509,
"learning_rate": 9.99565442241644e-06,
"loss": 1.4099,
"step": 616
},
{
"epoch": 0.3135760116892192,
"grad_norm": 3.393963098526001,
"learning_rate": 9.99558406019512e-06,
"loss": 1.3108,
"step": 617
},
{
"epoch": 0.3140842386125405,
"grad_norm": 3.3361024856567383,
"learning_rate": 9.99551313314958e-06,
"loss": 1.3209,
"step": 618
},
{
"epoch": 0.3145924655358618,
"grad_norm": 3.162201404571533,
"learning_rate": 9.995441641287833e-06,
"loss": 1.2169,
"step": 619
},
{
"epoch": 0.31510069245918304,
"grad_norm": 3.283411979675293,
"learning_rate": 9.995369584617962e-06,
"loss": 1.3413,
"step": 620
},
{
"epoch": 0.3156089193825043,
"grad_norm": 3.4232754707336426,
"learning_rate": 9.995296963148118e-06,
"loss": 1.2927,
"step": 621
},
{
"epoch": 0.31611714630582555,
"grad_norm": 3.652552604675293,
"learning_rate": 9.99522377688651e-06,
"loss": 1.4328,
"step": 622
},
{
"epoch": 0.3166253732291468,
"grad_norm": 3.1629154682159424,
"learning_rate": 9.995150025841412e-06,
"loss": 1.2648,
"step": 623
},
{
"epoch": 0.31713360015246805,
"grad_norm": 3.021181106567383,
"learning_rate": 9.995075710021165e-06,
"loss": 1.2518,
"step": 624
},
{
"epoch": 0.31764182707578936,
"grad_norm": 3.2148756980895996,
"learning_rate": 9.995000829434167e-06,
"loss": 1.3312,
"step": 625
},
{
"epoch": 0.3181500539991106,
"grad_norm": 3.3323326110839844,
"learning_rate": 9.994925384088889e-06,
"loss": 1.2723,
"step": 626
},
{
"epoch": 0.31865828092243187,
"grad_norm": 3.3048861026763916,
"learning_rate": 9.994849373993861e-06,
"loss": 1.372,
"step": 627
},
{
"epoch": 0.3191665078457531,
"grad_norm": 3.1596617698669434,
"learning_rate": 9.994772799157672e-06,
"loss": 1.159,
"step": 628
},
{
"epoch": 0.3196747347690744,
"grad_norm": 3.392035484313965,
"learning_rate": 9.994695659588985e-06,
"loss": 1.4064,
"step": 629
},
{
"epoch": 0.32018296169239563,
"grad_norm": 3.708467483520508,
"learning_rate": 9.99461795529652e-06,
"loss": 1.425,
"step": 630
},
{
"epoch": 0.32069118861571694,
"grad_norm": 3.287665843963623,
"learning_rate": 9.994539686289063e-06,
"loss": 1.2154,
"step": 631
},
{
"epoch": 0.3211994155390382,
"grad_norm": 3.2387781143188477,
"learning_rate": 9.994460852575463e-06,
"loss": 1.3697,
"step": 632
},
{
"epoch": 0.32170764246235944,
"grad_norm": 3.511781692504883,
"learning_rate": 9.994381454164635e-06,
"loss": 1.3696,
"step": 633
},
{
"epoch": 0.3222158693856807,
"grad_norm": 3.1286818981170654,
"learning_rate": 9.994301491065552e-06,
"loss": 1.2287,
"step": 634
},
{
"epoch": 0.32272409630900195,
"grad_norm": 3.539268970489502,
"learning_rate": 9.994220963287258e-06,
"loss": 1.2992,
"step": 635
},
{
"epoch": 0.32323232323232326,
"grad_norm": 3.2066617012023926,
"learning_rate": 9.994139870838859e-06,
"loss": 1.3689,
"step": 636
},
{
"epoch": 0.3237405501556445,
"grad_norm": 3.4815847873687744,
"learning_rate": 9.994058213729523e-06,
"loss": 1.2067,
"step": 637
},
{
"epoch": 0.32424877707896577,
"grad_norm": 3.814072370529175,
"learning_rate": 9.993975991968478e-06,
"loss": 1.2652,
"step": 638
},
{
"epoch": 0.324757004002287,
"grad_norm": 3.1743524074554443,
"learning_rate": 9.993893205565029e-06,
"loss": 1.3056,
"step": 639
},
{
"epoch": 0.3252652309256083,
"grad_norm": 3.4408047199249268,
"learning_rate": 9.993809854528529e-06,
"loss": 1.3515,
"step": 640
},
{
"epoch": 0.3257734578489295,
"grad_norm": 3.353102922439575,
"learning_rate": 9.993725938868404e-06,
"loss": 1.322,
"step": 641
},
{
"epoch": 0.32628168477225084,
"grad_norm": 4.640409469604492,
"learning_rate": 9.993641458594142e-06,
"loss": 1.4992,
"step": 642
},
{
"epoch": 0.3267899116955721,
"grad_norm": 3.294832706451416,
"learning_rate": 9.993556413715294e-06,
"loss": 1.3659,
"step": 643
},
{
"epoch": 0.32729813861889334,
"grad_norm": 3.26865553855896,
"learning_rate": 9.993470804241481e-06,
"loss": 1.3908,
"step": 644
},
{
"epoch": 0.3278063655422146,
"grad_norm": 3.2061288356781006,
"learning_rate": 9.993384630182375e-06,
"loss": 1.2603,
"step": 645
},
{
"epoch": 0.32831459246553585,
"grad_norm": 3.1718034744262695,
"learning_rate": 9.993297891547722e-06,
"loss": 1.3821,
"step": 646
},
{
"epoch": 0.3288228193888571,
"grad_norm": 3.1801249980926514,
"learning_rate": 9.99321058834733e-06,
"loss": 1.2118,
"step": 647
},
{
"epoch": 0.3293310463121784,
"grad_norm": 3.2288734912872314,
"learning_rate": 9.99312272059107e-06,
"loss": 1.2868,
"step": 648
},
{
"epoch": 0.32983927323549966,
"grad_norm": 3.5571651458740234,
"learning_rate": 9.993034288288874e-06,
"loss": 1.223,
"step": 649
},
{
"epoch": 0.3303475001588209,
"grad_norm": 3.352027654647827,
"learning_rate": 9.992945291450744e-06,
"loss": 1.2518,
"step": 650
},
{
"epoch": 0.33085572708214217,
"grad_norm": 3.242868185043335,
"learning_rate": 9.992855730086741e-06,
"loss": 1.2442,
"step": 651
},
{
"epoch": 0.3313639540054634,
"grad_norm": 3.3032219409942627,
"learning_rate": 9.992765604206992e-06,
"loss": 1.3753,
"step": 652
},
{
"epoch": 0.3318721809287847,
"grad_norm": 3.234017848968506,
"learning_rate": 9.992674913821685e-06,
"loss": 1.2213,
"step": 653
},
{
"epoch": 0.332380407852106,
"grad_norm": 3.0645787715911865,
"learning_rate": 9.992583658941075e-06,
"loss": 1.2599,
"step": 654
},
{
"epoch": 0.33288863477542724,
"grad_norm": 3.3873555660247803,
"learning_rate": 9.992491839575481e-06,
"loss": 1.2812,
"step": 655
},
{
"epoch": 0.3333968616987485,
"grad_norm": 3.0735232830047607,
"learning_rate": 9.992399455735283e-06,
"loss": 1.1829,
"step": 656
},
{
"epoch": 0.33390508862206975,
"grad_norm": 3.1945180892944336,
"learning_rate": 9.992306507430927e-06,
"loss": 1.2562,
"step": 657
},
{
"epoch": 0.334413315545391,
"grad_norm": 3.20089054107666,
"learning_rate": 9.992212994672921e-06,
"loss": 1.3315,
"step": 658
},
{
"epoch": 0.3349215424687123,
"grad_norm": 3.3600375652313232,
"learning_rate": 9.99211891747184e-06,
"loss": 1.3288,
"step": 659
},
{
"epoch": 0.33542976939203356,
"grad_norm": 3.2655248641967773,
"learning_rate": 9.992024275838318e-06,
"loss": 1.2318,
"step": 660
},
{
"epoch": 0.3359379963153548,
"grad_norm": 3.1854372024536133,
"learning_rate": 9.991929069783058e-06,
"loss": 1.2953,
"step": 661
},
{
"epoch": 0.33644622323867607,
"grad_norm": 3.1260249614715576,
"learning_rate": 9.991833299316824e-06,
"loss": 1.3619,
"step": 662
},
{
"epoch": 0.3369544501619973,
"grad_norm": 3.1407597064971924,
"learning_rate": 9.991736964450445e-06,
"loss": 1.2393,
"step": 663
},
{
"epoch": 0.3374626770853186,
"grad_norm": 3.2042787075042725,
"learning_rate": 9.991640065194812e-06,
"loss": 1.3299,
"step": 664
},
{
"epoch": 0.3379709040086399,
"grad_norm": 3.058418035507202,
"learning_rate": 9.99154260156088e-06,
"loss": 1.2894,
"step": 665
},
{
"epoch": 0.33847913093196114,
"grad_norm": 3.146761178970337,
"learning_rate": 9.99144457355967e-06,
"loss": 1.4489,
"step": 666
},
{
"epoch": 0.3389873578552824,
"grad_norm": 11.600865364074707,
"learning_rate": 9.991345981202265e-06,
"loss": 1.5436,
"step": 667
},
{
"epoch": 0.33949558477860364,
"grad_norm": 3.060974359512329,
"learning_rate": 9.991246824499812e-06,
"loss": 1.2756,
"step": 668
},
{
"epoch": 0.3400038117019249,
"grad_norm": 3.2085535526275635,
"learning_rate": 9.991147103463523e-06,
"loss": 1.1935,
"step": 669
},
{
"epoch": 0.34051203862524615,
"grad_norm": 3.497408628463745,
"learning_rate": 9.991046818104674e-06,
"loss": 1.3223,
"step": 670
},
{
"epoch": 0.34102026554856746,
"grad_norm": 3.2515928745269775,
"learning_rate": 9.990945968434601e-06,
"loss": 1.2761,
"step": 671
},
{
"epoch": 0.3415284924718887,
"grad_norm": 3.371119737625122,
"learning_rate": 9.990844554464709e-06,
"loss": 1.245,
"step": 672
},
{
"epoch": 0.34203671939520996,
"grad_norm": 3.2016313076019287,
"learning_rate": 9.990742576206462e-06,
"loss": 1.3644,
"step": 673
},
{
"epoch": 0.3425449463185312,
"grad_norm": 3.163677453994751,
"learning_rate": 9.990640033671391e-06,
"loss": 1.271,
"step": 674
},
{
"epoch": 0.34305317324185247,
"grad_norm": 3.464029312133789,
"learning_rate": 9.99053692687109e-06,
"loss": 1.3403,
"step": 675
},
{
"epoch": 0.3435614001651737,
"grad_norm": 3.115363836288452,
"learning_rate": 9.990433255817218e-06,
"loss": 1.2434,
"step": 676
},
{
"epoch": 0.34406962708849503,
"grad_norm": 3.0379855632781982,
"learning_rate": 9.990329020521497e-06,
"loss": 1.2424,
"step": 677
},
{
"epoch": 0.3445778540118163,
"grad_norm": 3.1256349086761475,
"learning_rate": 9.990224220995709e-06,
"loss": 1.2773,
"step": 678
},
{
"epoch": 0.34508608093513754,
"grad_norm": 2.9989559650421143,
"learning_rate": 9.990118857251706e-06,
"loss": 1.2307,
"step": 679
},
{
"epoch": 0.3455943078584588,
"grad_norm": 3.4447340965270996,
"learning_rate": 9.990012929301399e-06,
"loss": 1.3264,
"step": 680
},
{
"epoch": 0.34610253478178005,
"grad_norm": 3.2726187705993652,
"learning_rate": 9.989906437156766e-06,
"loss": 1.3172,
"step": 681
},
{
"epoch": 0.3466107617051013,
"grad_norm": 3.2503907680511475,
"learning_rate": 9.989799380829846e-06,
"loss": 1.2419,
"step": 682
},
{
"epoch": 0.3471189886284226,
"grad_norm": 3.216642141342163,
"learning_rate": 9.989691760332748e-06,
"loss": 1.275,
"step": 683
},
{
"epoch": 0.34762721555174386,
"grad_norm": 3.044985055923462,
"learning_rate": 9.989583575677633e-06,
"loss": 1.2534,
"step": 684
},
{
"epoch": 0.3481354424750651,
"grad_norm": 3.3953421115875244,
"learning_rate": 9.989474826876736e-06,
"loss": 1.3845,
"step": 685
},
{
"epoch": 0.34864366939838637,
"grad_norm": 3.6470160484313965,
"learning_rate": 9.989365513942356e-06,
"loss": 1.3019,
"step": 686
},
{
"epoch": 0.3491518963217076,
"grad_norm": 3.700324296951294,
"learning_rate": 9.989255636886848e-06,
"loss": 1.3368,
"step": 687
},
{
"epoch": 0.34966012324502893,
"grad_norm": 2.9334194660186768,
"learning_rate": 9.989145195722636e-06,
"loss": 1.1772,
"step": 688
},
{
"epoch": 0.3501683501683502,
"grad_norm": 3.1360538005828857,
"learning_rate": 9.989034190462207e-06,
"loss": 1.3372,
"step": 689
},
{
"epoch": 0.35067657709167144,
"grad_norm": 3.0413472652435303,
"learning_rate": 9.988922621118115e-06,
"loss": 1.3548,
"step": 690
},
{
"epoch": 0.3511848040149927,
"grad_norm": 3.3083596229553223,
"learning_rate": 9.988810487702971e-06,
"loss": 1.3764,
"step": 691
},
{
"epoch": 0.35169303093831394,
"grad_norm": 3.088041067123413,
"learning_rate": 9.988697790229454e-06,
"loss": 1.3161,
"step": 692
},
{
"epoch": 0.3522012578616352,
"grad_norm": 3.1266753673553467,
"learning_rate": 9.988584528710306e-06,
"loss": 1.3091,
"step": 693
},
{
"epoch": 0.3527094847849565,
"grad_norm": 3.1496315002441406,
"learning_rate": 9.988470703158334e-06,
"loss": 1.2456,
"step": 694
},
{
"epoch": 0.35321771170827776,
"grad_norm": 3.72305965423584,
"learning_rate": 9.988356313586407e-06,
"loss": 1.3824,
"step": 695
},
{
"epoch": 0.353725938631599,
"grad_norm": 3.113633632659912,
"learning_rate": 9.988241360007459e-06,
"loss": 1.385,
"step": 696
},
{
"epoch": 0.35423416555492027,
"grad_norm": 2.981914758682251,
"learning_rate": 9.988125842434484e-06,
"loss": 1.1441,
"step": 697
},
{
"epoch": 0.3547423924782415,
"grad_norm": 3.1952383518218994,
"learning_rate": 9.988009760880548e-06,
"loss": 1.3209,
"step": 698
},
{
"epoch": 0.3552506194015628,
"grad_norm": 3.1060612201690674,
"learning_rate": 9.987893115358773e-06,
"loss": 1.2458,
"step": 699
},
{
"epoch": 0.3557588463248841,
"grad_norm": 3.365842819213867,
"learning_rate": 9.987775905882346e-06,
"loss": 1.338,
"step": 700
},
{
"epoch": 0.35626707324820533,
"grad_norm": 3.0432286262512207,
"learning_rate": 9.987658132464524e-06,
"loss": 1.2491,
"step": 701
},
{
"epoch": 0.3567753001715266,
"grad_norm": 3.0596561431884766,
"learning_rate": 9.987539795118617e-06,
"loss": 1.3572,
"step": 702
},
{
"epoch": 0.35728352709484784,
"grad_norm": 3.2221055030822754,
"learning_rate": 9.987420893858011e-06,
"loss": 1.3876,
"step": 703
},
{
"epoch": 0.3577917540181691,
"grad_norm": 3.2124743461608887,
"learning_rate": 9.987301428696144e-06,
"loss": 1.2375,
"step": 704
},
{
"epoch": 0.35829998094149035,
"grad_norm": 3.352320671081543,
"learning_rate": 9.987181399646526e-06,
"loss": 1.4334,
"step": 705
},
{
"epoch": 0.35880820786481166,
"grad_norm": 3.2828238010406494,
"learning_rate": 9.987060806722727e-06,
"loss": 1.2911,
"step": 706
},
{
"epoch": 0.3593164347881329,
"grad_norm": 3.1434576511383057,
"learning_rate": 9.986939649938385e-06,
"loss": 1.3936,
"step": 707
},
{
"epoch": 0.35982466171145416,
"grad_norm": 3.1314871311187744,
"learning_rate": 9.986817929307194e-06,
"loss": 1.2858,
"step": 708
},
{
"epoch": 0.3603328886347754,
"grad_norm": 3.170621156692505,
"learning_rate": 9.986695644842918e-06,
"loss": 1.2604,
"step": 709
},
{
"epoch": 0.36084111555809667,
"grad_norm": 3.3497283458709717,
"learning_rate": 9.986572796559386e-06,
"loss": 1.2838,
"step": 710
},
{
"epoch": 0.361349342481418,
"grad_norm": 3.2710769176483154,
"learning_rate": 9.986449384470483e-06,
"loss": 1.315,
"step": 711
},
{
"epoch": 0.36185756940473923,
"grad_norm": 3.350487232208252,
"learning_rate": 9.986325408590165e-06,
"loss": 1.2497,
"step": 712
},
{
"epoch": 0.3623657963280605,
"grad_norm": 3.4346296787261963,
"learning_rate": 9.98620086893245e-06,
"loss": 1.3931,
"step": 713
},
{
"epoch": 0.36287402325138174,
"grad_norm": 3.1220874786376953,
"learning_rate": 9.986075765511417e-06,
"loss": 1.3431,
"step": 714
},
{
"epoch": 0.363382250174703,
"grad_norm": 3.2858989238739014,
"learning_rate": 9.985950098341213e-06,
"loss": 1.304,
"step": 715
},
{
"epoch": 0.36389047709802425,
"grad_norm": 3.1637048721313477,
"learning_rate": 9.985823867436045e-06,
"loss": 1.3185,
"step": 716
},
{
"epoch": 0.36439870402134555,
"grad_norm": 3.1585402488708496,
"learning_rate": 9.985697072810185e-06,
"loss": 1.3015,
"step": 717
},
{
"epoch": 0.3649069309446668,
"grad_norm": 3.1651861667633057,
"learning_rate": 9.98556971447797e-06,
"loss": 1.3635,
"step": 718
},
{
"epoch": 0.36541515786798806,
"grad_norm": 3.2013018131256104,
"learning_rate": 9.9854417924538e-06,
"loss": 1.381,
"step": 719
},
{
"epoch": 0.3659233847913093,
"grad_norm": 3.0635321140289307,
"learning_rate": 9.985313306752136e-06,
"loss": 1.2533,
"step": 720
},
{
"epoch": 0.36643161171463057,
"grad_norm": 2.983309507369995,
"learning_rate": 9.98518425738751e-06,
"loss": 1.2858,
"step": 721
},
{
"epoch": 0.3669398386379518,
"grad_norm": 3.1740927696228027,
"learning_rate": 9.985054644374509e-06,
"loss": 1.2483,
"step": 722
},
{
"epoch": 0.36744806556127313,
"grad_norm": 3.0193238258361816,
"learning_rate": 9.984924467727787e-06,
"loss": 1.3102,
"step": 723
},
{
"epoch": 0.3679562924845944,
"grad_norm": 3.6168391704559326,
"learning_rate": 9.984793727462065e-06,
"loss": 1.2824,
"step": 724
},
{
"epoch": 0.36846451940791564,
"grad_norm": 3.6449429988861084,
"learning_rate": 9.984662423592124e-06,
"loss": 1.4594,
"step": 725
},
{
"epoch": 0.3689727463312369,
"grad_norm": 3.096966505050659,
"learning_rate": 9.984530556132812e-06,
"loss": 1.2573,
"step": 726
},
{
"epoch": 0.36948097325455814,
"grad_norm": 3.231896162033081,
"learning_rate": 9.984398125099033e-06,
"loss": 1.2727,
"step": 727
},
{
"epoch": 0.3699892001778794,
"grad_norm": 3.1200449466705322,
"learning_rate": 9.984265130505766e-06,
"loss": 1.3387,
"step": 728
},
{
"epoch": 0.3704974271012007,
"grad_norm": 3.424175977706909,
"learning_rate": 9.984131572368045e-06,
"loss": 1.3011,
"step": 729
},
{
"epoch": 0.37100565402452196,
"grad_norm": 3.364169120788574,
"learning_rate": 9.983997450700973e-06,
"loss": 1.3665,
"step": 730
},
{
"epoch": 0.3715138809478432,
"grad_norm": 3.1565613746643066,
"learning_rate": 9.983862765519711e-06,
"loss": 1.2281,
"step": 731
},
{
"epoch": 0.37202210787116446,
"grad_norm": 3.174419403076172,
"learning_rate": 9.98372751683949e-06,
"loss": 1.3035,
"step": 732
},
{
"epoch": 0.3725303347944857,
"grad_norm": 2.9651894569396973,
"learning_rate": 9.983591704675602e-06,
"loss": 1.2217,
"step": 733
},
{
"epoch": 0.373038561717807,
"grad_norm": 3.3082499504089355,
"learning_rate": 9.9834553290434e-06,
"loss": 1.3253,
"step": 734
},
{
"epoch": 0.3735467886411283,
"grad_norm": 3.055314064025879,
"learning_rate": 9.983318389958305e-06,
"loss": 1.2681,
"step": 735
},
{
"epoch": 0.37405501556444953,
"grad_norm": 3.4626822471618652,
"learning_rate": 9.983180887435799e-06,
"loss": 1.2864,
"step": 736
},
{
"epoch": 0.3745632424877708,
"grad_norm": 2.935825824737549,
"learning_rate": 9.983042821491432e-06,
"loss": 1.1635,
"step": 737
},
{
"epoch": 0.37507146941109204,
"grad_norm": 3.4077136516571045,
"learning_rate": 9.982904192140808e-06,
"loss": 1.56,
"step": 738
},
{
"epoch": 0.3755796963344133,
"grad_norm": 3.5357930660247803,
"learning_rate": 9.982764999399607e-06,
"loss": 1.3316,
"step": 739
},
{
"epoch": 0.3760879232577346,
"grad_norm": 3.308767080307007,
"learning_rate": 9.982625243283566e-06,
"loss": 1.4096,
"step": 740
},
{
"epoch": 0.37659615018105586,
"grad_norm": 3.031561851501465,
"learning_rate": 9.982484923808484e-06,
"loss": 1.3236,
"step": 741
},
{
"epoch": 0.3771043771043771,
"grad_norm": 3.082707643508911,
"learning_rate": 9.982344040990226e-06,
"loss": 1.3657,
"step": 742
},
{
"epoch": 0.37761260402769836,
"grad_norm": 2.883720636367798,
"learning_rate": 9.982202594844723e-06,
"loss": 1.1881,
"step": 743
},
{
"epoch": 0.3781208309510196,
"grad_norm": 3.01926851272583,
"learning_rate": 9.982060585387968e-06,
"loss": 1.3477,
"step": 744
},
{
"epoch": 0.37862905787434087,
"grad_norm": 2.99509596824646,
"learning_rate": 9.981918012636015e-06,
"loss": 1.2324,
"step": 745
},
{
"epoch": 0.3791372847976622,
"grad_norm": 3.1339457035064697,
"learning_rate": 9.981774876604985e-06,
"loss": 1.2635,
"step": 746
},
{
"epoch": 0.37964551172098343,
"grad_norm": 3.1058597564697266,
"learning_rate": 9.981631177311061e-06,
"loss": 1.3046,
"step": 747
},
{
"epoch": 0.3801537386443047,
"grad_norm": 3.1269471645355225,
"learning_rate": 9.981486914770493e-06,
"loss": 1.2447,
"step": 748
},
{
"epoch": 0.38066196556762594,
"grad_norm": 3.224168539047241,
"learning_rate": 9.981342088999588e-06,
"loss": 1.2274,
"step": 749
},
{
"epoch": 0.3811701924909472,
"grad_norm": 3.2049806118011475,
"learning_rate": 9.981196700014724e-06,
"loss": 1.2978,
"step": 750
},
{
"epoch": 0.38167841941426844,
"grad_norm": 3.1496620178222656,
"learning_rate": 9.981050747832336e-06,
"loss": 1.273,
"step": 751
},
{
"epoch": 0.38218664633758975,
"grad_norm": 3.535106897354126,
"learning_rate": 9.98090423246893e-06,
"loss": 1.3022,
"step": 752
},
{
"epoch": 0.382694873260911,
"grad_norm": 3.1526551246643066,
"learning_rate": 9.980757153941069e-06,
"loss": 1.1942,
"step": 753
},
{
"epoch": 0.38320310018423226,
"grad_norm": 3.3968474864959717,
"learning_rate": 9.980609512265383e-06,
"loss": 1.3029,
"step": 754
},
{
"epoch": 0.3837113271075535,
"grad_norm": 3.6863186359405518,
"learning_rate": 9.980461307458564e-06,
"loss": 1.3164,
"step": 755
},
{
"epoch": 0.38421955403087477,
"grad_norm": 2.9728426933288574,
"learning_rate": 9.980312539537373e-06,
"loss": 1.2588,
"step": 756
},
{
"epoch": 0.3847277809541961,
"grad_norm": 3.1564176082611084,
"learning_rate": 9.980163208518626e-06,
"loss": 1.3021,
"step": 757
},
{
"epoch": 0.38523600787751733,
"grad_norm": 3.3139936923980713,
"learning_rate": 9.980013314419208e-06,
"loss": 1.2729,
"step": 758
},
{
"epoch": 0.3857442348008386,
"grad_norm": 3.0863771438598633,
"learning_rate": 9.979862857256066e-06,
"loss": 1.3166,
"step": 759
},
{
"epoch": 0.38625246172415983,
"grad_norm": 3.377894639968872,
"learning_rate": 9.979711837046212e-06,
"loss": 1.3912,
"step": 760
},
{
"epoch": 0.3867606886474811,
"grad_norm": 3.1915252208709717,
"learning_rate": 9.979560253806723e-06,
"loss": 1.3662,
"step": 761
},
{
"epoch": 0.38726891557080234,
"grad_norm": 3.0366125106811523,
"learning_rate": 9.979408107554738e-06,
"loss": 1.231,
"step": 762
},
{
"epoch": 0.38777714249412365,
"grad_norm": 3.1416783332824707,
"learning_rate": 9.979255398307457e-06,
"loss": 1.2466,
"step": 763
},
{
"epoch": 0.3882853694174449,
"grad_norm": 2.884857416152954,
"learning_rate": 9.979102126082145e-06,
"loss": 1.2442,
"step": 764
},
{
"epoch": 0.38879359634076616,
"grad_norm": 3.1883974075317383,
"learning_rate": 9.978948290896134e-06,
"loss": 1.3042,
"step": 765
},
{
"epoch": 0.3893018232640874,
"grad_norm": 3.1092233657836914,
"learning_rate": 9.978793892766817e-06,
"loss": 1.3102,
"step": 766
},
{
"epoch": 0.38981005018740866,
"grad_norm": 3.001688241958618,
"learning_rate": 9.978638931711651e-06,
"loss": 1.3254,
"step": 767
},
{
"epoch": 0.3903182771107299,
"grad_norm": 3.205700635910034,
"learning_rate": 9.978483407748154e-06,
"loss": 1.3245,
"step": 768
},
{
"epoch": 0.3908265040340512,
"grad_norm": 3.2046477794647217,
"learning_rate": 9.978327320893915e-06,
"loss": 1.2614,
"step": 769
},
{
"epoch": 0.3913347309573725,
"grad_norm": 3.1941304206848145,
"learning_rate": 9.978170671166578e-06,
"loss": 1.353,
"step": 770
},
{
"epoch": 0.39184295788069373,
"grad_norm": 3.317028522491455,
"learning_rate": 9.978013458583857e-06,
"loss": 1.2896,
"step": 771
},
{
"epoch": 0.392351184804015,
"grad_norm": 3.0112125873565674,
"learning_rate": 9.977855683163526e-06,
"loss": 1.276,
"step": 772
},
{
"epoch": 0.39285941172733624,
"grad_norm": 3.0274596214294434,
"learning_rate": 9.977697344923425e-06,
"loss": 1.2585,
"step": 773
},
{
"epoch": 0.3933676386506575,
"grad_norm": 2.992523193359375,
"learning_rate": 9.977538443881454e-06,
"loss": 1.28,
"step": 774
},
{
"epoch": 0.3938758655739788,
"grad_norm": 3.1852054595947266,
"learning_rate": 9.97737898005558e-06,
"loss": 1.3497,
"step": 775
},
{
"epoch": 0.39438409249730005,
"grad_norm": 3.218014717102051,
"learning_rate": 9.977218953463836e-06,
"loss": 1.2833,
"step": 776
},
{
"epoch": 0.3948923194206213,
"grad_norm": 2.910120725631714,
"learning_rate": 9.97705836412431e-06,
"loss": 1.2687,
"step": 777
},
{
"epoch": 0.39540054634394256,
"grad_norm": 3.407662868499756,
"learning_rate": 9.976897212055164e-06,
"loss": 1.3764,
"step": 778
},
{
"epoch": 0.3959087732672638,
"grad_norm": 3.326226234436035,
"learning_rate": 9.976735497274615e-06,
"loss": 1.3304,
"step": 779
},
{
"epoch": 0.3964170001905851,
"grad_norm": 2.9093177318573,
"learning_rate": 9.976573219800948e-06,
"loss": 1.277,
"step": 780
},
{
"epoch": 0.3969252271139064,
"grad_norm": 3.1852495670318604,
"learning_rate": 9.976410379652512e-06,
"loss": 1.3158,
"step": 781
},
{
"epoch": 0.39743345403722763,
"grad_norm": 3.149109125137329,
"learning_rate": 9.97624697684772e-06,
"loss": 1.2381,
"step": 782
},
{
"epoch": 0.3979416809605489,
"grad_norm": 3.0496628284454346,
"learning_rate": 9.976083011405042e-06,
"loss": 1.2591,
"step": 783
},
{
"epoch": 0.39844990788387014,
"grad_norm": 2.9263885021209717,
"learning_rate": 9.975918483343022e-06,
"loss": 1.2457,
"step": 784
},
{
"epoch": 0.3989581348071914,
"grad_norm": 2.949040412902832,
"learning_rate": 9.975753392680258e-06,
"loss": 1.2433,
"step": 785
},
{
"epoch": 0.3994663617305127,
"grad_norm": 3.1974003314971924,
"learning_rate": 9.975587739435418e-06,
"loss": 1.2861,
"step": 786
},
{
"epoch": 0.39997458865383395,
"grad_norm": 3.366123914718628,
"learning_rate": 9.975421523627232e-06,
"loss": 1.2619,
"step": 787
},
{
"epoch": 0.4004828155771552,
"grad_norm": 3.0037221908569336,
"learning_rate": 9.975254745274492e-06,
"loss": 1.3039,
"step": 788
},
{
"epoch": 0.40099104250047646,
"grad_norm": 3.247976303100586,
"learning_rate": 9.975087404396057e-06,
"loss": 1.3495,
"step": 789
},
{
"epoch": 0.4014992694237977,
"grad_norm": 2.977108955383301,
"learning_rate": 9.974919501010844e-06,
"loss": 1.1731,
"step": 790
},
{
"epoch": 0.40200749634711896,
"grad_norm": 3.743683099746704,
"learning_rate": 9.97475103513784e-06,
"loss": 1.4369,
"step": 791
},
{
"epoch": 0.4025157232704403,
"grad_norm": 3.533647298812866,
"learning_rate": 9.97458200679609e-06,
"loss": 1.3664,
"step": 792
},
{
"epoch": 0.4030239501937615,
"grad_norm": 3.04760479927063,
"learning_rate": 9.974412416004706e-06,
"loss": 1.1608,
"step": 793
},
{
"epoch": 0.4035321771170828,
"grad_norm": 3.0548715591430664,
"learning_rate": 9.974242262782865e-06,
"loss": 1.1694,
"step": 794
},
{
"epoch": 0.40404040404040403,
"grad_norm": 2.859910726547241,
"learning_rate": 9.974071547149801e-06,
"loss": 1.2936,
"step": 795
},
{
"epoch": 0.4045486309637253,
"grad_norm": 3.3869526386260986,
"learning_rate": 9.973900269124818e-06,
"loss": 1.4214,
"step": 796
},
{
"epoch": 0.40505685788704654,
"grad_norm": 3.380077600479126,
"learning_rate": 9.973728428727284e-06,
"loss": 1.3634,
"step": 797
},
{
"epoch": 0.40556508481036785,
"grad_norm": 3.0257716178894043,
"learning_rate": 9.973556025976625e-06,
"loss": 1.2793,
"step": 798
},
{
"epoch": 0.4060733117336891,
"grad_norm": 3.1302125453948975,
"learning_rate": 9.973383060892335e-06,
"loss": 1.3027,
"step": 799
},
{
"epoch": 0.40658153865701036,
"grad_norm": 3.309006690979004,
"learning_rate": 9.973209533493969e-06,
"loss": 1.2625,
"step": 800
},
{
"epoch": 0.4070897655803316,
"grad_norm": 3.024994373321533,
"learning_rate": 9.973035443801147e-06,
"loss": 1.2243,
"step": 801
},
{
"epoch": 0.40759799250365286,
"grad_norm": 3.1751198768615723,
"learning_rate": 9.972860791833555e-06,
"loss": 1.2211,
"step": 802
},
{
"epoch": 0.40810621942697417,
"grad_norm": 3.170717716217041,
"learning_rate": 9.972685577610936e-06,
"loss": 1.2553,
"step": 803
},
{
"epoch": 0.4086144463502954,
"grad_norm": 3.22538161277771,
"learning_rate": 9.972509801153102e-06,
"loss": 1.2277,
"step": 804
},
{
"epoch": 0.4091226732736167,
"grad_norm": 3.1638424396514893,
"learning_rate": 9.972333462479931e-06,
"loss": 1.2627,
"step": 805
},
{
"epoch": 0.40963090019693793,
"grad_norm": 2.9831383228302,
"learning_rate": 9.972156561611354e-06,
"loss": 1.2155,
"step": 806
},
{
"epoch": 0.4101391271202592,
"grad_norm": 3.119858980178833,
"learning_rate": 9.971979098567377e-06,
"loss": 1.198,
"step": 807
},
{
"epoch": 0.41064735404358044,
"grad_norm": 3.1125288009643555,
"learning_rate": 9.971801073368062e-06,
"loss": 1.2545,
"step": 808
},
{
"epoch": 0.41115558096690175,
"grad_norm": 3.114292621612549,
"learning_rate": 9.97162248603354e-06,
"loss": 1.2087,
"step": 809
},
{
"epoch": 0.411663807890223,
"grad_norm": 3.1119182109832764,
"learning_rate": 9.971443336584002e-06,
"loss": 1.2883,
"step": 810
},
{
"epoch": 0.41217203481354425,
"grad_norm": 3.3735485076904297,
"learning_rate": 9.971263625039702e-06,
"loss": 1.2603,
"step": 811
},
{
"epoch": 0.4126802617368655,
"grad_norm": 3.0008327960968018,
"learning_rate": 9.97108335142096e-06,
"loss": 1.3121,
"step": 812
},
{
"epoch": 0.41318848866018676,
"grad_norm": 3.1853764057159424,
"learning_rate": 9.97090251574816e-06,
"loss": 1.2841,
"step": 813
},
{
"epoch": 0.413696715583508,
"grad_norm": 3.3970205783843994,
"learning_rate": 9.970721118041746e-06,
"loss": 1.3358,
"step": 814
},
{
"epoch": 0.4142049425068293,
"grad_norm": 3.416800022125244,
"learning_rate": 9.970539158322229e-06,
"loss": 1.3436,
"step": 815
},
{
"epoch": 0.4147131694301506,
"grad_norm": 2.908444404602051,
"learning_rate": 9.970356636610181e-06,
"loss": 1.3395,
"step": 816
},
{
"epoch": 0.4152213963534718,
"grad_norm": 3.0709686279296875,
"learning_rate": 9.97017355292624e-06,
"loss": 1.298,
"step": 817
},
{
"epoch": 0.4157296232767931,
"grad_norm": 3.745266914367676,
"learning_rate": 9.969989907291106e-06,
"loss": 1.2785,
"step": 818
},
{
"epoch": 0.41623785020011433,
"grad_norm": 2.99845290184021,
"learning_rate": 9.969805699725542e-06,
"loss": 1.2763,
"step": 819
},
{
"epoch": 0.4167460771234356,
"grad_norm": 3.5009357929229736,
"learning_rate": 9.969620930250377e-06,
"loss": 1.4035,
"step": 820
},
{
"epoch": 0.4172543040467569,
"grad_norm": 3.1333866119384766,
"learning_rate": 9.9694355988865e-06,
"loss": 1.2492,
"step": 821
},
{
"epoch": 0.41776253097007815,
"grad_norm": 3.015458583831787,
"learning_rate": 9.969249705654866e-06,
"loss": 1.3015,
"step": 822
},
{
"epoch": 0.4182707578933994,
"grad_norm": 2.9285178184509277,
"learning_rate": 9.969063250576494e-06,
"loss": 1.2905,
"step": 823
},
{
"epoch": 0.41877898481672066,
"grad_norm": 3.2691152095794678,
"learning_rate": 9.968876233672466e-06,
"loss": 1.2708,
"step": 824
},
{
"epoch": 0.4192872117400419,
"grad_norm": 3.1857168674468994,
"learning_rate": 9.968688654963926e-06,
"loss": 1.2818,
"step": 825
},
{
"epoch": 0.4197954386633632,
"grad_norm": 3.2709298133850098,
"learning_rate": 9.96850051447208e-06,
"loss": 1.2403,
"step": 826
},
{
"epoch": 0.42030366558668447,
"grad_norm": 3.037520170211792,
"learning_rate": 9.968311812218203e-06,
"loss": 1.2857,
"step": 827
},
{
"epoch": 0.4208118925100057,
"grad_norm": 3.4567365646362305,
"learning_rate": 9.96812254822363e-06,
"loss": 1.3809,
"step": 828
},
{
"epoch": 0.421320119433327,
"grad_norm": 3.0860140323638916,
"learning_rate": 9.967932722509762e-06,
"loss": 1.3025,
"step": 829
},
{
"epoch": 0.42182834635664823,
"grad_norm": 3.1566691398620605,
"learning_rate": 9.967742335098058e-06,
"loss": 1.3849,
"step": 830
},
{
"epoch": 0.4223365732799695,
"grad_norm": 3.086601734161377,
"learning_rate": 9.967551386010046e-06,
"loss": 1.335,
"step": 831
},
{
"epoch": 0.4228448002032908,
"grad_norm": 3.1381146907806396,
"learning_rate": 9.967359875267315e-06,
"loss": 1.1581,
"step": 832
},
{
"epoch": 0.42335302712661205,
"grad_norm": 3.1009199619293213,
"learning_rate": 9.967167802891519e-06,
"loss": 1.2917,
"step": 833
},
{
"epoch": 0.4238612540499333,
"grad_norm": 3.1351935863494873,
"learning_rate": 9.966975168904373e-06,
"loss": 1.3964,
"step": 834
},
{
"epoch": 0.42436948097325455,
"grad_norm": 2.7338829040527344,
"learning_rate": 9.966781973327661e-06,
"loss": 1.239,
"step": 835
},
{
"epoch": 0.4248777078965758,
"grad_norm": 3.2059786319732666,
"learning_rate": 9.966588216183221e-06,
"loss": 1.1639,
"step": 836
},
{
"epoch": 0.42538593481989706,
"grad_norm": 3.4231512546539307,
"learning_rate": 9.966393897492962e-06,
"loss": 1.319,
"step": 837
},
{
"epoch": 0.42589416174321837,
"grad_norm": 3.154146909713745,
"learning_rate": 9.966199017278859e-06,
"loss": 1.1938,
"step": 838
},
{
"epoch": 0.4264023886665396,
"grad_norm": 3.007706642150879,
"learning_rate": 9.96600357556294e-06,
"loss": 1.3219,
"step": 839
},
{
"epoch": 0.4269106155898609,
"grad_norm": 3.235159397125244,
"learning_rate": 9.965807572367306e-06,
"loss": 1.3359,
"step": 840
},
{
"epoch": 0.42741884251318213,
"grad_norm": 3.1410484313964844,
"learning_rate": 9.965611007714117e-06,
"loss": 1.3004,
"step": 841
},
{
"epoch": 0.4279270694365034,
"grad_norm": 3.1803131103515625,
"learning_rate": 9.965413881625597e-06,
"loss": 1.2798,
"step": 842
},
{
"epoch": 0.42843529635982464,
"grad_norm": 2.8185393810272217,
"learning_rate": 9.965216194124035e-06,
"loss": 1.2421,
"step": 843
},
{
"epoch": 0.42894352328314594,
"grad_norm": 3.0903513431549072,
"learning_rate": 9.965017945231783e-06,
"loss": 1.3236,
"step": 844
},
{
"epoch": 0.4294517502064672,
"grad_norm": 3.6765925884246826,
"learning_rate": 9.964819134971255e-06,
"loss": 1.3905,
"step": 845
},
{
"epoch": 0.42995997712978845,
"grad_norm": 3.137418031692505,
"learning_rate": 9.964619763364928e-06,
"loss": 1.173,
"step": 846
},
{
"epoch": 0.4304682040531097,
"grad_norm": 2.982210159301758,
"learning_rate": 9.964419830435346e-06,
"loss": 1.2189,
"step": 847
},
{
"epoch": 0.43097643097643096,
"grad_norm": 3.154118776321411,
"learning_rate": 9.964219336205114e-06,
"loss": 1.2155,
"step": 848
},
{
"epoch": 0.43148465789975227,
"grad_norm": 3.562628984451294,
"learning_rate": 9.9640182806969e-06,
"loss": 1.3424,
"step": 849
},
{
"epoch": 0.4319928848230735,
"grad_norm": 3.1238515377044678,
"learning_rate": 9.963816663933438e-06,
"loss": 1.3475,
"step": 850
},
{
"epoch": 0.4325011117463948,
"grad_norm": 3.4061684608459473,
"learning_rate": 9.963614485937522e-06,
"loss": 1.3098,
"step": 851
},
{
"epoch": 0.433009338669716,
"grad_norm": 2.9898059368133545,
"learning_rate": 9.963411746732012e-06,
"loss": 1.2531,
"step": 852
},
{
"epoch": 0.4335175655930373,
"grad_norm": 2.9392600059509277,
"learning_rate": 9.963208446339829e-06,
"loss": 1.2618,
"step": 853
},
{
"epoch": 0.43402579251635853,
"grad_norm": 3.1422648429870605,
"learning_rate": 9.963004584783961e-06,
"loss": 1.3015,
"step": 854
},
{
"epoch": 0.43453401943967984,
"grad_norm": 3.061648368835449,
"learning_rate": 9.962800162087458e-06,
"loss": 1.2793,
"step": 855
},
{
"epoch": 0.4350422463630011,
"grad_norm": 3.354825496673584,
"learning_rate": 9.962595178273432e-06,
"loss": 1.2846,
"step": 856
},
{
"epoch": 0.43555047328632235,
"grad_norm": 3.2007317543029785,
"learning_rate": 9.962389633365059e-06,
"loss": 1.246,
"step": 857
},
{
"epoch": 0.4360587002096436,
"grad_norm": 3.1026949882507324,
"learning_rate": 9.96218352738558e-06,
"loss": 1.2304,
"step": 858
},
{
"epoch": 0.43656692713296485,
"grad_norm": 3.2969212532043457,
"learning_rate": 9.961976860358298e-06,
"loss": 1.1946,
"step": 859
},
{
"epoch": 0.4370751540562861,
"grad_norm": 3.416917324066162,
"learning_rate": 9.961769632306579e-06,
"loss": 1.2282,
"step": 860
},
{
"epoch": 0.4375833809796074,
"grad_norm": 3.0532281398773193,
"learning_rate": 9.961561843253853e-06,
"loss": 1.2293,
"step": 861
},
{
"epoch": 0.43809160790292867,
"grad_norm": 3.875426769256592,
"learning_rate": 9.961353493223613e-06,
"loss": 1.3623,
"step": 862
},
{
"epoch": 0.4385998348262499,
"grad_norm": 3.1366961002349854,
"learning_rate": 9.961144582239418e-06,
"loss": 1.1868,
"step": 863
},
{
"epoch": 0.4391080617495712,
"grad_norm": 3.866417646408081,
"learning_rate": 9.96093511032489e-06,
"loss": 1.3833,
"step": 864
},
{
"epoch": 0.43961628867289243,
"grad_norm": 3.051649808883667,
"learning_rate": 9.96072507750371e-06,
"loss": 1.2557,
"step": 865
},
{
"epoch": 0.4401245155962137,
"grad_norm": 3.038184881210327,
"learning_rate": 9.960514483799624e-06,
"loss": 1.267,
"step": 866
},
{
"epoch": 0.440632742519535,
"grad_norm": 3.4575061798095703,
"learning_rate": 9.960303329236447e-06,
"loss": 1.4039,
"step": 867
},
{
"epoch": 0.44114096944285625,
"grad_norm": 3.2219109535217285,
"learning_rate": 9.960091613838048e-06,
"loss": 1.3335,
"step": 868
},
{
"epoch": 0.4416491963661775,
"grad_norm": 3.134032964706421,
"learning_rate": 9.959879337628368e-06,
"loss": 1.3197,
"step": 869
},
{
"epoch": 0.44215742328949875,
"grad_norm": 3.1833622455596924,
"learning_rate": 9.95966650063141e-06,
"loss": 1.2531,
"step": 870
},
{
"epoch": 0.44266565021282,
"grad_norm": 2.999913215637207,
"learning_rate": 9.959453102871231e-06,
"loss": 1.1841,
"step": 871
},
{
"epoch": 0.4431738771361413,
"grad_norm": 3.2226994037628174,
"learning_rate": 9.959239144371966e-06,
"loss": 1.302,
"step": 872
},
{
"epoch": 0.44368210405946257,
"grad_norm": 3.1408486366271973,
"learning_rate": 9.959024625157804e-06,
"loss": 1.2729,
"step": 873
},
{
"epoch": 0.4441903309827838,
"grad_norm": 3.2160913944244385,
"learning_rate": 9.958809545252997e-06,
"loss": 1.266,
"step": 874
},
{
"epoch": 0.4446985579061051,
"grad_norm": 3.3626604080200195,
"learning_rate": 9.958593904681866e-06,
"loss": 1.3973,
"step": 875
},
{
"epoch": 0.4452067848294263,
"grad_norm": 3.3469786643981934,
"learning_rate": 9.958377703468792e-06,
"loss": 1.282,
"step": 876
},
{
"epoch": 0.4457150117527476,
"grad_norm": 3.2448103427886963,
"learning_rate": 9.95816094163822e-06,
"loss": 1.2757,
"step": 877
},
{
"epoch": 0.4462232386760689,
"grad_norm": 4.24213171005249,
"learning_rate": 9.957943619214653e-06,
"loss": 1.3377,
"step": 878
},
{
"epoch": 0.44673146559939014,
"grad_norm": 3.2333717346191406,
"learning_rate": 9.95772573622267e-06,
"loss": 1.3042,
"step": 879
},
{
"epoch": 0.4472396925227114,
"grad_norm": 3.0316765308380127,
"learning_rate": 9.957507292686902e-06,
"loss": 1.3528,
"step": 880
},
{
"epoch": 0.44774791944603265,
"grad_norm": 2.985063314437866,
"learning_rate": 9.957288288632048e-06,
"loss": 1.2457,
"step": 881
},
{
"epoch": 0.4482561463693539,
"grad_norm": 2.8933520317077637,
"learning_rate": 9.957068724082868e-06,
"loss": 1.2641,
"step": 882
},
{
"epoch": 0.44876437329267516,
"grad_norm": 3.3127031326293945,
"learning_rate": 9.95684859906419e-06,
"loss": 1.3061,
"step": 883
},
{
"epoch": 0.44927260021599646,
"grad_norm": 3.223618984222412,
"learning_rate": 9.9566279136009e-06,
"loss": 1.3157,
"step": 884
},
{
"epoch": 0.4497808271393177,
"grad_norm": 2.9213273525238037,
"learning_rate": 9.956406667717951e-06,
"loss": 1.307,
"step": 885
},
{
"epoch": 0.45028905406263897,
"grad_norm": 3.295760154724121,
"learning_rate": 9.956184861440357e-06,
"loss": 1.1735,
"step": 886
},
{
"epoch": 0.4507972809859602,
"grad_norm": 3.401263952255249,
"learning_rate": 9.955962494793197e-06,
"loss": 1.3738,
"step": 887
},
{
"epoch": 0.4513055079092815,
"grad_norm": 2.9773504734039307,
"learning_rate": 9.955739567801613e-06,
"loss": 1.229,
"step": 888
},
{
"epoch": 0.45181373483260273,
"grad_norm": 6.719383239746094,
"learning_rate": 9.95551608049081e-06,
"loss": 1.4442,
"step": 889
},
{
"epoch": 0.45232196175592404,
"grad_norm": 3.0398476123809814,
"learning_rate": 9.955292032886057e-06,
"loss": 1.2627,
"step": 890
},
{
"epoch": 0.4528301886792453,
"grad_norm": 2.933922290802002,
"learning_rate": 9.955067425012685e-06,
"loss": 1.2333,
"step": 891
},
{
"epoch": 0.45333841560256655,
"grad_norm": 3.1984505653381348,
"learning_rate": 9.95484225689609e-06,
"loss": 1.3416,
"step": 892
},
{
"epoch": 0.4538466425258878,
"grad_norm": 3.189798593521118,
"learning_rate": 9.95461652856173e-06,
"loss": 1.2928,
"step": 893
},
{
"epoch": 0.45435486944920905,
"grad_norm": 3.028228759765625,
"learning_rate": 9.954390240035127e-06,
"loss": 1.2474,
"step": 894
},
{
"epoch": 0.45486309637253036,
"grad_norm": 3.0100460052490234,
"learning_rate": 9.954163391341867e-06,
"loss": 1.2952,
"step": 895
},
{
"epoch": 0.4553713232958516,
"grad_norm": 3.1047329902648926,
"learning_rate": 9.953935982507597e-06,
"loss": 1.2254,
"step": 896
},
{
"epoch": 0.45587955021917287,
"grad_norm": 3.1082210540771484,
"learning_rate": 9.95370801355803e-06,
"loss": 1.1121,
"step": 897
},
{
"epoch": 0.4563877771424941,
"grad_norm": 3.420098304748535,
"learning_rate": 9.953479484518943e-06,
"loss": 1.221,
"step": 898
},
{
"epoch": 0.4568960040658154,
"grad_norm": 3.4203615188598633,
"learning_rate": 9.953250395416172e-06,
"loss": 1.2991,
"step": 899
},
{
"epoch": 0.45740423098913663,
"grad_norm": 3.020646572113037,
"learning_rate": 9.953020746275618e-06,
"loss": 1.2723,
"step": 900
},
{
"epoch": 0.45791245791245794,
"grad_norm": 3.2635576725006104,
"learning_rate": 9.95279053712325e-06,
"loss": 1.3714,
"step": 901
},
{
"epoch": 0.4584206848357792,
"grad_norm": 2.987079381942749,
"learning_rate": 9.952559767985093e-06,
"loss": 1.2517,
"step": 902
},
{
"epoch": 0.45892891175910044,
"grad_norm": 2.9069972038269043,
"learning_rate": 9.95232843888724e-06,
"loss": 1.2647,
"step": 903
},
{
"epoch": 0.4594371386824217,
"grad_norm": 3.121272087097168,
"learning_rate": 9.952096549855846e-06,
"loss": 1.3379,
"step": 904
},
{
"epoch": 0.45994536560574295,
"grad_norm": 2.9536068439483643,
"learning_rate": 9.95186410091713e-06,
"loss": 1.2483,
"step": 905
},
{
"epoch": 0.4604535925290642,
"grad_norm": 3.0364537239074707,
"learning_rate": 9.951631092097373e-06,
"loss": 1.2642,
"step": 906
},
{
"epoch": 0.4609618194523855,
"grad_norm": 3.0341713428497314,
"learning_rate": 9.951397523422923e-06,
"loss": 1.3138,
"step": 907
},
{
"epoch": 0.46147004637570677,
"grad_norm": 3.261298656463623,
"learning_rate": 9.951163394920185e-06,
"loss": 1.286,
"step": 908
},
{
"epoch": 0.461978273299028,
"grad_norm": 3.1730971336364746,
"learning_rate": 9.95092870661563e-06,
"loss": 1.2841,
"step": 909
},
{
"epoch": 0.4624865002223493,
"grad_norm": 3.372532606124878,
"learning_rate": 9.950693458535796e-06,
"loss": 1.3713,
"step": 910
},
{
"epoch": 0.4629947271456705,
"grad_norm": 3.6603589057922363,
"learning_rate": 9.950457650707281e-06,
"loss": 1.3572,
"step": 911
},
{
"epoch": 0.4635029540689918,
"grad_norm": 3.153555154800415,
"learning_rate": 9.950221283156744e-06,
"loss": 1.3132,
"step": 912
},
{
"epoch": 0.4640111809923131,
"grad_norm": 2.9425718784332275,
"learning_rate": 9.94998435591091e-06,
"loss": 1.1842,
"step": 913
},
{
"epoch": 0.46451940791563434,
"grad_norm": 3.12605357170105,
"learning_rate": 9.94974686899657e-06,
"loss": 1.3627,
"step": 914
},
{
"epoch": 0.4650276348389556,
"grad_norm": 3.0458600521087646,
"learning_rate": 9.949508822440574e-06,
"loss": 1.2577,
"step": 915
},
{
"epoch": 0.46553586176227685,
"grad_norm": 3.2679193019866943,
"learning_rate": 9.949270216269837e-06,
"loss": 1.2647,
"step": 916
},
{
"epoch": 0.4660440886855981,
"grad_norm": 3.032907724380493,
"learning_rate": 9.949031050511335e-06,
"loss": 1.2442,
"step": 917
},
{
"epoch": 0.4665523156089194,
"grad_norm": 3.104398727416992,
"learning_rate": 9.94879132519211e-06,
"loss": 1.3335,
"step": 918
},
{
"epoch": 0.46706054253224066,
"grad_norm": 3.429504632949829,
"learning_rate": 9.948551040339269e-06,
"loss": 1.3438,
"step": 919
},
{
"epoch": 0.4675687694555619,
"grad_norm": 3.1915969848632812,
"learning_rate": 9.948310195979976e-06,
"loss": 1.2604,
"step": 920
},
{
"epoch": 0.46807699637888317,
"grad_norm": 3.0310678482055664,
"learning_rate": 9.948068792141465e-06,
"loss": 1.253,
"step": 921
},
{
"epoch": 0.4685852233022044,
"grad_norm": 3.172191858291626,
"learning_rate": 9.947826828851029e-06,
"loss": 1.2546,
"step": 922
},
{
"epoch": 0.4690934502255257,
"grad_norm": 3.4849483966827393,
"learning_rate": 9.947584306136024e-06,
"loss": 1.2744,
"step": 923
},
{
"epoch": 0.469601677148847,
"grad_norm": 3.4134442806243896,
"learning_rate": 9.947341224023875e-06,
"loss": 1.4603,
"step": 924
},
{
"epoch": 0.47010990407216824,
"grad_norm": 3.0923573970794678,
"learning_rate": 9.94709758254206e-06,
"loss": 1.3375,
"step": 925
},
{
"epoch": 0.4706181309954895,
"grad_norm": 3.329230546951294,
"learning_rate": 9.946853381718133e-06,
"loss": 1.1899,
"step": 926
},
{
"epoch": 0.47112635791881075,
"grad_norm": 2.9873125553131104,
"learning_rate": 9.946608621579698e-06,
"loss": 1.3432,
"step": 927
},
{
"epoch": 0.471634584842132,
"grad_norm": 3.530097723007202,
"learning_rate": 9.946363302154434e-06,
"loss": 1.1975,
"step": 928
},
{
"epoch": 0.47214281176545325,
"grad_norm": 3.5325372219085693,
"learning_rate": 9.946117423470074e-06,
"loss": 1.2736,
"step": 929
},
{
"epoch": 0.47265103868877456,
"grad_norm": 3.143618106842041,
"learning_rate": 9.94587098555442e-06,
"loss": 1.3366,
"step": 930
},
{
"epoch": 0.4731592656120958,
"grad_norm": 3.117429256439209,
"learning_rate": 9.945623988435336e-06,
"loss": 1.3636,
"step": 931
},
{
"epoch": 0.47366749253541707,
"grad_norm": 3.4205844402313232,
"learning_rate": 9.94537643214075e-06,
"loss": 1.3578,
"step": 932
},
{
"epoch": 0.4741757194587383,
"grad_norm": 3.8048481941223145,
"learning_rate": 9.945128316698647e-06,
"loss": 1.4087,
"step": 933
},
{
"epoch": 0.4746839463820596,
"grad_norm": 4.365840435028076,
"learning_rate": 9.944879642137085e-06,
"loss": 1.1789,
"step": 934
},
{
"epoch": 0.4751921733053808,
"grad_norm": 3.3367462158203125,
"learning_rate": 9.944630408484177e-06,
"loss": 1.2769,
"step": 935
},
{
"epoch": 0.47570040022870214,
"grad_norm": 3.1642816066741943,
"learning_rate": 9.944380615768104e-06,
"loss": 1.3854,
"step": 936
},
{
"epoch": 0.4762086271520234,
"grad_norm": 3.0635826587677,
"learning_rate": 9.944130264017109e-06,
"loss": 1.2968,
"step": 937
},
{
"epoch": 0.47671685407534464,
"grad_norm": 3.5414836406707764,
"learning_rate": 9.943879353259496e-06,
"loss": 1.2829,
"step": 938
},
{
"epoch": 0.4772250809986659,
"grad_norm": 2.936600923538208,
"learning_rate": 9.943627883523638e-06,
"loss": 1.2875,
"step": 939
},
{
"epoch": 0.47773330792198715,
"grad_norm": 3.4069905281066895,
"learning_rate": 9.943375854837963e-06,
"loss": 1.3088,
"step": 940
},
{
"epoch": 0.4782415348453084,
"grad_norm": 2.994814872741699,
"learning_rate": 9.94312326723097e-06,
"loss": 1.25,
"step": 941
},
{
"epoch": 0.4787497617686297,
"grad_norm": 3.145922899246216,
"learning_rate": 9.942870120731217e-06,
"loss": 1.1929,
"step": 942
},
{
"epoch": 0.47925798869195096,
"grad_norm": 2.976090908050537,
"learning_rate": 9.942616415367323e-06,
"loss": 1.2835,
"step": 943
},
{
"epoch": 0.4797662156152722,
"grad_norm": 3.158318281173706,
"learning_rate": 9.942362151167977e-06,
"loss": 1.3596,
"step": 944
},
{
"epoch": 0.48027444253859347,
"grad_norm": 3.21836519241333,
"learning_rate": 9.942107328161926e-06,
"loss": 1.3446,
"step": 945
},
{
"epoch": 0.4807826694619147,
"grad_norm": 2.979194402694702,
"learning_rate": 9.941851946377979e-06,
"loss": 1.2835,
"step": 946
},
{
"epoch": 0.48129089638523603,
"grad_norm": 3.823063850402832,
"learning_rate": 9.941596005845014e-06,
"loss": 1.2849,
"step": 947
},
{
"epoch": 0.4817991233085573,
"grad_norm": 3.020623207092285,
"learning_rate": 9.941339506591968e-06,
"loss": 1.3398,
"step": 948
},
{
"epoch": 0.48230735023187854,
"grad_norm": 3.188835382461548,
"learning_rate": 9.941082448647842e-06,
"loss": 1.3944,
"step": 949
},
{
"epoch": 0.4828155771551998,
"grad_norm": 3.160069704055786,
"learning_rate": 9.9408248320417e-06,
"loss": 1.264,
"step": 950
},
{
"epoch": 0.48332380407852105,
"grad_norm": 2.99892258644104,
"learning_rate": 9.940566656802667e-06,
"loss": 1.2279,
"step": 951
},
{
"epoch": 0.4838320310018423,
"grad_norm": 3.09138560295105,
"learning_rate": 9.940307922959938e-06,
"loss": 1.2021,
"step": 952
},
{
"epoch": 0.4843402579251636,
"grad_norm": 2.999363660812378,
"learning_rate": 9.940048630542765e-06,
"loss": 1.2779,
"step": 953
},
{
"epoch": 0.48484848484848486,
"grad_norm": 3.062927484512329,
"learning_rate": 9.93978877958046e-06,
"loss": 1.2581,
"step": 954
},
{
"epoch": 0.4853567117718061,
"grad_norm": 3.0736305713653564,
"learning_rate": 9.939528370102412e-06,
"loss": 1.2768,
"step": 955
},
{
"epoch": 0.48586493869512737,
"grad_norm": 3.21579647064209,
"learning_rate": 9.939267402138058e-06,
"loss": 1.2204,
"step": 956
},
{
"epoch": 0.4863731656184486,
"grad_norm": 3.127753973007202,
"learning_rate": 9.939005875716904e-06,
"loss": 1.2109,
"step": 957
},
{
"epoch": 0.4868813925417699,
"grad_norm": 3.4368927478790283,
"learning_rate": 9.938743790868523e-06,
"loss": 1.3368,
"step": 958
},
{
"epoch": 0.4873896194650912,
"grad_norm": 3.072741985321045,
"learning_rate": 9.938481147622545e-06,
"loss": 1.2094,
"step": 959
},
{
"epoch": 0.48789784638841244,
"grad_norm": 3.4925167560577393,
"learning_rate": 9.938217946008665e-06,
"loss": 1.3443,
"step": 960
},
{
"epoch": 0.4884060733117337,
"grad_norm": 3.1357178688049316,
"learning_rate": 9.937954186056644e-06,
"loss": 1.2344,
"step": 961
},
{
"epoch": 0.48891430023505494,
"grad_norm": 2.8915724754333496,
"learning_rate": 9.937689867796303e-06,
"loss": 1.2941,
"step": 962
},
{
"epoch": 0.4894225271583762,
"grad_norm": 2.945512533187866,
"learning_rate": 9.937424991257526e-06,
"loss": 1.3199,
"step": 963
},
{
"epoch": 0.48993075408169745,
"grad_norm": 3.0827341079711914,
"learning_rate": 9.937159556470263e-06,
"loss": 1.2625,
"step": 964
},
{
"epoch": 0.49043898100501876,
"grad_norm": 2.878173828125,
"learning_rate": 9.936893563464525e-06,
"loss": 1.3022,
"step": 965
},
{
"epoch": 0.49094720792834,
"grad_norm": 2.975311040878296,
"learning_rate": 9.936627012270385e-06,
"loss": 1.2563,
"step": 966
},
{
"epoch": 0.49145543485166127,
"grad_norm": 3.058943510055542,
"learning_rate": 9.93635990291798e-06,
"loss": 1.2574,
"step": 967
},
{
"epoch": 0.4919636617749825,
"grad_norm": 3.2917304039001465,
"learning_rate": 9.936092235437515e-06,
"loss": 1.2649,
"step": 968
},
{
"epoch": 0.4924718886983038,
"grad_norm": 3.0306715965270996,
"learning_rate": 9.93582400985925e-06,
"loss": 1.2129,
"step": 969
},
{
"epoch": 0.4929801156216251,
"grad_norm": 3.389181137084961,
"learning_rate": 9.935555226213512e-06,
"loss": 1.2894,
"step": 970
},
{
"epoch": 0.49348834254494633,
"grad_norm": 2.8703081607818604,
"learning_rate": 9.935285884530693e-06,
"loss": 1.2568,
"step": 971
},
{
"epoch": 0.4939965694682676,
"grad_norm": 3.097668170928955,
"learning_rate": 9.935015984841244e-06,
"loss": 1.1949,
"step": 972
},
{
"epoch": 0.49450479639158884,
"grad_norm": 3.344644546508789,
"learning_rate": 9.93474552717568e-06,
"loss": 1.3539,
"step": 973
},
{
"epoch": 0.4950130233149101,
"grad_norm": 2.9466795921325684,
"learning_rate": 9.934474511564583e-06,
"loss": 1.2893,
"step": 974
},
{
"epoch": 0.49552125023823135,
"grad_norm": 3.2382895946502686,
"learning_rate": 9.934202938038595e-06,
"loss": 1.1904,
"step": 975
},
{
"epoch": 0.49602947716155266,
"grad_norm": 3.703711986541748,
"learning_rate": 9.93393080662842e-06,
"loss": 1.3855,
"step": 976
},
{
"epoch": 0.4965377040848739,
"grad_norm": 2.887328863143921,
"learning_rate": 9.933658117364829e-06,
"loss": 1.1818,
"step": 977
},
{
"epoch": 0.49704593100819516,
"grad_norm": 3.141327381134033,
"learning_rate": 9.93338487027865e-06,
"loss": 1.3616,
"step": 978
},
{
"epoch": 0.4975541579315164,
"grad_norm": 3.216190814971924,
"learning_rate": 9.93311106540078e-06,
"loss": 1.3995,
"step": 979
},
{
"epoch": 0.49806238485483767,
"grad_norm": 2.990403175354004,
"learning_rate": 9.932836702762173e-06,
"loss": 1.1847,
"step": 980
},
{
"epoch": 0.4985706117781589,
"grad_norm": 2.8127925395965576,
"learning_rate": 9.932561782393858e-06,
"loss": 1.1195,
"step": 981
},
{
"epoch": 0.49907883870148023,
"grad_norm": 3.067380428314209,
"learning_rate": 9.93228630432691e-06,
"loss": 1.3241,
"step": 982
},
{
"epoch": 0.4995870656248015,
"grad_norm": 3.2635014057159424,
"learning_rate": 9.932010268592479e-06,
"loss": 1.4408,
"step": 983
},
{
"epoch": 0.5000952925481227,
"grad_norm": 3.01632022857666,
"learning_rate": 9.931733675221776e-06,
"loss": 1.3519,
"step": 984
},
{
"epoch": 0.500603519471444,
"grad_norm": 3.1168856620788574,
"learning_rate": 9.931456524246073e-06,
"loss": 1.2522,
"step": 985
},
{
"epoch": 0.5011117463947653,
"grad_norm": 3.0207486152648926,
"learning_rate": 9.931178815696706e-06,
"loss": 1.3152,
"step": 986
},
{
"epoch": 0.5016199733180865,
"grad_norm": 3.0515527725219727,
"learning_rate": 9.930900549605077e-06,
"loss": 1.2104,
"step": 987
},
{
"epoch": 0.5021282002414078,
"grad_norm": 2.985316514968872,
"learning_rate": 9.93062172600264e-06,
"loss": 1.2172,
"step": 988
},
{
"epoch": 0.502636427164729,
"grad_norm": 3.1258912086486816,
"learning_rate": 9.930342344920929e-06,
"loss": 1.2094,
"step": 989
},
{
"epoch": 0.5031446540880503,
"grad_norm": 3.497823476791382,
"learning_rate": 9.930062406391527e-06,
"loss": 1.2589,
"step": 990
},
{
"epoch": 0.5036528810113716,
"grad_norm": 2.91703462600708,
"learning_rate": 9.929781910446088e-06,
"loss": 1.2083,
"step": 991
},
{
"epoch": 0.5041611079346928,
"grad_norm": 2.9708058834075928,
"learning_rate": 9.929500857116326e-06,
"loss": 1.2771,
"step": 992
},
{
"epoch": 0.5046693348580141,
"grad_norm": 3.113933563232422,
"learning_rate": 9.929219246434014e-06,
"loss": 1.1901,
"step": 993
},
{
"epoch": 0.5051775617813353,
"grad_norm": 3.2545571327209473,
"learning_rate": 9.928937078430996e-06,
"loss": 1.4007,
"step": 994
},
{
"epoch": 0.5056857887046566,
"grad_norm": 3.0928285121917725,
"learning_rate": 9.928654353139175e-06,
"loss": 1.2483,
"step": 995
},
{
"epoch": 0.506194015627978,
"grad_norm": 3.1192171573638916,
"learning_rate": 9.928371070590517e-06,
"loss": 1.2464,
"step": 996
},
{
"epoch": 0.5067022425512991,
"grad_norm": 3.0406901836395264,
"learning_rate": 9.928087230817053e-06,
"loss": 1.3043,
"step": 997
},
{
"epoch": 0.5072104694746205,
"grad_norm": 3.2588446140289307,
"learning_rate": 9.92780283385087e-06,
"loss": 1.2525,
"step": 998
},
{
"epoch": 0.5077186963979416,
"grad_norm": 3.1698226928710938,
"learning_rate": 9.927517879724127e-06,
"loss": 1.2424,
"step": 999
},
{
"epoch": 0.508226923321263,
"grad_norm": 3.1326828002929688,
"learning_rate": 9.927232368469044e-06,
"loss": 1.2272,
"step": 1000
},
{
"epoch": 0.508226923321263,
"eval_loss": 1.2929835319519043,
"eval_runtime": 12.5577,
"eval_samples_per_second": 31.853,
"eval_steps_per_second": 3.982,
"step": 1000
},
{
"epoch": 0.5087351502445842,
"grad_norm": 2.9654858112335205,
"learning_rate": 9.926946300117897e-06,
"loss": 1.2446,
"step": 1001
},
{
"epoch": 0.5092433771679055,
"grad_norm": 2.9097492694854736,
"learning_rate": 9.926659674703036e-06,
"loss": 1.3136,
"step": 1002
},
{
"epoch": 0.5097516040912268,
"grad_norm": 3.0150370597839355,
"learning_rate": 9.926372492256864e-06,
"loss": 1.356,
"step": 1003
},
{
"epoch": 0.510259831014548,
"grad_norm": 3.2294318675994873,
"learning_rate": 9.926084752811853e-06,
"loss": 1.276,
"step": 1004
},
{
"epoch": 0.5107680579378693,
"grad_norm": 2.965230703353882,
"learning_rate": 9.925796456400535e-06,
"loss": 1.2202,
"step": 1005
},
{
"epoch": 0.5112762848611905,
"grad_norm": 2.934131145477295,
"learning_rate": 9.92550760305551e-06,
"loss": 1.2714,
"step": 1006
},
{
"epoch": 0.5117845117845118,
"grad_norm": 3.065397262573242,
"learning_rate": 9.92521819280943e-06,
"loss": 1.2414,
"step": 1007
},
{
"epoch": 0.5122927387078331,
"grad_norm": 3.265735387802124,
"learning_rate": 9.924928225695026e-06,
"loss": 1.2842,
"step": 1008
},
{
"epoch": 0.5128009656311543,
"grad_norm": 3.2375340461730957,
"learning_rate": 9.924637701745075e-06,
"loss": 1.1905,
"step": 1009
},
{
"epoch": 0.5133091925544756,
"grad_norm": 3.048048257827759,
"learning_rate": 9.924346620992429e-06,
"loss": 1.3127,
"step": 1010
},
{
"epoch": 0.5138174194777968,
"grad_norm": 2.9338512420654297,
"learning_rate": 9.924054983469999e-06,
"loss": 1.173,
"step": 1011
},
{
"epoch": 0.5143256464011181,
"grad_norm": 2.960909366607666,
"learning_rate": 9.923762789210757e-06,
"loss": 1.2117,
"step": 1012
},
{
"epoch": 0.5148338733244394,
"grad_norm": 2.8854153156280518,
"learning_rate": 9.923470038247741e-06,
"loss": 1.1573,
"step": 1013
},
{
"epoch": 0.5153421002477606,
"grad_norm": 3.157883644104004,
"learning_rate": 9.923176730614052e-06,
"loss": 1.2489,
"step": 1014
},
{
"epoch": 0.5158503271710819,
"grad_norm": 3.11163067817688,
"learning_rate": 9.92288286634285e-06,
"loss": 1.3366,
"step": 1015
},
{
"epoch": 0.5163585540944031,
"grad_norm": 3.2269506454467773,
"learning_rate": 9.922588445467362e-06,
"loss": 1.41,
"step": 1016
},
{
"epoch": 0.5168667810177244,
"grad_norm": 3.2312417030334473,
"learning_rate": 9.92229346802088e-06,
"loss": 1.332,
"step": 1017
},
{
"epoch": 0.5173750079410456,
"grad_norm": 3.2907750606536865,
"learning_rate": 9.921997934036749e-06,
"loss": 1.2556,
"step": 1018
},
{
"epoch": 0.5178832348643669,
"grad_norm": 2.9131078720092773,
"learning_rate": 9.921701843548389e-06,
"loss": 1.3176,
"step": 1019
},
{
"epoch": 0.5183914617876882,
"grad_norm": 3.4000084400177,
"learning_rate": 9.921405196589273e-06,
"loss": 1.2849,
"step": 1020
},
{
"epoch": 0.5188996887110094,
"grad_norm": 3.0663211345672607,
"learning_rate": 9.921107993192946e-06,
"loss": 1.2214,
"step": 1021
},
{
"epoch": 0.5194079156343308,
"grad_norm": 2.9851553440093994,
"learning_rate": 9.920810233393007e-06,
"loss": 1.1617,
"step": 1022
},
{
"epoch": 0.519916142557652,
"grad_norm": 3.3432230949401855,
"learning_rate": 9.920511917223125e-06,
"loss": 1.2762,
"step": 1023
},
{
"epoch": 0.5204243694809733,
"grad_norm": 3.3022565841674805,
"learning_rate": 9.920213044717027e-06,
"loss": 1.3154,
"step": 1024
},
{
"epoch": 0.5209325964042946,
"grad_norm": 3.4665110111236572,
"learning_rate": 9.919913615908505e-06,
"loss": 1.2879,
"step": 1025
},
{
"epoch": 0.5214408233276158,
"grad_norm": 3.0947935581207275,
"learning_rate": 9.919613630831416e-06,
"loss": 1.2294,
"step": 1026
},
{
"epoch": 0.5219490502509371,
"grad_norm": 3.237161874771118,
"learning_rate": 9.919313089519677e-06,
"loss": 1.2859,
"step": 1027
},
{
"epoch": 0.5224572771742583,
"grad_norm": 3.29890775680542,
"learning_rate": 9.919011992007266e-06,
"loss": 1.2226,
"step": 1028
},
{
"epoch": 0.5229655040975796,
"grad_norm": 3.400012969970703,
"learning_rate": 9.91871033832823e-06,
"loss": 1.3052,
"step": 1029
},
{
"epoch": 0.5234737310209008,
"grad_norm": 3.583190679550171,
"learning_rate": 9.918408128516674e-06,
"loss": 1.3402,
"step": 1030
},
{
"epoch": 0.5239819579442221,
"grad_norm": 3.0629453659057617,
"learning_rate": 9.918105362606766e-06,
"loss": 1.258,
"step": 1031
},
{
"epoch": 0.5244901848675434,
"grad_norm": 3.27661395072937,
"learning_rate": 9.91780204063274e-06,
"loss": 1.4624,
"step": 1032
},
{
"epoch": 0.5249984117908646,
"grad_norm": 3.9633708000183105,
"learning_rate": 9.917498162628888e-06,
"loss": 1.2498,
"step": 1033
},
{
"epoch": 0.5255066387141859,
"grad_norm": 3.0484509468078613,
"learning_rate": 9.917193728629574e-06,
"loss": 1.2621,
"step": 1034
},
{
"epoch": 0.5260148656375071,
"grad_norm": 3.034428596496582,
"learning_rate": 9.916888738669212e-06,
"loss": 1.2793,
"step": 1035
},
{
"epoch": 0.5265230925608284,
"grad_norm": 3.1338136196136475,
"learning_rate": 9.91658319278229e-06,
"loss": 1.3162,
"step": 1036
},
{
"epoch": 0.5270313194841497,
"grad_norm": 3.1185007095336914,
"learning_rate": 9.916277091003352e-06,
"loss": 1.2203,
"step": 1037
},
{
"epoch": 0.5275395464074709,
"grad_norm": 3.052046060562134,
"learning_rate": 9.915970433367009e-06,
"loss": 1.2556,
"step": 1038
},
{
"epoch": 0.5280477733307922,
"grad_norm": 3.055419921875,
"learning_rate": 9.915663219907933e-06,
"loss": 1.2842,
"step": 1039
},
{
"epoch": 0.5285560002541134,
"grad_norm": 3.175314426422119,
"learning_rate": 9.915355450660858e-06,
"loss": 1.2761,
"step": 1040
},
{
"epoch": 0.5290642271774347,
"grad_norm": 2.6530027389526367,
"learning_rate": 9.915047125660581e-06,
"loss": 1.2134,
"step": 1041
},
{
"epoch": 0.529572454100756,
"grad_norm": 3.3357229232788086,
"learning_rate": 9.914738244941965e-06,
"loss": 1.3765,
"step": 1042
},
{
"epoch": 0.5300806810240772,
"grad_norm": 2.9852263927459717,
"learning_rate": 9.91442880853993e-06,
"loss": 1.2577,
"step": 1043
},
{
"epoch": 0.5305889079473985,
"grad_norm": 2.864121913909912,
"learning_rate": 9.914118816489469e-06,
"loss": 1.3375,
"step": 1044
},
{
"epoch": 0.5310971348707197,
"grad_norm": 2.9069125652313232,
"learning_rate": 9.913808268825625e-06,
"loss": 1.2162,
"step": 1045
},
{
"epoch": 0.531605361794041,
"grad_norm": 3.2001500129699707,
"learning_rate": 9.91349716558351e-06,
"loss": 1.2921,
"step": 1046
},
{
"epoch": 0.5321135887173623,
"grad_norm": 2.888265371322632,
"learning_rate": 9.913185506798302e-06,
"loss": 1.1466,
"step": 1047
},
{
"epoch": 0.5326218156406836,
"grad_norm": 3.1221208572387695,
"learning_rate": 9.912873292505238e-06,
"loss": 1.2126,
"step": 1048
},
{
"epoch": 0.5331300425640049,
"grad_norm": 3.3143773078918457,
"learning_rate": 9.912560522739618e-06,
"loss": 1.3249,
"step": 1049
},
{
"epoch": 0.5336382694873261,
"grad_norm": 3.1017792224884033,
"learning_rate": 9.912247197536804e-06,
"loss": 1.3083,
"step": 1050
},
{
"epoch": 0.5341464964106474,
"grad_norm": 2.9904158115386963,
"learning_rate": 9.911933316932223e-06,
"loss": 1.2244,
"step": 1051
},
{
"epoch": 0.5346547233339686,
"grad_norm": 3.5156807899475098,
"learning_rate": 9.911618880961365e-06,
"loss": 1.3113,
"step": 1052
},
{
"epoch": 0.5351629502572899,
"grad_norm": 3.0118355751037598,
"learning_rate": 9.91130388965978e-06,
"loss": 1.2591,
"step": 1053
},
{
"epoch": 0.5356711771806112,
"grad_norm": 3.93129301071167,
"learning_rate": 9.910988343063081e-06,
"loss": 1.3097,
"step": 1054
},
{
"epoch": 0.5361794041039324,
"grad_norm": 2.846911668777466,
"learning_rate": 9.910672241206948e-06,
"loss": 1.1875,
"step": 1055
},
{
"epoch": 0.5366876310272537,
"grad_norm": 2.836031913757324,
"learning_rate": 9.91035558412712e-06,
"loss": 1.302,
"step": 1056
},
{
"epoch": 0.5371958579505749,
"grad_norm": 3.446367025375366,
"learning_rate": 9.910038371859399e-06,
"loss": 1.327,
"step": 1057
},
{
"epoch": 0.5377040848738962,
"grad_norm": 2.8755125999450684,
"learning_rate": 9.909720604439652e-06,
"loss": 1.2768,
"step": 1058
},
{
"epoch": 0.5382123117972175,
"grad_norm": 2.974616765975952,
"learning_rate": 9.909402281903808e-06,
"loss": 1.3633,
"step": 1059
},
{
"epoch": 0.5387205387205387,
"grad_norm": 3.0021567344665527,
"learning_rate": 9.909083404287853e-06,
"loss": 1.3469,
"step": 1060
},
{
"epoch": 0.53922876564386,
"grad_norm": 2.866323709487915,
"learning_rate": 9.908763971627846e-06,
"loss": 1.2739,
"step": 1061
},
{
"epoch": 0.5397369925671812,
"grad_norm": 3.079787254333496,
"learning_rate": 9.908443983959903e-06,
"loss": 1.2476,
"step": 1062
},
{
"epoch": 0.5402452194905025,
"grad_norm": 2.970996141433716,
"learning_rate": 9.9081234413202e-06,
"loss": 1.2822,
"step": 1063
},
{
"epoch": 0.5407534464138237,
"grad_norm": 3.0350842475891113,
"learning_rate": 9.907802343744983e-06,
"loss": 1.2566,
"step": 1064
},
{
"epoch": 0.541261673337145,
"grad_norm": 2.901156425476074,
"learning_rate": 9.907480691270554e-06,
"loss": 1.2111,
"step": 1065
},
{
"epoch": 0.5417699002604663,
"grad_norm": 3.4042131900787354,
"learning_rate": 9.907158483933283e-06,
"loss": 1.388,
"step": 1066
},
{
"epoch": 0.5422781271837875,
"grad_norm": 2.9463226795196533,
"learning_rate": 9.906835721769597e-06,
"loss": 1.1387,
"step": 1067
},
{
"epoch": 0.5427863541071088,
"grad_norm": 2.950364589691162,
"learning_rate": 9.90651240481599e-06,
"loss": 1.2743,
"step": 1068
},
{
"epoch": 0.54329458103043,
"grad_norm": 3.0166707038879395,
"learning_rate": 9.906188533109022e-06,
"loss": 1.2999,
"step": 1069
},
{
"epoch": 0.5438028079537514,
"grad_norm": 2.9995715618133545,
"learning_rate": 9.905864106685305e-06,
"loss": 1.3692,
"step": 1070
},
{
"epoch": 0.5443110348770727,
"grad_norm": 2.8355770111083984,
"learning_rate": 9.905539125581525e-06,
"loss": 1.222,
"step": 1071
},
{
"epoch": 0.5448192618003939,
"grad_norm": 3.0823659896850586,
"learning_rate": 9.905213589834424e-06,
"loss": 1.2928,
"step": 1072
},
{
"epoch": 0.5453274887237152,
"grad_norm": 3.1366348266601562,
"learning_rate": 9.90488749948081e-06,
"loss": 1.2437,
"step": 1073
},
{
"epoch": 0.5458357156470364,
"grad_norm": 3.1095762252807617,
"learning_rate": 9.904560854557548e-06,
"loss": 1.2076,
"step": 1074
},
{
"epoch": 0.5463439425703577,
"grad_norm": 2.9151086807250977,
"learning_rate": 9.904233655101574e-06,
"loss": 1.2691,
"step": 1075
},
{
"epoch": 0.5468521694936789,
"grad_norm": 2.994748830795288,
"learning_rate": 9.903905901149881e-06,
"loss": 1.2917,
"step": 1076
},
{
"epoch": 0.5473603964170002,
"grad_norm": 3.118807315826416,
"learning_rate": 9.903577592739528e-06,
"loss": 1.2359,
"step": 1077
},
{
"epoch": 0.5478686233403215,
"grad_norm": 3.042778253555298,
"learning_rate": 9.903248729907635e-06,
"loss": 1.283,
"step": 1078
},
{
"epoch": 0.5483768502636427,
"grad_norm": 2.8278987407684326,
"learning_rate": 9.902919312691384e-06,
"loss": 1.2585,
"step": 1079
},
{
"epoch": 0.548885077186964,
"grad_norm": 2.88580322265625,
"learning_rate": 9.902589341128019e-06,
"loss": 1.2512,
"step": 1080
},
{
"epoch": 0.5493933041102852,
"grad_norm": 3.03999400138855,
"learning_rate": 9.902258815254851e-06,
"loss": 1.2731,
"step": 1081
},
{
"epoch": 0.5499015310336065,
"grad_norm": 3.7839131355285645,
"learning_rate": 9.901927735109249e-06,
"loss": 1.3055,
"step": 1082
},
{
"epoch": 0.5504097579569278,
"grad_norm": 3.0038340091705322,
"learning_rate": 9.901596100728646e-06,
"loss": 1.2088,
"step": 1083
},
{
"epoch": 0.550917984880249,
"grad_norm": 3.1675291061401367,
"learning_rate": 9.90126391215054e-06,
"loss": 1.2438,
"step": 1084
},
{
"epoch": 0.5514262118035703,
"grad_norm": 3.0010335445404053,
"learning_rate": 9.900931169412488e-06,
"loss": 1.2682,
"step": 1085
},
{
"epoch": 0.5519344387268915,
"grad_norm": 2.973571300506592,
"learning_rate": 9.900597872552113e-06,
"loss": 1.283,
"step": 1086
},
{
"epoch": 0.5524426656502128,
"grad_norm": 3.2726941108703613,
"learning_rate": 9.9002640216071e-06,
"loss": 1.2838,
"step": 1087
},
{
"epoch": 0.5529508925735341,
"grad_norm": 3.167182207107544,
"learning_rate": 9.899929616615192e-06,
"loss": 1.2879,
"step": 1088
},
{
"epoch": 0.5534591194968553,
"grad_norm": 3.0281550884246826,
"learning_rate": 9.899594657614201e-06,
"loss": 1.1682,
"step": 1089
},
{
"epoch": 0.5539673464201766,
"grad_norm": 3.0986578464508057,
"learning_rate": 9.899259144641999e-06,
"loss": 1.3208,
"step": 1090
},
{
"epoch": 0.5544755733434978,
"grad_norm": 3.445312023162842,
"learning_rate": 9.89892307773652e-06,
"loss": 1.224,
"step": 1091
},
{
"epoch": 0.5549838002668191,
"grad_norm": 3.1991617679595947,
"learning_rate": 9.898586456935761e-06,
"loss": 1.3483,
"step": 1092
},
{
"epoch": 0.5554920271901403,
"grad_norm": 3.3592443466186523,
"learning_rate": 9.898249282277784e-06,
"loss": 1.3855,
"step": 1093
},
{
"epoch": 0.5560002541134617,
"grad_norm": 3.050511121749878,
"learning_rate": 9.897911553800709e-06,
"loss": 1.3756,
"step": 1094
},
{
"epoch": 0.556508481036783,
"grad_norm": 3.1178085803985596,
"learning_rate": 9.897573271542721e-06,
"loss": 1.3593,
"step": 1095
},
{
"epoch": 0.5570167079601042,
"grad_norm": 3.3286967277526855,
"learning_rate": 9.897234435542072e-06,
"loss": 1.2354,
"step": 1096
},
{
"epoch": 0.5575249348834255,
"grad_norm": 3.2614622116088867,
"learning_rate": 9.896895045837067e-06,
"loss": 1.3017,
"step": 1097
},
{
"epoch": 0.5580331618067467,
"grad_norm": 3.1033172607421875,
"learning_rate": 9.896555102466083e-06,
"loss": 1.3554,
"step": 1098
},
{
"epoch": 0.558541388730068,
"grad_norm": 3.0228354930877686,
"learning_rate": 9.896214605467553e-06,
"loss": 1.2444,
"step": 1099
},
{
"epoch": 0.5590496156533893,
"grad_norm": 2.8342230319976807,
"learning_rate": 9.895873554879978e-06,
"loss": 1.2475,
"step": 1100
},
{
"epoch": 0.5595578425767105,
"grad_norm": 3.0209481716156006,
"learning_rate": 9.895531950741915e-06,
"loss": 1.2892,
"step": 1101
},
{
"epoch": 0.5600660695000318,
"grad_norm": 2.9123499393463135,
"learning_rate": 9.89518979309199e-06,
"loss": 1.26,
"step": 1102
},
{
"epoch": 0.560574296423353,
"grad_norm": 2.979750394821167,
"learning_rate": 9.894847081968888e-06,
"loss": 1.2042,
"step": 1103
},
{
"epoch": 0.5610825233466743,
"grad_norm": 3.2477877140045166,
"learning_rate": 9.894503817411358e-06,
"loss": 1.553,
"step": 1104
},
{
"epoch": 0.5615907502699955,
"grad_norm": 3.2751965522766113,
"learning_rate": 9.89415999945821e-06,
"loss": 1.2902,
"step": 1105
},
{
"epoch": 0.5620989771933168,
"grad_norm": 3.260960578918457,
"learning_rate": 9.89381562814832e-06,
"loss": 1.2309,
"step": 1106
},
{
"epoch": 0.5626072041166381,
"grad_norm": 2.87548565864563,
"learning_rate": 9.893470703520622e-06,
"loss": 1.2196,
"step": 1107
},
{
"epoch": 0.5631154310399593,
"grad_norm": 3.0245654582977295,
"learning_rate": 9.893125225614117e-06,
"loss": 1.2439,
"step": 1108
},
{
"epoch": 0.5636236579632806,
"grad_norm": 2.7714860439300537,
"learning_rate": 9.892779194467864e-06,
"loss": 1.3271,
"step": 1109
},
{
"epoch": 0.5641318848866018,
"grad_norm": 2.8270699977874756,
"learning_rate": 9.892432610120987e-06,
"loss": 1.1949,
"step": 1110
},
{
"epoch": 0.5646401118099231,
"grad_norm": 3.2219133377075195,
"learning_rate": 9.892085472612675e-06,
"loss": 1.241,
"step": 1111
},
{
"epoch": 0.5651483387332444,
"grad_norm": 3.015878677368164,
"learning_rate": 9.891737781982174e-06,
"loss": 1.3107,
"step": 1112
},
{
"epoch": 0.5656565656565656,
"grad_norm": 3.113751173019409,
"learning_rate": 9.891389538268799e-06,
"loss": 1.3017,
"step": 1113
},
{
"epoch": 0.5661647925798869,
"grad_norm": 3.0058841705322266,
"learning_rate": 9.89104074151192e-06,
"loss": 1.2783,
"step": 1114
},
{
"epoch": 0.5666730195032081,
"grad_norm": 2.8917829990386963,
"learning_rate": 9.890691391750977e-06,
"loss": 1.2405,
"step": 1115
},
{
"epoch": 0.5671812464265295,
"grad_norm": 3.019864082336426,
"learning_rate": 9.890341489025466e-06,
"loss": 1.1901,
"step": 1116
},
{
"epoch": 0.5676894733498508,
"grad_norm": 2.9965898990631104,
"learning_rate": 9.889991033374952e-06,
"loss": 1.3086,
"step": 1117
},
{
"epoch": 0.568197700273172,
"grad_norm": 2.688847780227661,
"learning_rate": 9.889640024839057e-06,
"loss": 1.2379,
"step": 1118
},
{
"epoch": 0.5687059271964933,
"grad_norm": 3.068826198577881,
"learning_rate": 9.889288463457468e-06,
"loss": 1.2525,
"step": 1119
},
{
"epoch": 0.5692141541198145,
"grad_norm": 3.1524131298065186,
"learning_rate": 9.888936349269934e-06,
"loss": 1.2592,
"step": 1120
},
{
"epoch": 0.5697223810431358,
"grad_norm": 2.97160267829895,
"learning_rate": 9.888583682316268e-06,
"loss": 1.2293,
"step": 1121
},
{
"epoch": 0.570230607966457,
"grad_norm": 3.040951728820801,
"learning_rate": 9.888230462636343e-06,
"loss": 1.2587,
"step": 1122
},
{
"epoch": 0.5707388348897783,
"grad_norm": 3.0704641342163086,
"learning_rate": 9.887876690270095e-06,
"loss": 1.3122,
"step": 1123
},
{
"epoch": 0.5712470618130996,
"grad_norm": 3.068542242050171,
"learning_rate": 9.887522365257525e-06,
"loss": 1.3523,
"step": 1124
},
{
"epoch": 0.5717552887364208,
"grad_norm": 3.050361394882202,
"learning_rate": 9.887167487638693e-06,
"loss": 1.2626,
"step": 1125
},
{
"epoch": 0.5722635156597421,
"grad_norm": 3.1941027641296387,
"learning_rate": 9.886812057453726e-06,
"loss": 1.389,
"step": 1126
},
{
"epoch": 0.5727717425830633,
"grad_norm": 3.0776960849761963,
"learning_rate": 9.886456074742806e-06,
"loss": 1.2869,
"step": 1127
},
{
"epoch": 0.5732799695063846,
"grad_norm": 3.1108217239379883,
"learning_rate": 9.886099539546185e-06,
"loss": 1.2325,
"step": 1128
},
{
"epoch": 0.5737881964297059,
"grad_norm": 2.865870714187622,
"learning_rate": 9.885742451904174e-06,
"loss": 1.2044,
"step": 1129
},
{
"epoch": 0.5742964233530271,
"grad_norm": 2.8582499027252197,
"learning_rate": 9.885384811857148e-06,
"loss": 1.1932,
"step": 1130
},
{
"epoch": 0.5748046502763484,
"grad_norm": 3.5153896808624268,
"learning_rate": 9.885026619445544e-06,
"loss": 1.3823,
"step": 1131
},
{
"epoch": 0.5753128771996696,
"grad_norm": 2.8332269191741943,
"learning_rate": 9.884667874709857e-06,
"loss": 1.2556,
"step": 1132
},
{
"epoch": 0.5758211041229909,
"grad_norm": 2.7498703002929688,
"learning_rate": 9.88430857769065e-06,
"loss": 1.1875,
"step": 1133
},
{
"epoch": 0.5763293310463122,
"grad_norm": 2.9405388832092285,
"learning_rate": 9.883948728428551e-06,
"loss": 1.1411,
"step": 1134
},
{
"epoch": 0.5768375579696334,
"grad_norm": 2.9063611030578613,
"learning_rate": 9.883588326964242e-06,
"loss": 1.2758,
"step": 1135
},
{
"epoch": 0.5773457848929547,
"grad_norm": 3.066329002380371,
"learning_rate": 9.883227373338472e-06,
"loss": 1.2635,
"step": 1136
},
{
"epoch": 0.5778540118162759,
"grad_norm": 3.026329755783081,
"learning_rate": 9.882865867592054e-06,
"loss": 1.327,
"step": 1137
},
{
"epoch": 0.5783622387395972,
"grad_norm": 2.8590166568756104,
"learning_rate": 9.882503809765858e-06,
"loss": 1.2706,
"step": 1138
},
{
"epoch": 0.5788704656629184,
"grad_norm": 3.33844256401062,
"learning_rate": 9.882141199900823e-06,
"loss": 1.2434,
"step": 1139
},
{
"epoch": 0.5793786925862398,
"grad_norm": 2.910153865814209,
"learning_rate": 9.881778038037946e-06,
"loss": 1.2609,
"step": 1140
},
{
"epoch": 0.5798869195095611,
"grad_norm": 3.2438127994537354,
"learning_rate": 9.88141432421829e-06,
"loss": 1.308,
"step": 1141
},
{
"epoch": 0.5803951464328823,
"grad_norm": 3.1046183109283447,
"learning_rate": 9.881050058482976e-06,
"loss": 1.3514,
"step": 1142
},
{
"epoch": 0.5809033733562036,
"grad_norm": 2.9112555980682373,
"learning_rate": 9.88068524087319e-06,
"loss": 1.3074,
"step": 1143
},
{
"epoch": 0.5814116002795248,
"grad_norm": 3.073887586593628,
"learning_rate": 9.880319871430179e-06,
"loss": 1.219,
"step": 1144
},
{
"epoch": 0.5819198272028461,
"grad_norm": 2.8623321056365967,
"learning_rate": 9.879953950195255e-06,
"loss": 1.1971,
"step": 1145
},
{
"epoch": 0.5824280541261674,
"grad_norm": 2.9542438983917236,
"learning_rate": 9.879587477209793e-06,
"loss": 1.2554,
"step": 1146
},
{
"epoch": 0.5829362810494886,
"grad_norm": 3.502727508544922,
"learning_rate": 9.879220452515224e-06,
"loss": 1.254,
"step": 1147
},
{
"epoch": 0.5834445079728099,
"grad_norm": 2.9458866119384766,
"learning_rate": 9.878852876153047e-06,
"loss": 1.2976,
"step": 1148
},
{
"epoch": 0.5839527348961311,
"grad_norm": 3.059884786605835,
"learning_rate": 9.87848474816482e-06,
"loss": 1.3303,
"step": 1149
},
{
"epoch": 0.5844609618194524,
"grad_norm": 2.8677780628204346,
"learning_rate": 9.878116068592169e-06,
"loss": 1.2808,
"step": 1150
},
{
"epoch": 0.5849691887427736,
"grad_norm": 3.375119209289551,
"learning_rate": 9.877746837476777e-06,
"loss": 1.2412,
"step": 1151
},
{
"epoch": 0.5854774156660949,
"grad_norm": 3.057594060897827,
"learning_rate": 9.877377054860391e-06,
"loss": 1.2625,
"step": 1152
},
{
"epoch": 0.5859856425894162,
"grad_norm": 3.1959619522094727,
"learning_rate": 9.87700672078482e-06,
"loss": 1.306,
"step": 1153
},
{
"epoch": 0.5864938695127374,
"grad_norm": 2.947911262512207,
"learning_rate": 9.876635835291936e-06,
"loss": 1.2275,
"step": 1154
},
{
"epoch": 0.5870020964360587,
"grad_norm": 4.026703834533691,
"learning_rate": 9.876264398423672e-06,
"loss": 1.3709,
"step": 1155
},
{
"epoch": 0.5875103233593799,
"grad_norm": 2.906632661819458,
"learning_rate": 9.875892410222027e-06,
"loss": 1.3088,
"step": 1156
},
{
"epoch": 0.5880185502827012,
"grad_norm": 2.9481449127197266,
"learning_rate": 9.875519870729057e-06,
"loss": 1.3556,
"step": 1157
},
{
"epoch": 0.5885267772060225,
"grad_norm": 2.9592795372009277,
"learning_rate": 9.875146779986885e-06,
"loss": 1.2336,
"step": 1158
},
{
"epoch": 0.5890350041293437,
"grad_norm": 2.996302604675293,
"learning_rate": 9.874773138037693e-06,
"loss": 1.2626,
"step": 1159
},
{
"epoch": 0.589543231052665,
"grad_norm": 2.965101480484009,
"learning_rate": 9.874398944923728e-06,
"loss": 1.3835,
"step": 1160
},
{
"epoch": 0.5900514579759862,
"grad_norm": 2.9105746746063232,
"learning_rate": 9.874024200687297e-06,
"loss": 1.1297,
"step": 1161
},
{
"epoch": 0.5905596848993075,
"grad_norm": 2.9277119636535645,
"learning_rate": 9.873648905370769e-06,
"loss": 1.1621,
"step": 1162
},
{
"epoch": 0.5910679118226289,
"grad_norm": 3.346733808517456,
"learning_rate": 9.873273059016582e-06,
"loss": 1.3174,
"step": 1163
},
{
"epoch": 0.59157613874595,
"grad_norm": 3.2384955883026123,
"learning_rate": 9.872896661667224e-06,
"loss": 1.2219,
"step": 1164
},
{
"epoch": 0.5920843656692714,
"grad_norm": 2.9235384464263916,
"learning_rate": 9.872519713365259e-06,
"loss": 1.264,
"step": 1165
},
{
"epoch": 0.5925925925925926,
"grad_norm": 3.221442222595215,
"learning_rate": 9.8721422141533e-06,
"loss": 1.1639,
"step": 1166
},
{
"epoch": 0.5931008195159139,
"grad_norm": 2.9388232231140137,
"learning_rate": 9.871764164074033e-06,
"loss": 1.216,
"step": 1167
},
{
"epoch": 0.5936090464392351,
"grad_norm": 3.0020532608032227,
"learning_rate": 9.871385563170201e-06,
"loss": 1.2731,
"step": 1168
},
{
"epoch": 0.5941172733625564,
"grad_norm": 3.0851593017578125,
"learning_rate": 9.87100641148461e-06,
"loss": 1.1301,
"step": 1169
},
{
"epoch": 0.5946255002858777,
"grad_norm": 2.9967799186706543,
"learning_rate": 9.870626709060131e-06,
"loss": 1.22,
"step": 1170
},
{
"epoch": 0.5951337272091989,
"grad_norm": 3.1237094402313232,
"learning_rate": 9.870246455939692e-06,
"loss": 1.2942,
"step": 1171
},
{
"epoch": 0.5956419541325202,
"grad_norm": 3.2442684173583984,
"learning_rate": 9.869865652166287e-06,
"loss": 1.2948,
"step": 1172
},
{
"epoch": 0.5961501810558414,
"grad_norm": 3.2860963344573975,
"learning_rate": 9.869484297782971e-06,
"loss": 1.3071,
"step": 1173
},
{
"epoch": 0.5966584079791627,
"grad_norm": 2.9791018962860107,
"learning_rate": 9.869102392832863e-06,
"loss": 1.2806,
"step": 1174
},
{
"epoch": 0.597166634902484,
"grad_norm": 2.7118618488311768,
"learning_rate": 9.868719937359144e-06,
"loss": 1.2168,
"step": 1175
},
{
"epoch": 0.5976748618258052,
"grad_norm": 2.7597343921661377,
"learning_rate": 9.868336931405054e-06,
"loss": 1.2258,
"step": 1176
},
{
"epoch": 0.5981830887491265,
"grad_norm": 3.0382118225097656,
"learning_rate": 9.867953375013897e-06,
"loss": 1.3343,
"step": 1177
},
{
"epoch": 0.5986913156724477,
"grad_norm": 3.269522190093994,
"learning_rate": 9.86756926822904e-06,
"loss": 1.2483,
"step": 1178
},
{
"epoch": 0.599199542595769,
"grad_norm": 2.7839956283569336,
"learning_rate": 9.867184611093914e-06,
"loss": 1.2309,
"step": 1179
},
{
"epoch": 0.5997077695190903,
"grad_norm": 2.8881192207336426,
"learning_rate": 9.86679940365201e-06,
"loss": 1.2939,
"step": 1180
},
{
"epoch": 0.6002159964424115,
"grad_norm": 2.9655847549438477,
"learning_rate": 9.86641364594688e-06,
"loss": 1.2051,
"step": 1181
},
{
"epoch": 0.6007242233657328,
"grad_norm": 3.159656047821045,
"learning_rate": 9.866027338022139e-06,
"loss": 1.3687,
"step": 1182
},
{
"epoch": 0.601232450289054,
"grad_norm": 3.0268661975860596,
"learning_rate": 9.865640479921465e-06,
"loss": 1.218,
"step": 1183
},
{
"epoch": 0.6017406772123753,
"grad_norm": 3.583407402038574,
"learning_rate": 9.865253071688598e-06,
"loss": 1.2427,
"step": 1184
},
{
"epoch": 0.6022489041356965,
"grad_norm": 3.025599718093872,
"learning_rate": 9.864865113367344e-06,
"loss": 1.2514,
"step": 1185
},
{
"epoch": 0.6027571310590178,
"grad_norm": 2.75777006149292,
"learning_rate": 9.864476605001561e-06,
"loss": 1.2296,
"step": 1186
},
{
"epoch": 0.6032653579823392,
"grad_norm": 2.9044742584228516,
"learning_rate": 9.864087546635181e-06,
"loss": 1.2544,
"step": 1187
},
{
"epoch": 0.6037735849056604,
"grad_norm": 3.1498332023620605,
"learning_rate": 9.86369793831219e-06,
"loss": 1.3202,
"step": 1188
},
{
"epoch": 0.6042818118289817,
"grad_norm": 3.185675859451294,
"learning_rate": 9.863307780076638e-06,
"loss": 1.2586,
"step": 1189
},
{
"epoch": 0.6047900387523029,
"grad_norm": 3.4412953853607178,
"learning_rate": 9.86291707197264e-06,
"loss": 1.3381,
"step": 1190
},
{
"epoch": 0.6052982656756242,
"grad_norm": 3.0474026203155518,
"learning_rate": 9.862525814044373e-06,
"loss": 1.2852,
"step": 1191
},
{
"epoch": 0.6058064925989455,
"grad_norm": 2.7538821697235107,
"learning_rate": 9.86213400633607e-06,
"loss": 1.2725,
"step": 1192
},
{
"epoch": 0.6063147195222667,
"grad_norm": 3.0935001373291016,
"learning_rate": 9.861741648892035e-06,
"loss": 1.2087,
"step": 1193
},
{
"epoch": 0.606822946445588,
"grad_norm": 2.796851396560669,
"learning_rate": 9.861348741756626e-06,
"loss": 1.2487,
"step": 1194
},
{
"epoch": 0.6073311733689092,
"grad_norm": 3.0847465991973877,
"learning_rate": 9.86095528497427e-06,
"loss": 1.2479,
"step": 1195
},
{
"epoch": 0.6078394002922305,
"grad_norm": 2.979198932647705,
"learning_rate": 9.860561278589452e-06,
"loss": 1.2393,
"step": 1196
},
{
"epoch": 0.6083476272155517,
"grad_norm": 3.056978464126587,
"learning_rate": 9.860166722646718e-06,
"loss": 1.1733,
"step": 1197
},
{
"epoch": 0.608855854138873,
"grad_norm": 2.78646183013916,
"learning_rate": 9.859771617190681e-06,
"loss": 1.2877,
"step": 1198
},
{
"epoch": 0.6093640810621943,
"grad_norm": 2.911860704421997,
"learning_rate": 9.859375962266014e-06,
"loss": 1.2914,
"step": 1199
},
{
"epoch": 0.6098723079855155,
"grad_norm": 2.7991490364074707,
"learning_rate": 9.85897975791745e-06,
"loss": 1.2194,
"step": 1200
},
{
"epoch": 0.6103805349088368,
"grad_norm": 2.8022921085357666,
"learning_rate": 9.858583004189785e-06,
"loss": 1.2472,
"step": 1201
},
{
"epoch": 0.610888761832158,
"grad_norm": 3.0368905067443848,
"learning_rate": 9.85818570112788e-06,
"loss": 1.3095,
"step": 1202
},
{
"epoch": 0.6113969887554793,
"grad_norm": 2.757432460784912,
"learning_rate": 9.857787848776656e-06,
"loss": 1.1634,
"step": 1203
},
{
"epoch": 0.6119052156788006,
"grad_norm": 3.2205071449279785,
"learning_rate": 9.857389447181093e-06,
"loss": 1.2799,
"step": 1204
},
{
"epoch": 0.6124134426021218,
"grad_norm": 3.149803876876831,
"learning_rate": 9.85699049638624e-06,
"loss": 1.312,
"step": 1205
},
{
"epoch": 0.6129216695254431,
"grad_norm": 2.9970386028289795,
"learning_rate": 9.8565909964372e-06,
"loss": 1.2576,
"step": 1206
},
{
"epoch": 0.6134298964487643,
"grad_norm": 3.1370797157287598,
"learning_rate": 9.856190947379148e-06,
"loss": 1.3491,
"step": 1207
},
{
"epoch": 0.6139381233720856,
"grad_norm": 3.0502049922943115,
"learning_rate": 9.855790349257311e-06,
"loss": 1.1822,
"step": 1208
},
{
"epoch": 0.614446350295407,
"grad_norm": 3.278427839279175,
"learning_rate": 9.855389202116983e-06,
"loss": 1.2727,
"step": 1209
},
{
"epoch": 0.6149545772187281,
"grad_norm": 3.1668384075164795,
"learning_rate": 9.85498750600352e-06,
"loss": 1.3367,
"step": 1210
},
{
"epoch": 0.6154628041420495,
"grad_norm": 2.8745815753936768,
"learning_rate": 9.85458526096234e-06,
"loss": 1.2038,
"step": 1211
},
{
"epoch": 0.6159710310653707,
"grad_norm": 2.781729221343994,
"learning_rate": 9.854182467038922e-06,
"loss": 1.224,
"step": 1212
},
{
"epoch": 0.616479257988692,
"grad_norm": 2.9090940952301025,
"learning_rate": 9.85377912427881e-06,
"loss": 1.2572,
"step": 1213
},
{
"epoch": 0.6169874849120132,
"grad_norm": 2.9433419704437256,
"learning_rate": 9.853375232727606e-06,
"loss": 1.1687,
"step": 1214
},
{
"epoch": 0.6174957118353345,
"grad_norm": 3.9726810455322266,
"learning_rate": 9.852970792430976e-06,
"loss": 1.1999,
"step": 1215
},
{
"epoch": 0.6180039387586558,
"grad_norm": 3.0864198207855225,
"learning_rate": 9.852565803434649e-06,
"loss": 1.2704,
"step": 1216
},
{
"epoch": 0.618512165681977,
"grad_norm": 2.8298897743225098,
"learning_rate": 9.852160265784411e-06,
"loss": 1.2681,
"step": 1217
},
{
"epoch": 0.6190203926052983,
"grad_norm": 2.9570887088775635,
"learning_rate": 9.851754179526118e-06,
"loss": 1.1922,
"step": 1218
},
{
"epoch": 0.6195286195286195,
"grad_norm": 2.864625930786133,
"learning_rate": 9.851347544705686e-06,
"loss": 1.2429,
"step": 1219
},
{
"epoch": 0.6200368464519408,
"grad_norm": 2.9287493228912354,
"learning_rate": 9.850940361369085e-06,
"loss": 1.1807,
"step": 1220
},
{
"epoch": 0.6205450733752621,
"grad_norm": 3.0884289741516113,
"learning_rate": 9.850532629562357e-06,
"loss": 1.3063,
"step": 1221
},
{
"epoch": 0.6210533002985833,
"grad_norm": 2.916370153427124,
"learning_rate": 9.850124349331602e-06,
"loss": 1.3281,
"step": 1222
},
{
"epoch": 0.6215615272219046,
"grad_norm": 2.9838948249816895,
"learning_rate": 9.84971552072298e-06,
"loss": 1.2799,
"step": 1223
},
{
"epoch": 0.6220697541452258,
"grad_norm": 2.813861846923828,
"learning_rate": 9.849306143782717e-06,
"loss": 1.2931,
"step": 1224
},
{
"epoch": 0.6225779810685471,
"grad_norm": 2.860564708709717,
"learning_rate": 9.848896218557098e-06,
"loss": 1.2828,
"step": 1225
},
{
"epoch": 0.6230862079918684,
"grad_norm": 2.733185291290283,
"learning_rate": 9.848485745092472e-06,
"loss": 1.1781,
"step": 1226
},
{
"epoch": 0.6235944349151896,
"grad_norm": 4.069754600524902,
"learning_rate": 9.848074723435248e-06,
"loss": 1.2646,
"step": 1227
},
{
"epoch": 0.6241026618385109,
"grad_norm": 2.9285528659820557,
"learning_rate": 9.8476631536319e-06,
"loss": 1.3353,
"step": 1228
},
{
"epoch": 0.6246108887618321,
"grad_norm": 2.9530718326568604,
"learning_rate": 9.84725103572896e-06,
"loss": 1.2233,
"step": 1229
},
{
"epoch": 0.6251191156851534,
"grad_norm": 2.9010536670684814,
"learning_rate": 9.846838369773024e-06,
"loss": 1.304,
"step": 1230
},
{
"epoch": 0.6256273426084746,
"grad_norm": 2.8730621337890625,
"learning_rate": 9.84642515581075e-06,
"loss": 1.2007,
"step": 1231
},
{
"epoch": 0.6261355695317959,
"grad_norm": 3.3889389038085938,
"learning_rate": 9.84601139388886e-06,
"loss": 1.3055,
"step": 1232
},
{
"epoch": 0.6266437964551173,
"grad_norm": 2.939222812652588,
"learning_rate": 9.845597084054135e-06,
"loss": 1.1747,
"step": 1233
},
{
"epoch": 0.6271520233784385,
"grad_norm": 3.0841636657714844,
"learning_rate": 9.845182226353415e-06,
"loss": 1.3309,
"step": 1234
},
{
"epoch": 0.6276602503017598,
"grad_norm": 3.2949295043945312,
"learning_rate": 9.844766820833613e-06,
"loss": 1.3251,
"step": 1235
},
{
"epoch": 0.628168477225081,
"grad_norm": 2.994581699371338,
"learning_rate": 9.84435086754169e-06,
"loss": 1.4239,
"step": 1236
},
{
"epoch": 0.6286767041484023,
"grad_norm": 2.904791831970215,
"learning_rate": 9.843934366524679e-06,
"loss": 1.1277,
"step": 1237
},
{
"epoch": 0.6291849310717236,
"grad_norm": 2.857452630996704,
"learning_rate": 9.843517317829672e-06,
"loss": 1.2775,
"step": 1238
},
{
"epoch": 0.6296931579950448,
"grad_norm": 3.0897974967956543,
"learning_rate": 9.84309972150382e-06,
"loss": 1.4043,
"step": 1239
},
{
"epoch": 0.6302013849183661,
"grad_norm": 2.9603357315063477,
"learning_rate": 9.84268157759434e-06,
"loss": 1.2107,
"step": 1240
},
{
"epoch": 0.6307096118416873,
"grad_norm": 3.1953182220458984,
"learning_rate": 9.842262886148509e-06,
"loss": 1.292,
"step": 1241
},
{
"epoch": 0.6312178387650086,
"grad_norm": 3.0074422359466553,
"learning_rate": 9.841843647213664e-06,
"loss": 1.3658,
"step": 1242
},
{
"epoch": 0.6317260656883298,
"grad_norm": 3.2771244049072266,
"learning_rate": 9.84142386083721e-06,
"loss": 1.2754,
"step": 1243
},
{
"epoch": 0.6322342926116511,
"grad_norm": 2.9563822746276855,
"learning_rate": 9.84100352706661e-06,
"loss": 1.2131,
"step": 1244
},
{
"epoch": 0.6327425195349724,
"grad_norm": 2.826014995574951,
"learning_rate": 9.840582645949388e-06,
"loss": 1.1562,
"step": 1245
},
{
"epoch": 0.6332507464582936,
"grad_norm": 2.9703335762023926,
"learning_rate": 9.840161217533129e-06,
"loss": 1.4529,
"step": 1246
},
{
"epoch": 0.6337589733816149,
"grad_norm": 2.9779446125030518,
"learning_rate": 9.83973924186548e-06,
"loss": 1.2196,
"step": 1247
},
{
"epoch": 0.6342672003049361,
"grad_norm": 2.989461898803711,
"learning_rate": 9.839316718994159e-06,
"loss": 1.2317,
"step": 1248
},
{
"epoch": 0.6347754272282574,
"grad_norm": 3.122593402862549,
"learning_rate": 9.838893648966931e-06,
"loss": 1.2885,
"step": 1249
},
{
"epoch": 0.6352836541515787,
"grad_norm": 2.9813296794891357,
"learning_rate": 9.838470031831632e-06,
"loss": 1.2475,
"step": 1250
},
{
"epoch": 0.6357918810748999,
"grad_norm": 3.026923894882202,
"learning_rate": 9.838045867636163e-06,
"loss": 1.2436,
"step": 1251
},
{
"epoch": 0.6363001079982212,
"grad_norm": 2.8064677715301514,
"learning_rate": 9.837621156428476e-06,
"loss": 1.2575,
"step": 1252
},
{
"epoch": 0.6368083349215424,
"grad_norm": 3.0424234867095947,
"learning_rate": 9.837195898256593e-06,
"loss": 1.288,
"step": 1253
},
{
"epoch": 0.6373165618448637,
"grad_norm": 2.877368688583374,
"learning_rate": 9.836770093168595e-06,
"loss": 1.2892,
"step": 1254
},
{
"epoch": 0.637824788768185,
"grad_norm": 3.133418560028076,
"learning_rate": 9.836343741212628e-06,
"loss": 1.3596,
"step": 1255
},
{
"epoch": 0.6383330156915062,
"grad_norm": 9.114967346191406,
"learning_rate": 9.835916842436895e-06,
"loss": 1.3345,
"step": 1256
},
{
"epoch": 0.6388412426148276,
"grad_norm": 3.0029051303863525,
"learning_rate": 9.835489396889663e-06,
"loss": 1.2896,
"step": 1257
},
{
"epoch": 0.6393494695381488,
"grad_norm": 3.1740221977233887,
"learning_rate": 9.835061404619263e-06,
"loss": 1.2226,
"step": 1258
},
{
"epoch": 0.6398576964614701,
"grad_norm": 3.1588032245635986,
"learning_rate": 9.834632865674084e-06,
"loss": 1.2797,
"step": 1259
},
{
"epoch": 0.6403659233847913,
"grad_norm": 2.870164394378662,
"learning_rate": 9.834203780102579e-06,
"loss": 1.3561,
"step": 1260
},
{
"epoch": 0.6408741503081126,
"grad_norm": 3.0878357887268066,
"learning_rate": 9.833774147953264e-06,
"loss": 1.3606,
"step": 1261
},
{
"epoch": 0.6413823772314339,
"grad_norm": 2.916350841522217,
"learning_rate": 9.833343969274712e-06,
"loss": 1.2902,
"step": 1262
},
{
"epoch": 0.6418906041547551,
"grad_norm": 3.019193172454834,
"learning_rate": 9.832913244115565e-06,
"loss": 1.3008,
"step": 1263
},
{
"epoch": 0.6423988310780764,
"grad_norm": 3.3435311317443848,
"learning_rate": 9.83248197252452e-06,
"loss": 1.2686,
"step": 1264
},
{
"epoch": 0.6429070580013976,
"grad_norm": 2.869995594024658,
"learning_rate": 9.832050154550338e-06,
"loss": 1.1683,
"step": 1265
},
{
"epoch": 0.6434152849247189,
"grad_norm": 2.8468031883239746,
"learning_rate": 9.831617790241845e-06,
"loss": 1.2572,
"step": 1266
},
{
"epoch": 0.6439235118480402,
"grad_norm": 2.917226552963257,
"learning_rate": 9.831184879647927e-06,
"loss": 1.3825,
"step": 1267
},
{
"epoch": 0.6444317387713614,
"grad_norm": 3.3933417797088623,
"learning_rate": 9.830751422817526e-06,
"loss": 1.3198,
"step": 1268
},
{
"epoch": 0.6449399656946827,
"grad_norm": 2.893857717514038,
"learning_rate": 9.830317419799654e-06,
"loss": 1.2115,
"step": 1269
},
{
"epoch": 0.6454481926180039,
"grad_norm": 3.2240967750549316,
"learning_rate": 9.82988287064338e-06,
"loss": 1.3072,
"step": 1270
},
{
"epoch": 0.6459564195413252,
"grad_norm": 2.896242141723633,
"learning_rate": 9.829447775397837e-06,
"loss": 1.3173,
"step": 1271
},
{
"epoch": 0.6464646464646465,
"grad_norm": 3.0197970867156982,
"learning_rate": 9.829012134112222e-06,
"loss": 1.2142,
"step": 1272
},
{
"epoch": 0.6469728733879677,
"grad_norm": 2.990753650665283,
"learning_rate": 9.828575946835786e-06,
"loss": 1.3508,
"step": 1273
},
{
"epoch": 0.647481100311289,
"grad_norm": 3.1516451835632324,
"learning_rate": 9.828139213617847e-06,
"loss": 1.2211,
"step": 1274
},
{
"epoch": 0.6479893272346102,
"grad_norm": 2.989999771118164,
"learning_rate": 9.827701934507785e-06,
"loss": 1.3364,
"step": 1275
},
{
"epoch": 0.6484975541579315,
"grad_norm": 2.891176700592041,
"learning_rate": 9.827264109555041e-06,
"loss": 1.2299,
"step": 1276
},
{
"epoch": 0.6490057810812527,
"grad_norm": 3.024106025695801,
"learning_rate": 9.826825738809119e-06,
"loss": 1.2658,
"step": 1277
},
{
"epoch": 0.649514008004574,
"grad_norm": 3.742095470428467,
"learning_rate": 9.826386822319582e-06,
"loss": 1.2443,
"step": 1278
},
{
"epoch": 0.6500222349278953,
"grad_norm": 3.057175397872925,
"learning_rate": 9.825947360136055e-06,
"loss": 1.2077,
"step": 1279
},
{
"epoch": 0.6505304618512165,
"grad_norm": 3.2410778999328613,
"learning_rate": 9.825507352308225e-06,
"loss": 1.2809,
"step": 1280
},
{
"epoch": 0.6510386887745379,
"grad_norm": 2.82974910736084,
"learning_rate": 9.825066798885843e-06,
"loss": 1.2053,
"step": 1281
},
{
"epoch": 0.651546915697859,
"grad_norm": 3.046499013900757,
"learning_rate": 9.824625699918723e-06,
"loss": 1.2027,
"step": 1282
},
{
"epoch": 0.6520551426211804,
"grad_norm": 3.305159330368042,
"learning_rate": 9.824184055456729e-06,
"loss": 1.3742,
"step": 1283
},
{
"epoch": 0.6525633695445017,
"grad_norm": 3.1315276622772217,
"learning_rate": 9.823741865549805e-06,
"loss": 1.2914,
"step": 1284
},
{
"epoch": 0.6530715964678229,
"grad_norm": 3.0194857120513916,
"learning_rate": 9.823299130247941e-06,
"loss": 1.2446,
"step": 1285
},
{
"epoch": 0.6535798233911442,
"grad_norm": 2.8847827911376953,
"learning_rate": 9.822855849601198e-06,
"loss": 1.3122,
"step": 1286
},
{
"epoch": 0.6540880503144654,
"grad_norm": 3.0671706199645996,
"learning_rate": 9.822412023659692e-06,
"loss": 1.2765,
"step": 1287
},
{
"epoch": 0.6545962772377867,
"grad_norm": 2.971421480178833,
"learning_rate": 9.82196765247361e-06,
"loss": 1.2641,
"step": 1288
},
{
"epoch": 0.6551045041611079,
"grad_norm": 2.988215923309326,
"learning_rate": 9.821522736093189e-06,
"loss": 1.3037,
"step": 1289
},
{
"epoch": 0.6556127310844292,
"grad_norm": 2.7589046955108643,
"learning_rate": 9.821077274568734e-06,
"loss": 1.056,
"step": 1290
},
{
"epoch": 0.6561209580077505,
"grad_norm": 2.976534366607666,
"learning_rate": 9.820631267950613e-06,
"loss": 1.1519,
"step": 1291
},
{
"epoch": 0.6566291849310717,
"grad_norm": 2.928953170776367,
"learning_rate": 9.820184716289252e-06,
"loss": 1.3055,
"step": 1292
},
{
"epoch": 0.657137411854393,
"grad_norm": 3.0303738117218018,
"learning_rate": 9.819737619635143e-06,
"loss": 1.2309,
"step": 1293
},
{
"epoch": 0.6576456387777142,
"grad_norm": 3.0870563983917236,
"learning_rate": 9.819289978038833e-06,
"loss": 1.3138,
"step": 1294
},
{
"epoch": 0.6581538657010355,
"grad_norm": 2.9288690090179443,
"learning_rate": 9.818841791550938e-06,
"loss": 1.2676,
"step": 1295
},
{
"epoch": 0.6586620926243568,
"grad_norm": 2.846304178237915,
"learning_rate": 9.818393060222128e-06,
"loss": 1.2641,
"step": 1296
},
{
"epoch": 0.659170319547678,
"grad_norm": 2.9624176025390625,
"learning_rate": 9.817943784103142e-06,
"loss": 1.2804,
"step": 1297
},
{
"epoch": 0.6596785464709993,
"grad_norm": 2.7913033962249756,
"learning_rate": 9.817493963244778e-06,
"loss": 1.3064,
"step": 1298
},
{
"epoch": 0.6601867733943205,
"grad_norm": 2.988194465637207,
"learning_rate": 9.81704359769789e-06,
"loss": 1.3552,
"step": 1299
},
{
"epoch": 0.6606950003176418,
"grad_norm": 5.625545978546143,
"learning_rate": 9.816592687513404e-06,
"loss": 1.2971,
"step": 1300
},
{
"epoch": 0.6612032272409631,
"grad_norm": 3.0586233139038086,
"learning_rate": 9.8161412327423e-06,
"loss": 1.4045,
"step": 1301
},
{
"epoch": 0.6617114541642843,
"grad_norm": 3.3030478954315186,
"learning_rate": 9.815689233435619e-06,
"loss": 1.2915,
"step": 1302
},
{
"epoch": 0.6622196810876056,
"grad_norm": 3.2344744205474854,
"learning_rate": 9.81523668964447e-06,
"loss": 1.199,
"step": 1303
},
{
"epoch": 0.6627279080109268,
"grad_norm": 2.973972797393799,
"learning_rate": 9.814783601420018e-06,
"loss": 1.3101,
"step": 1304
},
{
"epoch": 0.6632361349342482,
"grad_norm": 3.051959276199341,
"learning_rate": 9.814329968813493e-06,
"loss": 1.3287,
"step": 1305
},
{
"epoch": 0.6637443618575694,
"grad_norm": 3.0178143978118896,
"learning_rate": 9.81387579187618e-06,
"loss": 1.1582,
"step": 1306
},
{
"epoch": 0.6642525887808907,
"grad_norm": 2.748084306716919,
"learning_rate": 9.813421070659435e-06,
"loss": 1.1526,
"step": 1307
},
{
"epoch": 0.664760815704212,
"grad_norm": 3.0890631675720215,
"learning_rate": 9.81296580521467e-06,
"loss": 1.1412,
"step": 1308
},
{
"epoch": 0.6652690426275332,
"grad_norm": 3.0133931636810303,
"learning_rate": 9.812509995593357e-06,
"loss": 1.3093,
"step": 1309
},
{
"epoch": 0.6657772695508545,
"grad_norm": 2.998985528945923,
"learning_rate": 9.812053641847038e-06,
"loss": 1.2876,
"step": 1310
},
{
"epoch": 0.6662854964741757,
"grad_norm": 3.7526612281799316,
"learning_rate": 9.811596744027304e-06,
"loss": 1.3247,
"step": 1311
},
{
"epoch": 0.666793723397497,
"grad_norm": 3.112264394760132,
"learning_rate": 9.811139302185817e-06,
"loss": 1.2754,
"step": 1312
},
{
"epoch": 0.6673019503208183,
"grad_norm": 3.145580768585205,
"learning_rate": 9.810681316374296e-06,
"loss": 1.3328,
"step": 1313
},
{
"epoch": 0.6678101772441395,
"grad_norm": 2.926412343978882,
"learning_rate": 9.810222786644526e-06,
"loss": 1.2873,
"step": 1314
},
{
"epoch": 0.6683184041674608,
"grad_norm": 2.8454012870788574,
"learning_rate": 9.809763713048347e-06,
"loss": 1.2252,
"step": 1315
},
{
"epoch": 0.668826631090782,
"grad_norm": 3.048414945602417,
"learning_rate": 9.809304095637665e-06,
"loss": 1.2712,
"step": 1316
},
{
"epoch": 0.6693348580141033,
"grad_norm": 2.9404375553131104,
"learning_rate": 9.80884393446445e-06,
"loss": 1.1873,
"step": 1317
},
{
"epoch": 0.6698430849374246,
"grad_norm": 3.0222291946411133,
"learning_rate": 9.808383229580724e-06,
"loss": 1.27,
"step": 1318
},
{
"epoch": 0.6703513118607458,
"grad_norm": 3.297321081161499,
"learning_rate": 9.807921981038581e-06,
"loss": 1.2672,
"step": 1319
},
{
"epoch": 0.6708595387840671,
"grad_norm": 3.1562671661376953,
"learning_rate": 9.80746018889017e-06,
"loss": 1.2629,
"step": 1320
},
{
"epoch": 0.6713677657073883,
"grad_norm": 2.894879102706909,
"learning_rate": 9.806997853187705e-06,
"loss": 1.2885,
"step": 1321
},
{
"epoch": 0.6718759926307096,
"grad_norm": 2.8734283447265625,
"learning_rate": 9.806534973983458e-06,
"loss": 1.2711,
"step": 1322
},
{
"epoch": 0.6723842195540308,
"grad_norm": 2.9292004108428955,
"learning_rate": 9.806071551329766e-06,
"loss": 1.2032,
"step": 1323
},
{
"epoch": 0.6728924464773521,
"grad_norm": 2.841843843460083,
"learning_rate": 9.805607585279022e-06,
"loss": 1.2444,
"step": 1324
},
{
"epoch": 0.6734006734006734,
"grad_norm": 3.2029173374176025,
"learning_rate": 9.80514307588369e-06,
"loss": 1.2899,
"step": 1325
},
{
"epoch": 0.6739089003239946,
"grad_norm": 2.921074151992798,
"learning_rate": 9.804678023196286e-06,
"loss": 1.1842,
"step": 1326
},
{
"epoch": 0.674417127247316,
"grad_norm": 2.954253673553467,
"learning_rate": 9.80421242726939e-06,
"loss": 1.3056,
"step": 1327
},
{
"epoch": 0.6749253541706371,
"grad_norm": 3.026883840560913,
"learning_rate": 9.803746288155647e-06,
"loss": 1.2471,
"step": 1328
},
{
"epoch": 0.6754335810939585,
"grad_norm": 2.9767909049987793,
"learning_rate": 9.80327960590776e-06,
"loss": 1.3336,
"step": 1329
},
{
"epoch": 0.6759418080172798,
"grad_norm": 2.963109016418457,
"learning_rate": 9.802812380578495e-06,
"loss": 1.1492,
"step": 1330
},
{
"epoch": 0.676450034940601,
"grad_norm": 2.853429079055786,
"learning_rate": 9.802344612220677e-06,
"loss": 1.2281,
"step": 1331
},
{
"epoch": 0.6769582618639223,
"grad_norm": 2.979201316833496,
"learning_rate": 9.801876300887195e-06,
"loss": 1.2248,
"step": 1332
},
{
"epoch": 0.6774664887872435,
"grad_norm": 3.138261318206787,
"learning_rate": 9.801407446631e-06,
"loss": 1.4046,
"step": 1333
},
{
"epoch": 0.6779747157105648,
"grad_norm": 3.044326066970825,
"learning_rate": 9.8009380495051e-06,
"loss": 1.2961,
"step": 1334
},
{
"epoch": 0.678482942633886,
"grad_norm": 3.0363643169403076,
"learning_rate": 9.80046810956257e-06,
"loss": 1.349,
"step": 1335
},
{
"epoch": 0.6789911695572073,
"grad_norm": 2.967984914779663,
"learning_rate": 9.799997626856539e-06,
"loss": 1.2037,
"step": 1336
},
{
"epoch": 0.6794993964805286,
"grad_norm": 2.81664776802063,
"learning_rate": 9.799526601440207e-06,
"loss": 1.2094,
"step": 1337
},
{
"epoch": 0.6800076234038498,
"grad_norm": 3.0124945640563965,
"learning_rate": 9.79905503336683e-06,
"loss": 1.3336,
"step": 1338
},
{
"epoch": 0.6805158503271711,
"grad_norm": 2.7598769664764404,
"learning_rate": 9.798582922689724e-06,
"loss": 1.2539,
"step": 1339
},
{
"epoch": 0.6810240772504923,
"grad_norm": 3.0373761653900146,
"learning_rate": 9.798110269462266e-06,
"loss": 1.3217,
"step": 1340
},
{
"epoch": 0.6815323041738136,
"grad_norm": 3.097094774246216,
"learning_rate": 9.797637073737901e-06,
"loss": 1.2075,
"step": 1341
},
{
"epoch": 0.6820405310971349,
"grad_norm": 2.749882698059082,
"learning_rate": 9.797163335570127e-06,
"loss": 1.3328,
"step": 1342
},
{
"epoch": 0.6825487580204561,
"grad_norm": 3.4999477863311768,
"learning_rate": 9.79668905501251e-06,
"loss": 1.3211,
"step": 1343
},
{
"epoch": 0.6830569849437774,
"grad_norm": 3.1416807174682617,
"learning_rate": 9.796214232118672e-06,
"loss": 1.3246,
"step": 1344
},
{
"epoch": 0.6835652118670986,
"grad_norm": 2.8817014694213867,
"learning_rate": 9.7957388669423e-06,
"loss": 1.2774,
"step": 1345
},
{
"epoch": 0.6840734387904199,
"grad_norm": 2.8663389682769775,
"learning_rate": 9.795262959537143e-06,
"loss": 1.287,
"step": 1346
},
{
"epoch": 0.6845816657137412,
"grad_norm": 3.0212528705596924,
"learning_rate": 9.794786509957002e-06,
"loss": 1.1961,
"step": 1347
},
{
"epoch": 0.6850898926370624,
"grad_norm": 2.8918073177337646,
"learning_rate": 9.794309518255755e-06,
"loss": 1.192,
"step": 1348
},
{
"epoch": 0.6855981195603837,
"grad_norm": 2.9363107681274414,
"learning_rate": 9.79383198448733e-06,
"loss": 1.2341,
"step": 1349
},
{
"epoch": 0.6861063464837049,
"grad_norm": 2.7646443843841553,
"learning_rate": 9.793353908705716e-06,
"loss": 1.1832,
"step": 1350
},
{
"epoch": 0.6866145734070263,
"grad_norm": 2.9691295623779297,
"learning_rate": 9.792875290964971e-06,
"loss": 1.1755,
"step": 1351
},
{
"epoch": 0.6871228003303474,
"grad_norm": 2.821946382522583,
"learning_rate": 9.792396131319208e-06,
"loss": 1.263,
"step": 1352
},
{
"epoch": 0.6876310272536688,
"grad_norm": 2.7758054733276367,
"learning_rate": 9.791916429822604e-06,
"loss": 1.2741,
"step": 1353
},
{
"epoch": 0.6881392541769901,
"grad_norm": 3.110229730606079,
"learning_rate": 9.791436186529392e-06,
"loss": 1.2129,
"step": 1354
},
{
"epoch": 0.6886474811003113,
"grad_norm": 3.091493606567383,
"learning_rate": 9.790955401493878e-06,
"loss": 1.2326,
"step": 1355
},
{
"epoch": 0.6891557080236326,
"grad_norm": 2.8974857330322266,
"learning_rate": 9.790474074770415e-06,
"loss": 1.2713,
"step": 1356
},
{
"epoch": 0.6896639349469538,
"grad_norm": 3.016157627105713,
"learning_rate": 9.789992206413428e-06,
"loss": 1.2726,
"step": 1357
},
{
"epoch": 0.6901721618702751,
"grad_norm": 2.9709484577178955,
"learning_rate": 9.7895097964774e-06,
"loss": 1.4299,
"step": 1358
},
{
"epoch": 0.6906803887935964,
"grad_norm": 2.8930253982543945,
"learning_rate": 9.789026845016868e-06,
"loss": 1.2822,
"step": 1359
},
{
"epoch": 0.6911886157169176,
"grad_norm": 2.8750662803649902,
"learning_rate": 9.788543352086447e-06,
"loss": 1.2785,
"step": 1360
},
{
"epoch": 0.6916968426402389,
"grad_norm": 3.3684775829315186,
"learning_rate": 9.788059317740793e-06,
"loss": 1.3986,
"step": 1361
},
{
"epoch": 0.6922050695635601,
"grad_norm": 2.6956255435943604,
"learning_rate": 9.78757474203464e-06,
"loss": 1.2541,
"step": 1362
},
{
"epoch": 0.6927132964868814,
"grad_norm": 2.7483339309692383,
"learning_rate": 9.787089625022772e-06,
"loss": 1.2703,
"step": 1363
},
{
"epoch": 0.6932215234102026,
"grad_norm": 3.469676971435547,
"learning_rate": 9.786603966760042e-06,
"loss": 1.3139,
"step": 1364
},
{
"epoch": 0.6937297503335239,
"grad_norm": 2.8216028213500977,
"learning_rate": 9.786117767301359e-06,
"loss": 1.2917,
"step": 1365
},
{
"epoch": 0.6942379772568452,
"grad_norm": 2.97011399269104,
"learning_rate": 9.785631026701695e-06,
"loss": 1.2288,
"step": 1366
},
{
"epoch": 0.6947462041801664,
"grad_norm": 3.1733460426330566,
"learning_rate": 9.785143745016085e-06,
"loss": 1.3337,
"step": 1367
},
{
"epoch": 0.6952544311034877,
"grad_norm": 3.0609326362609863,
"learning_rate": 9.78465592229962e-06,
"loss": 1.1612,
"step": 1368
},
{
"epoch": 0.6957626580268089,
"grad_norm": 2.876577854156494,
"learning_rate": 9.78416755860746e-06,
"loss": 1.3396,
"step": 1369
},
{
"epoch": 0.6962708849501302,
"grad_norm": 2.9949982166290283,
"learning_rate": 9.783678653994817e-06,
"loss": 1.1953,
"step": 1370
},
{
"epoch": 0.6967791118734515,
"grad_norm": 3.092203140258789,
"learning_rate": 9.783189208516972e-06,
"loss": 1.1856,
"step": 1371
},
{
"epoch": 0.6972873387967727,
"grad_norm": 2.965151071548462,
"learning_rate": 9.782699222229264e-06,
"loss": 1.2374,
"step": 1372
},
{
"epoch": 0.697795565720094,
"grad_norm": 2.849785327911377,
"learning_rate": 9.78220869518709e-06,
"loss": 1.2187,
"step": 1373
},
{
"epoch": 0.6983037926434152,
"grad_norm": 3.1366140842437744,
"learning_rate": 9.781717627445915e-06,
"loss": 1.3324,
"step": 1374
},
{
"epoch": 0.6988120195667366,
"grad_norm": 2.859644889831543,
"learning_rate": 9.78122601906126e-06,
"loss": 1.2878,
"step": 1375
},
{
"epoch": 0.6993202464900579,
"grad_norm": 2.927549123764038,
"learning_rate": 9.780733870088708e-06,
"loss": 1.3861,
"step": 1376
},
{
"epoch": 0.6998284734133791,
"grad_norm": 2.8348424434661865,
"learning_rate": 9.780241180583905e-06,
"loss": 1.178,
"step": 1377
},
{
"epoch": 0.7003367003367004,
"grad_norm": 3.0390775203704834,
"learning_rate": 9.779747950602553e-06,
"loss": 1.312,
"step": 1378
},
{
"epoch": 0.7008449272600216,
"grad_norm": 3.0308146476745605,
"learning_rate": 9.779254180200426e-06,
"loss": 1.2044,
"step": 1379
},
{
"epoch": 0.7013531541833429,
"grad_norm": 2.860550880432129,
"learning_rate": 9.778759869433345e-06,
"loss": 1.3131,
"step": 1380
},
{
"epoch": 0.7018613811066641,
"grad_norm": 3.319129705429077,
"learning_rate": 9.778265018357203e-06,
"loss": 1.2236,
"step": 1381
},
{
"epoch": 0.7023696080299854,
"grad_norm": 2.9930241107940674,
"learning_rate": 9.77776962702795e-06,
"loss": 1.249,
"step": 1382
},
{
"epoch": 0.7028778349533067,
"grad_norm": 2.9247124195098877,
"learning_rate": 9.777273695501594e-06,
"loss": 1.2426,
"step": 1383
},
{
"epoch": 0.7033860618766279,
"grad_norm": 3.4090874195098877,
"learning_rate": 9.776777223834212e-06,
"loss": 1.1573,
"step": 1384
},
{
"epoch": 0.7038942887999492,
"grad_norm": 3.1676511764526367,
"learning_rate": 9.776280212081934e-06,
"loss": 1.2312,
"step": 1385
},
{
"epoch": 0.7044025157232704,
"grad_norm": 3.1893248558044434,
"learning_rate": 9.775782660300957e-06,
"loss": 1.2459,
"step": 1386
},
{
"epoch": 0.7049107426465917,
"grad_norm": 2.791271686553955,
"learning_rate": 9.775284568547536e-06,
"loss": 1.156,
"step": 1387
},
{
"epoch": 0.705418969569913,
"grad_norm": 3.0256097316741943,
"learning_rate": 9.774785936877983e-06,
"loss": 1.3832,
"step": 1388
},
{
"epoch": 0.7059271964932342,
"grad_norm": 3.114658832550049,
"learning_rate": 9.774286765348684e-06,
"loss": 1.3485,
"step": 1389
},
{
"epoch": 0.7064354234165555,
"grad_norm": 2.794233798980713,
"learning_rate": 9.77378705401607e-06,
"loss": 1.1272,
"step": 1390
},
{
"epoch": 0.7069436503398767,
"grad_norm": 3.010028123855591,
"learning_rate": 9.773286802936644e-06,
"loss": 1.2159,
"step": 1391
},
{
"epoch": 0.707451877263198,
"grad_norm": 2.803492307662964,
"learning_rate": 9.772786012166968e-06,
"loss": 1.1581,
"step": 1392
},
{
"epoch": 0.7079601041865193,
"grad_norm": 2.8336427211761475,
"learning_rate": 9.772284681763662e-06,
"loss": 1.2794,
"step": 1393
},
{
"epoch": 0.7084683311098405,
"grad_norm": 3.0411875247955322,
"learning_rate": 9.771782811783408e-06,
"loss": 1.2202,
"step": 1394
},
{
"epoch": 0.7089765580331618,
"grad_norm": 3.8096001148223877,
"learning_rate": 9.771280402282953e-06,
"loss": 1.3383,
"step": 1395
},
{
"epoch": 0.709484784956483,
"grad_norm": 3.175851821899414,
"learning_rate": 9.770777453319098e-06,
"loss": 1.3495,
"step": 1396
},
{
"epoch": 0.7099930118798043,
"grad_norm": 3.015300989151001,
"learning_rate": 9.77027396494871e-06,
"loss": 1.2694,
"step": 1397
},
{
"epoch": 0.7105012388031255,
"grad_norm": 4.530679225921631,
"learning_rate": 9.769769937228716e-06,
"loss": 1.2853,
"step": 1398
},
{
"epoch": 0.7110094657264469,
"grad_norm": 2.898129463195801,
"learning_rate": 9.769265370216106e-06,
"loss": 1.223,
"step": 1399
},
{
"epoch": 0.7115176926497682,
"grad_norm": 3.0743815898895264,
"learning_rate": 9.768760263967927e-06,
"loss": 1.2532,
"step": 1400
},
{
"epoch": 0.7120259195730894,
"grad_norm": 2.855799674987793,
"learning_rate": 9.768254618541287e-06,
"loss": 1.2243,
"step": 1401
},
{
"epoch": 0.7125341464964107,
"grad_norm": 2.8209400177001953,
"learning_rate": 9.767748433993357e-06,
"loss": 1.2282,
"step": 1402
},
{
"epoch": 0.7130423734197319,
"grad_norm": 2.9385292530059814,
"learning_rate": 9.767241710381372e-06,
"loss": 1.3617,
"step": 1403
},
{
"epoch": 0.7135506003430532,
"grad_norm": 2.8516132831573486,
"learning_rate": 9.76673444776262e-06,
"loss": 1.271,
"step": 1404
},
{
"epoch": 0.7140588272663745,
"grad_norm": 2.887547254562378,
"learning_rate": 9.766226646194459e-06,
"loss": 1.1764,
"step": 1405
},
{
"epoch": 0.7145670541896957,
"grad_norm": 2.8994688987731934,
"learning_rate": 9.765718305734299e-06,
"loss": 1.1985,
"step": 1406
},
{
"epoch": 0.715075281113017,
"grad_norm": 3.094647169113159,
"learning_rate": 9.765209426439619e-06,
"loss": 1.2047,
"step": 1407
},
{
"epoch": 0.7155835080363382,
"grad_norm": 3.0000064373016357,
"learning_rate": 9.764700008367952e-06,
"loss": 1.175,
"step": 1408
},
{
"epoch": 0.7160917349596595,
"grad_norm": 2.8988466262817383,
"learning_rate": 9.764190051576898e-06,
"loss": 1.2322,
"step": 1409
},
{
"epoch": 0.7165999618829807,
"grad_norm": 2.796241044998169,
"learning_rate": 9.763679556124115e-06,
"loss": 1.2739,
"step": 1410
},
{
"epoch": 0.717108188806302,
"grad_norm": 2.8092799186706543,
"learning_rate": 9.76316852206732e-06,
"loss": 1.2592,
"step": 1411
},
{
"epoch": 0.7176164157296233,
"grad_norm": 2.8349976539611816,
"learning_rate": 9.762656949464293e-06,
"loss": 1.2057,
"step": 1412
},
{
"epoch": 0.7181246426529445,
"grad_norm": 2.937993288040161,
"learning_rate": 9.762144838372879e-06,
"loss": 1.2728,
"step": 1413
},
{
"epoch": 0.7186328695762658,
"grad_norm": 2.7717621326446533,
"learning_rate": 9.761632188850973e-06,
"loss": 1.1492,
"step": 1414
},
{
"epoch": 0.719141096499587,
"grad_norm": 2.7713875770568848,
"learning_rate": 9.761119000956543e-06,
"loss": 1.1935,
"step": 1415
},
{
"epoch": 0.7196493234229083,
"grad_norm": 3.239586353302002,
"learning_rate": 9.76060527474761e-06,
"loss": 1.2105,
"step": 1416
},
{
"epoch": 0.7201575503462296,
"grad_norm": 2.891342878341675,
"learning_rate": 9.76009101028226e-06,
"loss": 1.2722,
"step": 1417
},
{
"epoch": 0.7206657772695508,
"grad_norm": 3.0239803791046143,
"learning_rate": 9.759576207618636e-06,
"loss": 1.2555,
"step": 1418
},
{
"epoch": 0.7211740041928721,
"grad_norm": 2.953406810760498,
"learning_rate": 9.759060866814944e-06,
"loss": 1.2832,
"step": 1419
},
{
"epoch": 0.7216822311161933,
"grad_norm": 2.8011319637298584,
"learning_rate": 9.758544987929453e-06,
"loss": 1.1223,
"step": 1420
},
{
"epoch": 0.7221904580395146,
"grad_norm": 2.819378137588501,
"learning_rate": 9.758028571020489e-06,
"loss": 1.2726,
"step": 1421
},
{
"epoch": 0.722698684962836,
"grad_norm": 2.6413331031799316,
"learning_rate": 9.757511616146441e-06,
"loss": 1.185,
"step": 1422
},
{
"epoch": 0.7232069118861572,
"grad_norm": 2.5989086627960205,
"learning_rate": 9.75699412336576e-06,
"loss": 1.2007,
"step": 1423
},
{
"epoch": 0.7237151388094785,
"grad_norm": 2.8236801624298096,
"learning_rate": 9.756476092736953e-06,
"loss": 1.1923,
"step": 1424
},
{
"epoch": 0.7242233657327997,
"grad_norm": 2.875715970993042,
"learning_rate": 9.755957524318592e-06,
"loss": 1.2214,
"step": 1425
},
{
"epoch": 0.724731592656121,
"grad_norm": 2.9543588161468506,
"learning_rate": 9.75543841816931e-06,
"loss": 1.232,
"step": 1426
},
{
"epoch": 0.7252398195794422,
"grad_norm": 3.108790874481201,
"learning_rate": 9.7549187743478e-06,
"loss": 1.2526,
"step": 1427
},
{
"epoch": 0.7257480465027635,
"grad_norm": 3.0500638484954834,
"learning_rate": 9.754398592912813e-06,
"loss": 1.2936,
"step": 1428
},
{
"epoch": 0.7262562734260848,
"grad_norm": 2.8262805938720703,
"learning_rate": 9.753877873923164e-06,
"loss": 1.1733,
"step": 1429
},
{
"epoch": 0.726764500349406,
"grad_norm": 3.081902265548706,
"learning_rate": 9.75335661743773e-06,
"loss": 1.2526,
"step": 1430
},
{
"epoch": 0.7272727272727273,
"grad_norm": 2.996305465698242,
"learning_rate": 9.752834823515444e-06,
"loss": 1.2552,
"step": 1431
},
{
"epoch": 0.7277809541960485,
"grad_norm": 3.2910454273223877,
"learning_rate": 9.752312492215304e-06,
"loss": 1.2484,
"step": 1432
},
{
"epoch": 0.7282891811193698,
"grad_norm": 3.036968469619751,
"learning_rate": 9.751789623596366e-06,
"loss": 1.2597,
"step": 1433
},
{
"epoch": 0.7287974080426911,
"grad_norm": 2.843050956726074,
"learning_rate": 9.75126621771775e-06,
"loss": 1.2877,
"step": 1434
},
{
"epoch": 0.7293056349660123,
"grad_norm": 2.860912561416626,
"learning_rate": 9.750742274638632e-06,
"loss": 1.2826,
"step": 1435
},
{
"epoch": 0.7298138618893336,
"grad_norm": 2.9277420043945312,
"learning_rate": 9.750217794418254e-06,
"loss": 1.241,
"step": 1436
},
{
"epoch": 0.7303220888126548,
"grad_norm": 2.8361499309539795,
"learning_rate": 9.749692777115916e-06,
"loss": 1.2782,
"step": 1437
},
{
"epoch": 0.7308303157359761,
"grad_norm": 2.8240644931793213,
"learning_rate": 9.749167222790976e-06,
"loss": 1.1875,
"step": 1438
},
{
"epoch": 0.7313385426592974,
"grad_norm": 3.042060613632202,
"learning_rate": 9.748641131502858e-06,
"loss": 1.267,
"step": 1439
},
{
"epoch": 0.7318467695826186,
"grad_norm": 3.223292827606201,
"learning_rate": 9.748114503311045e-06,
"loss": 1.2628,
"step": 1440
},
{
"epoch": 0.7323549965059399,
"grad_norm": 2.960662841796875,
"learning_rate": 9.74758733827508e-06,
"loss": 1.2386,
"step": 1441
},
{
"epoch": 0.7328632234292611,
"grad_norm": 3.0385453701019287,
"learning_rate": 9.747059636454566e-06,
"loss": 1.1821,
"step": 1442
},
{
"epoch": 0.7333714503525824,
"grad_norm": 2.8012921810150146,
"learning_rate": 9.746531397909165e-06,
"loss": 1.1459,
"step": 1443
},
{
"epoch": 0.7338796772759036,
"grad_norm": 2.8723814487457275,
"learning_rate": 9.746002622698607e-06,
"loss": 1.227,
"step": 1444
},
{
"epoch": 0.734387904199225,
"grad_norm": 2.9052135944366455,
"learning_rate": 9.745473310882674e-06,
"loss": 1.2176,
"step": 1445
},
{
"epoch": 0.7348961311225463,
"grad_norm": 2.8227717876434326,
"learning_rate": 9.744943462521214e-06,
"loss": 1.2584,
"step": 1446
},
{
"epoch": 0.7354043580458675,
"grad_norm": 2.986020565032959,
"learning_rate": 9.744413077674134e-06,
"loss": 1.2,
"step": 1447
},
{
"epoch": 0.7359125849691888,
"grad_norm": 3.091575860977173,
"learning_rate": 9.7438821564014e-06,
"loss": 1.1782,
"step": 1448
},
{
"epoch": 0.73642081189251,
"grad_norm": 2.812776565551758,
"learning_rate": 9.743350698763046e-06,
"loss": 1.2385,
"step": 1449
},
{
"epoch": 0.7369290388158313,
"grad_norm": 3.120871067047119,
"learning_rate": 9.742818704819155e-06,
"loss": 1.2487,
"step": 1450
},
{
"epoch": 0.7374372657391526,
"grad_norm": 2.802520513534546,
"learning_rate": 9.742286174629879e-06,
"loss": 1.2003,
"step": 1451
},
{
"epoch": 0.7379454926624738,
"grad_norm": 3.259707450866699,
"learning_rate": 9.741753108255429e-06,
"loss": 1.2654,
"step": 1452
},
{
"epoch": 0.7384537195857951,
"grad_norm": 2.960662841796875,
"learning_rate": 9.741219505756074e-06,
"loss": 1.2144,
"step": 1453
},
{
"epoch": 0.7389619465091163,
"grad_norm": 3.017399787902832,
"learning_rate": 9.740685367192149e-06,
"loss": 1.1627,
"step": 1454
},
{
"epoch": 0.7394701734324376,
"grad_norm": 2.763535737991333,
"learning_rate": 9.740150692624044e-06,
"loss": 1.2747,
"step": 1455
},
{
"epoch": 0.7399784003557588,
"grad_norm": 2.646120309829712,
"learning_rate": 9.73961548211221e-06,
"loss": 1.1098,
"step": 1456
},
{
"epoch": 0.7404866272790801,
"grad_norm": 3.0598561763763428,
"learning_rate": 9.739079735717165e-06,
"loss": 1.2503,
"step": 1457
},
{
"epoch": 0.7409948542024014,
"grad_norm": 3.1667909622192383,
"learning_rate": 9.738543453499478e-06,
"loss": 1.2446,
"step": 1458
},
{
"epoch": 0.7415030811257226,
"grad_norm": 3.006512403488159,
"learning_rate": 9.738006635519788e-06,
"loss": 1.2218,
"step": 1459
},
{
"epoch": 0.7420113080490439,
"grad_norm": 3.4957993030548096,
"learning_rate": 9.737469281838786e-06,
"loss": 1.32,
"step": 1460
},
{
"epoch": 0.7425195349723651,
"grad_norm": 3.0907366275787354,
"learning_rate": 9.736931392517234e-06,
"loss": 1.2451,
"step": 1461
},
{
"epoch": 0.7430277618956864,
"grad_norm": 3.0201332569122314,
"learning_rate": 9.736392967615941e-06,
"loss": 1.2959,
"step": 1462
},
{
"epoch": 0.7435359888190077,
"grad_norm": 2.7725820541381836,
"learning_rate": 9.735854007195789e-06,
"loss": 1.2061,
"step": 1463
},
{
"epoch": 0.7440442157423289,
"grad_norm": 3.0488088130950928,
"learning_rate": 9.735314511317711e-06,
"loss": 1.2159,
"step": 1464
},
{
"epoch": 0.7445524426656502,
"grad_norm": 3.0015316009521484,
"learning_rate": 9.73477448004271e-06,
"loss": 1.3594,
"step": 1465
},
{
"epoch": 0.7450606695889714,
"grad_norm": 3.141895294189453,
"learning_rate": 9.73423391343184e-06,
"loss": 1.297,
"step": 1466
},
{
"epoch": 0.7455688965122927,
"grad_norm": 2.7780303955078125,
"learning_rate": 9.733692811546222e-06,
"loss": 1.1672,
"step": 1467
},
{
"epoch": 0.746077123435614,
"grad_norm": 2.9647746086120605,
"learning_rate": 9.733151174447038e-06,
"loss": 1.3291,
"step": 1468
},
{
"epoch": 0.7465853503589353,
"grad_norm": 3.054515838623047,
"learning_rate": 9.732609002195523e-06,
"loss": 1.2656,
"step": 1469
},
{
"epoch": 0.7470935772822566,
"grad_norm": 2.7921688556671143,
"learning_rate": 9.73206629485298e-06,
"loss": 1.2288,
"step": 1470
},
{
"epoch": 0.7476018042055778,
"grad_norm": 3.1555871963500977,
"learning_rate": 9.731523052480772e-06,
"loss": 1.2941,
"step": 1471
},
{
"epoch": 0.7481100311288991,
"grad_norm": 3.1695942878723145,
"learning_rate": 9.730979275140318e-06,
"loss": 1.3829,
"step": 1472
},
{
"epoch": 0.7486182580522203,
"grad_norm": 2.928703546524048,
"learning_rate": 9.730434962893098e-06,
"loss": 1.143,
"step": 1473
},
{
"epoch": 0.7491264849755416,
"grad_norm": 2.8269565105438232,
"learning_rate": 9.72989011580066e-06,
"loss": 1.1911,
"step": 1474
},
{
"epoch": 0.7496347118988629,
"grad_norm": 2.864147663116455,
"learning_rate": 9.729344733924603e-06,
"loss": 1.3372,
"step": 1475
},
{
"epoch": 0.7501429388221841,
"grad_norm": 2.9000654220581055,
"learning_rate": 9.728798817326592e-06,
"loss": 1.2584,
"step": 1476
},
{
"epoch": 0.7506511657455054,
"grad_norm": 2.9683735370635986,
"learning_rate": 9.72825236606835e-06,
"loss": 1.2438,
"step": 1477
},
{
"epoch": 0.7511593926688266,
"grad_norm": 3.1077730655670166,
"learning_rate": 9.727705380211662e-06,
"loss": 1.2655,
"step": 1478
},
{
"epoch": 0.7516676195921479,
"grad_norm": 2.839165687561035,
"learning_rate": 9.727157859818372e-06,
"loss": 1.2896,
"step": 1479
},
{
"epoch": 0.7521758465154692,
"grad_norm": 2.8478798866271973,
"learning_rate": 9.726609804950388e-06,
"loss": 1.2452,
"step": 1480
},
{
"epoch": 0.7526840734387904,
"grad_norm": 3.012943744659424,
"learning_rate": 9.72606121566967e-06,
"loss": 1.2447,
"step": 1481
},
{
"epoch": 0.7531923003621117,
"grad_norm": 2.7149770259857178,
"learning_rate": 9.725512092038251e-06,
"loss": 1.1905,
"step": 1482
},
{
"epoch": 0.7537005272854329,
"grad_norm": 2.8013172149658203,
"learning_rate": 9.724962434118213e-06,
"loss": 1.0993,
"step": 1483
},
{
"epoch": 0.7542087542087542,
"grad_norm": 2.8769729137420654,
"learning_rate": 9.724412241971703e-06,
"loss": 1.3132,
"step": 1484
},
{
"epoch": 0.7547169811320755,
"grad_norm": 2.906467914581299,
"learning_rate": 9.723861515660931e-06,
"loss": 1.2811,
"step": 1485
},
{
"epoch": 0.7552252080553967,
"grad_norm": 2.7540318965911865,
"learning_rate": 9.72331025524816e-06,
"loss": 1.2457,
"step": 1486
},
{
"epoch": 0.755733434978718,
"grad_norm": 3.0037455558776855,
"learning_rate": 9.722758460795723e-06,
"loss": 1.2976,
"step": 1487
},
{
"epoch": 0.7562416619020392,
"grad_norm": 3.0428314208984375,
"learning_rate": 9.722206132366008e-06,
"loss": 1.2379,
"step": 1488
},
{
"epoch": 0.7567498888253605,
"grad_norm": 2.7325022220611572,
"learning_rate": 9.721653270021461e-06,
"loss": 1.2126,
"step": 1489
},
{
"epoch": 0.7572581157486817,
"grad_norm": 2.63283371925354,
"learning_rate": 9.72109987382459e-06,
"loss": 1.2667,
"step": 1490
},
{
"epoch": 0.757766342672003,
"grad_norm": 2.848900556564331,
"learning_rate": 9.720545943837972e-06,
"loss": 1.2651,
"step": 1491
},
{
"epoch": 0.7582745695953244,
"grad_norm": 2.9327495098114014,
"learning_rate": 9.71999148012423e-06,
"loss": 1.2489,
"step": 1492
},
{
"epoch": 0.7587827965186456,
"grad_norm": 3.18332576751709,
"learning_rate": 9.719436482746054e-06,
"loss": 1.3644,
"step": 1493
},
{
"epoch": 0.7592910234419669,
"grad_norm": 2.8493423461914062,
"learning_rate": 9.718880951766201e-06,
"loss": 1.1427,
"step": 1494
},
{
"epoch": 0.7597992503652881,
"grad_norm": 3.0256540775299072,
"learning_rate": 9.718324887247475e-06,
"loss": 1.3127,
"step": 1495
},
{
"epoch": 0.7603074772886094,
"grad_norm": 2.7205774784088135,
"learning_rate": 9.717768289252752e-06,
"loss": 1.1484,
"step": 1496
},
{
"epoch": 0.7608157042119307,
"grad_norm": 2.971435546875,
"learning_rate": 9.717211157844962e-06,
"loss": 1.2894,
"step": 1497
},
{
"epoch": 0.7613239311352519,
"grad_norm": 3.055706262588501,
"learning_rate": 9.716653493087096e-06,
"loss": 1.2505,
"step": 1498
},
{
"epoch": 0.7618321580585732,
"grad_norm": 2.809715747833252,
"learning_rate": 9.716095295042207e-06,
"loss": 1.1809,
"step": 1499
},
{
"epoch": 0.7623403849818944,
"grad_norm": 2.8183910846710205,
"learning_rate": 9.715536563773407e-06,
"loss": 1.148,
"step": 1500
},
{
"epoch": 0.7623403849818944,
"eval_loss": 1.2643159627914429,
"eval_runtime": 12.322,
"eval_samples_per_second": 32.462,
"eval_steps_per_second": 4.058,
"step": 1500
},
{
"epoch": 0.7628486119052157,
"grad_norm": 2.898142099380493,
"learning_rate": 9.71497729934387e-06,
"loss": 1.2616,
"step": 1501
},
{
"epoch": 0.7633568388285369,
"grad_norm": 2.7970736026763916,
"learning_rate": 9.714417501816826e-06,
"loss": 1.2414,
"step": 1502
},
{
"epoch": 0.7638650657518582,
"grad_norm": 2.9098377227783203,
"learning_rate": 9.713857171255574e-06,
"loss": 1.2983,
"step": 1503
},
{
"epoch": 0.7643732926751795,
"grad_norm": 2.860549211502075,
"learning_rate": 9.713296307723463e-06,
"loss": 1.1495,
"step": 1504
},
{
"epoch": 0.7648815195985007,
"grad_norm": 2.819836378097534,
"learning_rate": 9.712734911283907e-06,
"loss": 1.1737,
"step": 1505
},
{
"epoch": 0.765389746521822,
"grad_norm": 3.5737171173095703,
"learning_rate": 9.712172982000382e-06,
"loss": 1.3854,
"step": 1506
},
{
"epoch": 0.7658979734451432,
"grad_norm": 3.0363149642944336,
"learning_rate": 9.71161051993642e-06,
"loss": 1.2698,
"step": 1507
},
{
"epoch": 0.7664062003684645,
"grad_norm": 3.0048258304595947,
"learning_rate": 9.711047525155619e-06,
"loss": 1.3692,
"step": 1508
},
{
"epoch": 0.7669144272917858,
"grad_norm": 2.9466333389282227,
"learning_rate": 9.710483997721633e-06,
"loss": 1.2379,
"step": 1509
},
{
"epoch": 0.767422654215107,
"grad_norm": 2.9100375175476074,
"learning_rate": 9.709919937698175e-06,
"loss": 1.1373,
"step": 1510
},
{
"epoch": 0.7679308811384283,
"grad_norm": 2.9696006774902344,
"learning_rate": 9.70935534514902e-06,
"loss": 1.2764,
"step": 1511
},
{
"epoch": 0.7684391080617495,
"grad_norm": 2.826723098754883,
"learning_rate": 9.708790220138007e-06,
"loss": 1.2072,
"step": 1512
},
{
"epoch": 0.7689473349850708,
"grad_norm": 3.223733425140381,
"learning_rate": 9.708224562729027e-06,
"loss": 1.2815,
"step": 1513
},
{
"epoch": 0.7694555619083921,
"grad_norm": 2.8028769493103027,
"learning_rate": 9.70765837298604e-06,
"loss": 1.2197,
"step": 1514
},
{
"epoch": 0.7699637888317133,
"grad_norm": 2.8905370235443115,
"learning_rate": 9.707091650973061e-06,
"loss": 1.3065,
"step": 1515
},
{
"epoch": 0.7704720157550347,
"grad_norm": 2.9921021461486816,
"learning_rate": 9.706524396754164e-06,
"loss": 1.3296,
"step": 1516
},
{
"epoch": 0.7709802426783559,
"grad_norm": 2.9344661235809326,
"learning_rate": 9.70595661039349e-06,
"loss": 1.4179,
"step": 1517
},
{
"epoch": 0.7714884696016772,
"grad_norm": 2.6728525161743164,
"learning_rate": 9.70538829195523e-06,
"loss": 1.2245,
"step": 1518
},
{
"epoch": 0.7719966965249984,
"grad_norm": 2.7900071144104004,
"learning_rate": 9.704819441503646e-06,
"loss": 1.1504,
"step": 1519
},
{
"epoch": 0.7725049234483197,
"grad_norm": 3.0739340782165527,
"learning_rate": 9.704250059103051e-06,
"loss": 1.2744,
"step": 1520
},
{
"epoch": 0.773013150371641,
"grad_norm": 2.846035957336426,
"learning_rate": 9.703680144817821e-06,
"loss": 1.0986,
"step": 1521
},
{
"epoch": 0.7735213772949622,
"grad_norm": 3.0878632068634033,
"learning_rate": 9.703109698712401e-06,
"loss": 1.324,
"step": 1522
},
{
"epoch": 0.7740296042182835,
"grad_norm": 2.9029667377471924,
"learning_rate": 9.702538720851279e-06,
"loss": 1.2852,
"step": 1523
},
{
"epoch": 0.7745378311416047,
"grad_norm": 2.980501890182495,
"learning_rate": 9.701967211299017e-06,
"loss": 1.2395,
"step": 1524
},
{
"epoch": 0.775046058064926,
"grad_norm": 2.8804404735565186,
"learning_rate": 9.701395170120233e-06,
"loss": 1.1636,
"step": 1525
},
{
"epoch": 0.7755542849882473,
"grad_norm": 2.804990768432617,
"learning_rate": 9.700822597379604e-06,
"loss": 1.0939,
"step": 1526
},
{
"epoch": 0.7760625119115685,
"grad_norm": 2.904367208480835,
"learning_rate": 9.700249493141867e-06,
"loss": 1.3072,
"step": 1527
},
{
"epoch": 0.7765707388348898,
"grad_norm": 3.0249783992767334,
"learning_rate": 9.69967585747182e-06,
"loss": 1.274,
"step": 1528
},
{
"epoch": 0.777078965758211,
"grad_norm": 2.8509297370910645,
"learning_rate": 9.69910169043432e-06,
"loss": 1.2317,
"step": 1529
},
{
"epoch": 0.7775871926815323,
"grad_norm": 3.515911102294922,
"learning_rate": 9.698526992094288e-06,
"loss": 1.2212,
"step": 1530
},
{
"epoch": 0.7780954196048536,
"grad_norm": 2.891103982925415,
"learning_rate": 9.6979517625167e-06,
"loss": 1.2583,
"step": 1531
},
{
"epoch": 0.7786036465281748,
"grad_norm": 2.970613956451416,
"learning_rate": 9.697376001766595e-06,
"loss": 1.1725,
"step": 1532
},
{
"epoch": 0.7791118734514961,
"grad_norm": 2.938046932220459,
"learning_rate": 9.69679970990907e-06,
"loss": 1.2778,
"step": 1533
},
{
"epoch": 0.7796201003748173,
"grad_norm": 2.8662068843841553,
"learning_rate": 9.696222887009283e-06,
"loss": 1.2765,
"step": 1534
},
{
"epoch": 0.7801283272981386,
"grad_norm": 2.9136219024658203,
"learning_rate": 9.695645533132455e-06,
"loss": 1.2756,
"step": 1535
},
{
"epoch": 0.7806365542214598,
"grad_norm": 2.9310011863708496,
"learning_rate": 9.695067648343862e-06,
"loss": 1.2819,
"step": 1536
},
{
"epoch": 0.7811447811447811,
"grad_norm": 3.0941317081451416,
"learning_rate": 9.694489232708843e-06,
"loss": 1.2342,
"step": 1537
},
{
"epoch": 0.7816530080681025,
"grad_norm": 2.9651567935943604,
"learning_rate": 9.693910286292797e-06,
"loss": 1.3028,
"step": 1538
},
{
"epoch": 0.7821612349914236,
"grad_norm": 2.940019130706787,
"learning_rate": 9.69333080916118e-06,
"loss": 1.1719,
"step": 1539
},
{
"epoch": 0.782669461914745,
"grad_norm": 2.8346259593963623,
"learning_rate": 9.692750801379514e-06,
"loss": 1.3167,
"step": 1540
},
{
"epoch": 0.7831776888380662,
"grad_norm": 2.784411907196045,
"learning_rate": 9.692170263013376e-06,
"loss": 1.2454,
"step": 1541
},
{
"epoch": 0.7836859157613875,
"grad_norm": 2.9267518520355225,
"learning_rate": 9.691589194128403e-06,
"loss": 1.219,
"step": 1542
},
{
"epoch": 0.7841941426847088,
"grad_norm": 2.6732523441314697,
"learning_rate": 9.691007594790295e-06,
"loss": 1.2958,
"step": 1543
},
{
"epoch": 0.78470236960803,
"grad_norm": 3.058943510055542,
"learning_rate": 9.69042546506481e-06,
"loss": 1.3182,
"step": 1544
},
{
"epoch": 0.7852105965313513,
"grad_norm": 2.853072166442871,
"learning_rate": 9.689842805017765e-06,
"loss": 1.2758,
"step": 1545
},
{
"epoch": 0.7857188234546725,
"grad_norm": 3.0760834217071533,
"learning_rate": 9.689259614715039e-06,
"loss": 1.2394,
"step": 1546
},
{
"epoch": 0.7862270503779938,
"grad_norm": 2.931668758392334,
"learning_rate": 9.688675894222572e-06,
"loss": 1.3268,
"step": 1547
},
{
"epoch": 0.786735277301315,
"grad_norm": 2.7671284675598145,
"learning_rate": 9.68809164360636e-06,
"loss": 1.2555,
"step": 1548
},
{
"epoch": 0.7872435042246363,
"grad_norm": 3.0845117568969727,
"learning_rate": 9.687506862932464e-06,
"loss": 1.2875,
"step": 1549
},
{
"epoch": 0.7877517311479576,
"grad_norm": 3.1043455600738525,
"learning_rate": 9.686921552266997e-06,
"loss": 1.2578,
"step": 1550
},
{
"epoch": 0.7882599580712788,
"grad_norm": 2.8478760719299316,
"learning_rate": 9.686335711676142e-06,
"loss": 1.2669,
"step": 1551
},
{
"epoch": 0.7887681849946001,
"grad_norm": 2.740041494369507,
"learning_rate": 9.685749341226134e-06,
"loss": 1.2157,
"step": 1552
},
{
"epoch": 0.7892764119179213,
"grad_norm": 2.8490264415740967,
"learning_rate": 9.685162440983272e-06,
"loss": 1.2503,
"step": 1553
},
{
"epoch": 0.7897846388412426,
"grad_norm": 2.845862865447998,
"learning_rate": 9.684575011013912e-06,
"loss": 1.3621,
"step": 1554
},
{
"epoch": 0.7902928657645639,
"grad_norm": 2.9016470909118652,
"learning_rate": 9.683987051384475e-06,
"loss": 1.3163,
"step": 1555
},
{
"epoch": 0.7908010926878851,
"grad_norm": 3.1869518756866455,
"learning_rate": 9.683398562161434e-06,
"loss": 1.302,
"step": 1556
},
{
"epoch": 0.7913093196112064,
"grad_norm": 3.030754327774048,
"learning_rate": 9.68280954341133e-06,
"loss": 1.3103,
"step": 1557
},
{
"epoch": 0.7918175465345276,
"grad_norm": 3.1585705280303955,
"learning_rate": 9.68221999520076e-06,
"loss": 1.37,
"step": 1558
},
{
"epoch": 0.7923257734578489,
"grad_norm": 2.867959976196289,
"learning_rate": 9.68162991759638e-06,
"loss": 1.17,
"step": 1559
},
{
"epoch": 0.7928340003811702,
"grad_norm": 3.2136871814727783,
"learning_rate": 9.681039310664906e-06,
"loss": 1.2515,
"step": 1560
},
{
"epoch": 0.7933422273044914,
"grad_norm": 3.129521608352661,
"learning_rate": 9.680448174473116e-06,
"loss": 1.2155,
"step": 1561
},
{
"epoch": 0.7938504542278128,
"grad_norm": 2.799604654312134,
"learning_rate": 9.679856509087847e-06,
"loss": 1.2057,
"step": 1562
},
{
"epoch": 0.794358681151134,
"grad_norm": 2.9921875,
"learning_rate": 9.679264314575996e-06,
"loss": 1.2361,
"step": 1563
},
{
"epoch": 0.7948669080744553,
"grad_norm": 2.982118606567383,
"learning_rate": 9.678671591004517e-06,
"loss": 1.2876,
"step": 1564
},
{
"epoch": 0.7953751349977765,
"grad_norm": 2.834472179412842,
"learning_rate": 9.678078338440426e-06,
"loss": 1.1996,
"step": 1565
},
{
"epoch": 0.7958833619210978,
"grad_norm": 2.7313015460968018,
"learning_rate": 9.677484556950802e-06,
"loss": 1.1582,
"step": 1566
},
{
"epoch": 0.7963915888444191,
"grad_norm": 2.772125244140625,
"learning_rate": 9.676890246602778e-06,
"loss": 1.1159,
"step": 1567
},
{
"epoch": 0.7968998157677403,
"grad_norm": 2.912230968475342,
"learning_rate": 9.676295407463551e-06,
"loss": 1.2765,
"step": 1568
},
{
"epoch": 0.7974080426910616,
"grad_norm": 2.979102611541748,
"learning_rate": 9.675700039600377e-06,
"loss": 1.3157,
"step": 1569
},
{
"epoch": 0.7979162696143828,
"grad_norm": 2.7840914726257324,
"learning_rate": 9.675104143080569e-06,
"loss": 1.1945,
"step": 1570
},
{
"epoch": 0.7984244965377041,
"grad_norm": 2.832731008529663,
"learning_rate": 9.674507717971502e-06,
"loss": 1.2942,
"step": 1571
},
{
"epoch": 0.7989327234610254,
"grad_norm": 2.896554470062256,
"learning_rate": 9.673910764340613e-06,
"loss": 1.2832,
"step": 1572
},
{
"epoch": 0.7994409503843466,
"grad_norm": 2.8940999507904053,
"learning_rate": 9.673313282255395e-06,
"loss": 1.2314,
"step": 1573
},
{
"epoch": 0.7999491773076679,
"grad_norm": 2.7886762619018555,
"learning_rate": 9.6727152717834e-06,
"loss": 1.227,
"step": 1574
},
{
"epoch": 0.8004574042309891,
"grad_norm": 2.9096152782440186,
"learning_rate": 9.672116732992245e-06,
"loss": 1.211,
"step": 1575
},
{
"epoch": 0.8009656311543104,
"grad_norm": 3.0253443717956543,
"learning_rate": 9.6715176659496e-06,
"loss": 1.2943,
"step": 1576
},
{
"epoch": 0.8014738580776317,
"grad_norm": 3.041499376296997,
"learning_rate": 9.670918070723206e-06,
"loss": 1.2964,
"step": 1577
},
{
"epoch": 0.8019820850009529,
"grad_norm": 3.052034378051758,
"learning_rate": 9.670317947380847e-06,
"loss": 1.2971,
"step": 1578
},
{
"epoch": 0.8024903119242742,
"grad_norm": 2.8331234455108643,
"learning_rate": 9.66971729599038e-06,
"loss": 1.2349,
"step": 1579
},
{
"epoch": 0.8029985388475954,
"grad_norm": 2.987531900405884,
"learning_rate": 9.669116116619717e-06,
"loss": 1.2844,
"step": 1580
},
{
"epoch": 0.8035067657709167,
"grad_norm": 3.0655086040496826,
"learning_rate": 9.668514409336831e-06,
"loss": 1.2412,
"step": 1581
},
{
"epoch": 0.8040149926942379,
"grad_norm": 2.681715965270996,
"learning_rate": 9.667912174209753e-06,
"loss": 1.1691,
"step": 1582
},
{
"epoch": 0.8045232196175592,
"grad_norm": 2.923539876937866,
"learning_rate": 9.667309411306574e-06,
"loss": 1.3403,
"step": 1583
},
{
"epoch": 0.8050314465408805,
"grad_norm": 2.8867475986480713,
"learning_rate": 9.666706120695447e-06,
"loss": 1.336,
"step": 1584
},
{
"epoch": 0.8055396734642017,
"grad_norm": 2.9885010719299316,
"learning_rate": 9.66610230244458e-06,
"loss": 1.2957,
"step": 1585
},
{
"epoch": 0.806047900387523,
"grad_norm": 2.730257749557495,
"learning_rate": 9.665497956622247e-06,
"loss": 1.1617,
"step": 1586
},
{
"epoch": 0.8065561273108443,
"grad_norm": 3.0298240184783936,
"learning_rate": 9.664893083296777e-06,
"loss": 1.3732,
"step": 1587
},
{
"epoch": 0.8070643542341656,
"grad_norm": 2.7434775829315186,
"learning_rate": 9.664287682536558e-06,
"loss": 1.1253,
"step": 1588
},
{
"epoch": 0.8075725811574869,
"grad_norm": 2.753551483154297,
"learning_rate": 9.663681754410038e-06,
"loss": 1.2321,
"step": 1589
},
{
"epoch": 0.8080808080808081,
"grad_norm": 2.7053587436676025,
"learning_rate": 9.663075298985733e-06,
"loss": 1.2795,
"step": 1590
},
{
"epoch": 0.8085890350041294,
"grad_norm": 2.874924898147583,
"learning_rate": 9.662468316332205e-06,
"loss": 1.2494,
"step": 1591
},
{
"epoch": 0.8090972619274506,
"grad_norm": 3.1453142166137695,
"learning_rate": 9.661860806518086e-06,
"loss": 1.3158,
"step": 1592
},
{
"epoch": 0.8096054888507719,
"grad_norm": 2.962503433227539,
"learning_rate": 9.661252769612063e-06,
"loss": 1.3158,
"step": 1593
},
{
"epoch": 0.8101137157740931,
"grad_norm": 3.0778138637542725,
"learning_rate": 9.660644205682884e-06,
"loss": 1.2964,
"step": 1594
},
{
"epoch": 0.8106219426974144,
"grad_norm": 2.989445924758911,
"learning_rate": 9.660035114799353e-06,
"loss": 1.3058,
"step": 1595
},
{
"epoch": 0.8111301696207357,
"grad_norm": 2.8797903060913086,
"learning_rate": 9.659425497030339e-06,
"loss": 1.1792,
"step": 1596
},
{
"epoch": 0.8116383965440569,
"grad_norm": 3.105631113052368,
"learning_rate": 9.65881535244477e-06,
"loss": 1.303,
"step": 1597
},
{
"epoch": 0.8121466234673782,
"grad_norm": 2.780606269836426,
"learning_rate": 9.658204681111628e-06,
"loss": 1.1623,
"step": 1598
},
{
"epoch": 0.8126548503906994,
"grad_norm": 5.6422038078308105,
"learning_rate": 9.657593483099962e-06,
"loss": 1.4302,
"step": 1599
},
{
"epoch": 0.8131630773140207,
"grad_norm": 3.0730020999908447,
"learning_rate": 9.656981758478875e-06,
"loss": 1.2633,
"step": 1600
},
{
"epoch": 0.813671304237342,
"grad_norm": 3.3350472450256348,
"learning_rate": 9.656369507317532e-06,
"loss": 1.201,
"step": 1601
},
{
"epoch": 0.8141795311606632,
"grad_norm": 2.7912869453430176,
"learning_rate": 9.655756729685156e-06,
"loss": 1.1654,
"step": 1602
},
{
"epoch": 0.8146877580839845,
"grad_norm": 2.8811697959899902,
"learning_rate": 9.655143425651033e-06,
"loss": 1.1811,
"step": 1603
},
{
"epoch": 0.8151959850073057,
"grad_norm": 2.713759183883667,
"learning_rate": 9.654529595284503e-06,
"loss": 1.1562,
"step": 1604
},
{
"epoch": 0.815704211930627,
"grad_norm": 2.927468776702881,
"learning_rate": 9.653915238654972e-06,
"loss": 1.2829,
"step": 1605
},
{
"epoch": 0.8162124388539483,
"grad_norm": 2.8604557514190674,
"learning_rate": 9.653300355831898e-06,
"loss": 1.2372,
"step": 1606
},
{
"epoch": 0.8167206657772695,
"grad_norm": 2.864851236343384,
"learning_rate": 9.652684946884806e-06,
"loss": 1.3857,
"step": 1607
},
{
"epoch": 0.8172288927005908,
"grad_norm": 3.0702593326568604,
"learning_rate": 9.652069011883273e-06,
"loss": 1.2066,
"step": 1608
},
{
"epoch": 0.817737119623912,
"grad_norm": 2.893040180206299,
"learning_rate": 9.651452550896943e-06,
"loss": 1.1917,
"step": 1609
},
{
"epoch": 0.8182453465472334,
"grad_norm": 2.9085614681243896,
"learning_rate": 9.650835563995516e-06,
"loss": 1.246,
"step": 1610
},
{
"epoch": 0.8187535734705546,
"grad_norm": 3.080528974533081,
"learning_rate": 9.65021805124875e-06,
"loss": 1.2369,
"step": 1611
},
{
"epoch": 0.8192618003938759,
"grad_norm": 2.8631365299224854,
"learning_rate": 9.649600012726465e-06,
"loss": 1.2071,
"step": 1612
},
{
"epoch": 0.8197700273171972,
"grad_norm": 3.306487560272217,
"learning_rate": 9.648981448498538e-06,
"loss": 1.2006,
"step": 1613
},
{
"epoch": 0.8202782542405184,
"grad_norm": 2.7040047645568848,
"learning_rate": 9.648362358634907e-06,
"loss": 1.2456,
"step": 1614
},
{
"epoch": 0.8207864811638397,
"grad_norm": 3.003469228744507,
"learning_rate": 9.64774274320557e-06,
"loss": 1.192,
"step": 1615
},
{
"epoch": 0.8212947080871609,
"grad_norm": 3.2069551944732666,
"learning_rate": 9.647122602280585e-06,
"loss": 1.3296,
"step": 1616
},
{
"epoch": 0.8218029350104822,
"grad_norm": 2.9010188579559326,
"learning_rate": 9.646501935930064e-06,
"loss": 1.2709,
"step": 1617
},
{
"epoch": 0.8223111619338035,
"grad_norm": 3.0305323600769043,
"learning_rate": 9.645880744224185e-06,
"loss": 1.2166,
"step": 1618
},
{
"epoch": 0.8228193888571247,
"grad_norm": 2.9393057823181152,
"learning_rate": 9.645259027233185e-06,
"loss": 1.2345,
"step": 1619
},
{
"epoch": 0.823327615780446,
"grad_norm": 2.836444139480591,
"learning_rate": 9.644636785027355e-06,
"loss": 1.1531,
"step": 1620
},
{
"epoch": 0.8238358427037672,
"grad_norm": 3.178603172302246,
"learning_rate": 9.644014017677049e-06,
"loss": 1.2349,
"step": 1621
},
{
"epoch": 0.8243440696270885,
"grad_norm": 2.6164798736572266,
"learning_rate": 9.64339072525268e-06,
"loss": 1.244,
"step": 1622
},
{
"epoch": 0.8248522965504097,
"grad_norm": 2.7259740829467773,
"learning_rate": 9.642766907824721e-06,
"loss": 1.2564,
"step": 1623
},
{
"epoch": 0.825360523473731,
"grad_norm": 2.822526454925537,
"learning_rate": 9.642142565463705e-06,
"loss": 1.2629,
"step": 1624
},
{
"epoch": 0.8258687503970523,
"grad_norm": 2.8354594707489014,
"learning_rate": 9.641517698240221e-06,
"loss": 1.2838,
"step": 1625
},
{
"epoch": 0.8263769773203735,
"grad_norm": 2.7072620391845703,
"learning_rate": 9.64089230622492e-06,
"loss": 1.0654,
"step": 1626
},
{
"epoch": 0.8268852042436948,
"grad_norm": 3.053953170776367,
"learning_rate": 9.640266389488512e-06,
"loss": 1.2494,
"step": 1627
},
{
"epoch": 0.827393431167016,
"grad_norm": 2.87473201751709,
"learning_rate": 9.639639948101767e-06,
"loss": 1.169,
"step": 1628
},
{
"epoch": 0.8279016580903373,
"grad_norm": 3.2058591842651367,
"learning_rate": 9.639012982135512e-06,
"loss": 1.2292,
"step": 1629
},
{
"epoch": 0.8284098850136586,
"grad_norm": 3.0206425189971924,
"learning_rate": 9.638385491660633e-06,
"loss": 1.3061,
"step": 1630
},
{
"epoch": 0.8289181119369798,
"grad_norm": 3.0649890899658203,
"learning_rate": 9.637757476748081e-06,
"loss": 1.2873,
"step": 1631
},
{
"epoch": 0.8294263388603011,
"grad_norm": 3.119568109512329,
"learning_rate": 9.637128937468862e-06,
"loss": 1.2597,
"step": 1632
},
{
"epoch": 0.8299345657836223,
"grad_norm": 2.910027027130127,
"learning_rate": 9.636499873894038e-06,
"loss": 1.1835,
"step": 1633
},
{
"epoch": 0.8304427927069437,
"grad_norm": 3.029801845550537,
"learning_rate": 9.635870286094738e-06,
"loss": 1.3794,
"step": 1634
},
{
"epoch": 0.830951019630265,
"grad_norm": 2.6900525093078613,
"learning_rate": 9.635240174142142e-06,
"loss": 1.2792,
"step": 1635
},
{
"epoch": 0.8314592465535862,
"grad_norm": 2.8703951835632324,
"learning_rate": 9.634609538107498e-06,
"loss": 1.2806,
"step": 1636
},
{
"epoch": 0.8319674734769075,
"grad_norm": 2.82772159576416,
"learning_rate": 9.633978378062103e-06,
"loss": 1.1742,
"step": 1637
},
{
"epoch": 0.8324757004002287,
"grad_norm": 3.2928287982940674,
"learning_rate": 9.633346694077324e-06,
"loss": 1.2234,
"step": 1638
},
{
"epoch": 0.83298392732355,
"grad_norm": 3.0190470218658447,
"learning_rate": 9.632714486224581e-06,
"loss": 1.1061,
"step": 1639
},
{
"epoch": 0.8334921542468712,
"grad_norm": 3.1004772186279297,
"learning_rate": 9.632081754575352e-06,
"loss": 1.325,
"step": 1640
},
{
"epoch": 0.8340003811701925,
"grad_norm": 2.919175386428833,
"learning_rate": 9.63144849920118e-06,
"loss": 1.2204,
"step": 1641
},
{
"epoch": 0.8345086080935138,
"grad_norm": 2.95920729637146,
"learning_rate": 9.630814720173662e-06,
"loss": 1.2594,
"step": 1642
},
{
"epoch": 0.835016835016835,
"grad_norm": 2.7796289920806885,
"learning_rate": 9.630180417564456e-06,
"loss": 1.2342,
"step": 1643
},
{
"epoch": 0.8355250619401563,
"grad_norm": 3.0137064456939697,
"learning_rate": 9.62954559144528e-06,
"loss": 1.315,
"step": 1644
},
{
"epoch": 0.8360332888634775,
"grad_norm": 2.9403417110443115,
"learning_rate": 9.628910241887908e-06,
"loss": 1.3395,
"step": 1645
},
{
"epoch": 0.8365415157867988,
"grad_norm": 2.85813045501709,
"learning_rate": 9.628274368964178e-06,
"loss": 1.3317,
"step": 1646
},
{
"epoch": 0.8370497427101201,
"grad_norm": 2.6518867015838623,
"learning_rate": 9.627637972745986e-06,
"loss": 1.1876,
"step": 1647
},
{
"epoch": 0.8375579696334413,
"grad_norm": 2.998403549194336,
"learning_rate": 9.627001053305283e-06,
"loss": 1.274,
"step": 1648
},
{
"epoch": 0.8380661965567626,
"grad_norm": 2.8829715251922607,
"learning_rate": 9.626363610714084e-06,
"loss": 1.2354,
"step": 1649
},
{
"epoch": 0.8385744234800838,
"grad_norm": 2.7852256298065186,
"learning_rate": 9.62572564504446e-06,
"loss": 1.2655,
"step": 1650
},
{
"epoch": 0.8390826504034051,
"grad_norm": 2.878523349761963,
"learning_rate": 9.625087156368541e-06,
"loss": 1.2437,
"step": 1651
},
{
"epoch": 0.8395908773267264,
"grad_norm": 3.0157649517059326,
"learning_rate": 9.624448144758522e-06,
"loss": 1.2135,
"step": 1652
},
{
"epoch": 0.8400991042500476,
"grad_norm": 2.7613508701324463,
"learning_rate": 9.623808610286652e-06,
"loss": 1.26,
"step": 1653
},
{
"epoch": 0.8406073311733689,
"grad_norm": 2.9558663368225098,
"learning_rate": 9.623168553025235e-06,
"loss": 1.2329,
"step": 1654
},
{
"epoch": 0.8411155580966901,
"grad_norm": 2.719539165496826,
"learning_rate": 9.622527973046642e-06,
"loss": 1.1355,
"step": 1655
},
{
"epoch": 0.8416237850200115,
"grad_norm": 2.8478665351867676,
"learning_rate": 9.6218868704233e-06,
"loss": 1.309,
"step": 1656
},
{
"epoch": 0.8421320119433326,
"grad_norm": 2.840024948120117,
"learning_rate": 9.621245245227695e-06,
"loss": 1.1948,
"step": 1657
},
{
"epoch": 0.842640238866654,
"grad_norm": 2.674862861633301,
"learning_rate": 9.620603097532373e-06,
"loss": 1.2537,
"step": 1658
},
{
"epoch": 0.8431484657899753,
"grad_norm": 2.6723244190216064,
"learning_rate": 9.619960427409937e-06,
"loss": 1.2343,
"step": 1659
},
{
"epoch": 0.8436566927132965,
"grad_norm": 2.7692830562591553,
"learning_rate": 9.619317234933049e-06,
"loss": 1.2511,
"step": 1660
},
{
"epoch": 0.8441649196366178,
"grad_norm": 2.7434282302856445,
"learning_rate": 9.618673520174435e-06,
"loss": 1.2742,
"step": 1661
},
{
"epoch": 0.844673146559939,
"grad_norm": 2.9034934043884277,
"learning_rate": 9.618029283206873e-06,
"loss": 1.3008,
"step": 1662
},
{
"epoch": 0.8451813734832603,
"grad_norm": 2.9145328998565674,
"learning_rate": 9.617384524103207e-06,
"loss": 1.2975,
"step": 1663
},
{
"epoch": 0.8456896004065816,
"grad_norm": 2.774017810821533,
"learning_rate": 9.616739242936331e-06,
"loss": 1.1945,
"step": 1664
},
{
"epoch": 0.8461978273299028,
"grad_norm": 2.818248748779297,
"learning_rate": 9.61609343977921e-06,
"loss": 1.3295,
"step": 1665
},
{
"epoch": 0.8467060542532241,
"grad_norm": 3.614201307296753,
"learning_rate": 9.615447114704858e-06,
"loss": 1.2313,
"step": 1666
},
{
"epoch": 0.8472142811765453,
"grad_norm": 3.3795571327209473,
"learning_rate": 9.614800267786349e-06,
"loss": 1.248,
"step": 1667
},
{
"epoch": 0.8477225080998666,
"grad_norm": 3.0424909591674805,
"learning_rate": 9.614152899096824e-06,
"loss": 1.2607,
"step": 1668
},
{
"epoch": 0.8482307350231878,
"grad_norm": 2.789071798324585,
"learning_rate": 9.613505008709475e-06,
"loss": 1.1765,
"step": 1669
},
{
"epoch": 0.8487389619465091,
"grad_norm": 2.9772937297821045,
"learning_rate": 9.612856596697556e-06,
"loss": 1.2276,
"step": 1670
},
{
"epoch": 0.8492471888698304,
"grad_norm": 3.111518144607544,
"learning_rate": 9.612207663134376e-06,
"loss": 1.2703,
"step": 1671
},
{
"epoch": 0.8497554157931516,
"grad_norm": 3.206437110900879,
"learning_rate": 9.611558208093313e-06,
"loss": 1.265,
"step": 1672
},
{
"epoch": 0.8502636427164729,
"grad_norm": 3.0687997341156006,
"learning_rate": 9.610908231647794e-06,
"loss": 1.1979,
"step": 1673
},
{
"epoch": 0.8507718696397941,
"grad_norm": 2.947190761566162,
"learning_rate": 9.610257733871306e-06,
"loss": 1.2856,
"step": 1674
},
{
"epoch": 0.8512800965631154,
"grad_norm": 2.7396671772003174,
"learning_rate": 9.609606714837401e-06,
"loss": 1.1921,
"step": 1675
},
{
"epoch": 0.8517883234864367,
"grad_norm": 2.6573565006256104,
"learning_rate": 9.608955174619685e-06,
"loss": 1.1377,
"step": 1676
},
{
"epoch": 0.8522965504097579,
"grad_norm": 3.111696481704712,
"learning_rate": 9.608303113291825e-06,
"loss": 1.2351,
"step": 1677
},
{
"epoch": 0.8528047773330792,
"grad_norm": 2.96317458152771,
"learning_rate": 9.607650530927545e-06,
"loss": 1.3084,
"step": 1678
},
{
"epoch": 0.8533130042564004,
"grad_norm": 2.9022066593170166,
"learning_rate": 9.606997427600629e-06,
"loss": 1.2549,
"step": 1679
},
{
"epoch": 0.8538212311797218,
"grad_norm": 2.879927158355713,
"learning_rate": 9.60634380338492e-06,
"loss": 1.2083,
"step": 1680
},
{
"epoch": 0.8543294581030431,
"grad_norm": 2.751678705215454,
"learning_rate": 9.60568965835432e-06,
"loss": 1.2135,
"step": 1681
},
{
"epoch": 0.8548376850263643,
"grad_norm": 3.1005539894104004,
"learning_rate": 9.605034992582791e-06,
"loss": 1.3971,
"step": 1682
},
{
"epoch": 0.8553459119496856,
"grad_norm": 2.9313011169433594,
"learning_rate": 9.604379806144351e-06,
"loss": 1.2184,
"step": 1683
},
{
"epoch": 0.8558541388730068,
"grad_norm": 2.909487724304199,
"learning_rate": 9.603724099113078e-06,
"loss": 1.2142,
"step": 1684
},
{
"epoch": 0.8563623657963281,
"grad_norm": 2.8453476428985596,
"learning_rate": 9.603067871563112e-06,
"loss": 1.2028,
"step": 1685
},
{
"epoch": 0.8568705927196493,
"grad_norm": 2.707455635070801,
"learning_rate": 9.602411123568647e-06,
"loss": 1.2559,
"step": 1686
},
{
"epoch": 0.8573788196429706,
"grad_norm": 3.0561623573303223,
"learning_rate": 9.601753855203937e-06,
"loss": 1.2467,
"step": 1687
},
{
"epoch": 0.8578870465662919,
"grad_norm": 2.825486898422241,
"learning_rate": 9.601096066543299e-06,
"loss": 1.2824,
"step": 1688
},
{
"epoch": 0.8583952734896131,
"grad_norm": 3.058521032333374,
"learning_rate": 9.600437757661102e-06,
"loss": 1.2396,
"step": 1689
},
{
"epoch": 0.8589035004129344,
"grad_norm": 2.9022626876831055,
"learning_rate": 9.59977892863178e-06,
"loss": 1.2501,
"step": 1690
},
{
"epoch": 0.8594117273362556,
"grad_norm": 2.787989616394043,
"learning_rate": 9.599119579529823e-06,
"loss": 1.2036,
"step": 1691
},
{
"epoch": 0.8599199542595769,
"grad_norm": 3.1896774768829346,
"learning_rate": 9.598459710429781e-06,
"loss": 1.245,
"step": 1692
},
{
"epoch": 0.8604281811828982,
"grad_norm": 2.805469512939453,
"learning_rate": 9.597799321406261e-06,
"loss": 1.191,
"step": 1693
},
{
"epoch": 0.8609364081062194,
"grad_norm": 3.0362026691436768,
"learning_rate": 9.597138412533928e-06,
"loss": 1.2462,
"step": 1694
},
{
"epoch": 0.8614446350295407,
"grad_norm": 2.771352767944336,
"learning_rate": 9.596476983887508e-06,
"loss": 1.2599,
"step": 1695
},
{
"epoch": 0.8619528619528619,
"grad_norm": 2.9952127933502197,
"learning_rate": 9.595815035541789e-06,
"loss": 1.281,
"step": 1696
},
{
"epoch": 0.8624610888761832,
"grad_norm": 2.7725441455841064,
"learning_rate": 9.595152567571609e-06,
"loss": 1.2921,
"step": 1697
},
{
"epoch": 0.8629693157995045,
"grad_norm": 2.7685930728912354,
"learning_rate": 9.594489580051872e-06,
"loss": 1.3027,
"step": 1698
},
{
"epoch": 0.8634775427228257,
"grad_norm": 3.058549165725708,
"learning_rate": 9.593826073057538e-06,
"loss": 1.2497,
"step": 1699
},
{
"epoch": 0.863985769646147,
"grad_norm": 2.9856812953948975,
"learning_rate": 9.593162046663629e-06,
"loss": 1.3705,
"step": 1700
},
{
"epoch": 0.8644939965694682,
"grad_norm": 2.884981870651245,
"learning_rate": 9.592497500945218e-06,
"loss": 1.2894,
"step": 1701
},
{
"epoch": 0.8650022234927895,
"grad_norm": 2.938297986984253,
"learning_rate": 9.591832435977446e-06,
"loss": 1.2297,
"step": 1702
},
{
"epoch": 0.8655104504161107,
"grad_norm": 3.102844715118408,
"learning_rate": 9.591166851835505e-06,
"loss": 1.2453,
"step": 1703
},
{
"epoch": 0.866018677339432,
"grad_norm": 2.9945712089538574,
"learning_rate": 9.590500748594652e-06,
"loss": 1.3084,
"step": 1704
},
{
"epoch": 0.8665269042627534,
"grad_norm": 2.8621790409088135,
"learning_rate": 9.589834126330198e-06,
"loss": 1.2862,
"step": 1705
},
{
"epoch": 0.8670351311860746,
"grad_norm": 2.7755682468414307,
"learning_rate": 9.589166985117514e-06,
"loss": 1.3119,
"step": 1706
},
{
"epoch": 0.8675433581093959,
"grad_norm": 2.88777494430542,
"learning_rate": 9.588499325032031e-06,
"loss": 1.4133,
"step": 1707
},
{
"epoch": 0.8680515850327171,
"grad_norm": 2.8970770835876465,
"learning_rate": 9.58783114614924e-06,
"loss": 1.3324,
"step": 1708
},
{
"epoch": 0.8685598119560384,
"grad_norm": 5.2515716552734375,
"learning_rate": 9.587162448544684e-06,
"loss": 1.2924,
"step": 1709
},
{
"epoch": 0.8690680388793597,
"grad_norm": 2.7246832847595215,
"learning_rate": 9.586493232293973e-06,
"loss": 1.1798,
"step": 1710
},
{
"epoch": 0.8695762658026809,
"grad_norm": 2.7503769397735596,
"learning_rate": 9.585823497472769e-06,
"loss": 1.1128,
"step": 1711
},
{
"epoch": 0.8700844927260022,
"grad_norm": 2.8117806911468506,
"learning_rate": 9.585153244156795e-06,
"loss": 1.1741,
"step": 1712
},
{
"epoch": 0.8705927196493234,
"grad_norm": 2.8019652366638184,
"learning_rate": 9.584482472421837e-06,
"loss": 1.3051,
"step": 1713
},
{
"epoch": 0.8711009465726447,
"grad_norm": 3.00313138961792,
"learning_rate": 9.58381118234373e-06,
"loss": 1.2535,
"step": 1714
},
{
"epoch": 0.8716091734959659,
"grad_norm": 2.6497244834899902,
"learning_rate": 9.583139373998378e-06,
"loss": 1.2638,
"step": 1715
},
{
"epoch": 0.8721174004192872,
"grad_norm": 2.8147075176239014,
"learning_rate": 9.58246704746174e-06,
"loss": 1.193,
"step": 1716
},
{
"epoch": 0.8726256273426085,
"grad_norm": 2.795912265777588,
"learning_rate": 9.581794202809824e-06,
"loss": 1.2126,
"step": 1717
},
{
"epoch": 0.8731338542659297,
"grad_norm": 2.7988035678863525,
"learning_rate": 9.581120840118714e-06,
"loss": 1.1986,
"step": 1718
},
{
"epoch": 0.873642081189251,
"grad_norm": 2.717869758605957,
"learning_rate": 9.58044695946454e-06,
"loss": 1.2796,
"step": 1719
},
{
"epoch": 0.8741503081125722,
"grad_norm": 2.8445379734039307,
"learning_rate": 9.579772560923493e-06,
"loss": 1.0302,
"step": 1720
},
{
"epoch": 0.8746585350358935,
"grad_norm": 2.7780463695526123,
"learning_rate": 9.579097644571825e-06,
"loss": 1.3045,
"step": 1721
},
{
"epoch": 0.8751667619592148,
"grad_norm": 2.833652973175049,
"learning_rate": 9.578422210485844e-06,
"loss": 1.133,
"step": 1722
},
{
"epoch": 0.875674988882536,
"grad_norm": 2.707354784011841,
"learning_rate": 9.57774625874192e-06,
"loss": 1.2762,
"step": 1723
},
{
"epoch": 0.8761832158058573,
"grad_norm": 3.210391044616699,
"learning_rate": 9.577069789416477e-06,
"loss": 1.1706,
"step": 1724
},
{
"epoch": 0.8766914427291785,
"grad_norm": 2.731499671936035,
"learning_rate": 9.576392802586001e-06,
"loss": 1.245,
"step": 1725
},
{
"epoch": 0.8771996696524998,
"grad_norm": 2.9754645824432373,
"learning_rate": 9.575715298327037e-06,
"loss": 1.3256,
"step": 1726
},
{
"epoch": 0.8777078965758212,
"grad_norm": 2.9126806259155273,
"learning_rate": 9.575037276716184e-06,
"loss": 1.3404,
"step": 1727
},
{
"epoch": 0.8782161234991424,
"grad_norm": 3.192377805709839,
"learning_rate": 9.574358737830103e-06,
"loss": 1.2681,
"step": 1728
},
{
"epoch": 0.8787243504224637,
"grad_norm": 2.8953189849853516,
"learning_rate": 9.573679681745512e-06,
"loss": 1.2454,
"step": 1729
},
{
"epoch": 0.8792325773457849,
"grad_norm": 3.191070795059204,
"learning_rate": 9.57300010853919e-06,
"loss": 1.269,
"step": 1730
},
{
"epoch": 0.8797408042691062,
"grad_norm": 3.6386911869049072,
"learning_rate": 9.572320018287973e-06,
"loss": 1.2563,
"step": 1731
},
{
"epoch": 0.8802490311924274,
"grad_norm": 2.961223602294922,
"learning_rate": 9.571639411068754e-06,
"loss": 1.2032,
"step": 1732
},
{
"epoch": 0.8807572581157487,
"grad_norm": 2.9369919300079346,
"learning_rate": 9.570958286958485e-06,
"loss": 1.2041,
"step": 1733
},
{
"epoch": 0.88126548503907,
"grad_norm": 2.8557302951812744,
"learning_rate": 9.570276646034178e-06,
"loss": 1.1812,
"step": 1734
},
{
"epoch": 0.8817737119623912,
"grad_norm": 2.7387492656707764,
"learning_rate": 9.569594488372903e-06,
"loss": 1.2181,
"step": 1735
},
{
"epoch": 0.8822819388857125,
"grad_norm": 2.7892708778381348,
"learning_rate": 9.568911814051787e-06,
"loss": 1.1526,
"step": 1736
},
{
"epoch": 0.8827901658090337,
"grad_norm": 2.80728816986084,
"learning_rate": 9.568228623148018e-06,
"loss": 1.2098,
"step": 1737
},
{
"epoch": 0.883298392732355,
"grad_norm": 2.7470126152038574,
"learning_rate": 9.567544915738839e-06,
"loss": 1.2536,
"step": 1738
},
{
"epoch": 0.8838066196556763,
"grad_norm": 2.956306219100952,
"learning_rate": 9.566860691901554e-06,
"loss": 1.2589,
"step": 1739
},
{
"epoch": 0.8843148465789975,
"grad_norm": 2.9518215656280518,
"learning_rate": 9.566175951713524e-06,
"loss": 1.2662,
"step": 1740
},
{
"epoch": 0.8848230735023188,
"grad_norm": 2.8271007537841797,
"learning_rate": 9.565490695252171e-06,
"loss": 1.2346,
"step": 1741
},
{
"epoch": 0.88533130042564,
"grad_norm": 2.9564075469970703,
"learning_rate": 9.56480492259497e-06,
"loss": 1.2713,
"step": 1742
},
{
"epoch": 0.8858395273489613,
"grad_norm": 2.854062795639038,
"learning_rate": 9.564118633819458e-06,
"loss": 1.2513,
"step": 1743
},
{
"epoch": 0.8863477542722826,
"grad_norm": 2.643578290939331,
"learning_rate": 9.563431829003233e-06,
"loss": 1.2893,
"step": 1744
},
{
"epoch": 0.8868559811956038,
"grad_norm": 2.767890691757202,
"learning_rate": 9.562744508223947e-06,
"loss": 1.32,
"step": 1745
},
{
"epoch": 0.8873642081189251,
"grad_norm": 2.9053843021392822,
"learning_rate": 9.562056671559312e-06,
"loss": 1.2899,
"step": 1746
},
{
"epoch": 0.8878724350422463,
"grad_norm": 2.75801682472229,
"learning_rate": 9.561368319087097e-06,
"loss": 1.2051,
"step": 1747
},
{
"epoch": 0.8883806619655676,
"grad_norm": 2.966491460800171,
"learning_rate": 9.56067945088513e-06,
"loss": 1.3499,
"step": 1748
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.8148977756500244,
"learning_rate": 9.5599900670313e-06,
"loss": 1.213,
"step": 1749
},
{
"epoch": 0.8893971158122101,
"grad_norm": 2.659385919570923,
"learning_rate": 9.55930016760355e-06,
"loss": 1.1845,
"step": 1750
},
{
"epoch": 0.8899053427355315,
"grad_norm": 2.595902919769287,
"learning_rate": 9.558609752679884e-06,
"loss": 1.1405,
"step": 1751
},
{
"epoch": 0.8904135696588527,
"grad_norm": 2.6552109718322754,
"learning_rate": 9.557918822338362e-06,
"loss": 1.189,
"step": 1752
},
{
"epoch": 0.890921796582174,
"grad_norm": 2.9055867195129395,
"learning_rate": 9.557227376657106e-06,
"loss": 1.0663,
"step": 1753
},
{
"epoch": 0.8914300235054952,
"grad_norm": 3.767561435699463,
"learning_rate": 9.556535415714294e-06,
"loss": 1.3009,
"step": 1754
},
{
"epoch": 0.8919382504288165,
"grad_norm": 2.8781096935272217,
"learning_rate": 9.555842939588162e-06,
"loss": 1.177,
"step": 1755
},
{
"epoch": 0.8924464773521378,
"grad_norm": 2.7181549072265625,
"learning_rate": 9.555149948357004e-06,
"loss": 1.045,
"step": 1756
},
{
"epoch": 0.892954704275459,
"grad_norm": 2.6964972019195557,
"learning_rate": 9.554456442099171e-06,
"loss": 1.1419,
"step": 1757
},
{
"epoch": 0.8934629311987803,
"grad_norm": 2.87219500541687,
"learning_rate": 9.553762420893078e-06,
"loss": 1.2508,
"step": 1758
},
{
"epoch": 0.8939711581221015,
"grad_norm": 2.856064558029175,
"learning_rate": 9.553067884817193e-06,
"loss": 1.189,
"step": 1759
},
{
"epoch": 0.8944793850454228,
"grad_norm": 2.5938351154327393,
"learning_rate": 9.552372833950041e-06,
"loss": 1.2577,
"step": 1760
},
{
"epoch": 0.894987611968744,
"grad_norm": 2.557764768600464,
"learning_rate": 9.551677268370212e-06,
"loss": 1.1727,
"step": 1761
},
{
"epoch": 0.8954958388920653,
"grad_norm": 2.9965009689331055,
"learning_rate": 9.550981188156347e-06,
"loss": 1.2943,
"step": 1762
},
{
"epoch": 0.8960040658153866,
"grad_norm": 2.8568296432495117,
"learning_rate": 9.550284593387148e-06,
"loss": 1.1781,
"step": 1763
},
{
"epoch": 0.8965122927387078,
"grad_norm": 2.8139688968658447,
"learning_rate": 9.549587484141377e-06,
"loss": 1.2641,
"step": 1764
},
{
"epoch": 0.8970205196620291,
"grad_norm": 3.023052930831909,
"learning_rate": 9.54888986049785e-06,
"loss": 1.2337,
"step": 1765
},
{
"epoch": 0.8975287465853503,
"grad_norm": 2.8153154850006104,
"learning_rate": 9.548191722535447e-06,
"loss": 1.2938,
"step": 1766
},
{
"epoch": 0.8980369735086716,
"grad_norm": 3.049635887145996,
"learning_rate": 9.5474930703331e-06,
"loss": 1.3754,
"step": 1767
},
{
"epoch": 0.8985452004319929,
"grad_norm": 2.8150997161865234,
"learning_rate": 9.546793903969801e-06,
"loss": 1.1264,
"step": 1768
},
{
"epoch": 0.8990534273553141,
"grad_norm": 2.751206159591675,
"learning_rate": 9.546094223524605e-06,
"loss": 1.2231,
"step": 1769
},
{
"epoch": 0.8995616542786354,
"grad_norm": 3.0150442123413086,
"learning_rate": 9.545394029076619e-06,
"loss": 1.2937,
"step": 1770
},
{
"epoch": 0.9000698812019566,
"grad_norm": 2.9299299716949463,
"learning_rate": 9.54469332070501e-06,
"loss": 1.3397,
"step": 1771
},
{
"epoch": 0.9005781081252779,
"grad_norm": 3.0025529861450195,
"learning_rate": 9.543992098489003e-06,
"loss": 1.2489,
"step": 1772
},
{
"epoch": 0.9010863350485993,
"grad_norm": 2.807588815689087,
"learning_rate": 9.543290362507882e-06,
"loss": 1.2776,
"step": 1773
},
{
"epoch": 0.9015945619719204,
"grad_norm": 2.946342706680298,
"learning_rate": 9.542588112840989e-06,
"loss": 1.2245,
"step": 1774
},
{
"epoch": 0.9021027888952418,
"grad_norm": 2.9518632888793945,
"learning_rate": 9.541885349567724e-06,
"loss": 1.3245,
"step": 1775
},
{
"epoch": 0.902611015818563,
"grad_norm": 2.85158109664917,
"learning_rate": 9.541182072767544e-06,
"loss": 1.1866,
"step": 1776
},
{
"epoch": 0.9031192427418843,
"grad_norm": 2.706902503967285,
"learning_rate": 9.540478282519963e-06,
"loss": 1.258,
"step": 1777
},
{
"epoch": 0.9036274696652055,
"grad_norm": 2.9308853149414062,
"learning_rate": 9.539773978904558e-06,
"loss": 1.3477,
"step": 1778
},
{
"epoch": 0.9041356965885268,
"grad_norm": 2.65582275390625,
"learning_rate": 9.53906916200096e-06,
"loss": 1.1828,
"step": 1779
},
{
"epoch": 0.9046439235118481,
"grad_norm": 2.792782783508301,
"learning_rate": 9.538363831888858e-06,
"loss": 1.2049,
"step": 1780
},
{
"epoch": 0.9051521504351693,
"grad_norm": 2.8841593265533447,
"learning_rate": 9.537657988647999e-06,
"loss": 1.2875,
"step": 1781
},
{
"epoch": 0.9056603773584906,
"grad_norm": 2.751776695251465,
"learning_rate": 9.536951632358193e-06,
"loss": 1.1579,
"step": 1782
},
{
"epoch": 0.9061686042818118,
"grad_norm": 2.696763753890991,
"learning_rate": 9.5362447630993e-06,
"loss": 1.186,
"step": 1783
},
{
"epoch": 0.9066768312051331,
"grad_norm": 2.878833293914795,
"learning_rate": 9.535537380951242e-06,
"loss": 1.1926,
"step": 1784
},
{
"epoch": 0.9071850581284544,
"grad_norm": 2.6030893325805664,
"learning_rate": 9.534829485994002e-06,
"loss": 1.1238,
"step": 1785
},
{
"epoch": 0.9076932850517756,
"grad_norm": 2.6879279613494873,
"learning_rate": 9.534121078307615e-06,
"loss": 1.1932,
"step": 1786
},
{
"epoch": 0.9082015119750969,
"grad_norm": 2.800438404083252,
"learning_rate": 9.533412157972179e-06,
"loss": 1.2328,
"step": 1787
},
{
"epoch": 0.9087097388984181,
"grad_norm": 2.800389289855957,
"learning_rate": 9.532702725067846e-06,
"loss": 1.2804,
"step": 1788
},
{
"epoch": 0.9092179658217394,
"grad_norm": 2.87565016746521,
"learning_rate": 9.531992779674828e-06,
"loss": 1.1231,
"step": 1789
},
{
"epoch": 0.9097261927450607,
"grad_norm": 2.781198501586914,
"learning_rate": 9.531282321873398e-06,
"loss": 1.1642,
"step": 1790
},
{
"epoch": 0.9102344196683819,
"grad_norm": 3.292746067047119,
"learning_rate": 9.530571351743881e-06,
"loss": 1.1705,
"step": 1791
},
{
"epoch": 0.9107426465917032,
"grad_norm": 2.8538334369659424,
"learning_rate": 9.52985986936666e-06,
"loss": 1.1693,
"step": 1792
},
{
"epoch": 0.9112508735150244,
"grad_norm": 2.933720588684082,
"learning_rate": 9.529147874822184e-06,
"loss": 1.1758,
"step": 1793
},
{
"epoch": 0.9117591004383457,
"grad_norm": 3.115551710128784,
"learning_rate": 9.528435368190952e-06,
"loss": 1.2691,
"step": 1794
},
{
"epoch": 0.9122673273616669,
"grad_norm": 2.8642966747283936,
"learning_rate": 9.527722349553522e-06,
"loss": 1.1481,
"step": 1795
},
{
"epoch": 0.9127755542849882,
"grad_norm": 3.1207451820373535,
"learning_rate": 9.527008818990513e-06,
"loss": 1.3712,
"step": 1796
},
{
"epoch": 0.9132837812083096,
"grad_norm": 2.7371482849121094,
"learning_rate": 9.526294776582599e-06,
"loss": 1.2768,
"step": 1797
},
{
"epoch": 0.9137920081316308,
"grad_norm": 3.4604902267456055,
"learning_rate": 9.525580222410512e-06,
"loss": 1.3342,
"step": 1798
},
{
"epoch": 0.9143002350549521,
"grad_norm": 2.8706648349761963,
"learning_rate": 9.524865156555047e-06,
"loss": 1.2667,
"step": 1799
},
{
"epoch": 0.9148084619782733,
"grad_norm": 2.873488426208496,
"learning_rate": 9.52414957909705e-06,
"loss": 1.2392,
"step": 1800
},
{
"epoch": 0.9153166889015946,
"grad_norm": 2.964588165283203,
"learning_rate": 9.523433490117427e-06,
"loss": 1.3241,
"step": 1801
},
{
"epoch": 0.9158249158249159,
"grad_norm": 2.9600985050201416,
"learning_rate": 9.522716889697141e-06,
"loss": 1.3308,
"step": 1802
},
{
"epoch": 0.9163331427482371,
"grad_norm": 2.5625863075256348,
"learning_rate": 9.521999777917219e-06,
"loss": 1.1425,
"step": 1803
},
{
"epoch": 0.9168413696715584,
"grad_norm": 2.7706921100616455,
"learning_rate": 9.521282154858736e-06,
"loss": 1.3258,
"step": 1804
},
{
"epoch": 0.9173495965948796,
"grad_norm": 2.833293914794922,
"learning_rate": 9.520564020602834e-06,
"loss": 1.2726,
"step": 1805
},
{
"epoch": 0.9178578235182009,
"grad_norm": 2.7428948879241943,
"learning_rate": 9.519845375230706e-06,
"loss": 1.2617,
"step": 1806
},
{
"epoch": 0.9183660504415221,
"grad_norm": 2.8612327575683594,
"learning_rate": 9.519126218823607e-06,
"loss": 1.178,
"step": 1807
},
{
"epoch": 0.9188742773648434,
"grad_norm": 2.9736928939819336,
"learning_rate": 9.518406551462847e-06,
"loss": 1.279,
"step": 1808
},
{
"epoch": 0.9193825042881647,
"grad_norm": 3.0132932662963867,
"learning_rate": 9.517686373229795e-06,
"loss": 1.2099,
"step": 1809
},
{
"epoch": 0.9198907312114859,
"grad_norm": 2.5593981742858887,
"learning_rate": 9.516965684205877e-06,
"loss": 1.1039,
"step": 1810
},
{
"epoch": 0.9203989581348072,
"grad_norm": 2.7686641216278076,
"learning_rate": 9.51624448447258e-06,
"loss": 1.1157,
"step": 1811
},
{
"epoch": 0.9209071850581284,
"grad_norm": 2.81060528755188,
"learning_rate": 9.515522774111445e-06,
"loss": 1.1971,
"step": 1812
},
{
"epoch": 0.9214154119814497,
"grad_norm": 2.5526318550109863,
"learning_rate": 9.514800553204071e-06,
"loss": 1.1534,
"step": 1813
},
{
"epoch": 0.921923638904771,
"grad_norm": 2.841200590133667,
"learning_rate": 9.514077821832118e-06,
"loss": 1.2518,
"step": 1814
},
{
"epoch": 0.9224318658280922,
"grad_norm": 2.7869009971618652,
"learning_rate": 9.513354580077299e-06,
"loss": 1.2512,
"step": 1815
},
{
"epoch": 0.9229400927514135,
"grad_norm": 2.617814302444458,
"learning_rate": 9.512630828021387e-06,
"loss": 1.1089,
"step": 1816
},
{
"epoch": 0.9234483196747347,
"grad_norm": 2.8492302894592285,
"learning_rate": 9.511906565746214e-06,
"loss": 1.1446,
"step": 1817
},
{
"epoch": 0.923956546598056,
"grad_norm": 2.7213473320007324,
"learning_rate": 9.51118179333367e-06,
"loss": 1.1777,
"step": 1818
},
{
"epoch": 0.9244647735213773,
"grad_norm": 3.0611300468444824,
"learning_rate": 9.510456510865697e-06,
"loss": 1.1902,
"step": 1819
},
{
"epoch": 0.9249730004446985,
"grad_norm": 2.8940231800079346,
"learning_rate": 9.509730718424303e-06,
"loss": 1.2389,
"step": 1820
},
{
"epoch": 0.9254812273680199,
"grad_norm": 3.2034969329833984,
"learning_rate": 9.509004416091548e-06,
"loss": 1.3084,
"step": 1821
},
{
"epoch": 0.925989454291341,
"grad_norm": 2.7354447841644287,
"learning_rate": 9.50827760394955e-06,
"loss": 1.1467,
"step": 1822
},
{
"epoch": 0.9264976812146624,
"grad_norm": 4.729049205780029,
"learning_rate": 9.507550282080488e-06,
"loss": 1.2631,
"step": 1823
},
{
"epoch": 0.9270059081379836,
"grad_norm": 3.0362253189086914,
"learning_rate": 9.506822450566595e-06,
"loss": 1.2361,
"step": 1824
},
{
"epoch": 0.9275141350613049,
"grad_norm": 3.075381278991699,
"learning_rate": 9.506094109490161e-06,
"loss": 1.2362,
"step": 1825
},
{
"epoch": 0.9280223619846262,
"grad_norm": 2.9710774421691895,
"learning_rate": 9.505365258933542e-06,
"loss": 1.3233,
"step": 1826
},
{
"epoch": 0.9285305889079474,
"grad_norm": 2.99249529838562,
"learning_rate": 9.504635898979138e-06,
"loss": 1.1723,
"step": 1827
},
{
"epoch": 0.9290388158312687,
"grad_norm": 2.88806414604187,
"learning_rate": 9.503906029709418e-06,
"loss": 1.2333,
"step": 1828
},
{
"epoch": 0.9295470427545899,
"grad_norm": 2.997180938720703,
"learning_rate": 9.503175651206903e-06,
"loss": 1.3472,
"step": 1829
},
{
"epoch": 0.9300552696779112,
"grad_norm": 2.8601789474487305,
"learning_rate": 9.502444763554174e-06,
"loss": 1.2205,
"step": 1830
},
{
"epoch": 0.9305634966012325,
"grad_norm": 3.0461935997009277,
"learning_rate": 9.501713366833869e-06,
"loss": 1.16,
"step": 1831
},
{
"epoch": 0.9310717235245537,
"grad_norm": 2.8133318424224854,
"learning_rate": 9.500981461128681e-06,
"loss": 1.2924,
"step": 1832
},
{
"epoch": 0.931579950447875,
"grad_norm": 2.750631809234619,
"learning_rate": 9.500249046521365e-06,
"loss": 1.2311,
"step": 1833
},
{
"epoch": 0.9320881773711962,
"grad_norm": 3.502110004425049,
"learning_rate": 9.49951612309473e-06,
"loss": 1.3335,
"step": 1834
},
{
"epoch": 0.9325964042945175,
"grad_norm": 2.9846878051757812,
"learning_rate": 9.498782690931643e-06,
"loss": 1.2773,
"step": 1835
},
{
"epoch": 0.9331046312178388,
"grad_norm": 2.80678653717041,
"learning_rate": 9.498048750115032e-06,
"loss": 1.1365,
"step": 1836
},
{
"epoch": 0.93361285814116,
"grad_norm": 3.084103584289551,
"learning_rate": 9.497314300727877e-06,
"loss": 1.297,
"step": 1837
},
{
"epoch": 0.9341210850644813,
"grad_norm": 2.8763110637664795,
"learning_rate": 9.49657934285322e-06,
"loss": 1.3062,
"step": 1838
},
{
"epoch": 0.9346293119878025,
"grad_norm": 2.8453195095062256,
"learning_rate": 9.495843876574157e-06,
"loss": 1.2479,
"step": 1839
},
{
"epoch": 0.9351375389111238,
"grad_norm": 2.914537191390991,
"learning_rate": 9.495107901973846e-06,
"loss": 1.2901,
"step": 1840
},
{
"epoch": 0.935645765834445,
"grad_norm": 2.7122802734375,
"learning_rate": 9.494371419135498e-06,
"loss": 1.1318,
"step": 1841
},
{
"epoch": 0.9361539927577663,
"grad_norm": 2.932257890701294,
"learning_rate": 9.493634428142383e-06,
"loss": 1.3514,
"step": 1842
},
{
"epoch": 0.9366622196810876,
"grad_norm": 2.784000873565674,
"learning_rate": 9.492896929077828e-06,
"loss": 1.2715,
"step": 1843
},
{
"epoch": 0.9371704466044088,
"grad_norm": 2.914268732070923,
"learning_rate": 9.492158922025221e-06,
"loss": 1.1562,
"step": 1844
},
{
"epoch": 0.9376786735277302,
"grad_norm": 2.8161864280700684,
"learning_rate": 9.491420407068002e-06,
"loss": 1.1786,
"step": 1845
},
{
"epoch": 0.9381869004510514,
"grad_norm": 2.703287363052368,
"learning_rate": 9.49068138428967e-06,
"loss": 1.1797,
"step": 1846
},
{
"epoch": 0.9386951273743727,
"grad_norm": 2.7507104873657227,
"learning_rate": 9.489941853773787e-06,
"loss": 1.2552,
"step": 1847
},
{
"epoch": 0.939203354297694,
"grad_norm": 3.103407859802246,
"learning_rate": 9.489201815603964e-06,
"loss": 1.2224,
"step": 1848
},
{
"epoch": 0.9397115812210152,
"grad_norm": 2.6951043605804443,
"learning_rate": 9.488461269863873e-06,
"loss": 1.3135,
"step": 1849
},
{
"epoch": 0.9402198081443365,
"grad_norm": 2.7768237590789795,
"learning_rate": 9.487720216637247e-06,
"loss": 1.0811,
"step": 1850
},
{
"epoch": 0.9407280350676577,
"grad_norm": 2.717684030532837,
"learning_rate": 9.486978656007869e-06,
"loss": 1.1631,
"step": 1851
},
{
"epoch": 0.941236261990979,
"grad_norm": 3.163203001022339,
"learning_rate": 9.486236588059585e-06,
"loss": 1.2808,
"step": 1852
},
{
"epoch": 0.9417444889143002,
"grad_norm": 2.7564680576324463,
"learning_rate": 9.485494012876298e-06,
"loss": 1.2187,
"step": 1853
},
{
"epoch": 0.9422527158376215,
"grad_norm": 2.8404791355133057,
"learning_rate": 9.484750930541964e-06,
"loss": 1.3074,
"step": 1854
},
{
"epoch": 0.9427609427609428,
"grad_norm": 2.8263309001922607,
"learning_rate": 9.484007341140602e-06,
"loss": 1.2831,
"step": 1855
},
{
"epoch": 0.943269169684264,
"grad_norm": 2.9559013843536377,
"learning_rate": 9.483263244756284e-06,
"loss": 1.162,
"step": 1856
},
{
"epoch": 0.9437773966075853,
"grad_norm": 2.8240835666656494,
"learning_rate": 9.482518641473144e-06,
"loss": 1.2336,
"step": 1857
},
{
"epoch": 0.9442856235309065,
"grad_norm": 2.7373499870300293,
"learning_rate": 9.481773531375366e-06,
"loss": 1.293,
"step": 1858
},
{
"epoch": 0.9447938504542278,
"grad_norm": 2.891880512237549,
"learning_rate": 9.481027914547199e-06,
"loss": 1.2538,
"step": 1859
},
{
"epoch": 0.9453020773775491,
"grad_norm": 2.8699028491973877,
"learning_rate": 9.480281791072944e-06,
"loss": 1.1302,
"step": 1860
},
{
"epoch": 0.9458103043008703,
"grad_norm": 3.3577420711517334,
"learning_rate": 9.479535161036962e-06,
"loss": 1.2419,
"step": 1861
},
{
"epoch": 0.9463185312241916,
"grad_norm": 3.0245659351348877,
"learning_rate": 9.478788024523673e-06,
"loss": 1.33,
"step": 1862
},
{
"epoch": 0.9468267581475128,
"grad_norm": 2.950090169906616,
"learning_rate": 9.478040381617546e-06,
"loss": 1.213,
"step": 1863
},
{
"epoch": 0.9473349850708341,
"grad_norm": 2.874415397644043,
"learning_rate": 9.477292232403118e-06,
"loss": 1.1361,
"step": 1864
},
{
"epoch": 0.9478432119941554,
"grad_norm": 3.1284801959991455,
"learning_rate": 9.476543576964977e-06,
"loss": 1.3103,
"step": 1865
},
{
"epoch": 0.9483514389174766,
"grad_norm": 2.839769124984741,
"learning_rate": 9.475794415387766e-06,
"loss": 1.2267,
"step": 1866
},
{
"epoch": 0.948859665840798,
"grad_norm": 2.890130043029785,
"learning_rate": 9.475044747756195e-06,
"loss": 1.158,
"step": 1867
},
{
"epoch": 0.9493678927641191,
"grad_norm": 2.8990070819854736,
"learning_rate": 9.474294574155022e-06,
"loss": 1.2617,
"step": 1868
},
{
"epoch": 0.9498761196874405,
"grad_norm": 2.893882989883423,
"learning_rate": 9.473543894669063e-06,
"loss": 1.2091,
"step": 1869
},
{
"epoch": 0.9503843466107617,
"grad_norm": 2.8073065280914307,
"learning_rate": 9.472792709383197e-06,
"loss": 1.2089,
"step": 1870
},
{
"epoch": 0.950892573534083,
"grad_norm": 2.6496944427490234,
"learning_rate": 9.472041018382354e-06,
"loss": 1.1846,
"step": 1871
},
{
"epoch": 0.9514008004574043,
"grad_norm": 2.8289594650268555,
"learning_rate": 9.471288821751525e-06,
"loss": 1.2576,
"step": 1872
},
{
"epoch": 0.9519090273807255,
"grad_norm": 2.997814893722534,
"learning_rate": 9.470536119575757e-06,
"loss": 1.2837,
"step": 1873
},
{
"epoch": 0.9524172543040468,
"grad_norm": 2.66351318359375,
"learning_rate": 9.469782911940151e-06,
"loss": 1.2383,
"step": 1874
},
{
"epoch": 0.952925481227368,
"grad_norm": 2.7139089107513428,
"learning_rate": 9.469029198929873e-06,
"loss": 1.1613,
"step": 1875
},
{
"epoch": 0.9534337081506893,
"grad_norm": 2.67689847946167,
"learning_rate": 9.468274980630137e-06,
"loss": 1.2042,
"step": 1876
},
{
"epoch": 0.9539419350740106,
"grad_norm": 2.7813730239868164,
"learning_rate": 9.467520257126223e-06,
"loss": 1.2591,
"step": 1877
},
{
"epoch": 0.9544501619973318,
"grad_norm": 2.801579713821411,
"learning_rate": 9.46676502850346e-06,
"loss": 1.1587,
"step": 1878
},
{
"epoch": 0.9549583889206531,
"grad_norm": 2.7422478199005127,
"learning_rate": 9.466009294847238e-06,
"loss": 1.2799,
"step": 1879
},
{
"epoch": 0.9554666158439743,
"grad_norm": 2.8934004306793213,
"learning_rate": 9.465253056243005e-06,
"loss": 1.254,
"step": 1880
},
{
"epoch": 0.9559748427672956,
"grad_norm": 2.6929843425750732,
"learning_rate": 9.464496312776265e-06,
"loss": 1.0316,
"step": 1881
},
{
"epoch": 0.9564830696906168,
"grad_norm": 2.816587209701538,
"learning_rate": 9.463739064532578e-06,
"loss": 1.253,
"step": 1882
},
{
"epoch": 0.9569912966139381,
"grad_norm": 2.6673052310943604,
"learning_rate": 9.462981311597563e-06,
"loss": 1.2072,
"step": 1883
},
{
"epoch": 0.9574995235372594,
"grad_norm": 2.825695514678955,
"learning_rate": 9.462223054056894e-06,
"loss": 1.2092,
"step": 1884
},
{
"epoch": 0.9580077504605806,
"grad_norm": 3.181696653366089,
"learning_rate": 9.461464291996305e-06,
"loss": 1.2547,
"step": 1885
},
{
"epoch": 0.9585159773839019,
"grad_norm": 2.9147400856018066,
"learning_rate": 9.460705025501581e-06,
"loss": 1.2261,
"step": 1886
},
{
"epoch": 0.9590242043072231,
"grad_norm": 6.87190580368042,
"learning_rate": 9.459945254658574e-06,
"loss": 1.3751,
"step": 1887
},
{
"epoch": 0.9595324312305444,
"grad_norm": 2.883603096008301,
"learning_rate": 9.459184979553183e-06,
"loss": 1.3314,
"step": 1888
},
{
"epoch": 0.9600406581538657,
"grad_norm": 2.7869436740875244,
"learning_rate": 9.45842420027137e-06,
"loss": 1.1628,
"step": 1889
},
{
"epoch": 0.9605488850771869,
"grad_norm": 2.8722105026245117,
"learning_rate": 9.457662916899152e-06,
"loss": 1.2581,
"step": 1890
},
{
"epoch": 0.9610571120005083,
"grad_norm": 2.908513069152832,
"learning_rate": 9.456901129522605e-06,
"loss": 1.2924,
"step": 1891
},
{
"epoch": 0.9615653389238294,
"grad_norm": 2.925353765487671,
"learning_rate": 9.456138838227857e-06,
"loss": 1.2244,
"step": 1892
},
{
"epoch": 0.9620735658471508,
"grad_norm": 2.8243985176086426,
"learning_rate": 9.455376043101099e-06,
"loss": 1.2406,
"step": 1893
},
{
"epoch": 0.9625817927704721,
"grad_norm": 2.6665141582489014,
"learning_rate": 9.454612744228572e-06,
"loss": 1.1531,
"step": 1894
},
{
"epoch": 0.9630900196937933,
"grad_norm": 2.6943883895874023,
"learning_rate": 9.453848941696586e-06,
"loss": 1.313,
"step": 1895
},
{
"epoch": 0.9635982466171146,
"grad_norm": 2.8478856086730957,
"learning_rate": 9.453084635591491e-06,
"loss": 1.2133,
"step": 1896
},
{
"epoch": 0.9641064735404358,
"grad_norm": 2.70573091506958,
"learning_rate": 9.45231982599971e-06,
"loss": 1.1661,
"step": 1897
},
{
"epoch": 0.9646147004637571,
"grad_norm": 2.684609889984131,
"learning_rate": 9.451554513007712e-06,
"loss": 1.3076,
"step": 1898
},
{
"epoch": 0.9651229273870783,
"grad_norm": 2.785606861114502,
"learning_rate": 9.450788696702028e-06,
"loss": 1.0978,
"step": 1899
},
{
"epoch": 0.9656311543103996,
"grad_norm": 2.884321689605713,
"learning_rate": 9.450022377169246e-06,
"loss": 1.2179,
"step": 1900
},
{
"epoch": 0.9661393812337209,
"grad_norm": 2.9700825214385986,
"learning_rate": 9.449255554496007e-06,
"loss": 1.1781,
"step": 1901
},
{
"epoch": 0.9666476081570421,
"grad_norm": 3.0699474811553955,
"learning_rate": 9.448488228769015e-06,
"loss": 1.3785,
"step": 1902
},
{
"epoch": 0.9671558350803634,
"grad_norm": 2.7597365379333496,
"learning_rate": 9.447720400075024e-06,
"loss": 1.1666,
"step": 1903
},
{
"epoch": 0.9676640620036846,
"grad_norm": 2.7310798168182373,
"learning_rate": 9.446952068500852e-06,
"loss": 1.2326,
"step": 1904
},
{
"epoch": 0.9681722889270059,
"grad_norm": 2.821917772293091,
"learning_rate": 9.446183234133367e-06,
"loss": 1.2468,
"step": 1905
},
{
"epoch": 0.9686805158503272,
"grad_norm": 2.7148962020874023,
"learning_rate": 9.445413897059499e-06,
"loss": 1.273,
"step": 1906
},
{
"epoch": 0.9691887427736484,
"grad_norm": 3.648280143737793,
"learning_rate": 9.44464405736623e-06,
"loss": 1.2404,
"step": 1907
},
{
"epoch": 0.9696969696969697,
"grad_norm": 2.7357401847839355,
"learning_rate": 9.443873715140606e-06,
"loss": 1.1583,
"step": 1908
},
{
"epoch": 0.9702051966202909,
"grad_norm": 2.8272571563720703,
"learning_rate": 9.443102870469722e-06,
"loss": 1.224,
"step": 1909
},
{
"epoch": 0.9707134235436122,
"grad_norm": 3.024099826812744,
"learning_rate": 9.442331523440736e-06,
"loss": 1.2522,
"step": 1910
},
{
"epoch": 0.9712216504669335,
"grad_norm": 2.9257144927978516,
"learning_rate": 9.441559674140859e-06,
"loss": 1.2456,
"step": 1911
},
{
"epoch": 0.9717298773902547,
"grad_norm": 3.5403923988342285,
"learning_rate": 9.440787322657358e-06,
"loss": 1.3027,
"step": 1912
},
{
"epoch": 0.972238104313576,
"grad_norm": 3.196686267852783,
"learning_rate": 9.44001446907756e-06,
"loss": 1.2347,
"step": 1913
},
{
"epoch": 0.9727463312368972,
"grad_norm": 2.7601497173309326,
"learning_rate": 9.439241113488849e-06,
"loss": 1.2686,
"step": 1914
},
{
"epoch": 0.9732545581602186,
"grad_norm": 3.1352243423461914,
"learning_rate": 9.438467255978663e-06,
"loss": 1.2042,
"step": 1915
},
{
"epoch": 0.9737627850835398,
"grad_norm": 2.772083044052124,
"learning_rate": 9.437692896634498e-06,
"loss": 1.2699,
"step": 1916
},
{
"epoch": 0.9742710120068611,
"grad_norm": 3.0568454265594482,
"learning_rate": 9.436918035543907e-06,
"loss": 1.391,
"step": 1917
},
{
"epoch": 0.9747792389301824,
"grad_norm": 2.8727424144744873,
"learning_rate": 9.4361426727945e-06,
"loss": 1.2516,
"step": 1918
},
{
"epoch": 0.9752874658535036,
"grad_norm": 2.9823689460754395,
"learning_rate": 9.43536680847394e-06,
"loss": 1.2432,
"step": 1919
},
{
"epoch": 0.9757956927768249,
"grad_norm": 2.8589422702789307,
"learning_rate": 9.434590442669952e-06,
"loss": 1.2263,
"step": 1920
},
{
"epoch": 0.9763039197001461,
"grad_norm": 2.7224597930908203,
"learning_rate": 9.433813575470318e-06,
"loss": 1.2102,
"step": 1921
},
{
"epoch": 0.9768121466234674,
"grad_norm": 3.058126449584961,
"learning_rate": 9.433036206962871e-06,
"loss": 1.262,
"step": 1922
},
{
"epoch": 0.9773203735467887,
"grad_norm": 2.858962059020996,
"learning_rate": 9.432258337235505e-06,
"loss": 1.2711,
"step": 1923
},
{
"epoch": 0.9778286004701099,
"grad_norm": 3.059061050415039,
"learning_rate": 9.43147996637617e-06,
"loss": 1.2354,
"step": 1924
},
{
"epoch": 0.9783368273934312,
"grad_norm": 2.8909220695495605,
"learning_rate": 9.43070109447287e-06,
"loss": 1.2501,
"step": 1925
},
{
"epoch": 0.9788450543167524,
"grad_norm": 2.7637128829956055,
"learning_rate": 9.42992172161367e-06,
"loss": 1.2199,
"step": 1926
},
{
"epoch": 0.9793532812400737,
"grad_norm": 2.7772271633148193,
"learning_rate": 9.429141847886692e-06,
"loss": 1.2338,
"step": 1927
},
{
"epoch": 0.9798615081633949,
"grad_norm": 3.01302170753479,
"learning_rate": 9.428361473380108e-06,
"loss": 1.2147,
"step": 1928
},
{
"epoch": 0.9803697350867162,
"grad_norm": 2.8627138137817383,
"learning_rate": 9.427580598182151e-06,
"loss": 1.2039,
"step": 1929
},
{
"epoch": 0.9808779620100375,
"grad_norm": 2.6455531120300293,
"learning_rate": 9.426799222381114e-06,
"loss": 1.1395,
"step": 1930
},
{
"epoch": 0.9813861889333587,
"grad_norm": 2.8535947799682617,
"learning_rate": 9.426017346065339e-06,
"loss": 1.2505,
"step": 1931
},
{
"epoch": 0.98189441585668,
"grad_norm": 2.6990885734558105,
"learning_rate": 9.425234969323231e-06,
"loss": 1.2925,
"step": 1932
},
{
"epoch": 0.9824026427800012,
"grad_norm": 2.916191816329956,
"learning_rate": 9.424452092243248e-06,
"loss": 1.1982,
"step": 1933
},
{
"epoch": 0.9829108697033225,
"grad_norm": 2.7172672748565674,
"learning_rate": 9.423668714913907e-06,
"loss": 1.2339,
"step": 1934
},
{
"epoch": 0.9834190966266438,
"grad_norm": 3.132009983062744,
"learning_rate": 9.42288483742378e-06,
"loss": 1.3103,
"step": 1935
},
{
"epoch": 0.983927323549965,
"grad_norm": 2.990915536880493,
"learning_rate": 9.422100459861494e-06,
"loss": 1.3056,
"step": 1936
},
{
"epoch": 0.9844355504732863,
"grad_norm": 2.8419580459594727,
"learning_rate": 9.421315582315737e-06,
"loss": 1.2209,
"step": 1937
},
{
"epoch": 0.9849437773966075,
"grad_norm": 2.8363163471221924,
"learning_rate": 9.420530204875252e-06,
"loss": 1.2706,
"step": 1938
},
{
"epoch": 0.9854520043199289,
"grad_norm": 2.7801365852355957,
"learning_rate": 9.419744327628832e-06,
"loss": 1.2744,
"step": 1939
},
{
"epoch": 0.9859602312432502,
"grad_norm": 3.0915050506591797,
"learning_rate": 9.418957950665336e-06,
"loss": 1.1607,
"step": 1940
},
{
"epoch": 0.9864684581665714,
"grad_norm": 2.951573610305786,
"learning_rate": 9.418171074073675e-06,
"loss": 1.2566,
"step": 1941
},
{
"epoch": 0.9869766850898927,
"grad_norm": 2.769648551940918,
"learning_rate": 9.417383697942817e-06,
"loss": 1.2288,
"step": 1942
},
{
"epoch": 0.9874849120132139,
"grad_norm": 2.8848860263824463,
"learning_rate": 9.416595822361786e-06,
"loss": 1.2998,
"step": 1943
},
{
"epoch": 0.9879931389365352,
"grad_norm": 2.8908326625823975,
"learning_rate": 9.415807447419663e-06,
"loss": 1.2915,
"step": 1944
},
{
"epoch": 0.9885013658598564,
"grad_norm": 2.7161648273468018,
"learning_rate": 9.415018573205588e-06,
"loss": 1.2233,
"step": 1945
},
{
"epoch": 0.9890095927831777,
"grad_norm": 2.799499988555908,
"learning_rate": 9.414229199808748e-06,
"loss": 1.1483,
"step": 1946
},
{
"epoch": 0.989517819706499,
"grad_norm": 3.000262498855591,
"learning_rate": 9.413439327318402e-06,
"loss": 1.3221,
"step": 1947
},
{
"epoch": 0.9900260466298202,
"grad_norm": 4.373028755187988,
"learning_rate": 9.412648955823848e-06,
"loss": 1.3722,
"step": 1948
},
{
"epoch": 0.9905342735531415,
"grad_norm": 2.90043306350708,
"learning_rate": 9.411858085414456e-06,
"loss": 1.2587,
"step": 1949
},
{
"epoch": 0.9910425004764627,
"grad_norm": 3.2279412746429443,
"learning_rate": 9.411066716179643e-06,
"loss": 1.2173,
"step": 1950
},
{
"epoch": 0.991550727399784,
"grad_norm": 2.9404780864715576,
"learning_rate": 9.410274848208884e-06,
"loss": 1.2789,
"step": 1951
},
{
"epoch": 0.9920589543231053,
"grad_norm": 2.7545523643493652,
"learning_rate": 9.409482481591713e-06,
"loss": 1.1923,
"step": 1952
},
{
"epoch": 0.9925671812464265,
"grad_norm": 2.863680839538574,
"learning_rate": 9.408689616417718e-06,
"loss": 1.2571,
"step": 1953
},
{
"epoch": 0.9930754081697478,
"grad_norm": 2.761908531188965,
"learning_rate": 9.407896252776543e-06,
"loss": 1.1544,
"step": 1954
},
{
"epoch": 0.993583635093069,
"grad_norm": 2.7828609943389893,
"learning_rate": 9.40710239075789e-06,
"loss": 1.2349,
"step": 1955
},
{
"epoch": 0.9940918620163903,
"grad_norm": 2.771557092666626,
"learning_rate": 9.406308030451519e-06,
"loss": 1.2707,
"step": 1956
},
{
"epoch": 0.9946000889397116,
"grad_norm": 2.988478422164917,
"learning_rate": 9.40551317194724e-06,
"loss": 1.2264,
"step": 1957
},
{
"epoch": 0.9951083158630328,
"grad_norm": 2.8643558025360107,
"learning_rate": 9.404717815334928e-06,
"loss": 1.287,
"step": 1958
},
{
"epoch": 0.9956165427863541,
"grad_norm": 2.7615389823913574,
"learning_rate": 9.403921960704507e-06,
"loss": 1.1656,
"step": 1959
},
{
"epoch": 0.9961247697096753,
"grad_norm": 2.893112897872925,
"learning_rate": 9.40312560814596e-06,
"loss": 1.2117,
"step": 1960
},
{
"epoch": 0.9966329966329966,
"grad_norm": 2.8117706775665283,
"learning_rate": 9.402328757749327e-06,
"loss": 1.2288,
"step": 1961
},
{
"epoch": 0.9971412235563178,
"grad_norm": 2.8521831035614014,
"learning_rate": 9.401531409604702e-06,
"loss": 1.2678,
"step": 1962
},
{
"epoch": 0.9976494504796392,
"grad_norm": 2.985893726348877,
"learning_rate": 9.40073356380224e-06,
"loss": 1.2412,
"step": 1963
},
{
"epoch": 0.9981576774029605,
"grad_norm": 2.948859453201294,
"learning_rate": 9.399935220432148e-06,
"loss": 1.3356,
"step": 1964
},
{
"epoch": 0.9986659043262817,
"grad_norm": 2.862870454788208,
"learning_rate": 9.39913637958469e-06,
"loss": 1.2378,
"step": 1965
},
{
"epoch": 0.999174131249603,
"grad_norm": 2.5982418060302734,
"learning_rate": 9.398337041350186e-06,
"loss": 1.1617,
"step": 1966
},
{
"epoch": 0.9996823581729242,
"grad_norm": 2.7536323070526123,
"learning_rate": 9.397537205819014e-06,
"loss": 1.2863,
"step": 1967
},
{
"epoch": 1.0001905850962454,
"grad_norm": 2.6883037090301514,
"learning_rate": 9.396736873081607e-06,
"loss": 0.9807,
"step": 1968
},
{
"epoch": 1.0006988120195668,
"grad_norm": 3.1424267292022705,
"learning_rate": 9.395936043228455e-06,
"loss": 0.8711,
"step": 1969
},
{
"epoch": 1.001207038942888,
"grad_norm": 2.7037315368652344,
"learning_rate": 9.395134716350103e-06,
"loss": 0.8217,
"step": 1970
},
{
"epoch": 1.0017152658662092,
"grad_norm": 3.0894687175750732,
"learning_rate": 9.394332892537151e-06,
"loss": 0.9446,
"step": 1971
},
{
"epoch": 1.0022234927895306,
"grad_norm": 2.8337249755859375,
"learning_rate": 9.39353057188026e-06,
"loss": 0.9479,
"step": 1972
},
{
"epoch": 1.0027317197128518,
"grad_norm": 3.023442506790161,
"learning_rate": 9.392727754470142e-06,
"loss": 0.9362,
"step": 1973
},
{
"epoch": 1.003239946636173,
"grad_norm": 3.1481966972351074,
"learning_rate": 9.391924440397569e-06,
"loss": 0.9307,
"step": 1974
},
{
"epoch": 1.0037481735594944,
"grad_norm": 3.388725519180298,
"learning_rate": 9.391120629753367e-06,
"loss": 0.809,
"step": 1975
},
{
"epoch": 1.0042564004828156,
"grad_norm": 3.3043911457061768,
"learning_rate": 9.390316322628417e-06,
"loss": 0.8328,
"step": 1976
},
{
"epoch": 1.0047646274061368,
"grad_norm": 3.6037285327911377,
"learning_rate": 9.38951151911366e-06,
"loss": 0.7763,
"step": 1977
},
{
"epoch": 1.005272854329458,
"grad_norm": 3.6332485675811768,
"learning_rate": 9.388706219300088e-06,
"loss": 0.9359,
"step": 1978
},
{
"epoch": 1.0057810812527794,
"grad_norm": 3.833462715148926,
"learning_rate": 9.387900423278756e-06,
"loss": 0.8459,
"step": 1979
},
{
"epoch": 1.0062893081761006,
"grad_norm": 3.7308406829833984,
"learning_rate": 9.387094131140769e-06,
"loss": 0.8102,
"step": 1980
},
{
"epoch": 1.0067975350994218,
"grad_norm": 3.359941005706787,
"learning_rate": 9.386287342977287e-06,
"loss": 0.8305,
"step": 1981
},
{
"epoch": 1.0073057620227432,
"grad_norm": 2.9804437160491943,
"learning_rate": 9.385480058879534e-06,
"loss": 0.7978,
"step": 1982
},
{
"epoch": 1.0078139889460644,
"grad_norm": 2.755053997039795,
"learning_rate": 9.384672278938785e-06,
"loss": 0.8343,
"step": 1983
},
{
"epoch": 1.0083222158693856,
"grad_norm": 3.064114809036255,
"learning_rate": 9.383864003246369e-06,
"loss": 0.9288,
"step": 1984
},
{
"epoch": 1.0088304427927068,
"grad_norm": 3.0753376483917236,
"learning_rate": 9.383055231893674e-06,
"loss": 0.8818,
"step": 1985
},
{
"epoch": 1.0093386697160283,
"grad_norm": 2.8319714069366455,
"learning_rate": 9.382245964972146e-06,
"loss": 0.8849,
"step": 1986
},
{
"epoch": 1.0098468966393495,
"grad_norm": 2.89162278175354,
"learning_rate": 9.38143620257328e-06,
"loss": 0.8338,
"step": 1987
},
{
"epoch": 1.0103551235626707,
"grad_norm": 3.2012033462524414,
"learning_rate": 9.380625944788635e-06,
"loss": 0.8047,
"step": 1988
},
{
"epoch": 1.010863350485992,
"grad_norm": 3.0053231716156006,
"learning_rate": 9.379815191709823e-06,
"loss": 0.8174,
"step": 1989
},
{
"epoch": 1.0113715774093133,
"grad_norm": 3.070302963256836,
"learning_rate": 9.379003943428508e-06,
"loss": 0.8858,
"step": 1990
},
{
"epoch": 1.0118798043326345,
"grad_norm": 2.9416215419769287,
"learning_rate": 9.378192200036418e-06,
"loss": 0.8167,
"step": 1991
},
{
"epoch": 1.012388031255956,
"grad_norm": 3.202517509460449,
"learning_rate": 9.377379961625328e-06,
"loss": 0.9251,
"step": 1992
},
{
"epoch": 1.012896258179277,
"grad_norm": 3.0757060050964355,
"learning_rate": 9.376567228287078e-06,
"loss": 0.8752,
"step": 1993
},
{
"epoch": 1.0134044851025983,
"grad_norm": 2.960498571395874,
"learning_rate": 9.375754000113555e-06,
"loss": 0.8223,
"step": 1994
},
{
"epoch": 1.0139127120259195,
"grad_norm": 3.186260223388672,
"learning_rate": 9.374940277196709e-06,
"loss": 0.786,
"step": 1995
},
{
"epoch": 1.014420938949241,
"grad_norm": 3.0160162448883057,
"learning_rate": 9.374126059628545e-06,
"loss": 0.8998,
"step": 1996
},
{
"epoch": 1.014929165872562,
"grad_norm": 3.1801722049713135,
"learning_rate": 9.373311347501117e-06,
"loss": 0.8987,
"step": 1997
},
{
"epoch": 1.0154373927958833,
"grad_norm": 3.2074148654937744,
"learning_rate": 9.372496140906546e-06,
"loss": 0.8403,
"step": 1998
},
{
"epoch": 1.0159456197192047,
"grad_norm": 2.9431893825531006,
"learning_rate": 9.371680439936999e-06,
"loss": 0.8974,
"step": 1999
},
{
"epoch": 1.016453846642526,
"grad_norm": 2.9264800548553467,
"learning_rate": 9.370864244684705e-06,
"loss": 0.8356,
"step": 2000
},
{
"epoch": 1.016453846642526,
"eval_loss": 1.2894710302352905,
"eval_runtime": 14.6197,
"eval_samples_per_second": 27.36,
"eval_steps_per_second": 3.42,
"step": 2000
},
{
"epoch": 1.0169620735658471,
"grad_norm": 3.1898536682128906,
"learning_rate": 9.370047555241947e-06,
"loss": 0.9506,
"step": 2001
},
{
"epoch": 1.0174703004891683,
"grad_norm": 3.170736312866211,
"learning_rate": 9.369230371701063e-06,
"loss": 0.9416,
"step": 2002
},
{
"epoch": 1.0179785274124897,
"grad_norm": 3.0140738487243652,
"learning_rate": 9.368412694154447e-06,
"loss": 0.7751,
"step": 2003
},
{
"epoch": 1.018486754335811,
"grad_norm": 3.268325090408325,
"learning_rate": 9.36759452269455e-06,
"loss": 0.9212,
"step": 2004
},
{
"epoch": 1.0189949812591321,
"grad_norm": 3.0660955905914307,
"learning_rate": 9.36677585741388e-06,
"loss": 0.8109,
"step": 2005
},
{
"epoch": 1.0195032081824535,
"grad_norm": 3.21256160736084,
"learning_rate": 9.365956698404997e-06,
"loss": 0.8029,
"step": 2006
},
{
"epoch": 1.0200114351057747,
"grad_norm": 3.118746757507324,
"learning_rate": 9.365137045760519e-06,
"loss": 0.8211,
"step": 2007
},
{
"epoch": 1.020519662029096,
"grad_norm": 3.166558027267456,
"learning_rate": 9.36431689957312e-06,
"loss": 0.8778,
"step": 2008
},
{
"epoch": 1.0210278889524174,
"grad_norm": 3.0696887969970703,
"learning_rate": 9.363496259935531e-06,
"loss": 0.8701,
"step": 2009
},
{
"epoch": 1.0215361158757386,
"grad_norm": 3.5696535110473633,
"learning_rate": 9.362675126940536e-06,
"loss": 0.9573,
"step": 2010
},
{
"epoch": 1.0220443427990598,
"grad_norm": 3.436431884765625,
"learning_rate": 9.361853500680976e-06,
"loss": 0.875,
"step": 2011
},
{
"epoch": 1.022552569722381,
"grad_norm": 3.065523862838745,
"learning_rate": 9.36103138124975e-06,
"loss": 0.789,
"step": 2012
},
{
"epoch": 1.0230607966457024,
"grad_norm": 3.238952875137329,
"learning_rate": 9.360208768739807e-06,
"loss": 0.8384,
"step": 2013
},
{
"epoch": 1.0235690235690236,
"grad_norm": 3.1629438400268555,
"learning_rate": 9.359385663244158e-06,
"loss": 0.8615,
"step": 2014
},
{
"epoch": 1.0240772504923448,
"grad_norm": 3.0951144695281982,
"learning_rate": 9.358562064855868e-06,
"loss": 0.8759,
"step": 2015
},
{
"epoch": 1.0245854774156662,
"grad_norm": 2.992215871810913,
"learning_rate": 9.357737973668056e-06,
"loss": 0.8095,
"step": 2016
},
{
"epoch": 1.0250937043389874,
"grad_norm": 3.3016228675842285,
"learning_rate": 9.356913389773895e-06,
"loss": 0.784,
"step": 2017
},
{
"epoch": 1.0256019312623086,
"grad_norm": 2.8466439247131348,
"learning_rate": 9.35608831326662e-06,
"loss": 0.7665,
"step": 2018
},
{
"epoch": 1.0261101581856298,
"grad_norm": 2.907198667526245,
"learning_rate": 9.355262744239517e-06,
"loss": 0.7961,
"step": 2019
},
{
"epoch": 1.0266183851089512,
"grad_norm": 3.4752185344696045,
"learning_rate": 9.354436682785928e-06,
"loss": 0.7864,
"step": 2020
},
{
"epoch": 1.0271266120322724,
"grad_norm": 3.046924352645874,
"learning_rate": 9.35361012899925e-06,
"loss": 0.7442,
"step": 2021
},
{
"epoch": 1.0276348389555936,
"grad_norm": 3.305177688598633,
"learning_rate": 9.35278308297294e-06,
"loss": 0.859,
"step": 2022
},
{
"epoch": 1.028143065878915,
"grad_norm": 3.166316270828247,
"learning_rate": 9.351955544800509e-06,
"loss": 0.8661,
"step": 2023
},
{
"epoch": 1.0286512928022362,
"grad_norm": 3.3500163555145264,
"learning_rate": 9.351127514575517e-06,
"loss": 0.8477,
"step": 2024
},
{
"epoch": 1.0291595197255574,
"grad_norm": 3.2255213260650635,
"learning_rate": 9.350298992391589e-06,
"loss": 0.8366,
"step": 2025
},
{
"epoch": 1.0296677466488786,
"grad_norm": 3.1444172859191895,
"learning_rate": 9.3494699783424e-06,
"loss": 0.855,
"step": 2026
},
{
"epoch": 1.0301759735722,
"grad_norm": 3.118273973464966,
"learning_rate": 9.348640472521682e-06,
"loss": 0.8224,
"step": 2027
},
{
"epoch": 1.0306842004955212,
"grad_norm": 3.104978084564209,
"learning_rate": 9.347810475023225e-06,
"loss": 0.8456,
"step": 2028
},
{
"epoch": 1.0311924274188424,
"grad_norm": 3.197139263153076,
"learning_rate": 9.34697998594087e-06,
"loss": 0.8209,
"step": 2029
},
{
"epoch": 1.0317006543421638,
"grad_norm": 3.226208448410034,
"learning_rate": 9.346149005368516e-06,
"loss": 0.928,
"step": 2030
},
{
"epoch": 1.032208881265485,
"grad_norm": 2.900405168533325,
"learning_rate": 9.345317533400122e-06,
"loss": 0.7765,
"step": 2031
},
{
"epoch": 1.0327171081888062,
"grad_norm": 3.115267038345337,
"learning_rate": 9.344485570129692e-06,
"loss": 0.814,
"step": 2032
},
{
"epoch": 1.0332253351121277,
"grad_norm": 3.040104866027832,
"learning_rate": 9.343653115651295e-06,
"loss": 0.7718,
"step": 2033
},
{
"epoch": 1.0337335620354489,
"grad_norm": 2.962225914001465,
"learning_rate": 9.34282017005905e-06,
"loss": 0.7983,
"step": 2034
},
{
"epoch": 1.03424178895877,
"grad_norm": 3.1415133476257324,
"learning_rate": 9.341986733447137e-06,
"loss": 0.8133,
"step": 2035
},
{
"epoch": 1.0347500158820913,
"grad_norm": 3.1007273197174072,
"learning_rate": 9.341152805909786e-06,
"loss": 0.7765,
"step": 2036
},
{
"epoch": 1.0352582428054127,
"grad_norm": 3.0376369953155518,
"learning_rate": 9.340318387541285e-06,
"loss": 0.8321,
"step": 2037
},
{
"epoch": 1.0357664697287339,
"grad_norm": 2.9017696380615234,
"learning_rate": 9.339483478435979e-06,
"loss": 0.8479,
"step": 2038
},
{
"epoch": 1.036274696652055,
"grad_norm": 6.134103775024414,
"learning_rate": 9.338648078688263e-06,
"loss": 0.7849,
"step": 2039
},
{
"epoch": 1.0367829235753765,
"grad_norm": 3.308187246322632,
"learning_rate": 9.337812188392596e-06,
"loss": 0.8817,
"step": 2040
},
{
"epoch": 1.0372911504986977,
"grad_norm": 3.367530584335327,
"learning_rate": 9.336975807643485e-06,
"loss": 0.8884,
"step": 2041
},
{
"epoch": 1.0377993774220189,
"grad_norm": 3.5947296619415283,
"learning_rate": 9.336138936535494e-06,
"loss": 0.929,
"step": 2042
},
{
"epoch": 1.03830760434534,
"grad_norm": 3.12381649017334,
"learning_rate": 9.335301575163247e-06,
"loss": 0.7718,
"step": 2043
},
{
"epoch": 1.0388158312686615,
"grad_norm": 3.505775213241577,
"learning_rate": 9.334463723621415e-06,
"loss": 0.8644,
"step": 2044
},
{
"epoch": 1.0393240581919827,
"grad_norm": 3.172312021255493,
"learning_rate": 9.333625382004734e-06,
"loss": 0.8577,
"step": 2045
},
{
"epoch": 1.039832285115304,
"grad_norm": 3.0678904056549072,
"learning_rate": 9.332786550407989e-06,
"loss": 0.7207,
"step": 2046
},
{
"epoch": 1.0403405120386253,
"grad_norm": 2.993863105773926,
"learning_rate": 9.331947228926024e-06,
"loss": 0.7157,
"step": 2047
},
{
"epoch": 1.0408487389619465,
"grad_norm": 3.0315968990325928,
"learning_rate": 9.331107417653734e-06,
"loss": 0.8081,
"step": 2048
},
{
"epoch": 1.0413569658852677,
"grad_norm": 3.491834878921509,
"learning_rate": 9.330267116686072e-06,
"loss": 0.9326,
"step": 2049
},
{
"epoch": 1.0418651928085891,
"grad_norm": 2.9259064197540283,
"learning_rate": 9.32942632611805e-06,
"loss": 0.8041,
"step": 2050
},
{
"epoch": 1.0423734197319103,
"grad_norm": 3.325554847717285,
"learning_rate": 9.328585046044728e-06,
"loss": 0.8363,
"step": 2051
},
{
"epoch": 1.0428816466552315,
"grad_norm": 3.138277053833008,
"learning_rate": 9.327743276561226e-06,
"loss": 0.8907,
"step": 2052
},
{
"epoch": 1.0433898735785527,
"grad_norm": 2.964484214782715,
"learning_rate": 9.32690101776272e-06,
"loss": 0.9177,
"step": 2053
},
{
"epoch": 1.0438981005018741,
"grad_norm": 3.1464931964874268,
"learning_rate": 9.326058269744436e-06,
"loss": 0.7592,
"step": 2054
},
{
"epoch": 1.0444063274251953,
"grad_norm": 2.9225363731384277,
"learning_rate": 9.325215032601664e-06,
"loss": 0.8515,
"step": 2055
},
{
"epoch": 1.0449145543485165,
"grad_norm": 6.968871116638184,
"learning_rate": 9.32437130642974e-06,
"loss": 0.8868,
"step": 2056
},
{
"epoch": 1.045422781271838,
"grad_norm": 2.892380714416504,
"learning_rate": 9.323527091324062e-06,
"loss": 0.7601,
"step": 2057
},
{
"epoch": 1.0459310081951592,
"grad_norm": 3.065734386444092,
"learning_rate": 9.322682387380082e-06,
"loss": 0.9312,
"step": 2058
},
{
"epoch": 1.0464392351184804,
"grad_norm": 3.2454822063446045,
"learning_rate": 9.321837194693304e-06,
"loss": 0.8848,
"step": 2059
},
{
"epoch": 1.0469474620418016,
"grad_norm": 3.0628859996795654,
"learning_rate": 9.32099151335929e-06,
"loss": 0.8395,
"step": 2060
},
{
"epoch": 1.047455688965123,
"grad_norm": 2.7631242275238037,
"learning_rate": 9.320145343473656e-06,
"loss": 0.6984,
"step": 2061
},
{
"epoch": 1.0479639158884442,
"grad_norm": 3.4513697624206543,
"learning_rate": 9.319298685132076e-06,
"loss": 0.8301,
"step": 2062
},
{
"epoch": 1.0484721428117654,
"grad_norm": 3.0544557571411133,
"learning_rate": 9.318451538430277e-06,
"loss": 0.8076,
"step": 2063
},
{
"epoch": 1.0489803697350868,
"grad_norm": 3.6341161727905273,
"learning_rate": 9.31760390346404e-06,
"loss": 0.9105,
"step": 2064
},
{
"epoch": 1.049488596658408,
"grad_norm": 3.331022262573242,
"learning_rate": 9.316755780329201e-06,
"loss": 0.8577,
"step": 2065
},
{
"epoch": 1.0499968235817292,
"grad_norm": 3.1098642349243164,
"learning_rate": 9.315907169121657e-06,
"loss": 0.7183,
"step": 2066
},
{
"epoch": 1.0505050505050506,
"grad_norm": 3.2029342651367188,
"learning_rate": 9.315058069937352e-06,
"loss": 0.8624,
"step": 2067
},
{
"epoch": 1.0510132774283718,
"grad_norm": 2.9517741203308105,
"learning_rate": 9.31420848287229e-06,
"loss": 0.8058,
"step": 2068
},
{
"epoch": 1.051521504351693,
"grad_norm": 3.1466259956359863,
"learning_rate": 9.313358408022533e-06,
"loss": 0.868,
"step": 2069
},
{
"epoch": 1.0520297312750142,
"grad_norm": 3.0388054847717285,
"learning_rate": 9.31250784548419e-06,
"loss": 0.855,
"step": 2070
},
{
"epoch": 1.0525379581983356,
"grad_norm": 2.9505624771118164,
"learning_rate": 9.311656795353431e-06,
"loss": 0.7738,
"step": 2071
},
{
"epoch": 1.0530461851216568,
"grad_norm": 3.3491604328155518,
"learning_rate": 9.31080525772648e-06,
"loss": 0.8004,
"step": 2072
},
{
"epoch": 1.053554412044978,
"grad_norm": 2.904555082321167,
"learning_rate": 9.309953232699617e-06,
"loss": 0.8718,
"step": 2073
},
{
"epoch": 1.0540626389682994,
"grad_norm": 2.8941566944122314,
"learning_rate": 9.309100720369176e-06,
"loss": 0.7971,
"step": 2074
},
{
"epoch": 1.0545708658916206,
"grad_norm": 3.0532689094543457,
"learning_rate": 9.308247720831542e-06,
"loss": 0.8472,
"step": 2075
},
{
"epoch": 1.0550790928149418,
"grad_norm": 3.448359489440918,
"learning_rate": 9.307394234183162e-06,
"loss": 0.8943,
"step": 2076
},
{
"epoch": 1.055587319738263,
"grad_norm": 3.1240499019622803,
"learning_rate": 9.306540260520535e-06,
"loss": 0.9552,
"step": 2077
},
{
"epoch": 1.0560955466615845,
"grad_norm": 3.350869655609131,
"learning_rate": 9.305685799940218e-06,
"loss": 0.8265,
"step": 2078
},
{
"epoch": 1.0566037735849056,
"grad_norm": 3.4039957523345947,
"learning_rate": 9.304830852538817e-06,
"loss": 0.8602,
"step": 2079
},
{
"epoch": 1.0571120005082268,
"grad_norm": 3.3318874835968018,
"learning_rate": 9.303975418412996e-06,
"loss": 0.9006,
"step": 2080
},
{
"epoch": 1.0576202274315483,
"grad_norm": 2.9756813049316406,
"learning_rate": 9.303119497659476e-06,
"loss": 0.8273,
"step": 2081
},
{
"epoch": 1.0581284543548695,
"grad_norm": 2.9196019172668457,
"learning_rate": 9.302263090375032e-06,
"loss": 0.8361,
"step": 2082
},
{
"epoch": 1.0586366812781907,
"grad_norm": 3.343363046646118,
"learning_rate": 9.30140619665649e-06,
"loss": 0.8124,
"step": 2083
},
{
"epoch": 1.059144908201512,
"grad_norm": 3.252643585205078,
"learning_rate": 9.300548816600739e-06,
"loss": 0.8564,
"step": 2084
},
{
"epoch": 1.0596531351248333,
"grad_norm": 3.0798771381378174,
"learning_rate": 9.299690950304716e-06,
"loss": 0.8804,
"step": 2085
},
{
"epoch": 1.0601613620481545,
"grad_norm": 3.143292188644409,
"learning_rate": 9.298832597865416e-06,
"loss": 0.8426,
"step": 2086
},
{
"epoch": 1.0606695889714757,
"grad_norm": 3.0367817878723145,
"learning_rate": 9.297973759379888e-06,
"loss": 0.8423,
"step": 2087
},
{
"epoch": 1.061177815894797,
"grad_norm": 3.2499539852142334,
"learning_rate": 9.297114434945236e-06,
"loss": 0.9039,
"step": 2088
},
{
"epoch": 1.0616860428181183,
"grad_norm": 3.1852822303771973,
"learning_rate": 9.296254624658618e-06,
"loss": 0.7962,
"step": 2089
},
{
"epoch": 1.0621942697414395,
"grad_norm": 3.22925066947937,
"learning_rate": 9.295394328617251e-06,
"loss": 0.7997,
"step": 2090
},
{
"epoch": 1.062702496664761,
"grad_norm": 3.0404813289642334,
"learning_rate": 9.294533546918406e-06,
"loss": 0.8152,
"step": 2091
},
{
"epoch": 1.063210723588082,
"grad_norm": 3.014554977416992,
"learning_rate": 9.2936722796594e-06,
"loss": 0.8412,
"step": 2092
},
{
"epoch": 1.0637189505114033,
"grad_norm": 3.2184641361236572,
"learning_rate": 9.292810526937617e-06,
"loss": 0.8574,
"step": 2093
},
{
"epoch": 1.0642271774347245,
"grad_norm": 3.2080061435699463,
"learning_rate": 9.29194828885049e-06,
"loss": 0.8677,
"step": 2094
},
{
"epoch": 1.064735404358046,
"grad_norm": 3.276824474334717,
"learning_rate": 9.291085565495508e-06,
"loss": 0.8431,
"step": 2095
},
{
"epoch": 1.0652436312813671,
"grad_norm": 3.0697712898254395,
"learning_rate": 9.290222356970213e-06,
"loss": 0.9106,
"step": 2096
},
{
"epoch": 1.0657518582046883,
"grad_norm": 3.019782066345215,
"learning_rate": 9.289358663372204e-06,
"loss": 0.7905,
"step": 2097
},
{
"epoch": 1.0662600851280097,
"grad_norm": 3.2518410682678223,
"learning_rate": 9.288494484799136e-06,
"loss": 0.8393,
"step": 2098
},
{
"epoch": 1.066768312051331,
"grad_norm": 2.8931727409362793,
"learning_rate": 9.287629821348714e-06,
"loss": 0.7574,
"step": 2099
},
{
"epoch": 1.0672765389746521,
"grad_norm": 3.020138740539551,
"learning_rate": 9.286764673118705e-06,
"loss": 0.7832,
"step": 2100
},
{
"epoch": 1.0677847658979736,
"grad_norm": 3.068448305130005,
"learning_rate": 9.285899040206922e-06,
"loss": 0.7436,
"step": 2101
},
{
"epoch": 1.0682929928212948,
"grad_norm": 3.2184550762176514,
"learning_rate": 9.28503292271124e-06,
"loss": 0.9075,
"step": 2102
},
{
"epoch": 1.068801219744616,
"grad_norm": 2.9750399589538574,
"learning_rate": 9.284166320729588e-06,
"loss": 0.8305,
"step": 2103
},
{
"epoch": 1.0693094466679371,
"grad_norm": 3.4522347450256348,
"learning_rate": 9.283299234359946e-06,
"loss": 0.7978,
"step": 2104
},
{
"epoch": 1.0698176735912586,
"grad_norm": 3.1621932983398438,
"learning_rate": 9.28243166370035e-06,
"loss": 0.8388,
"step": 2105
},
{
"epoch": 1.0703259005145798,
"grad_norm": 3.238377809524536,
"learning_rate": 9.281563608848893e-06,
"loss": 0.7583,
"step": 2106
},
{
"epoch": 1.070834127437901,
"grad_norm": 3.1495258808135986,
"learning_rate": 9.280695069903722e-06,
"loss": 0.7382,
"step": 2107
},
{
"epoch": 1.0713423543612224,
"grad_norm": 3.1268153190612793,
"learning_rate": 9.279826046963037e-06,
"loss": 0.7512,
"step": 2108
},
{
"epoch": 1.0718505812845436,
"grad_norm": 3.2700624465942383,
"learning_rate": 9.278956540125094e-06,
"loss": 0.7999,
"step": 2109
},
{
"epoch": 1.0723588082078648,
"grad_norm": 2.898972272872925,
"learning_rate": 9.278086549488203e-06,
"loss": 0.7911,
"step": 2110
},
{
"epoch": 1.072867035131186,
"grad_norm": 3.0485572814941406,
"learning_rate": 9.27721607515073e-06,
"loss": 0.7897,
"step": 2111
},
{
"epoch": 1.0733752620545074,
"grad_norm": 2.9671947956085205,
"learning_rate": 9.276345117211096e-06,
"loss": 0.8024,
"step": 2112
},
{
"epoch": 1.0738834889778286,
"grad_norm": 3.489755868911743,
"learning_rate": 9.275473675767773e-06,
"loss": 0.8729,
"step": 2113
},
{
"epoch": 1.0743917159011498,
"grad_norm": 3.384394645690918,
"learning_rate": 9.274601750919292e-06,
"loss": 0.8471,
"step": 2114
},
{
"epoch": 1.0748999428244712,
"grad_norm": 3.0558526515960693,
"learning_rate": 9.273729342764237e-06,
"loss": 0.801,
"step": 2115
},
{
"epoch": 1.0754081697477924,
"grad_norm": 3.1915698051452637,
"learning_rate": 9.272856451401246e-06,
"loss": 0.8724,
"step": 2116
},
{
"epoch": 1.0759163966711136,
"grad_norm": 3.234802722930908,
"learning_rate": 9.271983076929012e-06,
"loss": 0.8306,
"step": 2117
},
{
"epoch": 1.076424623594435,
"grad_norm": 3.1662769317626953,
"learning_rate": 9.271109219446282e-06,
"loss": 0.8037,
"step": 2118
},
{
"epoch": 1.0769328505177562,
"grad_norm": 3.228738784790039,
"learning_rate": 9.270234879051861e-06,
"loss": 0.7598,
"step": 2119
},
{
"epoch": 1.0774410774410774,
"grad_norm": 11.493374824523926,
"learning_rate": 9.269360055844605e-06,
"loss": 0.8335,
"step": 2120
},
{
"epoch": 1.0779493043643986,
"grad_norm": 3.0331759452819824,
"learning_rate": 9.268484749923424e-06,
"loss": 0.6947,
"step": 2121
},
{
"epoch": 1.07845753128772,
"grad_norm": 3.314284563064575,
"learning_rate": 9.267608961387287e-06,
"loss": 0.909,
"step": 2122
},
{
"epoch": 1.0789657582110412,
"grad_norm": 3.0632483959198,
"learning_rate": 9.266732690335211e-06,
"loss": 0.8805,
"step": 2123
},
{
"epoch": 1.0794739851343624,
"grad_norm": 3.0312142372131348,
"learning_rate": 9.265855936866276e-06,
"loss": 0.8584,
"step": 2124
},
{
"epoch": 1.0799822120576839,
"grad_norm": 3.4391958713531494,
"learning_rate": 9.264978701079607e-06,
"loss": 0.7548,
"step": 2125
},
{
"epoch": 1.080490438981005,
"grad_norm": 2.9293901920318604,
"learning_rate": 9.264100983074394e-06,
"loss": 0.8314,
"step": 2126
},
{
"epoch": 1.0809986659043263,
"grad_norm": 3.2253024578094482,
"learning_rate": 9.26322278294987e-06,
"loss": 0.9104,
"step": 2127
},
{
"epoch": 1.0815068928276474,
"grad_norm": 3.0602898597717285,
"learning_rate": 9.262344100805332e-06,
"loss": 0.78,
"step": 2128
},
{
"epoch": 1.0820151197509689,
"grad_norm": 3.211329460144043,
"learning_rate": 9.261464936740127e-06,
"loss": 0.8241,
"step": 2129
},
{
"epoch": 1.08252334667429,
"grad_norm": 2.9432098865509033,
"learning_rate": 9.260585290853658e-06,
"loss": 0.7371,
"step": 2130
},
{
"epoch": 1.0830315735976113,
"grad_norm": 3.190213203430176,
"learning_rate": 9.259705163245381e-06,
"loss": 0.909,
"step": 2131
},
{
"epoch": 1.0835398005209327,
"grad_norm": 3.1257691383361816,
"learning_rate": 9.258824554014807e-06,
"loss": 0.8234,
"step": 2132
},
{
"epoch": 1.0840480274442539,
"grad_norm": 2.958376884460449,
"learning_rate": 9.257943463261503e-06,
"loss": 0.8303,
"step": 2133
},
{
"epoch": 1.084556254367575,
"grad_norm": 3.43859601020813,
"learning_rate": 9.257061891085091e-06,
"loss": 0.7861,
"step": 2134
},
{
"epoch": 1.0850644812908965,
"grad_norm": 2.9984450340270996,
"learning_rate": 9.256179837585242e-06,
"loss": 0.7126,
"step": 2135
},
{
"epoch": 1.0855727082142177,
"grad_norm": 3.1922214031219482,
"learning_rate": 9.255297302861685e-06,
"loss": 0.8999,
"step": 2136
},
{
"epoch": 1.086080935137539,
"grad_norm": 2.9793853759765625,
"learning_rate": 9.254414287014208e-06,
"loss": 0.8929,
"step": 2137
},
{
"epoch": 1.08658916206086,
"grad_norm": 3.271268129348755,
"learning_rate": 9.253530790142646e-06,
"loss": 0.8677,
"step": 2138
},
{
"epoch": 1.0870973889841815,
"grad_norm": 3.011582612991333,
"learning_rate": 9.25264681234689e-06,
"loss": 0.846,
"step": 2139
},
{
"epoch": 1.0876056159075027,
"grad_norm": 3.042726755142212,
"learning_rate": 9.251762353726887e-06,
"loss": 0.7305,
"step": 2140
},
{
"epoch": 1.088113842830824,
"grad_norm": 3.29084849357605,
"learning_rate": 9.250877414382641e-06,
"loss": 0.8388,
"step": 2141
},
{
"epoch": 1.0886220697541453,
"grad_norm": 3.143230676651001,
"learning_rate": 9.249991994414207e-06,
"loss": 0.9816,
"step": 2142
},
{
"epoch": 1.0891302966774665,
"grad_norm": 2.8965611457824707,
"learning_rate": 9.249106093921692e-06,
"loss": 0.7588,
"step": 2143
},
{
"epoch": 1.0896385236007877,
"grad_norm": 3.2397620677948,
"learning_rate": 9.24821971300526e-06,
"loss": 0.8879,
"step": 2144
},
{
"epoch": 1.090146750524109,
"grad_norm": 2.9761197566986084,
"learning_rate": 9.247332851765134e-06,
"loss": 0.797,
"step": 2145
},
{
"epoch": 1.0906549774474303,
"grad_norm": 3.0833804607391357,
"learning_rate": 9.24644551030158e-06,
"loss": 0.8104,
"step": 2146
},
{
"epoch": 1.0911632043707515,
"grad_norm": 2.98724365234375,
"learning_rate": 9.24555768871493e-06,
"loss": 0.812,
"step": 2147
},
{
"epoch": 1.0916714312940727,
"grad_norm": 3.2756662368774414,
"learning_rate": 9.244669387105563e-06,
"loss": 0.9076,
"step": 2148
},
{
"epoch": 1.0921796582173942,
"grad_norm": 3.199113130569458,
"learning_rate": 9.243780605573918e-06,
"loss": 0.8027,
"step": 2149
},
{
"epoch": 1.0926878851407154,
"grad_norm": 2.9473695755004883,
"learning_rate": 9.24289134422048e-06,
"loss": 0.8426,
"step": 2150
},
{
"epoch": 1.0931961120640366,
"grad_norm": 3.1321775913238525,
"learning_rate": 9.242001603145795e-06,
"loss": 0.8629,
"step": 2151
},
{
"epoch": 1.093704338987358,
"grad_norm": 3.111842155456543,
"learning_rate": 9.241111382450463e-06,
"loss": 0.8082,
"step": 2152
},
{
"epoch": 1.0942125659106792,
"grad_norm": 4.241421699523926,
"learning_rate": 9.240220682235133e-06,
"loss": 0.8441,
"step": 2153
},
{
"epoch": 1.0947207928340004,
"grad_norm": 3.283623218536377,
"learning_rate": 9.239329502600515e-06,
"loss": 0.7652,
"step": 2154
},
{
"epoch": 1.0952290197573216,
"grad_norm": 2.9305100440979004,
"learning_rate": 9.23843784364737e-06,
"loss": 0.8427,
"step": 2155
},
{
"epoch": 1.095737246680643,
"grad_norm": 2.994626998901367,
"learning_rate": 9.23754570547651e-06,
"loss": 0.7648,
"step": 2156
},
{
"epoch": 1.0962454736039642,
"grad_norm": 3.076044797897339,
"learning_rate": 9.236653088188807e-06,
"loss": 0.7861,
"step": 2157
},
{
"epoch": 1.0967537005272854,
"grad_norm": 3.4667749404907227,
"learning_rate": 9.235759991885185e-06,
"loss": 0.9786,
"step": 2158
},
{
"epoch": 1.0972619274506068,
"grad_norm": 3.2529866695404053,
"learning_rate": 9.234866416666619e-06,
"loss": 0.784,
"step": 2159
},
{
"epoch": 1.097770154373928,
"grad_norm": 3.1599793434143066,
"learning_rate": 9.233972362634143e-06,
"loss": 0.96,
"step": 2160
},
{
"epoch": 1.0982783812972492,
"grad_norm": 3.1152777671813965,
"learning_rate": 9.233077829888841e-06,
"loss": 0.7875,
"step": 2161
},
{
"epoch": 1.0987866082205704,
"grad_norm": 3.0375049114227295,
"learning_rate": 9.232182818531856e-06,
"loss": 0.9108,
"step": 2162
},
{
"epoch": 1.0992948351438918,
"grad_norm": 2.9311556816101074,
"learning_rate": 9.23128732866438e-06,
"loss": 0.8091,
"step": 2163
},
{
"epoch": 1.099803062067213,
"grad_norm": 2.9771041870117188,
"learning_rate": 9.230391360387661e-06,
"loss": 0.8187,
"step": 2164
},
{
"epoch": 1.1003112889905342,
"grad_norm": 3.184452533721924,
"learning_rate": 9.229494913803003e-06,
"loss": 0.7583,
"step": 2165
},
{
"epoch": 1.1008195159138556,
"grad_norm": 3.0859079360961914,
"learning_rate": 9.228597989011761e-06,
"loss": 0.813,
"step": 2166
},
{
"epoch": 1.1013277428371768,
"grad_norm": 3.111276865005493,
"learning_rate": 9.227700586115347e-06,
"loss": 0.7791,
"step": 2167
},
{
"epoch": 1.101835969760498,
"grad_norm": 3.0945050716400146,
"learning_rate": 9.226802705215224e-06,
"loss": 0.8495,
"step": 2168
},
{
"epoch": 1.1023441966838192,
"grad_norm": 3.492349863052368,
"learning_rate": 9.225904346412913e-06,
"loss": 0.8259,
"step": 2169
},
{
"epoch": 1.1028524236071406,
"grad_norm": 3.0135536193847656,
"learning_rate": 9.225005509809984e-06,
"loss": 0.7308,
"step": 2170
},
{
"epoch": 1.1033606505304618,
"grad_norm": 3.3793108463287354,
"learning_rate": 9.224106195508064e-06,
"loss": 0.8777,
"step": 2171
},
{
"epoch": 1.103868877453783,
"grad_norm": 3.311250925064087,
"learning_rate": 9.223206403608836e-06,
"loss": 0.8091,
"step": 2172
},
{
"epoch": 1.1043771043771045,
"grad_norm": 3.3394904136657715,
"learning_rate": 9.222306134214032e-06,
"loss": 0.898,
"step": 2173
},
{
"epoch": 1.1048853313004257,
"grad_norm": 2.9980368614196777,
"learning_rate": 9.221405387425441e-06,
"loss": 0.8628,
"step": 2174
},
{
"epoch": 1.1053935582237469,
"grad_norm": 3.0090014934539795,
"learning_rate": 9.22050416334491e-06,
"loss": 0.8591,
"step": 2175
},
{
"epoch": 1.105901785147068,
"grad_norm": 3.2262046337127686,
"learning_rate": 9.21960246207433e-06,
"loss": 0.9521,
"step": 2176
},
{
"epoch": 1.1064100120703895,
"grad_norm": 3.0029313564300537,
"learning_rate": 9.218700283715653e-06,
"loss": 0.9119,
"step": 2177
},
{
"epoch": 1.1069182389937107,
"grad_norm": 2.9279654026031494,
"learning_rate": 9.217797628370886e-06,
"loss": 0.8419,
"step": 2178
},
{
"epoch": 1.1074264659170319,
"grad_norm": 3.0237679481506348,
"learning_rate": 9.216894496142083e-06,
"loss": 0.8855,
"step": 2179
},
{
"epoch": 1.1079346928403533,
"grad_norm": 3.1915111541748047,
"learning_rate": 9.215990887131362e-06,
"loss": 0.9484,
"step": 2180
},
{
"epoch": 1.1084429197636745,
"grad_norm": 3.263805627822876,
"learning_rate": 9.215086801440885e-06,
"loss": 0.9143,
"step": 2181
},
{
"epoch": 1.1089511466869957,
"grad_norm": 2.8310515880584717,
"learning_rate": 9.214182239172875e-06,
"loss": 0.7704,
"step": 2182
},
{
"epoch": 1.109459373610317,
"grad_norm": 3.0871376991271973,
"learning_rate": 9.213277200429604e-06,
"loss": 0.9276,
"step": 2183
},
{
"epoch": 1.1099676005336383,
"grad_norm": 3.289386749267578,
"learning_rate": 9.2123716853134e-06,
"loss": 0.8827,
"step": 2184
},
{
"epoch": 1.1104758274569595,
"grad_norm": 3.0301473140716553,
"learning_rate": 9.211465693926644e-06,
"loss": 0.6892,
"step": 2185
},
{
"epoch": 1.1109840543802807,
"grad_norm": 3.2088818550109863,
"learning_rate": 9.210559226371775e-06,
"loss": 0.8858,
"step": 2186
},
{
"epoch": 1.1114922813036021,
"grad_norm": 3.0917153358459473,
"learning_rate": 9.20965228275128e-06,
"loss": 0.8285,
"step": 2187
},
{
"epoch": 1.1120005082269233,
"grad_norm": 3.0714948177337646,
"learning_rate": 9.208744863167704e-06,
"loss": 0.7709,
"step": 2188
},
{
"epoch": 1.1125087351502445,
"grad_norm": 3.212080955505371,
"learning_rate": 9.207836967723642e-06,
"loss": 0.8698,
"step": 2189
},
{
"epoch": 1.113016962073566,
"grad_norm": 2.982008695602417,
"learning_rate": 9.206928596521745e-06,
"loss": 0.8373,
"step": 2190
},
{
"epoch": 1.1135251889968871,
"grad_norm": 2.828354597091675,
"learning_rate": 9.206019749664721e-06,
"loss": 0.8131,
"step": 2191
},
{
"epoch": 1.1140334159202083,
"grad_norm": 2.826298952102661,
"learning_rate": 9.205110427255325e-06,
"loss": 0.824,
"step": 2192
},
{
"epoch": 1.1145416428435295,
"grad_norm": 3.315394878387451,
"learning_rate": 9.204200629396369e-06,
"loss": 0.9247,
"step": 2193
},
{
"epoch": 1.115049869766851,
"grad_norm": 2.9893481731414795,
"learning_rate": 9.203290356190722e-06,
"loss": 0.8431,
"step": 2194
},
{
"epoch": 1.1155580966901721,
"grad_norm": 3.145125150680542,
"learning_rate": 9.2023796077413e-06,
"loss": 0.8641,
"step": 2195
},
{
"epoch": 1.1160663236134933,
"grad_norm": 3.1989402770996094,
"learning_rate": 9.20146838415108e-06,
"loss": 0.8556,
"step": 2196
},
{
"epoch": 1.1165745505368148,
"grad_norm": 3.063964605331421,
"learning_rate": 9.20055668552309e-06,
"loss": 0.9002,
"step": 2197
},
{
"epoch": 1.117082777460136,
"grad_norm": 3.030367374420166,
"learning_rate": 9.199644511960406e-06,
"loss": 0.8305,
"step": 2198
},
{
"epoch": 1.1175910043834572,
"grad_norm": 3.0812602043151855,
"learning_rate": 9.198731863566167e-06,
"loss": 0.7413,
"step": 2199
},
{
"epoch": 1.1180992313067786,
"grad_norm": 3.024437189102173,
"learning_rate": 9.197818740443557e-06,
"loss": 0.7769,
"step": 2200
},
{
"epoch": 1.1186074582300998,
"grad_norm": 3.1418869495391846,
"learning_rate": 9.196905142695824e-06,
"loss": 0.8448,
"step": 2201
},
{
"epoch": 1.119115685153421,
"grad_norm": 3.3266446590423584,
"learning_rate": 9.19599107042626e-06,
"loss": 0.8207,
"step": 2202
},
{
"epoch": 1.1196239120767422,
"grad_norm": 3.2680680751800537,
"learning_rate": 9.195076523738214e-06,
"loss": 0.7964,
"step": 2203
},
{
"epoch": 1.1201321390000636,
"grad_norm": 3.283367872238159,
"learning_rate": 9.19416150273509e-06,
"loss": 0.8387,
"step": 2204
},
{
"epoch": 1.1206403659233848,
"grad_norm": 3.3058741092681885,
"learning_rate": 9.193246007520344e-06,
"loss": 0.8465,
"step": 2205
},
{
"epoch": 1.121148592846706,
"grad_norm": 3.558431386947632,
"learning_rate": 9.192330038197487e-06,
"loss": 0.8973,
"step": 2206
},
{
"epoch": 1.1216568197700274,
"grad_norm": 3.1155524253845215,
"learning_rate": 9.191413594870082e-06,
"loss": 0.8167,
"step": 2207
},
{
"epoch": 1.1221650466933486,
"grad_norm": 3.192988157272339,
"learning_rate": 9.190496677641745e-06,
"loss": 0.8652,
"step": 2208
},
{
"epoch": 1.1226732736166698,
"grad_norm": 3.0044095516204834,
"learning_rate": 9.189579286616151e-06,
"loss": 0.7597,
"step": 2209
},
{
"epoch": 1.123181500539991,
"grad_norm": 3.117872953414917,
"learning_rate": 9.18866142189702e-06,
"loss": 0.8327,
"step": 2210
},
{
"epoch": 1.1236897274633124,
"grad_norm": 3.1604981422424316,
"learning_rate": 9.187743083588135e-06,
"loss": 0.8148,
"step": 2211
},
{
"epoch": 1.1241979543866336,
"grad_norm": 3.1135852336883545,
"learning_rate": 9.186824271793324e-06,
"loss": 0.837,
"step": 2212
},
{
"epoch": 1.1247061813099548,
"grad_norm": 3.106766939163208,
"learning_rate": 9.185904986616471e-06,
"loss": 0.8302,
"step": 2213
},
{
"epoch": 1.1252144082332762,
"grad_norm": 3.023362874984741,
"learning_rate": 9.184985228161518e-06,
"loss": 0.89,
"step": 2214
},
{
"epoch": 1.1257226351565974,
"grad_norm": 3.0963006019592285,
"learning_rate": 9.184064996532457e-06,
"loss": 0.8387,
"step": 2215
},
{
"epoch": 1.1262308620799186,
"grad_norm": 3.141411542892456,
"learning_rate": 9.183144291833332e-06,
"loss": 0.8162,
"step": 2216
},
{
"epoch": 1.12673908900324,
"grad_norm": 3.1030666828155518,
"learning_rate": 9.182223114168243e-06,
"loss": 0.8868,
"step": 2217
},
{
"epoch": 1.1272473159265612,
"grad_norm": 3.0338220596313477,
"learning_rate": 9.181301463641343e-06,
"loss": 0.8492,
"step": 2218
},
{
"epoch": 1.1277555428498824,
"grad_norm": 3.1174585819244385,
"learning_rate": 9.180379340356837e-06,
"loss": 0.892,
"step": 2219
},
{
"epoch": 1.1282637697732036,
"grad_norm": 3.2138559818267822,
"learning_rate": 9.179456744418987e-06,
"loss": 0.849,
"step": 2220
},
{
"epoch": 1.128771996696525,
"grad_norm": 2.9782936573028564,
"learning_rate": 9.178533675932103e-06,
"loss": 0.7515,
"step": 2221
},
{
"epoch": 1.1292802236198463,
"grad_norm": 3.7740142345428467,
"learning_rate": 9.177610135000552e-06,
"loss": 0.7538,
"step": 2222
},
{
"epoch": 1.1297884505431675,
"grad_norm": 3.475064516067505,
"learning_rate": 9.176686121728755e-06,
"loss": 0.884,
"step": 2223
},
{
"epoch": 1.1302966774664889,
"grad_norm": 3.4748387336730957,
"learning_rate": 9.175761636221186e-06,
"loss": 0.8535,
"step": 2224
},
{
"epoch": 1.13080490438981,
"grad_norm": 3.3585240840911865,
"learning_rate": 9.17483667858237e-06,
"loss": 0.8299,
"step": 2225
},
{
"epoch": 1.1313131313131313,
"grad_norm": 2.91369891166687,
"learning_rate": 9.173911248916888e-06,
"loss": 0.7635,
"step": 2226
},
{
"epoch": 1.1318213582364525,
"grad_norm": 3.1783607006073,
"learning_rate": 9.172985347329374e-06,
"loss": 0.8534,
"step": 2227
},
{
"epoch": 1.1323295851597739,
"grad_norm": 3.3611485958099365,
"learning_rate": 9.172058973924514e-06,
"loss": 0.9793,
"step": 2228
},
{
"epoch": 1.132837812083095,
"grad_norm": 3.0700531005859375,
"learning_rate": 9.171132128807047e-06,
"loss": 0.8908,
"step": 2229
},
{
"epoch": 1.1333460390064163,
"grad_norm": 3.0375781059265137,
"learning_rate": 9.170204812081767e-06,
"loss": 0.8368,
"step": 2230
},
{
"epoch": 1.1338542659297377,
"grad_norm": 2.99582576751709,
"learning_rate": 9.169277023853523e-06,
"loss": 0.7991,
"step": 2231
},
{
"epoch": 1.134362492853059,
"grad_norm": 3.3543779850006104,
"learning_rate": 9.168348764227213e-06,
"loss": 0.9089,
"step": 2232
},
{
"epoch": 1.13487071977638,
"grad_norm": 2.9977941513061523,
"learning_rate": 9.16742003330779e-06,
"loss": 0.8454,
"step": 2233
},
{
"epoch": 1.1353789466997015,
"grad_norm": 2.8905301094055176,
"learning_rate": 9.166490831200264e-06,
"loss": 0.7581,
"step": 2234
},
{
"epoch": 1.1358871736230227,
"grad_norm": 3.1561331748962402,
"learning_rate": 9.165561158009689e-06,
"loss": 0.8404,
"step": 2235
},
{
"epoch": 1.136395400546344,
"grad_norm": 3.356651544570923,
"learning_rate": 9.164631013841184e-06,
"loss": 0.929,
"step": 2236
},
{
"epoch": 1.1369036274696651,
"grad_norm": 2.907170534133911,
"learning_rate": 9.163700398799913e-06,
"loss": 0.8456,
"step": 2237
},
{
"epoch": 1.1374118543929865,
"grad_norm": 3.214137077331543,
"learning_rate": 9.162769312991095e-06,
"loss": 0.7972,
"step": 2238
},
{
"epoch": 1.1379200813163077,
"grad_norm": 2.9030961990356445,
"learning_rate": 9.161837756520005e-06,
"loss": 0.8041,
"step": 2239
},
{
"epoch": 1.138428308239629,
"grad_norm": 3.315462112426758,
"learning_rate": 9.160905729491967e-06,
"loss": 0.8011,
"step": 2240
},
{
"epoch": 1.1389365351629503,
"grad_norm": 3.185739278793335,
"learning_rate": 9.159973232012363e-06,
"loss": 0.8687,
"step": 2241
},
{
"epoch": 1.1394447620862715,
"grad_norm": 3.2211828231811523,
"learning_rate": 9.159040264186621e-06,
"loss": 0.8402,
"step": 2242
},
{
"epoch": 1.1399529890095927,
"grad_norm": 3.1946299076080322,
"learning_rate": 9.158106826120232e-06,
"loss": 0.8323,
"step": 2243
},
{
"epoch": 1.140461215932914,
"grad_norm": 2.910707712173462,
"learning_rate": 9.157172917918732e-06,
"loss": 0.8432,
"step": 2244
},
{
"epoch": 1.1409694428562354,
"grad_norm": 3.3521809577941895,
"learning_rate": 9.156238539687713e-06,
"loss": 0.8958,
"step": 2245
},
{
"epoch": 1.1414776697795566,
"grad_norm": 2.8933801651000977,
"learning_rate": 9.155303691532821e-06,
"loss": 0.777,
"step": 2246
},
{
"epoch": 1.1419858967028778,
"grad_norm": 3.164515256881714,
"learning_rate": 9.154368373559754e-06,
"loss": 0.8503,
"step": 2247
},
{
"epoch": 1.1424941236261992,
"grad_norm": 2.9174115657806396,
"learning_rate": 9.153432585874265e-06,
"loss": 0.7781,
"step": 2248
},
{
"epoch": 1.1430023505495204,
"grad_norm": 3.1479575634002686,
"learning_rate": 9.152496328582156e-06,
"loss": 0.9578,
"step": 2249
},
{
"epoch": 1.1435105774728416,
"grad_norm": 3.2180874347686768,
"learning_rate": 9.151559601789286e-06,
"loss": 0.7281,
"step": 2250
},
{
"epoch": 1.144018804396163,
"grad_norm": 2.899796724319458,
"learning_rate": 9.150622405601564e-06,
"loss": 0.7567,
"step": 2251
},
{
"epoch": 1.1445270313194842,
"grad_norm": 3.3812904357910156,
"learning_rate": 9.149684740124958e-06,
"loss": 0.8009,
"step": 2252
},
{
"epoch": 1.1450352582428054,
"grad_norm": 3.2274460792541504,
"learning_rate": 9.14874660546548e-06,
"loss": 0.9155,
"step": 2253
},
{
"epoch": 1.1455434851661266,
"grad_norm": 3.4081389904022217,
"learning_rate": 9.147808001729203e-06,
"loss": 0.8662,
"step": 2254
},
{
"epoch": 1.146051712089448,
"grad_norm": 3.192394256591797,
"learning_rate": 9.14686892902225e-06,
"loss": 0.872,
"step": 2255
},
{
"epoch": 1.1465599390127692,
"grad_norm": 3.7580795288085938,
"learning_rate": 9.145929387450794e-06,
"loss": 0.9428,
"step": 2256
},
{
"epoch": 1.1470681659360904,
"grad_norm": 2.902574300765991,
"learning_rate": 9.144989377121067e-06,
"loss": 0.7778,
"step": 2257
},
{
"epoch": 1.1475763928594118,
"grad_norm": 3.1599409580230713,
"learning_rate": 9.14404889813935e-06,
"loss": 0.909,
"step": 2258
},
{
"epoch": 1.148084619782733,
"grad_norm": 3.0382742881774902,
"learning_rate": 9.143107950611978e-06,
"loss": 0.788,
"step": 2259
},
{
"epoch": 1.1485928467060542,
"grad_norm": 3.310295343399048,
"learning_rate": 9.14216653464534e-06,
"loss": 0.8701,
"step": 2260
},
{
"epoch": 1.1491010736293754,
"grad_norm": 3.244692325592041,
"learning_rate": 9.141224650345875e-06,
"loss": 0.8442,
"step": 2261
},
{
"epoch": 1.1496093005526968,
"grad_norm": 3.261472463607788,
"learning_rate": 9.140282297820078e-06,
"loss": 0.8507,
"step": 2262
},
{
"epoch": 1.150117527476018,
"grad_norm": 3.2070884704589844,
"learning_rate": 9.139339477174495e-06,
"loss": 0.8635,
"step": 2263
},
{
"epoch": 1.1506257543993392,
"grad_norm": 3.273611307144165,
"learning_rate": 9.138396188515725e-06,
"loss": 0.8498,
"step": 2264
},
{
"epoch": 1.1511339813226606,
"grad_norm": 3.6329290866851807,
"learning_rate": 9.137452431950424e-06,
"loss": 0.9368,
"step": 2265
},
{
"epoch": 1.1516422082459818,
"grad_norm": 3.0486176013946533,
"learning_rate": 9.136508207585295e-06,
"loss": 0.8328,
"step": 2266
},
{
"epoch": 1.152150435169303,
"grad_norm": 3.372185468673706,
"learning_rate": 9.135563515527098e-06,
"loss": 0.8505,
"step": 2267
},
{
"epoch": 1.1526586620926245,
"grad_norm": 3.2860240936279297,
"learning_rate": 9.134618355882641e-06,
"loss": 0.867,
"step": 2268
},
{
"epoch": 1.1531668890159457,
"grad_norm": 3.219965934753418,
"learning_rate": 9.133672728758791e-06,
"loss": 0.8907,
"step": 2269
},
{
"epoch": 1.1536751159392669,
"grad_norm": 3.027545928955078,
"learning_rate": 9.132726634262465e-06,
"loss": 0.856,
"step": 2270
},
{
"epoch": 1.154183342862588,
"grad_norm": 3.089707851409912,
"learning_rate": 9.131780072500633e-06,
"loss": 0.9343,
"step": 2271
},
{
"epoch": 1.1546915697859095,
"grad_norm": 3.1712076663970947,
"learning_rate": 9.130833043580315e-06,
"loss": 0.8669,
"step": 2272
},
{
"epoch": 1.1551997967092307,
"grad_norm": 2.896791458129883,
"learning_rate": 9.12988554760859e-06,
"loss": 0.7617,
"step": 2273
},
{
"epoch": 1.1557080236325519,
"grad_norm": 3.4459807872772217,
"learning_rate": 9.128937584692586e-06,
"loss": 0.8495,
"step": 2274
},
{
"epoch": 1.1562162505558733,
"grad_norm": 2.8953559398651123,
"learning_rate": 9.127989154939481e-06,
"loss": 0.834,
"step": 2275
},
{
"epoch": 1.1567244774791945,
"grad_norm": 3.0459115505218506,
"learning_rate": 9.127040258456512e-06,
"loss": 0.8592,
"step": 2276
},
{
"epoch": 1.1572327044025157,
"grad_norm": 2.9910728931427,
"learning_rate": 9.126090895350966e-06,
"loss": 0.8281,
"step": 2277
},
{
"epoch": 1.1577409313258369,
"grad_norm": 3.0232229232788086,
"learning_rate": 9.125141065730179e-06,
"loss": 0.868,
"step": 2278
},
{
"epoch": 1.1582491582491583,
"grad_norm": 4.885484218597412,
"learning_rate": 9.124190769701547e-06,
"loss": 0.8484,
"step": 2279
},
{
"epoch": 1.1587573851724795,
"grad_norm": 3.1473946571350098,
"learning_rate": 9.123240007372514e-06,
"loss": 0.9519,
"step": 2280
},
{
"epoch": 1.1592656120958007,
"grad_norm": 3.1233749389648438,
"learning_rate": 9.122288778850576e-06,
"loss": 0.748,
"step": 2281
},
{
"epoch": 1.1597738390191221,
"grad_norm": 3.5578534603118896,
"learning_rate": 9.121337084243284e-06,
"loss": 0.8351,
"step": 2282
},
{
"epoch": 1.1602820659424433,
"grad_norm": 3.0705373287200928,
"learning_rate": 9.120384923658242e-06,
"loss": 0.8245,
"step": 2283
},
{
"epoch": 1.1607902928657645,
"grad_norm": 3.356689453125,
"learning_rate": 9.119432297203104e-06,
"loss": 0.972,
"step": 2284
},
{
"epoch": 1.161298519789086,
"grad_norm": 3.0365214347839355,
"learning_rate": 9.118479204985582e-06,
"loss": 0.924,
"step": 2285
},
{
"epoch": 1.1618067467124071,
"grad_norm": 9.004101753234863,
"learning_rate": 9.117525647113433e-06,
"loss": 0.7769,
"step": 2286
},
{
"epoch": 1.1623149736357283,
"grad_norm": 3.2367615699768066,
"learning_rate": 9.116571623694473e-06,
"loss": 0.7716,
"step": 2287
},
{
"epoch": 1.1628232005590495,
"grad_norm": 3.1672203540802,
"learning_rate": 9.115617134836567e-06,
"loss": 0.7419,
"step": 2288
},
{
"epoch": 1.163331427482371,
"grad_norm": 3.166799545288086,
"learning_rate": 9.114662180647635e-06,
"loss": 0.7803,
"step": 2289
},
{
"epoch": 1.1638396544056921,
"grad_norm": 7.431529521942139,
"learning_rate": 9.11370676123565e-06,
"loss": 0.8892,
"step": 2290
},
{
"epoch": 1.1643478813290133,
"grad_norm": 3.1311194896698,
"learning_rate": 9.112750876708633e-06,
"loss": 0.8267,
"step": 2291
},
{
"epoch": 1.1648561082523345,
"grad_norm": 3.522717237472534,
"learning_rate": 9.111794527174665e-06,
"loss": 0.9574,
"step": 2292
},
{
"epoch": 1.165364335175656,
"grad_norm": 3.246248245239258,
"learning_rate": 9.110837712741871e-06,
"loss": 0.8789,
"step": 2293
},
{
"epoch": 1.1658725620989772,
"grad_norm": 3.2041945457458496,
"learning_rate": 9.109880433518434e-06,
"loss": 0.8074,
"step": 2294
},
{
"epoch": 1.1663807890222984,
"grad_norm": 3.2885286808013916,
"learning_rate": 9.10892268961259e-06,
"loss": 0.9215,
"step": 2295
},
{
"epoch": 1.1668890159456198,
"grad_norm": 2.9827210903167725,
"learning_rate": 9.107964481132625e-06,
"loss": 0.8479,
"step": 2296
},
{
"epoch": 1.167397242868941,
"grad_norm": 3.529890298843384,
"learning_rate": 9.10700580818688e-06,
"loss": 0.9017,
"step": 2297
},
{
"epoch": 1.1679054697922622,
"grad_norm": 3.1567318439483643,
"learning_rate": 9.106046670883745e-06,
"loss": 0.8741,
"step": 2298
},
{
"epoch": 1.1684136967155836,
"grad_norm": 2.972628116607666,
"learning_rate": 9.105087069331666e-06,
"loss": 0.7806,
"step": 2299
},
{
"epoch": 1.1689219236389048,
"grad_norm": 3.2747461795806885,
"learning_rate": 9.104127003639138e-06,
"loss": 0.8251,
"step": 2300
},
{
"epoch": 1.169430150562226,
"grad_norm": 3.3266758918762207,
"learning_rate": 9.103166473914714e-06,
"loss": 0.8261,
"step": 2301
},
{
"epoch": 1.1699383774855474,
"grad_norm": 3.083332061767578,
"learning_rate": 9.102205480266993e-06,
"loss": 0.8373,
"step": 2302
},
{
"epoch": 1.1704466044088686,
"grad_norm": 2.9103949069976807,
"learning_rate": 9.101244022804631e-06,
"loss": 0.7487,
"step": 2303
},
{
"epoch": 1.1709548313321898,
"grad_norm": 3.736177444458008,
"learning_rate": 9.100282101636334e-06,
"loss": 0.868,
"step": 2304
},
{
"epoch": 1.171463058255511,
"grad_norm": 2.9956789016723633,
"learning_rate": 9.099319716870863e-06,
"loss": 0.7916,
"step": 2305
},
{
"epoch": 1.1719712851788324,
"grad_norm": 2.974546194076538,
"learning_rate": 9.098356868617028e-06,
"loss": 0.8415,
"step": 2306
},
{
"epoch": 1.1724795121021536,
"grad_norm": 4.560594081878662,
"learning_rate": 9.097393556983694e-06,
"loss": 0.9999,
"step": 2307
},
{
"epoch": 1.1729877390254748,
"grad_norm": 3.2358243465423584,
"learning_rate": 9.096429782079777e-06,
"loss": 0.7266,
"step": 2308
},
{
"epoch": 1.173495965948796,
"grad_norm": 3.0477235317230225,
"learning_rate": 9.095465544014244e-06,
"loss": 0.9312,
"step": 2309
},
{
"epoch": 1.1740041928721174,
"grad_norm": 3.2846338748931885,
"learning_rate": 9.09450084289612e-06,
"loss": 0.8764,
"step": 2310
},
{
"epoch": 1.1745124197954386,
"grad_norm": 3.05623459815979,
"learning_rate": 9.093535678834479e-06,
"loss": 0.7985,
"step": 2311
},
{
"epoch": 1.1750206467187598,
"grad_norm": 2.8562092781066895,
"learning_rate": 9.092570051938444e-06,
"loss": 0.8054,
"step": 2312
},
{
"epoch": 1.1755288736420813,
"grad_norm": 3.0281214714050293,
"learning_rate": 9.091603962317192e-06,
"loss": 0.858,
"step": 2313
},
{
"epoch": 1.1760371005654024,
"grad_norm": 3.491211414337158,
"learning_rate": 9.090637410079958e-06,
"loss": 0.8533,
"step": 2314
},
{
"epoch": 1.1765453274887236,
"grad_norm": 3.175933599472046,
"learning_rate": 9.089670395336023e-06,
"loss": 0.7493,
"step": 2315
},
{
"epoch": 1.177053554412045,
"grad_norm": 3.269052505493164,
"learning_rate": 9.088702918194723e-06,
"loss": 0.7981,
"step": 2316
},
{
"epoch": 1.1775617813353663,
"grad_norm": 3.173762559890747,
"learning_rate": 9.087734978765443e-06,
"loss": 0.7655,
"step": 2317
},
{
"epoch": 1.1780700082586875,
"grad_norm": 3.22505521774292,
"learning_rate": 9.086766577157626e-06,
"loss": 0.8203,
"step": 2318
},
{
"epoch": 1.1785782351820089,
"grad_norm": 3.346877336502075,
"learning_rate": 9.085797713480763e-06,
"loss": 0.8404,
"step": 2319
},
{
"epoch": 1.17908646210533,
"grad_norm": 3.098677396774292,
"learning_rate": 9.084828387844396e-06,
"loss": 0.8589,
"step": 2320
},
{
"epoch": 1.1795946890286513,
"grad_norm": 3.0070483684539795,
"learning_rate": 9.083858600358125e-06,
"loss": 0.8285,
"step": 2321
},
{
"epoch": 1.1801029159519725,
"grad_norm": 3.2013142108917236,
"learning_rate": 9.082888351131596e-06,
"loss": 0.7647,
"step": 2322
},
{
"epoch": 1.180611142875294,
"grad_norm": 2.795560359954834,
"learning_rate": 9.08191764027451e-06,
"loss": 0.7127,
"step": 2323
},
{
"epoch": 1.181119369798615,
"grad_norm": 2.8931901454925537,
"learning_rate": 9.080946467896623e-06,
"loss": 0.7877,
"step": 2324
},
{
"epoch": 1.1816275967219363,
"grad_norm": 3.125441551208496,
"learning_rate": 9.07997483410774e-06,
"loss": 0.7735,
"step": 2325
},
{
"epoch": 1.1821358236452575,
"grad_norm": 3.3146045207977295,
"learning_rate": 9.079002739017713e-06,
"loss": 0.8159,
"step": 2326
},
{
"epoch": 1.182644050568579,
"grad_norm": 3.1576666831970215,
"learning_rate": 9.078030182736458e-06,
"loss": 0.8076,
"step": 2327
},
{
"epoch": 1.1831522774919,
"grad_norm": 3.0713062286376953,
"learning_rate": 9.077057165373932e-06,
"loss": 0.7745,
"step": 2328
},
{
"epoch": 1.1836605044152213,
"grad_norm": 3.206789255142212,
"learning_rate": 9.076083687040154e-06,
"loss": 0.7932,
"step": 2329
},
{
"epoch": 1.1841687313385427,
"grad_norm": 3.2853028774261475,
"learning_rate": 9.075109747845188e-06,
"loss": 0.8669,
"step": 2330
},
{
"epoch": 1.184676958261864,
"grad_norm": 3.4324445724487305,
"learning_rate": 9.07413534789915e-06,
"loss": 0.833,
"step": 2331
},
{
"epoch": 1.1851851851851851,
"grad_norm": 3.3072774410247803,
"learning_rate": 9.073160487312212e-06,
"loss": 0.9215,
"step": 2332
},
{
"epoch": 1.1856934121085065,
"grad_norm": 3.1970913410186768,
"learning_rate": 9.072185166194595e-06,
"loss": 0.8354,
"step": 2333
},
{
"epoch": 1.1862016390318277,
"grad_norm": 3.035238027572632,
"learning_rate": 9.071209384656576e-06,
"loss": 0.8417,
"step": 2334
},
{
"epoch": 1.186709865955149,
"grad_norm": 3.103426694869995,
"learning_rate": 9.070233142808478e-06,
"loss": 0.7325,
"step": 2335
},
{
"epoch": 1.1872180928784704,
"grad_norm": 2.9447386264801025,
"learning_rate": 9.069256440760683e-06,
"loss": 0.9334,
"step": 2336
},
{
"epoch": 1.1877263198017916,
"grad_norm": 3.1110117435455322,
"learning_rate": 9.06827927862362e-06,
"loss": 0.8528,
"step": 2337
},
{
"epoch": 1.1882345467251128,
"grad_norm": 2.9499640464782715,
"learning_rate": 9.06730165650777e-06,
"loss": 0.8304,
"step": 2338
},
{
"epoch": 1.188742773648434,
"grad_norm": 3.1496939659118652,
"learning_rate": 9.06632357452367e-06,
"loss": 0.9043,
"step": 2339
},
{
"epoch": 1.1892510005717554,
"grad_norm": 3.112644672393799,
"learning_rate": 9.065345032781906e-06,
"loss": 0.754,
"step": 2340
},
{
"epoch": 1.1897592274950766,
"grad_norm": 3.083366632461548,
"learning_rate": 9.064366031393114e-06,
"loss": 0.788,
"step": 2341
},
{
"epoch": 1.1902674544183978,
"grad_norm": 3.426002025604248,
"learning_rate": 9.06338657046799e-06,
"loss": 0.8761,
"step": 2342
},
{
"epoch": 1.190775681341719,
"grad_norm": 3.2204999923706055,
"learning_rate": 9.06240665011727e-06,
"loss": 0.9046,
"step": 2343
},
{
"epoch": 1.1912839082650404,
"grad_norm": 3.1768798828125,
"learning_rate": 9.061426270451752e-06,
"loss": 0.8119,
"step": 2344
},
{
"epoch": 1.1917921351883616,
"grad_norm": 3.0866265296936035,
"learning_rate": 9.060445431582282e-06,
"loss": 0.7913,
"step": 2345
},
{
"epoch": 1.1923003621116828,
"grad_norm": 3.2694597244262695,
"learning_rate": 9.05946413361976e-06,
"loss": 0.871,
"step": 2346
},
{
"epoch": 1.1928085890350042,
"grad_norm": 3.0038201808929443,
"learning_rate": 9.058482376675132e-06,
"loss": 0.8324,
"step": 2347
},
{
"epoch": 1.1933168159583254,
"grad_norm": 3.2698614597320557,
"learning_rate": 9.057500160859405e-06,
"loss": 0.9151,
"step": 2348
},
{
"epoch": 1.1938250428816466,
"grad_norm": 3.040255546569824,
"learning_rate": 9.056517486283626e-06,
"loss": 0.7836,
"step": 2349
},
{
"epoch": 1.194333269804968,
"grad_norm": 3.324594736099243,
"learning_rate": 9.055534353058907e-06,
"loss": 0.8665,
"step": 2350
},
{
"epoch": 1.1948414967282892,
"grad_norm": 2.856466293334961,
"learning_rate": 9.054550761296404e-06,
"loss": 0.761,
"step": 2351
},
{
"epoch": 1.1953497236516104,
"grad_norm": 3.116780996322632,
"learning_rate": 9.053566711107327e-06,
"loss": 0.8185,
"step": 2352
},
{
"epoch": 1.1958579505749318,
"grad_norm": 3.159078359603882,
"learning_rate": 9.052582202602935e-06,
"loss": 0.8617,
"step": 2353
},
{
"epoch": 1.196366177498253,
"grad_norm": 3.0667052268981934,
"learning_rate": 9.051597235894544e-06,
"loss": 0.8621,
"step": 2354
},
{
"epoch": 1.1968744044215742,
"grad_norm": 3.0409858226776123,
"learning_rate": 9.050611811093517e-06,
"loss": 0.8067,
"step": 2355
},
{
"epoch": 1.1973826313448954,
"grad_norm": 3.0293259620666504,
"learning_rate": 9.049625928311272e-06,
"loss": 0.7851,
"step": 2356
},
{
"epoch": 1.1978908582682168,
"grad_norm": 3.1196017265319824,
"learning_rate": 9.048639587659275e-06,
"loss": 0.88,
"step": 2357
},
{
"epoch": 1.198399085191538,
"grad_norm": 3.0092966556549072,
"learning_rate": 9.04765278924905e-06,
"loss": 0.8317,
"step": 2358
},
{
"epoch": 1.1989073121148592,
"grad_norm": 3.1430952548980713,
"learning_rate": 9.046665533192167e-06,
"loss": 0.7821,
"step": 2359
},
{
"epoch": 1.1994155390381804,
"grad_norm": 3.2352135181427,
"learning_rate": 9.04567781960025e-06,
"loss": 0.8412,
"step": 2360
},
{
"epoch": 1.1999237659615019,
"grad_norm": 3.115145206451416,
"learning_rate": 9.044689648584974e-06,
"loss": 0.784,
"step": 2361
},
{
"epoch": 1.200431992884823,
"grad_norm": 2.9744040966033936,
"learning_rate": 9.043701020258067e-06,
"loss": 0.8497,
"step": 2362
},
{
"epoch": 1.2009402198081442,
"grad_norm": 3.1320130825042725,
"learning_rate": 9.042711934731309e-06,
"loss": 0.8199,
"step": 2363
},
{
"epoch": 1.2014484467314657,
"grad_norm": 2.8396992683410645,
"learning_rate": 9.041722392116529e-06,
"loss": 0.7548,
"step": 2364
},
{
"epoch": 1.2019566736547869,
"grad_norm": 3.1364777088165283,
"learning_rate": 9.04073239252561e-06,
"loss": 0.7983,
"step": 2365
},
{
"epoch": 1.202464900578108,
"grad_norm": 3.106210708618164,
"learning_rate": 9.039741936070487e-06,
"loss": 0.8722,
"step": 2366
},
{
"epoch": 1.2029731275014295,
"grad_norm": 2.981907367706299,
"learning_rate": 9.038751022863144e-06,
"loss": 0.821,
"step": 2367
},
{
"epoch": 1.2034813544247507,
"grad_norm": 3.6937308311462402,
"learning_rate": 9.037759653015619e-06,
"loss": 1.0072,
"step": 2368
},
{
"epoch": 1.2039895813480719,
"grad_norm": 2.9094719886779785,
"learning_rate": 9.03676782664e-06,
"loss": 0.8495,
"step": 2369
},
{
"epoch": 1.2044978082713933,
"grad_norm": 3.194845676422119,
"learning_rate": 9.035775543848428e-06,
"loss": 0.8678,
"step": 2370
},
{
"epoch": 1.2050060351947145,
"grad_norm": 3.114051580429077,
"learning_rate": 9.034782804753097e-06,
"loss": 0.8427,
"step": 2371
},
{
"epoch": 1.2055142621180357,
"grad_norm": 3.27559757232666,
"learning_rate": 9.033789609466248e-06,
"loss": 0.8815,
"step": 2372
},
{
"epoch": 1.206022489041357,
"grad_norm": 2.918750286102295,
"learning_rate": 9.032795958100179e-06,
"loss": 0.7836,
"step": 2373
},
{
"epoch": 1.2065307159646783,
"grad_norm": 3.2416598796844482,
"learning_rate": 9.031801850767234e-06,
"loss": 0.811,
"step": 2374
},
{
"epoch": 1.2070389428879995,
"grad_norm": 3.0644783973693848,
"learning_rate": 9.030807287579814e-06,
"loss": 0.836,
"step": 2375
},
{
"epoch": 1.2075471698113207,
"grad_norm": 3.6824216842651367,
"learning_rate": 9.02981226865037e-06,
"loss": 0.9161,
"step": 2376
},
{
"epoch": 1.208055396734642,
"grad_norm": 3.1358485221862793,
"learning_rate": 9.028816794091397e-06,
"loss": 0.8101,
"step": 2377
},
{
"epoch": 1.2085636236579633,
"grad_norm": 3.423971652984619,
"learning_rate": 9.027820864015455e-06,
"loss": 0.8777,
"step": 2378
},
{
"epoch": 1.2090718505812845,
"grad_norm": 3.24017333984375,
"learning_rate": 9.026824478535145e-06,
"loss": 0.8798,
"step": 2379
},
{
"epoch": 1.2095800775046057,
"grad_norm": 3.0433313846588135,
"learning_rate": 9.025827637763125e-06,
"loss": 0.8052,
"step": 2380
},
{
"epoch": 1.2100883044279271,
"grad_norm": 3.0827200412750244,
"learning_rate": 9.024830341812103e-06,
"loss": 0.8905,
"step": 2381
},
{
"epoch": 1.2105965313512483,
"grad_norm": 3.2809956073760986,
"learning_rate": 9.023832590794834e-06,
"loss": 0.8415,
"step": 2382
},
{
"epoch": 1.2111047582745695,
"grad_norm": 3.0780837535858154,
"learning_rate": 9.022834384824133e-06,
"loss": 0.853,
"step": 2383
},
{
"epoch": 1.211612985197891,
"grad_norm": 3.268043041229248,
"learning_rate": 9.021835724012858e-06,
"loss": 0.8751,
"step": 2384
},
{
"epoch": 1.2121212121212122,
"grad_norm": 3.1368472576141357,
"learning_rate": 9.020836608473926e-06,
"loss": 0.8292,
"step": 2385
},
{
"epoch": 1.2126294390445334,
"grad_norm": 2.958005905151367,
"learning_rate": 9.019837038320298e-06,
"loss": 0.8687,
"step": 2386
},
{
"epoch": 1.2131376659678546,
"grad_norm": 3.2961275577545166,
"learning_rate": 9.018837013664993e-06,
"loss": 0.7909,
"step": 2387
},
{
"epoch": 1.213645892891176,
"grad_norm": 3.2279088497161865,
"learning_rate": 9.017836534621078e-06,
"loss": 0.791,
"step": 2388
},
{
"epoch": 1.2141541198144972,
"grad_norm": 3.077115535736084,
"learning_rate": 9.01683560130167e-06,
"loss": 0.741,
"step": 2389
},
{
"epoch": 1.2146623467378184,
"grad_norm": 3.03611159324646,
"learning_rate": 9.015834213819941e-06,
"loss": 0.9399,
"step": 2390
},
{
"epoch": 1.2151705736611398,
"grad_norm": 3.0374908447265625,
"learning_rate": 9.014832372289113e-06,
"loss": 0.7597,
"step": 2391
},
{
"epoch": 1.215678800584461,
"grad_norm": 3.051901340484619,
"learning_rate": 9.013830076822457e-06,
"loss": 0.8795,
"step": 2392
},
{
"epoch": 1.2161870275077822,
"grad_norm": 3.266887664794922,
"learning_rate": 9.012827327533297e-06,
"loss": 0.92,
"step": 2393
},
{
"epoch": 1.2166952544311034,
"grad_norm": 3.2254724502563477,
"learning_rate": 9.011824124535012e-06,
"loss": 0.791,
"step": 2394
},
{
"epoch": 1.2172034813544248,
"grad_norm": 2.9573566913604736,
"learning_rate": 9.010820467941026e-06,
"loss": 0.8311,
"step": 2395
},
{
"epoch": 1.217711708277746,
"grad_norm": 3.0001730918884277,
"learning_rate": 9.009816357864819e-06,
"loss": 0.8513,
"step": 2396
},
{
"epoch": 1.2182199352010672,
"grad_norm": 3.096930503845215,
"learning_rate": 9.008811794419917e-06,
"loss": 0.8505,
"step": 2397
},
{
"epoch": 1.2187281621243886,
"grad_norm": 3.2050721645355225,
"learning_rate": 9.007806777719904e-06,
"loss": 0.805,
"step": 2398
},
{
"epoch": 1.2192363890477098,
"grad_norm": 3.0328614711761475,
"learning_rate": 9.00680130787841e-06,
"loss": 0.8385,
"step": 2399
},
{
"epoch": 1.219744615971031,
"grad_norm": 3.149296283721924,
"learning_rate": 9.00579538500912e-06,
"loss": 0.7824,
"step": 2400
},
{
"epoch": 1.2202528428943524,
"grad_norm": 3.3050854206085205,
"learning_rate": 9.004789009225766e-06,
"loss": 0.8228,
"step": 2401
},
{
"epoch": 1.2207610698176736,
"grad_norm": 3.378373384475708,
"learning_rate": 9.003782180642137e-06,
"loss": 0.839,
"step": 2402
},
{
"epoch": 1.2212692967409948,
"grad_norm": 3.153099298477173,
"learning_rate": 9.002774899372065e-06,
"loss": 0.7567,
"step": 2403
},
{
"epoch": 1.221777523664316,
"grad_norm": 2.9937663078308105,
"learning_rate": 9.001767165529442e-06,
"loss": 0.8638,
"step": 2404
},
{
"epoch": 1.2222857505876374,
"grad_norm": 3.197364568710327,
"learning_rate": 9.000758979228206e-06,
"loss": 0.8708,
"step": 2405
},
{
"epoch": 1.2227939775109586,
"grad_norm": 3.2125697135925293,
"learning_rate": 8.999750340582347e-06,
"loss": 0.8009,
"step": 2406
},
{
"epoch": 1.2233022044342798,
"grad_norm": 3.165888786315918,
"learning_rate": 8.998741249705905e-06,
"loss": 0.8278,
"step": 2407
},
{
"epoch": 1.2238104313576013,
"grad_norm": 3.570157766342163,
"learning_rate": 8.997731706712976e-06,
"loss": 0.7706,
"step": 2408
},
{
"epoch": 1.2243186582809225,
"grad_norm": 3.1924993991851807,
"learning_rate": 8.9967217117177e-06,
"loss": 0.7931,
"step": 2409
},
{
"epoch": 1.2248268852042437,
"grad_norm": 3.422243356704712,
"learning_rate": 8.995711264834274e-06,
"loss": 0.8448,
"step": 2410
},
{
"epoch": 1.2253351121275649,
"grad_norm": 3.4591121673583984,
"learning_rate": 8.994700366176945e-06,
"loss": 0.9026,
"step": 2411
},
{
"epoch": 1.2258433390508863,
"grad_norm": 3.981348752975464,
"learning_rate": 8.993689015860006e-06,
"loss": 0.8046,
"step": 2412
},
{
"epoch": 1.2263515659742075,
"grad_norm": 3.094794273376465,
"learning_rate": 8.992677213997809e-06,
"loss": 0.8496,
"step": 2413
},
{
"epoch": 1.2268597928975287,
"grad_norm": 3.073066234588623,
"learning_rate": 8.991664960704749e-06,
"loss": 0.8681,
"step": 2414
},
{
"epoch": 1.22736801982085,
"grad_norm": 3.075650930404663,
"learning_rate": 8.99065225609528e-06,
"loss": 0.8837,
"step": 2415
},
{
"epoch": 1.2278762467441713,
"grad_norm": 3.204456090927124,
"learning_rate": 8.989639100283903e-06,
"loss": 0.8398,
"step": 2416
},
{
"epoch": 1.2283844736674925,
"grad_norm": 2.9511778354644775,
"learning_rate": 8.988625493385166e-06,
"loss": 0.7308,
"step": 2417
},
{
"epoch": 1.228892700590814,
"grad_norm": 3.3825843334198,
"learning_rate": 8.987611435513677e-06,
"loss": 0.8433,
"step": 2418
},
{
"epoch": 1.229400927514135,
"grad_norm": 3.2403564453125,
"learning_rate": 8.986596926784088e-06,
"loss": 0.8387,
"step": 2419
},
{
"epoch": 1.2299091544374563,
"grad_norm": 3.0978314876556396,
"learning_rate": 8.985581967311103e-06,
"loss": 0.9133,
"step": 2420
},
{
"epoch": 1.2304173813607775,
"grad_norm": 3.2604808807373047,
"learning_rate": 8.984566557209481e-06,
"loss": 0.8242,
"step": 2421
},
{
"epoch": 1.230925608284099,
"grad_norm": 3.1556198596954346,
"learning_rate": 8.983550696594026e-06,
"loss": 0.8673,
"step": 2422
},
{
"epoch": 1.2314338352074201,
"grad_norm": 2.9530467987060547,
"learning_rate": 8.982534385579598e-06,
"loss": 0.8397,
"step": 2423
},
{
"epoch": 1.2319420621307413,
"grad_norm": 3.1731629371643066,
"learning_rate": 8.981517624281106e-06,
"loss": 0.8845,
"step": 2424
},
{
"epoch": 1.2324502890540627,
"grad_norm": 3.053375244140625,
"learning_rate": 8.980500412813506e-06,
"loss": 0.8773,
"step": 2425
},
{
"epoch": 1.232958515977384,
"grad_norm": 3.229344606399536,
"learning_rate": 8.979482751291816e-06,
"loss": 0.8718,
"step": 2426
},
{
"epoch": 1.2334667429007051,
"grad_norm": 3.266913414001465,
"learning_rate": 8.97846463983109e-06,
"loss": 0.9393,
"step": 2427
},
{
"epoch": 1.2339749698240263,
"grad_norm": 3.1056594848632812,
"learning_rate": 8.977446078546445e-06,
"loss": 0.7848,
"step": 2428
},
{
"epoch": 1.2344831967473477,
"grad_norm": 2.832486867904663,
"learning_rate": 8.976427067553044e-06,
"loss": 0.8953,
"step": 2429
},
{
"epoch": 1.234991423670669,
"grad_norm": 3.0973379611968994,
"learning_rate": 8.9754076069661e-06,
"loss": 0.8394,
"step": 2430
},
{
"epoch": 1.2354996505939901,
"grad_norm": 3.237048625946045,
"learning_rate": 8.97438769690088e-06,
"loss": 0.9313,
"step": 2431
},
{
"epoch": 1.2360078775173116,
"grad_norm": 3.141131639480591,
"learning_rate": 8.973367337472694e-06,
"loss": 0.7916,
"step": 2432
},
{
"epoch": 1.2365161044406328,
"grad_norm": 3.2379703521728516,
"learning_rate": 8.972346528796916e-06,
"loss": 0.8643,
"step": 2433
},
{
"epoch": 1.237024331363954,
"grad_norm": 3.0718865394592285,
"learning_rate": 8.97132527098896e-06,
"loss": 0.8541,
"step": 2434
},
{
"epoch": 1.2375325582872754,
"grad_norm": 3.0552730560302734,
"learning_rate": 8.970303564164293e-06,
"loss": 0.7842,
"step": 2435
},
{
"epoch": 1.2380407852105966,
"grad_norm": 3.286994457244873,
"learning_rate": 8.969281408438437e-06,
"loss": 0.8628,
"step": 2436
},
{
"epoch": 1.2385490121339178,
"grad_norm": 3.58115291595459,
"learning_rate": 8.96825880392696e-06,
"loss": 0.911,
"step": 2437
},
{
"epoch": 1.239057239057239,
"grad_norm": 3.199500799179077,
"learning_rate": 8.967235750745483e-06,
"loss": 0.864,
"step": 2438
},
{
"epoch": 1.2395654659805604,
"grad_norm": 3.2523953914642334,
"learning_rate": 8.966212249009675e-06,
"loss": 0.8658,
"step": 2439
},
{
"epoch": 1.2400736929038816,
"grad_norm": 3.1522932052612305,
"learning_rate": 8.96518829883526e-06,
"loss": 0.8349,
"step": 2440
},
{
"epoch": 1.2405819198272028,
"grad_norm": 3.049180030822754,
"learning_rate": 8.964163900338011e-06,
"loss": 0.8032,
"step": 2441
},
{
"epoch": 1.2410901467505242,
"grad_norm": 3.3346316814422607,
"learning_rate": 8.963139053633752e-06,
"loss": 0.8094,
"step": 2442
},
{
"epoch": 1.2415983736738454,
"grad_norm": 3.185328245162964,
"learning_rate": 8.962113758838356e-06,
"loss": 0.8299,
"step": 2443
},
{
"epoch": 1.2421066005971666,
"grad_norm": 2.983642101287842,
"learning_rate": 8.961088016067744e-06,
"loss": 0.8406,
"step": 2444
},
{
"epoch": 1.2426148275204878,
"grad_norm": 2.7408642768859863,
"learning_rate": 8.960061825437897e-06,
"loss": 0.682,
"step": 2445
},
{
"epoch": 1.2431230544438092,
"grad_norm": 3.0992236137390137,
"learning_rate": 8.95903518706484e-06,
"loss": 0.856,
"step": 2446
},
{
"epoch": 1.2436312813671304,
"grad_norm": 3.2850234508514404,
"learning_rate": 8.958008101064646e-06,
"loss": 0.8097,
"step": 2447
},
{
"epoch": 1.2441395082904516,
"grad_norm": 3.040407419204712,
"learning_rate": 8.956980567553443e-06,
"loss": 0.8335,
"step": 2448
},
{
"epoch": 1.244647735213773,
"grad_norm": 3.125934362411499,
"learning_rate": 8.955952586647414e-06,
"loss": 0.8421,
"step": 2449
},
{
"epoch": 1.2451559621370942,
"grad_norm": 3.215177536010742,
"learning_rate": 8.954924158462782e-06,
"loss": 0.8339,
"step": 2450
},
{
"epoch": 1.2456641890604154,
"grad_norm": 3.099355459213257,
"learning_rate": 8.953895283115825e-06,
"loss": 0.7777,
"step": 2451
},
{
"epoch": 1.2461724159837368,
"grad_norm": 2.988253116607666,
"learning_rate": 8.952865960722876e-06,
"loss": 0.8,
"step": 2452
},
{
"epoch": 1.246680642907058,
"grad_norm": 2.885324716567993,
"learning_rate": 8.951836191400316e-06,
"loss": 0.8199,
"step": 2453
},
{
"epoch": 1.2471888698303792,
"grad_norm": 3.369645357131958,
"learning_rate": 8.950805975264572e-06,
"loss": 0.8281,
"step": 2454
},
{
"epoch": 1.2476970967537004,
"grad_norm": 3.1595754623413086,
"learning_rate": 8.949775312432125e-06,
"loss": 0.8552,
"step": 2455
},
{
"epoch": 1.2482053236770219,
"grad_norm": 3.157674551010132,
"learning_rate": 8.94874420301951e-06,
"loss": 0.8398,
"step": 2456
},
{
"epoch": 1.248713550600343,
"grad_norm": 2.965175151824951,
"learning_rate": 8.947712647143308e-06,
"loss": 0.824,
"step": 2457
},
{
"epoch": 1.2492217775236643,
"grad_norm": 3.188775062561035,
"learning_rate": 8.946680644920148e-06,
"loss": 0.9177,
"step": 2458
},
{
"epoch": 1.2497300044469857,
"grad_norm": 3.110813856124878,
"learning_rate": 8.945648196466718e-06,
"loss": 0.8316,
"step": 2459
},
{
"epoch": 1.2502382313703069,
"grad_norm": 3.100200653076172,
"learning_rate": 8.944615301899749e-06,
"loss": 0.8408,
"step": 2460
},
{
"epoch": 1.250746458293628,
"grad_norm": 2.9803078174591064,
"learning_rate": 8.943581961336023e-06,
"loss": 0.8405,
"step": 2461
},
{
"epoch": 1.2512546852169493,
"grad_norm": 3.0053930282592773,
"learning_rate": 8.942548174892379e-06,
"loss": 0.8902,
"step": 2462
},
{
"epoch": 1.2517629121402707,
"grad_norm": 3.080328941345215,
"learning_rate": 8.941513942685698e-06,
"loss": 0.8324,
"step": 2463
},
{
"epoch": 1.2522711390635919,
"grad_norm": 3.199618101119995,
"learning_rate": 8.940479264832918e-06,
"loss": 0.787,
"step": 2464
},
{
"epoch": 1.252779365986913,
"grad_norm": 3.244206428527832,
"learning_rate": 8.93944414145102e-06,
"loss": 0.8252,
"step": 2465
},
{
"epoch": 1.2532875929102345,
"grad_norm": 3.08567476272583,
"learning_rate": 8.938408572657045e-06,
"loss": 0.8402,
"step": 2466
},
{
"epoch": 1.2537958198335557,
"grad_norm": 3.227609157562256,
"learning_rate": 8.937372558568078e-06,
"loss": 0.8494,
"step": 2467
},
{
"epoch": 1.254304046756877,
"grad_norm": 3.0492734909057617,
"learning_rate": 8.936336099301253e-06,
"loss": 0.9403,
"step": 2468
},
{
"epoch": 1.2548122736801983,
"grad_norm": 2.8660738468170166,
"learning_rate": 8.93529919497376e-06,
"loss": 0.7848,
"step": 2469
},
{
"epoch": 1.2553205006035195,
"grad_norm": 2.914168119430542,
"learning_rate": 8.934261845702835e-06,
"loss": 0.8184,
"step": 2470
},
{
"epoch": 1.2558287275268407,
"grad_norm": 3.180852174758911,
"learning_rate": 8.933224051605764e-06,
"loss": 0.85,
"step": 2471
},
{
"epoch": 1.2563369544501621,
"grad_norm": 3.4860987663269043,
"learning_rate": 8.932185812799888e-06,
"loss": 0.8416,
"step": 2472
},
{
"epoch": 1.2568451813734833,
"grad_norm": 3.155968427658081,
"learning_rate": 8.931147129402592e-06,
"loss": 0.8476,
"step": 2473
},
{
"epoch": 1.2573534082968045,
"grad_norm": 3.176732063293457,
"learning_rate": 8.930108001531318e-06,
"loss": 0.8863,
"step": 2474
},
{
"epoch": 1.2578616352201257,
"grad_norm": 3.208754301071167,
"learning_rate": 8.929068429303553e-06,
"loss": 0.8382,
"step": 2475
},
{
"epoch": 1.258369862143447,
"grad_norm": 3.254345655441284,
"learning_rate": 8.928028412836835e-06,
"loss": 0.8497,
"step": 2476
},
{
"epoch": 1.2588780890667683,
"grad_norm": 3.4679901599884033,
"learning_rate": 8.926987952248753e-06,
"loss": 0.8932,
"step": 2477
},
{
"epoch": 1.2593863159900895,
"grad_norm": 3.174726963043213,
"learning_rate": 8.925947047656949e-06,
"loss": 0.771,
"step": 2478
},
{
"epoch": 1.2598945429134107,
"grad_norm": 3.153735399246216,
"learning_rate": 8.92490569917911e-06,
"loss": 0.8844,
"step": 2479
},
{
"epoch": 1.2604027698367322,
"grad_norm": 3.165095329284668,
"learning_rate": 8.923863906932976e-06,
"loss": 0.781,
"step": 2480
},
{
"epoch": 1.2609109967600534,
"grad_norm": 3.09627628326416,
"learning_rate": 8.922821671036338e-06,
"loss": 0.8963,
"step": 2481
},
{
"epoch": 1.2614192236833746,
"grad_norm": 3.1823904514312744,
"learning_rate": 8.921778991607036e-06,
"loss": 0.8274,
"step": 2482
},
{
"epoch": 1.261927450606696,
"grad_norm": 3.225573778152466,
"learning_rate": 8.920735868762957e-06,
"loss": 0.8876,
"step": 2483
},
{
"epoch": 1.2624356775300172,
"grad_norm": 2.9334287643432617,
"learning_rate": 8.919692302622048e-06,
"loss": 0.7982,
"step": 2484
},
{
"epoch": 1.2629439044533384,
"grad_norm": 11.20725154876709,
"learning_rate": 8.918648293302293e-06,
"loss": 0.869,
"step": 2485
},
{
"epoch": 1.2634521313766598,
"grad_norm": 3.3023571968078613,
"learning_rate": 8.917603840921736e-06,
"loss": 0.8895,
"step": 2486
},
{
"epoch": 1.263960358299981,
"grad_norm": 2.789487838745117,
"learning_rate": 8.916558945598469e-06,
"loss": 0.8183,
"step": 2487
},
{
"epoch": 1.2644685852233022,
"grad_norm": 3.5704424381256104,
"learning_rate": 8.915513607450627e-06,
"loss": 0.9285,
"step": 2488
},
{
"epoch": 1.2649768121466234,
"grad_norm": 2.936912775039673,
"learning_rate": 8.914467826596408e-06,
"loss": 0.7793,
"step": 2489
},
{
"epoch": 1.2654850390699448,
"grad_norm": 3.02742338180542,
"learning_rate": 8.913421603154046e-06,
"loss": 0.8367,
"step": 2490
},
{
"epoch": 1.265993265993266,
"grad_norm": 3.056135416030884,
"learning_rate": 8.91237493724184e-06,
"loss": 0.8521,
"step": 2491
},
{
"epoch": 1.2665014929165872,
"grad_norm": 3.128657102584839,
"learning_rate": 8.911327828978123e-06,
"loss": 0.9025,
"step": 2492
},
{
"epoch": 1.2670097198399084,
"grad_norm": 2.892381191253662,
"learning_rate": 8.910280278481289e-06,
"loss": 0.7111,
"step": 2493
},
{
"epoch": 1.2675179467632298,
"grad_norm": 2.8076236248016357,
"learning_rate": 8.90923228586978e-06,
"loss": 0.8013,
"step": 2494
},
{
"epoch": 1.268026173686551,
"grad_norm": 3.046893835067749,
"learning_rate": 8.908183851262087e-06,
"loss": 0.8518,
"step": 2495
},
{
"epoch": 1.2685344006098722,
"grad_norm": 3.2953848838806152,
"learning_rate": 8.90713497477675e-06,
"loss": 0.8759,
"step": 2496
},
{
"epoch": 1.2690426275331936,
"grad_norm": 3.101726770401001,
"learning_rate": 8.906085656532361e-06,
"loss": 0.7667,
"step": 2497
},
{
"epoch": 1.2695508544565148,
"grad_norm": 3.3615872859954834,
"learning_rate": 8.905035896647561e-06,
"loss": 0.8447,
"step": 2498
},
{
"epoch": 1.270059081379836,
"grad_norm": 3.2602908611297607,
"learning_rate": 8.903985695241037e-06,
"loss": 0.8351,
"step": 2499
},
{
"epoch": 1.2705673083031574,
"grad_norm": 3.361398458480835,
"learning_rate": 8.902935052431534e-06,
"loss": 0.8394,
"step": 2500
},
{
"epoch": 1.2705673083031574,
"eval_loss": 1.2745423316955566,
"eval_runtime": 15.0612,
"eval_samples_per_second": 26.558,
"eval_steps_per_second": 3.32,
"step": 2500
},
{
"epoch": 1.2710755352264786,
"grad_norm": 3.303532838821411,
"learning_rate": 8.90188396833784e-06,
"loss": 0.9127,
"step": 2501
},
{
"epoch": 1.2715837621497998,
"grad_norm": 3.171142578125,
"learning_rate": 8.9008324430788e-06,
"loss": 0.8087,
"step": 2502
},
{
"epoch": 1.2720919890731213,
"grad_norm": 3.1894915103912354,
"learning_rate": 8.899780476773297e-06,
"loss": 0.9523,
"step": 2503
},
{
"epoch": 1.2726002159964425,
"grad_norm": 3.1396098136901855,
"learning_rate": 8.898728069540278e-06,
"loss": 0.8368,
"step": 2504
},
{
"epoch": 1.2731084429197637,
"grad_norm": 3.1784250736236572,
"learning_rate": 8.897675221498729e-06,
"loss": 0.7707,
"step": 2505
},
{
"epoch": 1.2736166698430849,
"grad_norm": 3.0713679790496826,
"learning_rate": 8.896621932767692e-06,
"loss": 0.8648,
"step": 2506
},
{
"epoch": 1.2741248967664063,
"grad_norm": 3.134429693222046,
"learning_rate": 8.895568203466256e-06,
"loss": 0.7814,
"step": 2507
},
{
"epoch": 1.2746331236897275,
"grad_norm": 3.5291848182678223,
"learning_rate": 8.894514033713562e-06,
"loss": 0.8768,
"step": 2508
},
{
"epoch": 1.2751413506130487,
"grad_norm": 3.3426871299743652,
"learning_rate": 8.893459423628797e-06,
"loss": 0.941,
"step": 2509
},
{
"epoch": 1.2756495775363699,
"grad_norm": 3.3209519386291504,
"learning_rate": 8.8924043733312e-06,
"loss": 0.9354,
"step": 2510
},
{
"epoch": 1.2761578044596913,
"grad_norm": 2.953981876373291,
"learning_rate": 8.891348882940063e-06,
"loss": 0.8667,
"step": 2511
},
{
"epoch": 1.2766660313830125,
"grad_norm": 3.2882747650146484,
"learning_rate": 8.890292952574723e-06,
"loss": 0.8203,
"step": 2512
},
{
"epoch": 1.2771742583063337,
"grad_norm": 3.161607027053833,
"learning_rate": 8.889236582354568e-06,
"loss": 0.8898,
"step": 2513
},
{
"epoch": 1.277682485229655,
"grad_norm": 3.209338426589966,
"learning_rate": 8.888179772399038e-06,
"loss": 0.8284,
"step": 2514
},
{
"epoch": 1.2781907121529763,
"grad_norm": 3.230221748352051,
"learning_rate": 8.887122522827617e-06,
"loss": 0.8283,
"step": 2515
},
{
"epoch": 1.2786989390762975,
"grad_norm": 3.2188124656677246,
"learning_rate": 8.886064833759847e-06,
"loss": 0.8498,
"step": 2516
},
{
"epoch": 1.279207165999619,
"grad_norm": 3.1550827026367188,
"learning_rate": 8.885006705315313e-06,
"loss": 0.8682,
"step": 2517
},
{
"epoch": 1.2797153929229401,
"grad_norm": 3.071791648864746,
"learning_rate": 8.883948137613651e-06,
"loss": 0.7674,
"step": 2518
},
{
"epoch": 1.2802236198462613,
"grad_norm": 2.99682354927063,
"learning_rate": 8.882889130774551e-06,
"loss": 0.8389,
"step": 2519
},
{
"epoch": 1.2807318467695827,
"grad_norm": 3.1506733894348145,
"learning_rate": 8.881829684917746e-06,
"loss": 0.8242,
"step": 2520
},
{
"epoch": 1.281240073692904,
"grad_norm": 2.9105701446533203,
"learning_rate": 8.880769800163025e-06,
"loss": 0.7714,
"step": 2521
},
{
"epoch": 1.2817483006162251,
"grad_norm": 3.228342294692993,
"learning_rate": 8.879709476630219e-06,
"loss": 0.8571,
"step": 2522
},
{
"epoch": 1.2822565275395463,
"grad_norm": 3.045037031173706,
"learning_rate": 8.878648714439217e-06,
"loss": 0.8537,
"step": 2523
},
{
"epoch": 1.2827647544628678,
"grad_norm": 3.146073579788208,
"learning_rate": 8.877587513709954e-06,
"loss": 0.8636,
"step": 2524
},
{
"epoch": 1.283272981386189,
"grad_norm": 3.209416627883911,
"learning_rate": 8.876525874562413e-06,
"loss": 0.8199,
"step": 2525
},
{
"epoch": 1.2837812083095101,
"grad_norm": 2.9850914478302,
"learning_rate": 8.875463797116627e-06,
"loss": 0.8699,
"step": 2526
},
{
"epoch": 1.2842894352328313,
"grad_norm": 3.307227849960327,
"learning_rate": 8.874401281492681e-06,
"loss": 0.8231,
"step": 2527
},
{
"epoch": 1.2847976621561528,
"grad_norm": 2.9989612102508545,
"learning_rate": 8.873338327810708e-06,
"loss": 0.787,
"step": 2528
},
{
"epoch": 1.285305889079474,
"grad_norm": 3.090996742248535,
"learning_rate": 8.872274936190888e-06,
"loss": 0.8456,
"step": 2529
},
{
"epoch": 1.2858141160027952,
"grad_norm": 3.1071884632110596,
"learning_rate": 8.871211106753457e-06,
"loss": 0.7524,
"step": 2530
},
{
"epoch": 1.2863223429261166,
"grad_norm": 3.232839822769165,
"learning_rate": 8.870146839618694e-06,
"loss": 0.8982,
"step": 2531
},
{
"epoch": 1.2868305698494378,
"grad_norm": 3.2980551719665527,
"learning_rate": 8.869082134906931e-06,
"loss": 0.8118,
"step": 2532
},
{
"epoch": 1.287338796772759,
"grad_norm": 3.268399715423584,
"learning_rate": 8.868016992738548e-06,
"loss": 0.803,
"step": 2533
},
{
"epoch": 1.2878470236960804,
"grad_norm": 3.322483539581299,
"learning_rate": 8.866951413233976e-06,
"loss": 0.9056,
"step": 2534
},
{
"epoch": 1.2883552506194016,
"grad_norm": 3.203437328338623,
"learning_rate": 8.865885396513693e-06,
"loss": 0.9368,
"step": 2535
},
{
"epoch": 1.2888634775427228,
"grad_norm": 2.9805757999420166,
"learning_rate": 8.864818942698228e-06,
"loss": 0.8216,
"step": 2536
},
{
"epoch": 1.2893717044660442,
"grad_norm": 2.8534796237945557,
"learning_rate": 8.86375205190816e-06,
"loss": 0.78,
"step": 2537
},
{
"epoch": 1.2898799313893654,
"grad_norm": 2.94832444190979,
"learning_rate": 8.862684724264118e-06,
"loss": 0.7969,
"step": 2538
},
{
"epoch": 1.2903881583126866,
"grad_norm": 2.9740326404571533,
"learning_rate": 8.861616959886774e-06,
"loss": 0.9288,
"step": 2539
},
{
"epoch": 1.2908963852360078,
"grad_norm": 3.0878005027770996,
"learning_rate": 8.86054875889686e-06,
"loss": 0.7948,
"step": 2540
},
{
"epoch": 1.291404612159329,
"grad_norm": 3.220125198364258,
"learning_rate": 8.859480121415152e-06,
"loss": 0.8302,
"step": 2541
},
{
"epoch": 1.2919128390826504,
"grad_norm": 3.5187385082244873,
"learning_rate": 8.85841104756247e-06,
"loss": 0.8091,
"step": 2542
},
{
"epoch": 1.2924210660059716,
"grad_norm": 3.397118330001831,
"learning_rate": 8.857341537459693e-06,
"loss": 0.8509,
"step": 2543
},
{
"epoch": 1.2929292929292928,
"grad_norm": 3.103379726409912,
"learning_rate": 8.856271591227743e-06,
"loss": 0.8122,
"step": 2544
},
{
"epoch": 1.2934375198526142,
"grad_norm": 3.081847667694092,
"learning_rate": 8.855201208987593e-06,
"loss": 0.9056,
"step": 2545
},
{
"epoch": 1.2939457467759354,
"grad_norm": 3.5324161052703857,
"learning_rate": 8.854130390860268e-06,
"loss": 0.8944,
"step": 2546
},
{
"epoch": 1.2944539736992566,
"grad_norm": 3.22245192527771,
"learning_rate": 8.853059136966835e-06,
"loss": 0.8114,
"step": 2547
},
{
"epoch": 1.294962200622578,
"grad_norm": 3.203016996383667,
"learning_rate": 8.851987447428419e-06,
"loss": 0.8688,
"step": 2548
},
{
"epoch": 1.2954704275458993,
"grad_norm": 3.2853200435638428,
"learning_rate": 8.850915322366187e-06,
"loss": 0.7993,
"step": 2549
},
{
"epoch": 1.2959786544692204,
"grad_norm": 2.8735644817352295,
"learning_rate": 8.849842761901363e-06,
"loss": 0.8585,
"step": 2550
},
{
"epoch": 1.2964868813925419,
"grad_norm": 3.0382649898529053,
"learning_rate": 8.848769766155212e-06,
"loss": 0.8293,
"step": 2551
},
{
"epoch": 1.296995108315863,
"grad_norm": 2.963172197341919,
"learning_rate": 8.847696335249055e-06,
"loss": 0.8423,
"step": 2552
},
{
"epoch": 1.2975033352391843,
"grad_norm": 3.24950909614563,
"learning_rate": 8.846622469304255e-06,
"loss": 0.7968,
"step": 2553
},
{
"epoch": 1.2980115621625057,
"grad_norm": 3.5385589599609375,
"learning_rate": 8.845548168442232e-06,
"loss": 0.9819,
"step": 2554
},
{
"epoch": 1.2985197890858269,
"grad_norm": 3.3161239624023438,
"learning_rate": 8.844473432784448e-06,
"loss": 0.8769,
"step": 2555
},
{
"epoch": 1.299028016009148,
"grad_norm": 3.361607074737549,
"learning_rate": 8.843398262452422e-06,
"loss": 0.873,
"step": 2556
},
{
"epoch": 1.2995362429324693,
"grad_norm": 2.9351627826690674,
"learning_rate": 8.842322657567715e-06,
"loss": 0.8579,
"step": 2557
},
{
"epoch": 1.3000444698557905,
"grad_norm": 2.9046859741210938,
"learning_rate": 8.84124661825194e-06,
"loss": 0.8712,
"step": 2558
},
{
"epoch": 1.300552696779112,
"grad_norm": 3.100588798522949,
"learning_rate": 8.840170144626761e-06,
"loss": 0.8623,
"step": 2559
},
{
"epoch": 1.301060923702433,
"grad_norm": 3.147078275680542,
"learning_rate": 8.839093236813887e-06,
"loss": 0.8377,
"step": 2560
},
{
"epoch": 1.3015691506257543,
"grad_norm": 3.067751884460449,
"learning_rate": 8.83801589493508e-06,
"loss": 0.8867,
"step": 2561
},
{
"epoch": 1.3020773775490757,
"grad_norm": 3.0106406211853027,
"learning_rate": 8.836938119112145e-06,
"loss": 0.8218,
"step": 2562
},
{
"epoch": 1.302585604472397,
"grad_norm": 2.999750852584839,
"learning_rate": 8.835859909466949e-06,
"loss": 0.8377,
"step": 2563
},
{
"epoch": 1.303093831395718,
"grad_norm": 3.097104072570801,
"learning_rate": 8.834781266121391e-06,
"loss": 0.7746,
"step": 2564
},
{
"epoch": 1.3036020583190395,
"grad_norm": 3.1769418716430664,
"learning_rate": 8.83370218919743e-06,
"loss": 0.937,
"step": 2565
},
{
"epoch": 1.3041102852423607,
"grad_norm": 2.8542466163635254,
"learning_rate": 8.832622678817074e-06,
"loss": 0.8561,
"step": 2566
},
{
"epoch": 1.304618512165682,
"grad_norm": 3.1751227378845215,
"learning_rate": 8.831542735102375e-06,
"loss": 0.791,
"step": 2567
},
{
"epoch": 1.3051267390890033,
"grad_norm": 3.0102860927581787,
"learning_rate": 8.830462358175438e-06,
"loss": 0.9021,
"step": 2568
},
{
"epoch": 1.3056349660123245,
"grad_norm": 3.0185563564300537,
"learning_rate": 8.829381548158414e-06,
"loss": 0.7755,
"step": 2569
},
{
"epoch": 1.3061431929356457,
"grad_norm": 2.9850900173187256,
"learning_rate": 8.828300305173506e-06,
"loss": 0.854,
"step": 2570
},
{
"epoch": 1.3066514198589672,
"grad_norm": 3.0586602687835693,
"learning_rate": 8.827218629342962e-06,
"loss": 0.7996,
"step": 2571
},
{
"epoch": 1.3071596467822884,
"grad_norm": 3.3865036964416504,
"learning_rate": 8.826136520789084e-06,
"loss": 0.7912,
"step": 2572
},
{
"epoch": 1.3076678737056096,
"grad_norm": 2.9162116050720215,
"learning_rate": 8.82505397963422e-06,
"loss": 0.8339,
"step": 2573
},
{
"epoch": 1.3081761006289307,
"grad_norm": 3.1088786125183105,
"learning_rate": 8.823971006000767e-06,
"loss": 0.9219,
"step": 2574
},
{
"epoch": 1.308684327552252,
"grad_norm": 3.166175365447998,
"learning_rate": 8.822887600011168e-06,
"loss": 0.9238,
"step": 2575
},
{
"epoch": 1.3091925544755734,
"grad_norm": 3.029024124145508,
"learning_rate": 8.821803761787923e-06,
"loss": 0.7947,
"step": 2576
},
{
"epoch": 1.3097007813988946,
"grad_norm": 3.238969087600708,
"learning_rate": 8.820719491453572e-06,
"loss": 0.9903,
"step": 2577
},
{
"epoch": 1.3102090083222158,
"grad_norm": 3.3764801025390625,
"learning_rate": 8.819634789130709e-06,
"loss": 0.9136,
"step": 2578
},
{
"epoch": 1.3107172352455372,
"grad_norm": 3.1779088973999023,
"learning_rate": 8.818549654941976e-06,
"loss": 0.8954,
"step": 2579
},
{
"epoch": 1.3112254621688584,
"grad_norm": 2.949017286300659,
"learning_rate": 8.817464089010064e-06,
"loss": 0.8774,
"step": 2580
},
{
"epoch": 1.3117336890921796,
"grad_norm": 3.089338541030884,
"learning_rate": 8.81637809145771e-06,
"loss": 0.7818,
"step": 2581
},
{
"epoch": 1.312241916015501,
"grad_norm": 3.3381898403167725,
"learning_rate": 8.815291662407704e-06,
"loss": 0.7747,
"step": 2582
},
{
"epoch": 1.3127501429388222,
"grad_norm": 3.0036628246307373,
"learning_rate": 8.814204801982882e-06,
"loss": 0.802,
"step": 2583
},
{
"epoch": 1.3132583698621434,
"grad_norm": 3.6632609367370605,
"learning_rate": 8.813117510306128e-06,
"loss": 0.796,
"step": 2584
},
{
"epoch": 1.3137665967854648,
"grad_norm": 3.659998893737793,
"learning_rate": 8.812029787500379e-06,
"loss": 0.8787,
"step": 2585
},
{
"epoch": 1.314274823708786,
"grad_norm": 3.202430248260498,
"learning_rate": 8.810941633688617e-06,
"loss": 0.8552,
"step": 2586
},
{
"epoch": 1.3147830506321072,
"grad_norm": 3.068216562271118,
"learning_rate": 8.809853048993873e-06,
"loss": 0.8298,
"step": 2587
},
{
"epoch": 1.3152912775554286,
"grad_norm": 3.2713656425476074,
"learning_rate": 8.80876403353923e-06,
"loss": 0.8764,
"step": 2588
},
{
"epoch": 1.3157995044787498,
"grad_norm": 3.147080898284912,
"learning_rate": 8.807674587447813e-06,
"loss": 0.846,
"step": 2589
},
{
"epoch": 1.316307731402071,
"grad_norm": 3.5714316368103027,
"learning_rate": 8.806584710842803e-06,
"loss": 0.9365,
"step": 2590
},
{
"epoch": 1.3168159583253922,
"grad_norm": 3.3361597061157227,
"learning_rate": 8.805494403847426e-06,
"loss": 0.7961,
"step": 2591
},
{
"epoch": 1.3173241852487134,
"grad_norm": 3.182502269744873,
"learning_rate": 8.804403666584958e-06,
"loss": 0.8503,
"step": 2592
},
{
"epoch": 1.3178324121720348,
"grad_norm": 3.635493755340576,
"learning_rate": 8.803312499178722e-06,
"loss": 0.8862,
"step": 2593
},
{
"epoch": 1.318340639095356,
"grad_norm": 2.8551406860351562,
"learning_rate": 8.80222090175209e-06,
"loss": 0.7413,
"step": 2594
},
{
"epoch": 1.3188488660186772,
"grad_norm": 3.0634207725524902,
"learning_rate": 8.801128874428482e-06,
"loss": 0.9011,
"step": 2595
},
{
"epoch": 1.3193570929419987,
"grad_norm": 3.162566900253296,
"learning_rate": 8.800036417331372e-06,
"loss": 0.8009,
"step": 2596
},
{
"epoch": 1.3198653198653199,
"grad_norm": 3.1202633380889893,
"learning_rate": 8.798943530584275e-06,
"loss": 0.8532,
"step": 2597
},
{
"epoch": 1.320373546788641,
"grad_norm": 3.2355780601501465,
"learning_rate": 8.797850214310756e-06,
"loss": 0.8975,
"step": 2598
},
{
"epoch": 1.3208817737119625,
"grad_norm": 3.200838565826416,
"learning_rate": 8.796756468634436e-06,
"loss": 0.8297,
"step": 2599
},
{
"epoch": 1.3213900006352837,
"grad_norm": 3.2080655097961426,
"learning_rate": 8.795662293678976e-06,
"loss": 0.83,
"step": 2600
},
{
"epoch": 1.3218982275586049,
"grad_norm": 3.2180094718933105,
"learning_rate": 8.794567689568088e-06,
"loss": 0.9397,
"step": 2601
},
{
"epoch": 1.3224064544819263,
"grad_norm": 3.111396074295044,
"learning_rate": 8.793472656425533e-06,
"loss": 0.8781,
"step": 2602
},
{
"epoch": 1.3229146814052475,
"grad_norm": 3.1451263427734375,
"learning_rate": 8.792377194375123e-06,
"loss": 0.839,
"step": 2603
},
{
"epoch": 1.3234229083285687,
"grad_norm": 3.002424955368042,
"learning_rate": 8.791281303540714e-06,
"loss": 0.8521,
"step": 2604
},
{
"epoch": 1.32393113525189,
"grad_norm": 2.9210152626037598,
"learning_rate": 8.790184984046212e-06,
"loss": 0.8843,
"step": 2605
},
{
"epoch": 1.3244393621752113,
"grad_norm": 3.1625709533691406,
"learning_rate": 8.789088236015576e-06,
"loss": 0.871,
"step": 2606
},
{
"epoch": 1.3249475890985325,
"grad_norm": 3.112685441970825,
"learning_rate": 8.787991059572803e-06,
"loss": 0.7916,
"step": 2607
},
{
"epoch": 1.3254558160218537,
"grad_norm": 3.3765015602111816,
"learning_rate": 8.786893454841949e-06,
"loss": 0.8464,
"step": 2608
},
{
"epoch": 1.325964042945175,
"grad_norm": 3.056694507598877,
"learning_rate": 8.785795421947116e-06,
"loss": 0.8172,
"step": 2609
},
{
"epoch": 1.3264722698684963,
"grad_norm": 3.156933546066284,
"learning_rate": 8.784696961012448e-06,
"loss": 0.8663,
"step": 2610
},
{
"epoch": 1.3269804967918175,
"grad_norm": 2.98030161857605,
"learning_rate": 8.783598072162147e-06,
"loss": 0.7714,
"step": 2611
},
{
"epoch": 1.3274887237151387,
"grad_norm": 3.092323064804077,
"learning_rate": 8.782498755520457e-06,
"loss": 0.7489,
"step": 2612
},
{
"epoch": 1.3279969506384601,
"grad_norm": 3.140317916870117,
"learning_rate": 8.78139901121167e-06,
"loss": 0.8019,
"step": 2613
},
{
"epoch": 1.3285051775617813,
"grad_norm": 3.0484914779663086,
"learning_rate": 8.780298839360129e-06,
"loss": 0.9009,
"step": 2614
},
{
"epoch": 1.3290134044851025,
"grad_norm": 3.2454006671905518,
"learning_rate": 8.779198240090225e-06,
"loss": 0.8669,
"step": 2615
},
{
"epoch": 1.329521631408424,
"grad_norm": 3.0834341049194336,
"learning_rate": 8.778097213526398e-06,
"loss": 0.804,
"step": 2616
},
{
"epoch": 1.3300298583317451,
"grad_norm": 3.589625597000122,
"learning_rate": 8.776995759793132e-06,
"loss": 0.8648,
"step": 2617
},
{
"epoch": 1.3305380852550663,
"grad_norm": 2.9998013973236084,
"learning_rate": 8.775893879014968e-06,
"loss": 0.7427,
"step": 2618
},
{
"epoch": 1.3310463121783878,
"grad_norm": 3.2124462127685547,
"learning_rate": 8.774791571316484e-06,
"loss": 0.863,
"step": 2619
},
{
"epoch": 1.331554539101709,
"grad_norm": 3.1781957149505615,
"learning_rate": 8.773688836822317e-06,
"loss": 0.8429,
"step": 2620
},
{
"epoch": 1.3320627660250302,
"grad_norm": 3.172304391860962,
"learning_rate": 8.772585675657144e-06,
"loss": 0.882,
"step": 2621
},
{
"epoch": 1.3325709929483516,
"grad_norm": 2.9271175861358643,
"learning_rate": 8.771482087945693e-06,
"loss": 0.7754,
"step": 2622
},
{
"epoch": 1.3330792198716728,
"grad_norm": 3.295121669769287,
"learning_rate": 8.770378073812745e-06,
"loss": 0.7888,
"step": 2623
},
{
"epoch": 1.333587446794994,
"grad_norm": 3.0873188972473145,
"learning_rate": 8.769273633383122e-06,
"loss": 0.7987,
"step": 2624
},
{
"epoch": 1.3340956737183152,
"grad_norm": 3.130263090133667,
"learning_rate": 8.768168766781698e-06,
"loss": 0.8407,
"step": 2625
},
{
"epoch": 1.3346039006416364,
"grad_norm": 3.202841281890869,
"learning_rate": 8.767063474133392e-06,
"loss": 0.7984,
"step": 2626
},
{
"epoch": 1.3351121275649578,
"grad_norm": 2.8878347873687744,
"learning_rate": 8.765957755563177e-06,
"loss": 0.7478,
"step": 2627
},
{
"epoch": 1.335620354488279,
"grad_norm": 3.223191261291504,
"learning_rate": 8.76485161119607e-06,
"loss": 0.7901,
"step": 2628
},
{
"epoch": 1.3361285814116002,
"grad_norm": 3.7308144569396973,
"learning_rate": 8.763745041157136e-06,
"loss": 0.931,
"step": 2629
},
{
"epoch": 1.3366368083349216,
"grad_norm": 3.2447123527526855,
"learning_rate": 8.76263804557149e-06,
"loss": 0.9182,
"step": 2630
},
{
"epoch": 1.3371450352582428,
"grad_norm": 3.1200344562530518,
"learning_rate": 8.761530624564292e-06,
"loss": 0.7992,
"step": 2631
},
{
"epoch": 1.337653262181564,
"grad_norm": 3.198173761367798,
"learning_rate": 8.760422778260753e-06,
"loss": 0.8869,
"step": 2632
},
{
"epoch": 1.3381614891048854,
"grad_norm": 3.0903890132904053,
"learning_rate": 8.759314506786134e-06,
"loss": 0.8946,
"step": 2633
},
{
"epoch": 1.3386697160282066,
"grad_norm": 3.123501777648926,
"learning_rate": 8.75820581026574e-06,
"loss": 0.8356,
"step": 2634
},
{
"epoch": 1.3391779429515278,
"grad_norm": 3.3818912506103516,
"learning_rate": 8.757096688824925e-06,
"loss": 0.8841,
"step": 2635
},
{
"epoch": 1.3396861698748492,
"grad_norm": 3.03412127494812,
"learning_rate": 8.75598714258909e-06,
"loss": 0.8452,
"step": 2636
},
{
"epoch": 1.3401943967981704,
"grad_norm": 3.1534507274627686,
"learning_rate": 8.754877171683685e-06,
"loss": 0.9732,
"step": 2637
},
{
"epoch": 1.3407026237214916,
"grad_norm": 3.1218719482421875,
"learning_rate": 8.753766776234213e-06,
"loss": 0.8408,
"step": 2638
},
{
"epoch": 1.341210850644813,
"grad_norm": 3.4161899089813232,
"learning_rate": 8.752655956366217e-06,
"loss": 0.9102,
"step": 2639
},
{
"epoch": 1.3417190775681342,
"grad_norm": 3.1156539916992188,
"learning_rate": 8.751544712205293e-06,
"loss": 0.8302,
"step": 2640
},
{
"epoch": 1.3422273044914554,
"grad_norm": 3.08512020111084,
"learning_rate": 8.750433043877083e-06,
"loss": 0.8262,
"step": 2641
},
{
"epoch": 1.3427355314147766,
"grad_norm": 3.0877416133880615,
"learning_rate": 8.749320951507276e-06,
"loss": 0.8799,
"step": 2642
},
{
"epoch": 1.3432437583380978,
"grad_norm": 3.131417751312256,
"learning_rate": 8.748208435221614e-06,
"loss": 0.8745,
"step": 2643
},
{
"epoch": 1.3437519852614193,
"grad_norm": 3.1524205207824707,
"learning_rate": 8.747095495145878e-06,
"loss": 0.9559,
"step": 2644
},
{
"epoch": 1.3442602121847405,
"grad_norm": 3.236327648162842,
"learning_rate": 8.745982131405908e-06,
"loss": 0.8704,
"step": 2645
},
{
"epoch": 1.3447684391080617,
"grad_norm": 3.1059675216674805,
"learning_rate": 8.744868344127583e-06,
"loss": 0.8759,
"step": 2646
},
{
"epoch": 1.345276666031383,
"grad_norm": 3.2322580814361572,
"learning_rate": 8.743754133436832e-06,
"loss": 0.8551,
"step": 2647
},
{
"epoch": 1.3457848929547043,
"grad_norm": 3.067265510559082,
"learning_rate": 8.742639499459637e-06,
"loss": 0.8704,
"step": 2648
},
{
"epoch": 1.3462931198780255,
"grad_norm": 3.043553590774536,
"learning_rate": 8.74152444232202e-06,
"loss": 0.8944,
"step": 2649
},
{
"epoch": 1.3468013468013469,
"grad_norm": 2.9999492168426514,
"learning_rate": 8.740408962150055e-06,
"loss": 0.852,
"step": 2650
},
{
"epoch": 1.347309573724668,
"grad_norm": 3.1530864238739014,
"learning_rate": 8.739293059069864e-06,
"loss": 0.8197,
"step": 2651
},
{
"epoch": 1.3478178006479893,
"grad_norm": 3.741610527038574,
"learning_rate": 8.738176733207618e-06,
"loss": 0.8789,
"step": 2652
},
{
"epoch": 1.3483260275713107,
"grad_norm": 3.1385812759399414,
"learning_rate": 8.73705998468953e-06,
"loss": 0.8479,
"step": 2653
},
{
"epoch": 1.348834254494632,
"grad_norm": 3.3255321979522705,
"learning_rate": 8.735942813641869e-06,
"loss": 0.7281,
"step": 2654
},
{
"epoch": 1.349342481417953,
"grad_norm": 3.0691211223602295,
"learning_rate": 8.734825220190946e-06,
"loss": 0.8329,
"step": 2655
},
{
"epoch": 1.3498507083412743,
"grad_norm": 3.088752269744873,
"learning_rate": 8.733707204463121e-06,
"loss": 0.7821,
"step": 2656
},
{
"epoch": 1.3503589352645957,
"grad_norm": 3.136718511581421,
"learning_rate": 8.732588766584803e-06,
"loss": 0.9038,
"step": 2657
},
{
"epoch": 1.350867162187917,
"grad_norm": 2.992579460144043,
"learning_rate": 8.731469906682445e-06,
"loss": 0.8415,
"step": 2658
},
{
"epoch": 1.3513753891112381,
"grad_norm": 3.259535312652588,
"learning_rate": 8.730350624882557e-06,
"loss": 0.9561,
"step": 2659
},
{
"epoch": 1.3518836160345593,
"grad_norm": 3.0274555683135986,
"learning_rate": 8.729230921311682e-06,
"loss": 0.8513,
"step": 2660
},
{
"epoch": 1.3523918429578807,
"grad_norm": 3.5799143314361572,
"learning_rate": 8.728110796096426e-06,
"loss": 0.844,
"step": 2661
},
{
"epoch": 1.352900069881202,
"grad_norm": 3.2173969745635986,
"learning_rate": 8.726990249363432e-06,
"loss": 0.8714,
"step": 2662
},
{
"epoch": 1.3534082968045231,
"grad_norm": 3.0594699382781982,
"learning_rate": 8.725869281239395e-06,
"loss": 0.9004,
"step": 2663
},
{
"epoch": 1.3539165237278445,
"grad_norm": 2.9932353496551514,
"learning_rate": 8.724747891851055e-06,
"loss": 0.7776,
"step": 2664
},
{
"epoch": 1.3544247506511657,
"grad_norm": 3.293879270553589,
"learning_rate": 8.723626081325205e-06,
"loss": 0.8032,
"step": 2665
},
{
"epoch": 1.354932977574487,
"grad_norm": 3.299185037612915,
"learning_rate": 8.722503849788679e-06,
"loss": 0.9281,
"step": 2666
},
{
"epoch": 1.3554412044978084,
"grad_norm": 3.27127742767334,
"learning_rate": 8.721381197368366e-06,
"loss": 0.8855,
"step": 2667
},
{
"epoch": 1.3559494314211296,
"grad_norm": 3.081345319747925,
"learning_rate": 8.720258124191195e-06,
"loss": 0.8455,
"step": 2668
},
{
"epoch": 1.3564576583444508,
"grad_norm": 3.182535409927368,
"learning_rate": 8.719134630384144e-06,
"loss": 0.8738,
"step": 2669
},
{
"epoch": 1.3569658852677722,
"grad_norm": 3.1837494373321533,
"learning_rate": 8.718010716074246e-06,
"loss": 0.8641,
"step": 2670
},
{
"epoch": 1.3574741121910934,
"grad_norm": 3.0172135829925537,
"learning_rate": 8.716886381388573e-06,
"loss": 0.8186,
"step": 2671
},
{
"epoch": 1.3579823391144146,
"grad_norm": 3.1252171993255615,
"learning_rate": 8.715761626454248e-06,
"loss": 0.8675,
"step": 2672
},
{
"epoch": 1.3584905660377358,
"grad_norm": 3.1834468841552734,
"learning_rate": 8.71463645139844e-06,
"loss": 0.912,
"step": 2673
},
{
"epoch": 1.3589987929610572,
"grad_norm": 3.274007797241211,
"learning_rate": 8.713510856348368e-06,
"loss": 0.8753,
"step": 2674
},
{
"epoch": 1.3595070198843784,
"grad_norm": 3.550733804702759,
"learning_rate": 8.712384841431296e-06,
"loss": 0.8694,
"step": 2675
},
{
"epoch": 1.3600152468076996,
"grad_norm": 3.228518486022949,
"learning_rate": 8.711258406774536e-06,
"loss": 0.8589,
"step": 2676
},
{
"epoch": 1.3605234737310208,
"grad_norm": 3.438473701477051,
"learning_rate": 8.71013155250545e-06,
"loss": 0.8953,
"step": 2677
},
{
"epoch": 1.3610317006543422,
"grad_norm": 3.2976551055908203,
"learning_rate": 8.709004278751445e-06,
"loss": 0.8868,
"step": 2678
},
{
"epoch": 1.3615399275776634,
"grad_norm": 3.1462578773498535,
"learning_rate": 8.707876585639977e-06,
"loss": 0.8054,
"step": 2679
},
{
"epoch": 1.3620481545009846,
"grad_norm": 2.89199161529541,
"learning_rate": 8.706748473298544e-06,
"loss": 0.7397,
"step": 2680
},
{
"epoch": 1.362556381424306,
"grad_norm": 3.5015709400177,
"learning_rate": 8.705619941854698e-06,
"loss": 0.8578,
"step": 2681
},
{
"epoch": 1.3630646083476272,
"grad_norm": 3.020496368408203,
"learning_rate": 8.70449099143604e-06,
"loss": 0.8183,
"step": 2682
},
{
"epoch": 1.3635728352709484,
"grad_norm": 3.3509302139282227,
"learning_rate": 8.703361622170205e-06,
"loss": 0.7856,
"step": 2683
},
{
"epoch": 1.3640810621942698,
"grad_norm": 3.096768379211426,
"learning_rate": 8.702231834184895e-06,
"loss": 0.9488,
"step": 2684
},
{
"epoch": 1.364589289117591,
"grad_norm": 3.023076295852661,
"learning_rate": 8.701101627607844e-06,
"loss": 0.8422,
"step": 2685
},
{
"epoch": 1.3650975160409122,
"grad_norm": 4.890537738800049,
"learning_rate": 8.699971002566839e-06,
"loss": 0.838,
"step": 2686
},
{
"epoch": 1.3656057429642336,
"grad_norm": 3.220949172973633,
"learning_rate": 8.698839959189714e-06,
"loss": 0.8532,
"step": 2687
},
{
"epoch": 1.3661139698875548,
"grad_norm": 2.687530994415283,
"learning_rate": 8.697708497604352e-06,
"loss": 0.7821,
"step": 2688
},
{
"epoch": 1.366622196810876,
"grad_norm": 3.0187814235687256,
"learning_rate": 8.696576617938677e-06,
"loss": 0.9102,
"step": 2689
},
{
"epoch": 1.3671304237341972,
"grad_norm": 3.226120948791504,
"learning_rate": 8.695444320320668e-06,
"loss": 0.8591,
"step": 2690
},
{
"epoch": 1.3676386506575187,
"grad_norm": 3.4441635608673096,
"learning_rate": 8.694311604878346e-06,
"loss": 0.9067,
"step": 2691
},
{
"epoch": 1.3681468775808399,
"grad_norm": 3.1548378467559814,
"learning_rate": 8.693178471739782e-06,
"loss": 0.7731,
"step": 2692
},
{
"epoch": 1.368655104504161,
"grad_norm": 2.9003067016601562,
"learning_rate": 8.692044921033096e-06,
"loss": 0.7738,
"step": 2693
},
{
"epoch": 1.3691633314274823,
"grad_norm": 3.099714756011963,
"learning_rate": 8.690910952886449e-06,
"loss": 0.7917,
"step": 2694
},
{
"epoch": 1.3696715583508037,
"grad_norm": 3.210352897644043,
"learning_rate": 8.689776567428053e-06,
"loss": 0.8826,
"step": 2695
},
{
"epoch": 1.3701797852741249,
"grad_norm": 3.1537983417510986,
"learning_rate": 8.688641764786167e-06,
"loss": 0.8355,
"step": 2696
},
{
"epoch": 1.370688012197446,
"grad_norm": 3.399169683456421,
"learning_rate": 8.6875065450891e-06,
"loss": 0.9821,
"step": 2697
},
{
"epoch": 1.3711962391207675,
"grad_norm": 3.2011547088623047,
"learning_rate": 8.686370908465204e-06,
"loss": 0.8729,
"step": 2698
},
{
"epoch": 1.3717044660440887,
"grad_norm": 3.188690185546875,
"learning_rate": 8.685234855042876e-06,
"loss": 0.8369,
"step": 2699
},
{
"epoch": 1.3722126929674099,
"grad_norm": 4.217759132385254,
"learning_rate": 8.684098384950567e-06,
"loss": 0.8288,
"step": 2700
},
{
"epoch": 1.3727209198907313,
"grad_norm": 3.447901964187622,
"learning_rate": 8.682961498316772e-06,
"loss": 0.8944,
"step": 2701
},
{
"epoch": 1.3732291468140525,
"grad_norm": 2.8357911109924316,
"learning_rate": 8.68182419527003e-06,
"loss": 0.8125,
"step": 2702
},
{
"epoch": 1.3737373737373737,
"grad_norm": 2.925048828125,
"learning_rate": 8.680686475938933e-06,
"loss": 0.7786,
"step": 2703
},
{
"epoch": 1.3742456006606951,
"grad_norm": 3.1883702278137207,
"learning_rate": 8.679548340452115e-06,
"loss": 0.7921,
"step": 2704
},
{
"epoch": 1.3747538275840163,
"grad_norm": 3.2614142894744873,
"learning_rate": 8.678409788938259e-06,
"loss": 0.8351,
"step": 2705
},
{
"epoch": 1.3752620545073375,
"grad_norm": 3.193164825439453,
"learning_rate": 8.677270821526095e-06,
"loss": 0.7844,
"step": 2706
},
{
"epoch": 1.3757702814306587,
"grad_norm": 3.2156474590301514,
"learning_rate": 8.6761314383444e-06,
"loss": 0.8201,
"step": 2707
},
{
"epoch": 1.3762785083539801,
"grad_norm": 2.989922523498535,
"learning_rate": 8.674991639521997e-06,
"loss": 0.8055,
"step": 2708
},
{
"epoch": 1.3767867352773013,
"grad_norm": 3.1420819759368896,
"learning_rate": 8.673851425187762e-06,
"loss": 0.9387,
"step": 2709
},
{
"epoch": 1.3772949622006225,
"grad_norm": 2.995516061782837,
"learning_rate": 8.672710795470606e-06,
"loss": 0.8184,
"step": 2710
},
{
"epoch": 1.3778031891239437,
"grad_norm": 3.6818063259124756,
"learning_rate": 8.6715697504995e-06,
"loss": 0.9301,
"step": 2711
},
{
"epoch": 1.3783114160472651,
"grad_norm": 3.0470900535583496,
"learning_rate": 8.67042829040345e-06,
"loss": 0.8822,
"step": 2712
},
{
"epoch": 1.3788196429705863,
"grad_norm": 3.0707991123199463,
"learning_rate": 8.66928641531152e-06,
"loss": 0.8192,
"step": 2713
},
{
"epoch": 1.3793278698939075,
"grad_norm": 3.1534693241119385,
"learning_rate": 8.668144125352814e-06,
"loss": 0.7877,
"step": 2714
},
{
"epoch": 1.379836096817229,
"grad_norm": 3.1589243412017822,
"learning_rate": 8.667001420656482e-06,
"loss": 0.8504,
"step": 2715
},
{
"epoch": 1.3803443237405502,
"grad_norm": 3.279162645339966,
"learning_rate": 8.665858301351728e-06,
"loss": 0.9218,
"step": 2716
},
{
"epoch": 1.3808525506638714,
"grad_norm": 3.084298610687256,
"learning_rate": 8.664714767567796e-06,
"loss": 0.8225,
"step": 2717
},
{
"epoch": 1.3813607775871928,
"grad_norm": 3.2460992336273193,
"learning_rate": 8.66357081943398e-06,
"loss": 0.8463,
"step": 2718
},
{
"epoch": 1.381869004510514,
"grad_norm": 3.2598676681518555,
"learning_rate": 8.662426457079622e-06,
"loss": 0.9005,
"step": 2719
},
{
"epoch": 1.3823772314338352,
"grad_norm": 3.0160598754882812,
"learning_rate": 8.661281680634103e-06,
"loss": 0.8236,
"step": 2720
},
{
"epoch": 1.3828854583571566,
"grad_norm": 3.1025872230529785,
"learning_rate": 8.660136490226863e-06,
"loss": 0.8245,
"step": 2721
},
{
"epoch": 1.3833936852804778,
"grad_norm": 3.3537919521331787,
"learning_rate": 8.65899088598738e-06,
"loss": 0.9065,
"step": 2722
},
{
"epoch": 1.383901912203799,
"grad_norm": 3.2307286262512207,
"learning_rate": 8.657844868045182e-06,
"loss": 0.7384,
"step": 2723
},
{
"epoch": 1.3844101391271202,
"grad_norm": 3.2937235832214355,
"learning_rate": 8.656698436529843e-06,
"loss": 0.8946,
"step": 2724
},
{
"epoch": 1.3849183660504414,
"grad_norm": 3.5228772163391113,
"learning_rate": 8.655551591570983e-06,
"loss": 0.97,
"step": 2725
},
{
"epoch": 1.3854265929737628,
"grad_norm": 3.1984856128692627,
"learning_rate": 8.65440433329827e-06,
"loss": 0.8,
"step": 2726
},
{
"epoch": 1.385934819897084,
"grad_norm": 3.3704750537872314,
"learning_rate": 8.65325666184142e-06,
"loss": 0.9496,
"step": 2727
},
{
"epoch": 1.3864430468204052,
"grad_norm": 3.2403101921081543,
"learning_rate": 8.652108577330194e-06,
"loss": 0.7782,
"step": 2728
},
{
"epoch": 1.3869512737437266,
"grad_norm": 3.0873589515686035,
"learning_rate": 8.650960079894397e-06,
"loss": 0.7821,
"step": 2729
},
{
"epoch": 1.3874595006670478,
"grad_norm": 3.159641742706299,
"learning_rate": 8.649811169663886e-06,
"loss": 0.8486,
"step": 2730
},
{
"epoch": 1.387967727590369,
"grad_norm": 3.6541502475738525,
"learning_rate": 8.648661846768562e-06,
"loss": 0.8905,
"step": 2731
},
{
"epoch": 1.3884759545136904,
"grad_norm": 2.725341558456421,
"learning_rate": 8.647512111338374e-06,
"loss": 0.7955,
"step": 2732
},
{
"epoch": 1.3889841814370116,
"grad_norm": 3.1985182762145996,
"learning_rate": 8.646361963503312e-06,
"loss": 0.7561,
"step": 2733
},
{
"epoch": 1.3894924083603328,
"grad_norm": 2.953597068786621,
"learning_rate": 8.645211403393422e-06,
"loss": 0.9021,
"step": 2734
},
{
"epoch": 1.3900006352836543,
"grad_norm": 3.17386794090271,
"learning_rate": 8.644060431138789e-06,
"loss": 0.8701,
"step": 2735
},
{
"epoch": 1.3905088622069754,
"grad_norm": 3.1918575763702393,
"learning_rate": 8.64290904686955e-06,
"loss": 0.7802,
"step": 2736
},
{
"epoch": 1.3910170891302966,
"grad_norm": 3.179152488708496,
"learning_rate": 8.64175725071588e-06,
"loss": 0.826,
"step": 2737
},
{
"epoch": 1.391525316053618,
"grad_norm": 3.167999505996704,
"learning_rate": 8.640605042808015e-06,
"loss": 0.9195,
"step": 2738
},
{
"epoch": 1.3920335429769393,
"grad_norm": 3.178011655807495,
"learning_rate": 8.639452423276222e-06,
"loss": 0.8234,
"step": 2739
},
{
"epoch": 1.3925417699002605,
"grad_norm": 3.097113609313965,
"learning_rate": 8.638299392250825e-06,
"loss": 0.8382,
"step": 2740
},
{
"epoch": 1.3930499968235817,
"grad_norm": 2.9893417358398438,
"learning_rate": 8.63714594986219e-06,
"loss": 0.822,
"step": 2741
},
{
"epoch": 1.3935582237469029,
"grad_norm": 3.445077419281006,
"learning_rate": 8.63599209624073e-06,
"loss": 0.8855,
"step": 2742
},
{
"epoch": 1.3940664506702243,
"grad_norm": 3.340830087661743,
"learning_rate": 8.634837831516908e-06,
"loss": 0.8562,
"step": 2743
},
{
"epoch": 1.3945746775935455,
"grad_norm": 3.0364067554473877,
"learning_rate": 8.633683155821228e-06,
"loss": 0.836,
"step": 2744
},
{
"epoch": 1.3950829045168667,
"grad_norm": 3.1018741130828857,
"learning_rate": 8.632528069284243e-06,
"loss": 0.8154,
"step": 2745
},
{
"epoch": 1.395591131440188,
"grad_norm": 3.1715431213378906,
"learning_rate": 8.631372572036554e-06,
"loss": 0.9054,
"step": 2746
},
{
"epoch": 1.3960993583635093,
"grad_norm": 3.1135804653167725,
"learning_rate": 8.630216664208807e-06,
"loss": 0.7402,
"step": 2747
},
{
"epoch": 1.3966075852868305,
"grad_norm": 3.0619115829467773,
"learning_rate": 8.629060345931692e-06,
"loss": 0.8012,
"step": 2748
},
{
"epoch": 1.397115812210152,
"grad_norm": 3.196671962738037,
"learning_rate": 8.62790361733595e-06,
"loss": 1.0199,
"step": 2749
},
{
"epoch": 1.397624039133473,
"grad_norm": 3.023580312728882,
"learning_rate": 8.626746478552364e-06,
"loss": 0.8694,
"step": 2750
},
{
"epoch": 1.3981322660567943,
"grad_norm": 3.1226820945739746,
"learning_rate": 8.625588929711769e-06,
"loss": 0.8368,
"step": 2751
},
{
"epoch": 1.3986404929801157,
"grad_norm": 3.6180248260498047,
"learning_rate": 8.624430970945042e-06,
"loss": 0.8729,
"step": 2752
},
{
"epoch": 1.399148719903437,
"grad_norm": 3.0566389560699463,
"learning_rate": 8.623272602383104e-06,
"loss": 0.8592,
"step": 2753
},
{
"epoch": 1.3996569468267581,
"grad_norm": 2.938758373260498,
"learning_rate": 8.622113824156927e-06,
"loss": 0.7979,
"step": 2754
},
{
"epoch": 1.4001651737500795,
"grad_norm": 3.0424911975860596,
"learning_rate": 8.62095463639753e-06,
"loss": 0.8087,
"step": 2755
},
{
"epoch": 1.4006734006734007,
"grad_norm": 3.3442065715789795,
"learning_rate": 8.619795039235977e-06,
"loss": 0.8459,
"step": 2756
},
{
"epoch": 1.401181627596722,
"grad_norm": 3.2160093784332275,
"learning_rate": 8.618635032803373e-06,
"loss": 0.9036,
"step": 2757
},
{
"epoch": 1.4016898545200431,
"grad_norm": 3.39898681640625,
"learning_rate": 8.617474617230876e-06,
"loss": 0.9047,
"step": 2758
},
{
"epoch": 1.4021980814433643,
"grad_norm": 2.9836056232452393,
"learning_rate": 8.61631379264969e-06,
"loss": 0.8554,
"step": 2759
},
{
"epoch": 1.4027063083666858,
"grad_norm": 3.0101606845855713,
"learning_rate": 8.61515255919106e-06,
"loss": 0.8432,
"step": 2760
},
{
"epoch": 1.403214535290007,
"grad_norm": 3.043668270111084,
"learning_rate": 8.613990916986283e-06,
"loss": 0.8153,
"step": 2761
},
{
"epoch": 1.4037227622133281,
"grad_norm": 3.441566228866577,
"learning_rate": 8.6128288661667e-06,
"loss": 0.9139,
"step": 2762
},
{
"epoch": 1.4042309891366496,
"grad_norm": 3.1094048023223877,
"learning_rate": 8.611666406863695e-06,
"loss": 0.8962,
"step": 2763
},
{
"epoch": 1.4047392160599708,
"grad_norm": 3.3947198390960693,
"learning_rate": 8.610503539208704e-06,
"loss": 0.8963,
"step": 2764
},
{
"epoch": 1.405247442983292,
"grad_norm": 3.0119621753692627,
"learning_rate": 8.609340263333204e-06,
"loss": 0.7885,
"step": 2765
},
{
"epoch": 1.4057556699066134,
"grad_norm": 3.0325357913970947,
"learning_rate": 8.608176579368721e-06,
"loss": 0.8552,
"step": 2766
},
{
"epoch": 1.4062638968299346,
"grad_norm": 3.492356300354004,
"learning_rate": 8.60701248744683e-06,
"loss": 0.8615,
"step": 2767
},
{
"epoch": 1.4067721237532558,
"grad_norm": 3.209897756576538,
"learning_rate": 8.605847987699143e-06,
"loss": 0.8475,
"step": 2768
},
{
"epoch": 1.4072803506765772,
"grad_norm": 3.118128538131714,
"learning_rate": 8.604683080257328e-06,
"loss": 0.8113,
"step": 2769
},
{
"epoch": 1.4077885775998984,
"grad_norm": 3.1163711547851562,
"learning_rate": 8.603517765253093e-06,
"loss": 0.9601,
"step": 2770
},
{
"epoch": 1.4082968045232196,
"grad_norm": 3.1078336238861084,
"learning_rate": 8.602352042818196e-06,
"loss": 0.7957,
"step": 2771
},
{
"epoch": 1.408805031446541,
"grad_norm": 3.149662494659424,
"learning_rate": 8.601185913084435e-06,
"loss": 0.8792,
"step": 2772
},
{
"epoch": 1.4093132583698622,
"grad_norm": 2.814724922180176,
"learning_rate": 8.600019376183664e-06,
"loss": 0.8117,
"step": 2773
},
{
"epoch": 1.4098214852931834,
"grad_norm": 3.325305938720703,
"learning_rate": 8.598852432247773e-06,
"loss": 0.9079,
"step": 2774
},
{
"epoch": 1.4103297122165046,
"grad_norm": 3.1834630966186523,
"learning_rate": 8.597685081408702e-06,
"loss": 0.7996,
"step": 2775
},
{
"epoch": 1.4108379391398258,
"grad_norm": 3.0160608291625977,
"learning_rate": 8.596517323798439e-06,
"loss": 0.8563,
"step": 2776
},
{
"epoch": 1.4113461660631472,
"grad_norm": 3.034503936767578,
"learning_rate": 8.595349159549014e-06,
"loss": 0.8282,
"step": 2777
},
{
"epoch": 1.4118543929864684,
"grad_norm": 3.2270278930664062,
"learning_rate": 8.594180588792509e-06,
"loss": 0.8111,
"step": 2778
},
{
"epoch": 1.4123626199097896,
"grad_norm": 3.277219772338867,
"learning_rate": 8.593011611661044e-06,
"loss": 0.7967,
"step": 2779
},
{
"epoch": 1.412870846833111,
"grad_norm": 3.335444211959839,
"learning_rate": 8.59184222828679e-06,
"loss": 0.8529,
"step": 2780
},
{
"epoch": 1.4133790737564322,
"grad_norm": 3.420228958129883,
"learning_rate": 8.590672438801966e-06,
"loss": 0.9701,
"step": 2781
},
{
"epoch": 1.4138873006797534,
"grad_norm": 3.2469561100006104,
"learning_rate": 8.58950224333883e-06,
"loss": 0.8626,
"step": 2782
},
{
"epoch": 1.4143955276030749,
"grad_norm": 3.1776680946350098,
"learning_rate": 8.588331642029693e-06,
"loss": 0.9284,
"step": 2783
},
{
"epoch": 1.414903754526396,
"grad_norm": 3.105638027191162,
"learning_rate": 8.587160635006906e-06,
"loss": 0.8902,
"step": 2784
},
{
"epoch": 1.4154119814497172,
"grad_norm": 3.259697675704956,
"learning_rate": 8.585989222402871e-06,
"loss": 0.814,
"step": 2785
},
{
"epoch": 1.4159202083730387,
"grad_norm": 2.953216791152954,
"learning_rate": 8.58481740435003e-06,
"loss": 0.7898,
"step": 2786
},
{
"epoch": 1.4164284352963599,
"grad_norm": 3.1166532039642334,
"learning_rate": 8.583645180980878e-06,
"loss": 0.7499,
"step": 2787
},
{
"epoch": 1.416936662219681,
"grad_norm": 3.0191895961761475,
"learning_rate": 8.582472552427949e-06,
"loss": 0.7992,
"step": 2788
},
{
"epoch": 1.4174448891430025,
"grad_norm": 3.2020316123962402,
"learning_rate": 8.581299518823829e-06,
"loss": 0.7971,
"step": 2789
},
{
"epoch": 1.4179531160663237,
"grad_norm": 3.126887083053589,
"learning_rate": 8.580126080301143e-06,
"loss": 0.7992,
"step": 2790
},
{
"epoch": 1.4184613429896449,
"grad_norm": 3.4426639080047607,
"learning_rate": 8.578952236992569e-06,
"loss": 0.9443,
"step": 2791
},
{
"epoch": 1.418969569912966,
"grad_norm": 3.0545034408569336,
"learning_rate": 8.577777989030826e-06,
"loss": 0.7823,
"step": 2792
},
{
"epoch": 1.4194777968362873,
"grad_norm": 3.326939821243286,
"learning_rate": 8.576603336548679e-06,
"loss": 0.8822,
"step": 2793
},
{
"epoch": 1.4199860237596087,
"grad_norm": 3.2515408992767334,
"learning_rate": 8.575428279678942e-06,
"loss": 0.9458,
"step": 2794
},
{
"epoch": 1.42049425068293,
"grad_norm": 3.2859838008880615,
"learning_rate": 8.574252818554469e-06,
"loss": 0.8204,
"step": 2795
},
{
"epoch": 1.421002477606251,
"grad_norm": 3.3892626762390137,
"learning_rate": 8.573076953308164e-06,
"loss": 0.9016,
"step": 2796
},
{
"epoch": 1.4215107045295725,
"grad_norm": 3.129750967025757,
"learning_rate": 8.57190068407298e-06,
"loss": 0.7464,
"step": 2797
},
{
"epoch": 1.4220189314528937,
"grad_norm": 3.18557071685791,
"learning_rate": 8.570724010981907e-06,
"loss": 0.8757,
"step": 2798
},
{
"epoch": 1.422527158376215,
"grad_norm": 3.095346450805664,
"learning_rate": 8.569546934167986e-06,
"loss": 0.7698,
"step": 2799
},
{
"epoch": 1.4230353852995363,
"grad_norm": 3.1986424922943115,
"learning_rate": 8.568369453764304e-06,
"loss": 0.8281,
"step": 2800
},
{
"epoch": 1.4235436122228575,
"grad_norm": 3.0349645614624023,
"learning_rate": 8.567191569903993e-06,
"loss": 0.8225,
"step": 2801
},
{
"epoch": 1.4240518391461787,
"grad_norm": 3.03617000579834,
"learning_rate": 8.566013282720227e-06,
"loss": 0.8585,
"step": 2802
},
{
"epoch": 1.4245600660695001,
"grad_norm": 2.9680211544036865,
"learning_rate": 8.564834592346235e-06,
"loss": 0.7789,
"step": 2803
},
{
"epoch": 1.4250682929928213,
"grad_norm": 2.939490795135498,
"learning_rate": 8.563655498915277e-06,
"loss": 0.8843,
"step": 2804
},
{
"epoch": 1.4255765199161425,
"grad_norm": 3.2486467361450195,
"learning_rate": 8.562476002560671e-06,
"loss": 0.8049,
"step": 2805
},
{
"epoch": 1.426084746839464,
"grad_norm": 2.8949148654937744,
"learning_rate": 8.561296103415777e-06,
"loss": 0.7904,
"step": 2806
},
{
"epoch": 1.4265929737627852,
"grad_norm": 3.06335711479187,
"learning_rate": 8.560115801614e-06,
"loss": 0.8296,
"step": 2807
},
{
"epoch": 1.4271012006861064,
"grad_norm": 3.0824975967407227,
"learning_rate": 8.55893509728879e-06,
"loss": 0.8573,
"step": 2808
},
{
"epoch": 1.4276094276094276,
"grad_norm": 3.0061516761779785,
"learning_rate": 8.557753990573642e-06,
"loss": 0.7923,
"step": 2809
},
{
"epoch": 1.4281176545327487,
"grad_norm": 3.269150495529175,
"learning_rate": 8.556572481602097e-06,
"loss": 0.939,
"step": 2810
},
{
"epoch": 1.4286258814560702,
"grad_norm": 3.064577102661133,
"learning_rate": 8.555390570507746e-06,
"loss": 0.8354,
"step": 2811
},
{
"epoch": 1.4291341083793914,
"grad_norm": 3.408207416534424,
"learning_rate": 8.554208257424216e-06,
"loss": 0.861,
"step": 2812
},
{
"epoch": 1.4296423353027126,
"grad_norm": 3.1423888206481934,
"learning_rate": 8.553025542485188e-06,
"loss": 0.8399,
"step": 2813
},
{
"epoch": 1.430150562226034,
"grad_norm": 3.00049090385437,
"learning_rate": 8.551842425824386e-06,
"loss": 0.8831,
"step": 2814
},
{
"epoch": 1.4306587891493552,
"grad_norm": 3.9325108528137207,
"learning_rate": 8.550658907575575e-06,
"loss": 0.871,
"step": 2815
},
{
"epoch": 1.4311670160726764,
"grad_norm": 3.3278439044952393,
"learning_rate": 8.549474987872575e-06,
"loss": 0.8385,
"step": 2816
},
{
"epoch": 1.4316752429959978,
"grad_norm": 3.1003921031951904,
"learning_rate": 8.54829066684924e-06,
"loss": 0.7442,
"step": 2817
},
{
"epoch": 1.432183469919319,
"grad_norm": 3.381220579147339,
"learning_rate": 8.547105944639476e-06,
"loss": 0.8432,
"step": 2818
},
{
"epoch": 1.4326916968426402,
"grad_norm": 3.1350619792938232,
"learning_rate": 8.545920821377236e-06,
"loss": 0.8929,
"step": 2819
},
{
"epoch": 1.4331999237659616,
"grad_norm": 3.075319766998291,
"learning_rate": 8.544735297196514e-06,
"loss": 0.8004,
"step": 2820
},
{
"epoch": 1.4337081506892828,
"grad_norm": 3.096254348754883,
"learning_rate": 8.54354937223135e-06,
"loss": 0.8188,
"step": 2821
},
{
"epoch": 1.434216377612604,
"grad_norm": 3.446495532989502,
"learning_rate": 8.542363046615832e-06,
"loss": 0.8236,
"step": 2822
},
{
"epoch": 1.4347246045359252,
"grad_norm": 3.2281386852264404,
"learning_rate": 8.54117632048409e-06,
"loss": 0.8753,
"step": 2823
},
{
"epoch": 1.4352328314592466,
"grad_norm": 3.3451106548309326,
"learning_rate": 8.539989193970302e-06,
"loss": 0.8476,
"step": 2824
},
{
"epoch": 1.4357410583825678,
"grad_norm": 3.919847011566162,
"learning_rate": 8.538801667208689e-06,
"loss": 0.8938,
"step": 2825
},
{
"epoch": 1.436249285305889,
"grad_norm": 3.22807240486145,
"learning_rate": 8.53761374033352e-06,
"loss": 0.8215,
"step": 2826
},
{
"epoch": 1.4367575122292102,
"grad_norm": 3.2741971015930176,
"learning_rate": 8.536425413479106e-06,
"loss": 0.9306,
"step": 2827
},
{
"epoch": 1.4372657391525316,
"grad_norm": 3.3959178924560547,
"learning_rate": 8.535236686779803e-06,
"loss": 0.8611,
"step": 2828
},
{
"epoch": 1.4377739660758528,
"grad_norm": 3.349571943283081,
"learning_rate": 8.53404756037002e-06,
"loss": 0.8705,
"step": 2829
},
{
"epoch": 1.438282192999174,
"grad_norm": 3.0857625007629395,
"learning_rate": 8.5328580343842e-06,
"loss": 0.8817,
"step": 2830
},
{
"epoch": 1.4387904199224955,
"grad_norm": 3.328871965408325,
"learning_rate": 8.531668108956839e-06,
"loss": 0.8801,
"step": 2831
},
{
"epoch": 1.4392986468458167,
"grad_norm": 3.0159804821014404,
"learning_rate": 8.530477784222474e-06,
"loss": 0.8405,
"step": 2832
},
{
"epoch": 1.4398068737691379,
"grad_norm": 3.806766986846924,
"learning_rate": 8.529287060315689e-06,
"loss": 0.7828,
"step": 2833
},
{
"epoch": 1.4403151006924593,
"grad_norm": 3.1105751991271973,
"learning_rate": 8.528095937371114e-06,
"loss": 0.8531,
"step": 2834
},
{
"epoch": 1.4408233276157805,
"grad_norm": 3.2140769958496094,
"learning_rate": 8.52690441552342e-06,
"loss": 0.9142,
"step": 2835
},
{
"epoch": 1.4413315545391017,
"grad_norm": 3.303377151489258,
"learning_rate": 8.525712494907331e-06,
"loss": 0.8428,
"step": 2836
},
{
"epoch": 1.441839781462423,
"grad_norm": 3.3976967334747314,
"learning_rate": 8.524520175657607e-06,
"loss": 0.9415,
"step": 2837
},
{
"epoch": 1.4423480083857443,
"grad_norm": 3.5745909214019775,
"learning_rate": 8.52332745790906e-06,
"loss": 0.8693,
"step": 2838
},
{
"epoch": 1.4428562353090655,
"grad_norm": 3.0088138580322266,
"learning_rate": 8.522134341796541e-06,
"loss": 0.7789,
"step": 2839
},
{
"epoch": 1.4433644622323867,
"grad_norm": 3.2750589847564697,
"learning_rate": 8.52094082745495e-06,
"loss": 0.8578,
"step": 2840
},
{
"epoch": 1.443872689155708,
"grad_norm": 3.0049092769622803,
"learning_rate": 8.519746915019235e-06,
"loss": 0.8935,
"step": 2841
},
{
"epoch": 1.4443809160790293,
"grad_norm": 3.0418643951416016,
"learning_rate": 8.518552604624383e-06,
"loss": 0.8245,
"step": 2842
},
{
"epoch": 1.4448891430023505,
"grad_norm": 3.2596395015716553,
"learning_rate": 8.517357896405427e-06,
"loss": 0.8868,
"step": 2843
},
{
"epoch": 1.4453973699256717,
"grad_norm": 2.954144239425659,
"learning_rate": 8.516162790497448e-06,
"loss": 0.8098,
"step": 2844
},
{
"epoch": 1.4459055968489931,
"grad_norm": 3.078198194503784,
"learning_rate": 8.51496728703557e-06,
"loss": 0.9043,
"step": 2845
},
{
"epoch": 1.4464138237723143,
"grad_norm": 3.0612032413482666,
"learning_rate": 8.51377138615496e-06,
"loss": 0.7907,
"step": 2846
},
{
"epoch": 1.4469220506956355,
"grad_norm": 3.0762479305267334,
"learning_rate": 8.512575087990838e-06,
"loss": 0.8781,
"step": 2847
},
{
"epoch": 1.447430277618957,
"grad_norm": 3.2731642723083496,
"learning_rate": 8.511378392678456e-06,
"loss": 0.8208,
"step": 2848
},
{
"epoch": 1.4479385045422781,
"grad_norm": 2.9340736865997314,
"learning_rate": 8.510181300353123e-06,
"loss": 0.7683,
"step": 2849
},
{
"epoch": 1.4484467314655993,
"grad_norm": 3.1629176139831543,
"learning_rate": 8.508983811150187e-06,
"loss": 0.8628,
"step": 2850
},
{
"epoch": 1.4489549583889207,
"grad_norm": 3.1435041427612305,
"learning_rate": 8.50778592520504e-06,
"loss": 0.8533,
"step": 2851
},
{
"epoch": 1.449463185312242,
"grad_norm": 3.251697063446045,
"learning_rate": 8.506587642653122e-06,
"loss": 0.8611,
"step": 2852
},
{
"epoch": 1.4499714122355631,
"grad_norm": 3.0637731552124023,
"learning_rate": 8.505388963629914e-06,
"loss": 0.7843,
"step": 2853
},
{
"epoch": 1.4504796391588846,
"grad_norm": 3.6621084213256836,
"learning_rate": 8.504189888270948e-06,
"loss": 0.8674,
"step": 2854
},
{
"epoch": 1.4509878660822058,
"grad_norm": 3.443359851837158,
"learning_rate": 8.502990416711796e-06,
"loss": 0.778,
"step": 2855
},
{
"epoch": 1.451496093005527,
"grad_norm": 3.2870068550109863,
"learning_rate": 8.501790549088074e-06,
"loss": 0.8024,
"step": 2856
},
{
"epoch": 1.4520043199288482,
"grad_norm": 3.1077282428741455,
"learning_rate": 8.500590285535447e-06,
"loss": 0.8335,
"step": 2857
},
{
"epoch": 1.4525125468521696,
"grad_norm": 3.2536587715148926,
"learning_rate": 8.499389626189622e-06,
"loss": 0.8781,
"step": 2858
},
{
"epoch": 1.4530207737754908,
"grad_norm": 3.109429359436035,
"learning_rate": 8.49818857118635e-06,
"loss": 0.8489,
"step": 2859
},
{
"epoch": 1.453529000698812,
"grad_norm": 3.064183235168457,
"learning_rate": 8.496987120661429e-06,
"loss": 0.8095,
"step": 2860
},
{
"epoch": 1.4540372276221332,
"grad_norm": 3.017165422439575,
"learning_rate": 8.495785274750698e-06,
"loss": 0.8582,
"step": 2861
},
{
"epoch": 1.4545454545454546,
"grad_norm": 3.174152374267578,
"learning_rate": 8.494583033590047e-06,
"loss": 0.7484,
"step": 2862
},
{
"epoch": 1.4550536814687758,
"grad_norm": 3.0165398120880127,
"learning_rate": 8.493380397315408e-06,
"loss": 0.8425,
"step": 2863
},
{
"epoch": 1.455561908392097,
"grad_norm": 3.5248165130615234,
"learning_rate": 8.49217736606275e-06,
"loss": 0.83,
"step": 2864
},
{
"epoch": 1.4560701353154184,
"grad_norm": 3.3429296016693115,
"learning_rate": 8.490973939968101e-06,
"loss": 0.8659,
"step": 2865
},
{
"epoch": 1.4565783622387396,
"grad_norm": 3.2521004676818848,
"learning_rate": 8.489770119167521e-06,
"loss": 0.8644,
"step": 2866
},
{
"epoch": 1.4570865891620608,
"grad_norm": 3.1303560733795166,
"learning_rate": 8.488565903797122e-06,
"loss": 0.9001,
"step": 2867
},
{
"epoch": 1.4575948160853822,
"grad_norm": 2.9541337490081787,
"learning_rate": 8.487361293993057e-06,
"loss": 0.8452,
"step": 2868
},
{
"epoch": 1.4581030430087034,
"grad_norm": 2.9469094276428223,
"learning_rate": 8.486156289891527e-06,
"loss": 0.804,
"step": 2869
},
{
"epoch": 1.4586112699320246,
"grad_norm": 3.3827242851257324,
"learning_rate": 8.484950891628774e-06,
"loss": 0.8085,
"step": 2870
},
{
"epoch": 1.459119496855346,
"grad_norm": 3.1991117000579834,
"learning_rate": 8.483745099341082e-06,
"loss": 0.8154,
"step": 2871
},
{
"epoch": 1.4596277237786672,
"grad_norm": 3.126009941101074,
"learning_rate": 8.482538913164792e-06,
"loss": 0.8419,
"step": 2872
},
{
"epoch": 1.4601359507019884,
"grad_norm": 3.3102211952209473,
"learning_rate": 8.481332333236275e-06,
"loss": 0.8628,
"step": 2873
},
{
"epoch": 1.4606441776253096,
"grad_norm": 3.188005208969116,
"learning_rate": 8.480125359691954e-06,
"loss": 0.9521,
"step": 2874
},
{
"epoch": 1.461152404548631,
"grad_norm": 3.1601901054382324,
"learning_rate": 8.478917992668295e-06,
"loss": 0.7734,
"step": 2875
},
{
"epoch": 1.4616606314719522,
"grad_norm": 3.1462960243225098,
"learning_rate": 8.477710232301809e-06,
"loss": 0.8857,
"step": 2876
},
{
"epoch": 1.4621688583952734,
"grad_norm": 3.0840206146240234,
"learning_rate": 8.476502078729049e-06,
"loss": 0.8253,
"step": 2877
},
{
"epoch": 1.4626770853185946,
"grad_norm": 3.2918813228607178,
"learning_rate": 8.47529353208662e-06,
"loss": 0.7815,
"step": 2878
},
{
"epoch": 1.463185312241916,
"grad_norm": 3.0587096214294434,
"learning_rate": 8.47408459251116e-06,
"loss": 0.8291,
"step": 2879
},
{
"epoch": 1.4636935391652373,
"grad_norm": 2.9685184955596924,
"learning_rate": 8.472875260139361e-06,
"loss": 0.8308,
"step": 2880
},
{
"epoch": 1.4642017660885585,
"grad_norm": 3.0110650062561035,
"learning_rate": 8.471665535107953e-06,
"loss": 0.8293,
"step": 2881
},
{
"epoch": 1.4647099930118799,
"grad_norm": 3.130685329437256,
"learning_rate": 8.470455417553716e-06,
"loss": 0.8487,
"step": 2882
},
{
"epoch": 1.465218219935201,
"grad_norm": 3.396280527114868,
"learning_rate": 8.46924490761347e-06,
"loss": 0.9272,
"step": 2883
},
{
"epoch": 1.4657264468585223,
"grad_norm": 3.0790679454803467,
"learning_rate": 8.468034005424081e-06,
"loss": 0.8587,
"step": 2884
},
{
"epoch": 1.4662346737818437,
"grad_norm": 3.0198047161102295,
"learning_rate": 8.46682271112246e-06,
"loss": 0.8687,
"step": 2885
},
{
"epoch": 1.4667429007051649,
"grad_norm": 3.0898425579071045,
"learning_rate": 8.465611024845561e-06,
"loss": 0.8936,
"step": 2886
},
{
"epoch": 1.467251127628486,
"grad_norm": 3.215315818786621,
"learning_rate": 8.464398946730383e-06,
"loss": 0.8631,
"step": 2887
},
{
"epoch": 1.4677593545518075,
"grad_norm": 3.161775827407837,
"learning_rate": 8.46318647691397e-06,
"loss": 0.8432,
"step": 2888
},
{
"epoch": 1.4682675814751287,
"grad_norm": 3.053117513656616,
"learning_rate": 8.461973615533409e-06,
"loss": 0.9322,
"step": 2889
},
{
"epoch": 1.46877580839845,
"grad_norm": 3.3006246089935303,
"learning_rate": 8.460760362725831e-06,
"loss": 0.8339,
"step": 2890
},
{
"epoch": 1.469284035321771,
"grad_norm": 3.0707836151123047,
"learning_rate": 8.459546718628412e-06,
"loss": 0.8493,
"step": 2891
},
{
"epoch": 1.4697922622450923,
"grad_norm": 3.0935218334198,
"learning_rate": 8.458332683378375e-06,
"loss": 0.8258,
"step": 2892
},
{
"epoch": 1.4703004891684137,
"grad_norm": 3.4484004974365234,
"learning_rate": 8.457118257112982e-06,
"loss": 0.8924,
"step": 2893
},
{
"epoch": 1.470808716091735,
"grad_norm": 3.459404706954956,
"learning_rate": 8.455903439969543e-06,
"loss": 0.8267,
"step": 2894
},
{
"epoch": 1.4713169430150561,
"grad_norm": 3.255765914916992,
"learning_rate": 8.454688232085409e-06,
"loss": 0.9236,
"step": 2895
},
{
"epoch": 1.4718251699383775,
"grad_norm": 3.0659914016723633,
"learning_rate": 8.45347263359798e-06,
"loss": 0.8843,
"step": 2896
},
{
"epoch": 1.4723333968616987,
"grad_norm": 2.9841461181640625,
"learning_rate": 8.452256644644694e-06,
"loss": 0.7879,
"step": 2897
},
{
"epoch": 1.47284162378502,
"grad_norm": 3.225430488586426,
"learning_rate": 8.451040265363039e-06,
"loss": 0.8594,
"step": 2898
},
{
"epoch": 1.4733498507083413,
"grad_norm": 3.0873258113861084,
"learning_rate": 8.449823495890546e-06,
"loss": 0.8681,
"step": 2899
},
{
"epoch": 1.4738580776316625,
"grad_norm": 2.978499174118042,
"learning_rate": 8.448606336364783e-06,
"loss": 0.8227,
"step": 2900
},
{
"epoch": 1.4743663045549837,
"grad_norm": 3.4347798824310303,
"learning_rate": 8.447388786923371e-06,
"loss": 0.9436,
"step": 2901
},
{
"epoch": 1.4748745314783052,
"grad_norm": 3.1734769344329834,
"learning_rate": 8.446170847703975e-06,
"loss": 0.8,
"step": 2902
},
{
"epoch": 1.4753827584016264,
"grad_norm": 2.9005730152130127,
"learning_rate": 8.444952518844297e-06,
"loss": 0.879,
"step": 2903
},
{
"epoch": 1.4758909853249476,
"grad_norm": 3.3382294178009033,
"learning_rate": 8.443733800482089e-06,
"loss": 0.9734,
"step": 2904
},
{
"epoch": 1.476399212248269,
"grad_norm": 2.981613874435425,
"learning_rate": 8.442514692755141e-06,
"loss": 0.9232,
"step": 2905
},
{
"epoch": 1.4769074391715902,
"grad_norm": 3.060418128967285,
"learning_rate": 8.441295195801296e-06,
"loss": 0.8169,
"step": 2906
},
{
"epoch": 1.4774156660949114,
"grad_norm": 3.258392095565796,
"learning_rate": 8.440075309758433e-06,
"loss": 0.7951,
"step": 2907
},
{
"epoch": 1.4779238930182326,
"grad_norm": 3.1214146614074707,
"learning_rate": 8.438855034764482e-06,
"loss": 0.8439,
"step": 2908
},
{
"epoch": 1.4784321199415538,
"grad_norm": 3.0851261615753174,
"learning_rate": 8.437634370957407e-06,
"loss": 0.9226,
"step": 2909
},
{
"epoch": 1.4789403468648752,
"grad_norm": 3.002401351928711,
"learning_rate": 8.436413318475227e-06,
"loss": 0.7845,
"step": 2910
},
{
"epoch": 1.4794485737881964,
"grad_norm": 2.99877667427063,
"learning_rate": 8.435191877455998e-06,
"loss": 0.8346,
"step": 2911
},
{
"epoch": 1.4799568007115176,
"grad_norm": 3.067758321762085,
"learning_rate": 8.43397004803782e-06,
"loss": 0.8056,
"step": 2912
},
{
"epoch": 1.480465027634839,
"grad_norm": 3.270920515060425,
"learning_rate": 8.432747830358843e-06,
"loss": 0.8406,
"step": 2913
},
{
"epoch": 1.4809732545581602,
"grad_norm": 3.130580186843872,
"learning_rate": 8.431525224557252e-06,
"loss": 0.8509,
"step": 2914
},
{
"epoch": 1.4814814814814814,
"grad_norm": 3.3330612182617188,
"learning_rate": 8.430302230771287e-06,
"loss": 0.8677,
"step": 2915
},
{
"epoch": 1.4819897084048028,
"grad_norm": 3.016632318496704,
"learning_rate": 8.42907884913922e-06,
"loss": 0.7927,
"step": 2916
},
{
"epoch": 1.482497935328124,
"grad_norm": 3.3111484050750732,
"learning_rate": 8.427855079799372e-06,
"loss": 0.8822,
"step": 2917
},
{
"epoch": 1.4830061622514452,
"grad_norm": 3.247408628463745,
"learning_rate": 8.426630922890111e-06,
"loss": 0.905,
"step": 2918
},
{
"epoch": 1.4835143891747666,
"grad_norm": 2.9573397636413574,
"learning_rate": 8.425406378549845e-06,
"loss": 0.8445,
"step": 2919
},
{
"epoch": 1.4840226160980878,
"grad_norm": 3.0608110427856445,
"learning_rate": 8.424181446917025e-06,
"loss": 0.7899,
"step": 2920
},
{
"epoch": 1.484530843021409,
"grad_norm": 3.070166826248169,
"learning_rate": 8.422956128130152e-06,
"loss": 0.8312,
"step": 2921
},
{
"epoch": 1.4850390699447304,
"grad_norm": 3.365817070007324,
"learning_rate": 8.421730422327761e-06,
"loss": 0.8399,
"step": 2922
},
{
"epoch": 1.4855472968680516,
"grad_norm": 3.1153318881988525,
"learning_rate": 8.42050432964844e-06,
"loss": 0.8013,
"step": 2923
},
{
"epoch": 1.4860555237913728,
"grad_norm": 3.2523930072784424,
"learning_rate": 8.419277850230813e-06,
"loss": 0.8811,
"step": 2924
},
{
"epoch": 1.486563750714694,
"grad_norm": 3.05375599861145,
"learning_rate": 8.418050984213556e-06,
"loss": 0.882,
"step": 2925
},
{
"epoch": 1.4870719776380152,
"grad_norm": 3.3024351596832275,
"learning_rate": 8.41682373173538e-06,
"loss": 0.9168,
"step": 2926
},
{
"epoch": 1.4875802045613367,
"grad_norm": 3.0616862773895264,
"learning_rate": 8.415596092935047e-06,
"loss": 0.841,
"step": 2927
},
{
"epoch": 1.4880884314846579,
"grad_norm": 3.1600990295410156,
"learning_rate": 8.41436806795136e-06,
"loss": 0.8187,
"step": 2928
},
{
"epoch": 1.488596658407979,
"grad_norm": 3.2013626098632812,
"learning_rate": 8.413139656923162e-06,
"loss": 0.8933,
"step": 2929
},
{
"epoch": 1.4891048853313005,
"grad_norm": 3.221249580383301,
"learning_rate": 8.411910859989345e-06,
"loss": 0.8945,
"step": 2930
},
{
"epoch": 1.4896131122546217,
"grad_norm": 3.0507285594940186,
"learning_rate": 8.410681677288843e-06,
"loss": 0.934,
"step": 2931
},
{
"epoch": 1.4901213391779429,
"grad_norm": 3.444394111633301,
"learning_rate": 8.409452108960631e-06,
"loss": 0.8934,
"step": 2932
},
{
"epoch": 1.4906295661012643,
"grad_norm": 3.080002546310425,
"learning_rate": 8.408222155143732e-06,
"loss": 0.7693,
"step": 2933
},
{
"epoch": 1.4911377930245855,
"grad_norm": 3.0022099018096924,
"learning_rate": 8.40699181597721e-06,
"loss": 0.8172,
"step": 2934
},
{
"epoch": 1.4916460199479067,
"grad_norm": 2.9647133350372314,
"learning_rate": 8.405761091600172e-06,
"loss": 0.9459,
"step": 2935
},
{
"epoch": 1.492154246871228,
"grad_norm": 2.958550453186035,
"learning_rate": 8.404529982151772e-06,
"loss": 0.8155,
"step": 2936
},
{
"epoch": 1.4926624737945493,
"grad_norm": 2.8132691383361816,
"learning_rate": 8.403298487771201e-06,
"loss": 0.7531,
"step": 2937
},
{
"epoch": 1.4931707007178705,
"grad_norm": 3.3202908039093018,
"learning_rate": 8.4020666085977e-06,
"loss": 0.9386,
"step": 2938
},
{
"epoch": 1.493678927641192,
"grad_norm": 3.345435857772827,
"learning_rate": 8.40083434477055e-06,
"loss": 0.9833,
"step": 2939
},
{
"epoch": 1.4941871545645131,
"grad_norm": 3.2024502754211426,
"learning_rate": 8.399601696429077e-06,
"loss": 0.8559,
"step": 2940
},
{
"epoch": 1.4946953814878343,
"grad_norm": 3.3189926147460938,
"learning_rate": 8.398368663712652e-06,
"loss": 0.8808,
"step": 2941
},
{
"epoch": 1.4952036084111555,
"grad_norm": 3.0005111694335938,
"learning_rate": 8.397135246760686e-06,
"loss": 0.8676,
"step": 2942
},
{
"epoch": 1.4957118353344767,
"grad_norm": 2.9679107666015625,
"learning_rate": 8.395901445712635e-06,
"loss": 0.7782,
"step": 2943
},
{
"epoch": 1.4962200622577981,
"grad_norm": 3.023895263671875,
"learning_rate": 8.394667260707996e-06,
"loss": 0.8329,
"step": 2944
},
{
"epoch": 1.4967282891811193,
"grad_norm": 2.946505069732666,
"learning_rate": 8.393432691886314e-06,
"loss": 0.7313,
"step": 2945
},
{
"epoch": 1.4972365161044405,
"grad_norm": 2.7999486923217773,
"learning_rate": 8.392197739387175e-06,
"loss": 0.8184,
"step": 2946
},
{
"epoch": 1.497744743027762,
"grad_norm": 3.1402924060821533,
"learning_rate": 8.390962403350209e-06,
"loss": 0.843,
"step": 2947
},
{
"epoch": 1.4982529699510831,
"grad_norm": 3.1389057636260986,
"learning_rate": 8.389726683915088e-06,
"loss": 0.9186,
"step": 2948
},
{
"epoch": 1.4987611968744043,
"grad_norm": 2.9966344833374023,
"learning_rate": 8.388490581221529e-06,
"loss": 0.8748,
"step": 2949
},
{
"epoch": 1.4992694237977258,
"grad_norm": 3.105550527572632,
"learning_rate": 8.387254095409289e-06,
"loss": 0.8893,
"step": 2950
},
{
"epoch": 1.499777650721047,
"grad_norm": 3.089803695678711,
"learning_rate": 8.386017226618175e-06,
"loss": 0.8809,
"step": 2951
},
{
"epoch": 1.5002858776443682,
"grad_norm": 3.3688395023345947,
"learning_rate": 8.38477997498803e-06,
"loss": 0.8093,
"step": 2952
},
{
"epoch": 1.5007941045676896,
"grad_norm": 3.1366262435913086,
"learning_rate": 8.383542340658749e-06,
"loss": 0.9673,
"step": 2953
},
{
"epoch": 1.5013023314910108,
"grad_norm": 3.131044387817383,
"learning_rate": 8.382304323770257e-06,
"loss": 0.9301,
"step": 2954
},
{
"epoch": 1.501810558414332,
"grad_norm": 3.0539796352386475,
"learning_rate": 8.381065924462532e-06,
"loss": 0.9085,
"step": 2955
},
{
"epoch": 1.5023187853376534,
"grad_norm": 3.356163263320923,
"learning_rate": 8.379827142875598e-06,
"loss": 0.8581,
"step": 2956
},
{
"epoch": 1.5028270122609744,
"grad_norm": 3.249194622039795,
"learning_rate": 8.378587979149512e-06,
"loss": 0.8807,
"step": 2957
},
{
"epoch": 1.5033352391842958,
"grad_norm": 3.210223913192749,
"learning_rate": 8.377348433424382e-06,
"loss": 0.875,
"step": 2958
},
{
"epoch": 1.5038434661076172,
"grad_norm": 2.936296224594116,
"learning_rate": 8.37610850584036e-06,
"loss": 0.7714,
"step": 2959
},
{
"epoch": 1.5043516930309382,
"grad_norm": 3.063220262527466,
"learning_rate": 8.374868196537632e-06,
"loss": 0.8493,
"step": 2960
},
{
"epoch": 1.5048599199542596,
"grad_norm": 2.9019317626953125,
"learning_rate": 8.373627505656434e-06,
"loss": 0.8043,
"step": 2961
},
{
"epoch": 1.5053681468775808,
"grad_norm": 3.295156717300415,
"learning_rate": 8.37238643333705e-06,
"loss": 0.9071,
"step": 2962
},
{
"epoch": 1.505876373800902,
"grad_norm": 3.10031795501709,
"learning_rate": 8.371144979719797e-06,
"loss": 0.8211,
"step": 2963
},
{
"epoch": 1.5063846007242234,
"grad_norm": 3.311487913131714,
"learning_rate": 8.36990314494504e-06,
"loss": 0.9032,
"step": 2964
},
{
"epoch": 1.5068928276475446,
"grad_norm": 3.106748580932617,
"learning_rate": 8.368660929153187e-06,
"loss": 0.8927,
"step": 2965
},
{
"epoch": 1.5074010545708658,
"grad_norm": 3.0898537635803223,
"learning_rate": 8.367418332484689e-06,
"loss": 0.8918,
"step": 2966
},
{
"epoch": 1.5079092814941872,
"grad_norm": 3.2117109298706055,
"learning_rate": 8.36617535508004e-06,
"loss": 0.8505,
"step": 2967
},
{
"epoch": 1.5084175084175084,
"grad_norm": 3.125581979751587,
"learning_rate": 8.364931997079775e-06,
"loss": 0.9883,
"step": 2968
},
{
"epoch": 1.5089257353408296,
"grad_norm": 3.275686502456665,
"learning_rate": 8.363688258624478e-06,
"loss": 0.8197,
"step": 2969
},
{
"epoch": 1.509433962264151,
"grad_norm": 3.1875977516174316,
"learning_rate": 8.362444139854767e-06,
"loss": 0.8912,
"step": 2970
},
{
"epoch": 1.5099421891874723,
"grad_norm": 3.183387279510498,
"learning_rate": 8.361199640911311e-06,
"loss": 0.8201,
"step": 2971
},
{
"epoch": 1.5104504161107934,
"grad_norm": 3.1798882484436035,
"learning_rate": 8.35995476193482e-06,
"loss": 0.8789,
"step": 2972
},
{
"epoch": 1.5109586430341149,
"grad_norm": 3.138533353805542,
"learning_rate": 8.358709503066042e-06,
"loss": 0.8732,
"step": 2973
},
{
"epoch": 1.5114668699574358,
"grad_norm": 3.4618101119995117,
"learning_rate": 8.357463864445774e-06,
"loss": 0.8354,
"step": 2974
},
{
"epoch": 1.5119750968807573,
"grad_norm": 3.1000592708587646,
"learning_rate": 8.356217846214855e-06,
"loss": 0.7872,
"step": 2975
},
{
"epoch": 1.5124833238040787,
"grad_norm": 3.0090417861938477,
"learning_rate": 8.354971448514164e-06,
"loss": 0.8379,
"step": 2976
},
{
"epoch": 1.5129915507273997,
"grad_norm": 2.9547312259674072,
"learning_rate": 8.353724671484624e-06,
"loss": 0.7905,
"step": 2977
},
{
"epoch": 1.513499777650721,
"grad_norm": 3.2382640838623047,
"learning_rate": 8.352477515267203e-06,
"loss": 0.8356,
"step": 2978
},
{
"epoch": 1.5140080045740423,
"grad_norm": 2.9780771732330322,
"learning_rate": 8.35122998000291e-06,
"loss": 0.8185,
"step": 2979
},
{
"epoch": 1.5145162314973635,
"grad_norm": 3.149280309677124,
"learning_rate": 8.349982065832797e-06,
"loss": 0.7817,
"step": 2980
},
{
"epoch": 1.515024458420685,
"grad_norm": 3.0026772022247314,
"learning_rate": 8.34873377289796e-06,
"loss": 0.8866,
"step": 2981
},
{
"epoch": 1.515532685344006,
"grad_norm": 3.194310188293457,
"learning_rate": 8.347485101339533e-06,
"loss": 0.8655,
"step": 2982
},
{
"epoch": 1.5160409122673273,
"grad_norm": 3.2000746726989746,
"learning_rate": 8.3462360512987e-06,
"loss": 0.8921,
"step": 2983
},
{
"epoch": 1.5165491391906487,
"grad_norm": 3.400982141494751,
"learning_rate": 8.344986622916685e-06,
"loss": 0.8467,
"step": 2984
},
{
"epoch": 1.51705736611397,
"grad_norm": 2.931072235107422,
"learning_rate": 8.343736816334755e-06,
"loss": 0.834,
"step": 2985
},
{
"epoch": 1.517565593037291,
"grad_norm": 3.178807497024536,
"learning_rate": 8.342486631694216e-06,
"loss": 0.9266,
"step": 2986
},
{
"epoch": 1.5180738199606125,
"grad_norm": 3.3427088260650635,
"learning_rate": 8.341236069136419e-06,
"loss": 0.8043,
"step": 2987
},
{
"epoch": 1.5185820468839337,
"grad_norm": 3.239030599594116,
"learning_rate": 8.339985128802763e-06,
"loss": 0.945,
"step": 2988
},
{
"epoch": 1.519090273807255,
"grad_norm": 3.4419260025024414,
"learning_rate": 8.33873381083468e-06,
"loss": 0.8667,
"step": 2989
},
{
"epoch": 1.5195985007305763,
"grad_norm": 3.0164976119995117,
"learning_rate": 8.337482115373655e-06,
"loss": 0.839,
"step": 2990
},
{
"epoch": 1.5201067276538973,
"grad_norm": 2.8095803260803223,
"learning_rate": 8.336230042561209e-06,
"loss": 0.7806,
"step": 2991
},
{
"epoch": 1.5206149545772187,
"grad_norm": 3.120523452758789,
"learning_rate": 8.334977592538904e-06,
"loss": 0.8523,
"step": 2992
},
{
"epoch": 1.5211231815005402,
"grad_norm": 3.2824933528900146,
"learning_rate": 8.333724765448352e-06,
"loss": 0.8601,
"step": 2993
},
{
"epoch": 1.5216314084238611,
"grad_norm": 3.133676767349243,
"learning_rate": 8.3324715614312e-06,
"loss": 0.8269,
"step": 2994
},
{
"epoch": 1.5221396353471826,
"grad_norm": 3.2283775806427,
"learning_rate": 8.331217980629144e-06,
"loss": 0.9106,
"step": 2995
},
{
"epoch": 1.5226478622705037,
"grad_norm": 3.171283483505249,
"learning_rate": 8.329964023183918e-06,
"loss": 0.8629,
"step": 2996
},
{
"epoch": 1.523156089193825,
"grad_norm": 3.0246291160583496,
"learning_rate": 8.328709689237303e-06,
"loss": 0.8226,
"step": 2997
},
{
"epoch": 1.5236643161171464,
"grad_norm": 3.2844457626342773,
"learning_rate": 8.327454978931117e-06,
"loss": 0.8238,
"step": 2998
},
{
"epoch": 1.5241725430404676,
"grad_norm": 3.1582090854644775,
"learning_rate": 8.326199892407222e-06,
"loss": 0.8133,
"step": 2999
},
{
"epoch": 1.5246807699637888,
"grad_norm": 2.971914768218994,
"learning_rate": 8.32494442980753e-06,
"loss": 0.7771,
"step": 3000
},
{
"epoch": 1.5246807699637888,
"eval_loss": 1.2642887830734253,
"eval_runtime": 14.481,
"eval_samples_per_second": 27.622,
"eval_steps_per_second": 3.453,
"step": 3000
},
{
"epoch": 1.5251889968871102,
"grad_norm": 3.0078349113464355,
"learning_rate": 8.323688591273983e-06,
"loss": 0.8273,
"step": 3001
},
{
"epoch": 1.5256972238104314,
"grad_norm": 2.915525436401367,
"learning_rate": 8.322432376948577e-06,
"loss": 0.8111,
"step": 3002
},
{
"epoch": 1.5262054507337526,
"grad_norm": 3.245734930038452,
"learning_rate": 8.321175786973343e-06,
"loss": 0.8522,
"step": 3003
},
{
"epoch": 1.526713677657074,
"grad_norm": 3.0924389362335205,
"learning_rate": 8.319918821490358e-06,
"loss": 0.9071,
"step": 3004
},
{
"epoch": 1.5272219045803952,
"grad_norm": 3.2382094860076904,
"learning_rate": 8.318661480641738e-06,
"loss": 0.7896,
"step": 3005
},
{
"epoch": 1.5277301315037164,
"grad_norm": 3.118859052658081,
"learning_rate": 8.317403764569646e-06,
"loss": 0.841,
"step": 3006
},
{
"epoch": 1.5282383584270378,
"grad_norm": 3.158026695251465,
"learning_rate": 8.316145673416285e-06,
"loss": 0.862,
"step": 3007
},
{
"epoch": 1.5287465853503588,
"grad_norm": 3.2535459995269775,
"learning_rate": 8.3148872073239e-06,
"loss": 0.8305,
"step": 3008
},
{
"epoch": 1.5292548122736802,
"grad_norm": 2.9503650665283203,
"learning_rate": 8.31362836643478e-06,
"loss": 0.911,
"step": 3009
},
{
"epoch": 1.5297630391970014,
"grad_norm": 3.5011672973632812,
"learning_rate": 8.312369150891256e-06,
"loss": 0.8192,
"step": 3010
},
{
"epoch": 1.5302712661203226,
"grad_norm": 3.1151344776153564,
"learning_rate": 8.3111095608357e-06,
"loss": 0.8384,
"step": 3011
},
{
"epoch": 1.530779493043644,
"grad_norm": 3.046571731567383,
"learning_rate": 8.309849596410527e-06,
"loss": 0.7742,
"step": 3012
},
{
"epoch": 1.5312877199669652,
"grad_norm": 3.1235508918762207,
"learning_rate": 8.308589257758194e-06,
"loss": 0.8431,
"step": 3013
},
{
"epoch": 1.5317959468902864,
"grad_norm": 3.450984477996826,
"learning_rate": 8.307328545021203e-06,
"loss": 0.8558,
"step": 3014
},
{
"epoch": 1.5323041738136078,
"grad_norm": 3.317640542984009,
"learning_rate": 8.306067458342092e-06,
"loss": 0.7204,
"step": 3015
},
{
"epoch": 1.532812400736929,
"grad_norm": 3.245126247406006,
"learning_rate": 8.304805997863453e-06,
"loss": 0.8786,
"step": 3016
},
{
"epoch": 1.5333206276602502,
"grad_norm": 3.327097177505493,
"learning_rate": 8.303544163727904e-06,
"loss": 0.8458,
"step": 3017
},
{
"epoch": 1.5338288545835717,
"grad_norm": 3.1399662494659424,
"learning_rate": 8.302281956078117e-06,
"loss": 0.7665,
"step": 3018
},
{
"epoch": 1.5343370815068929,
"grad_norm": 3.164243698120117,
"learning_rate": 8.301019375056805e-06,
"loss": 0.7948,
"step": 3019
},
{
"epoch": 1.534845308430214,
"grad_norm": 3.5101428031921387,
"learning_rate": 8.29975642080672e-06,
"loss": 0.9736,
"step": 3020
},
{
"epoch": 1.5353535353535355,
"grad_norm": 3.018258810043335,
"learning_rate": 8.298493093470656e-06,
"loss": 0.8181,
"step": 3021
},
{
"epoch": 1.5358617622768567,
"grad_norm": 3.4201853275299072,
"learning_rate": 8.297229393191454e-06,
"loss": 0.8984,
"step": 3022
},
{
"epoch": 1.5363699892001779,
"grad_norm": 2.9878666400909424,
"learning_rate": 8.295965320111993e-06,
"loss": 0.8458,
"step": 3023
},
{
"epoch": 1.5368782161234993,
"grad_norm": 3.21189022064209,
"learning_rate": 8.294700874375192e-06,
"loss": 0.803,
"step": 3024
},
{
"epoch": 1.5373864430468203,
"grad_norm": 3.2621307373046875,
"learning_rate": 8.29343605612402e-06,
"loss": 0.9049,
"step": 3025
},
{
"epoch": 1.5378946699701417,
"grad_norm": 3.1909806728363037,
"learning_rate": 8.292170865501479e-06,
"loss": 0.9027,
"step": 3026
},
{
"epoch": 1.5384028968934629,
"grad_norm": 2.886561870574951,
"learning_rate": 8.29090530265062e-06,
"loss": 0.8333,
"step": 3027
},
{
"epoch": 1.538911123816784,
"grad_norm": 3.039076566696167,
"learning_rate": 8.28963936771453e-06,
"loss": 0.8114,
"step": 3028
},
{
"epoch": 1.5394193507401055,
"grad_norm": 3.1542789936065674,
"learning_rate": 8.288373060836347e-06,
"loss": 0.8028,
"step": 3029
},
{
"epoch": 1.5399275776634267,
"grad_norm": 3.1072874069213867,
"learning_rate": 8.287106382159242e-06,
"loss": 0.8745,
"step": 3030
},
{
"epoch": 1.540435804586748,
"grad_norm": 3.9167263507843018,
"learning_rate": 8.285839331826432e-06,
"loss": 0.9285,
"step": 3031
},
{
"epoch": 1.5409440315100693,
"grad_norm": 3.416506290435791,
"learning_rate": 8.28457190998118e-06,
"loss": 0.9308,
"step": 3032
},
{
"epoch": 1.5414522584333905,
"grad_norm": 3.403721332550049,
"learning_rate": 8.283304116766777e-06,
"loss": 0.8827,
"step": 3033
},
{
"epoch": 1.5419604853567117,
"grad_norm": 2.909219264984131,
"learning_rate": 8.282035952326575e-06,
"loss": 0.7463,
"step": 3034
},
{
"epoch": 1.5424687122800331,
"grad_norm": 3.1260173320770264,
"learning_rate": 8.280767416803953e-06,
"loss": 0.8301,
"step": 3035
},
{
"epoch": 1.5429769392033543,
"grad_norm": 3.044611692428589,
"learning_rate": 8.27949851034234e-06,
"loss": 0.8554,
"step": 3036
},
{
"epoch": 1.5434851661266755,
"grad_norm": 3.3264572620391846,
"learning_rate": 8.278229233085206e-06,
"loss": 0.9276,
"step": 3037
},
{
"epoch": 1.543993393049997,
"grad_norm": 3.1489923000335693,
"learning_rate": 8.276959585176059e-06,
"loss": 0.8785,
"step": 3038
},
{
"epoch": 1.5445016199733181,
"grad_norm": 3.221567153930664,
"learning_rate": 8.275689566758452e-06,
"loss": 0.9196,
"step": 3039
},
{
"epoch": 1.5450098468966393,
"grad_norm": 2.85846209526062,
"learning_rate": 8.274419177975978e-06,
"loss": 0.7357,
"step": 3040
},
{
"epoch": 1.5455180738199608,
"grad_norm": 3.177860975265503,
"learning_rate": 8.273148418972276e-06,
"loss": 0.8897,
"step": 3041
},
{
"epoch": 1.5460263007432817,
"grad_norm": 2.943847894668579,
"learning_rate": 8.271877289891022e-06,
"loss": 0.8209,
"step": 3042
},
{
"epoch": 1.5465345276666032,
"grad_norm": 2.898120164871216,
"learning_rate": 8.270605790875936e-06,
"loss": 0.849,
"step": 3043
},
{
"epoch": 1.5470427545899244,
"grad_norm": 3.1277554035186768,
"learning_rate": 8.269333922070779e-06,
"loss": 0.8751,
"step": 3044
},
{
"epoch": 1.5475509815132455,
"grad_norm": 3.0100021362304688,
"learning_rate": 8.268061683619354e-06,
"loss": 0.7681,
"step": 3045
},
{
"epoch": 1.548059208436567,
"grad_norm": 3.272531509399414,
"learning_rate": 8.266789075665513e-06,
"loss": 0.9174,
"step": 3046
},
{
"epoch": 1.5485674353598882,
"grad_norm": 3.1157844066619873,
"learning_rate": 8.265516098353134e-06,
"loss": 0.8402,
"step": 3047
},
{
"epoch": 1.5490756622832094,
"grad_norm": 3.2872796058654785,
"learning_rate": 8.264242751826149e-06,
"loss": 0.8969,
"step": 3048
},
{
"epoch": 1.5495838892065308,
"grad_norm": 2.835674285888672,
"learning_rate": 8.26296903622853e-06,
"loss": 0.8268,
"step": 3049
},
{
"epoch": 1.550092116129852,
"grad_norm": 3.2123286724090576,
"learning_rate": 8.26169495170429e-06,
"loss": 0.871,
"step": 3050
},
{
"epoch": 1.5506003430531732,
"grad_norm": 3.2385337352752686,
"learning_rate": 8.260420498397477e-06,
"loss": 0.95,
"step": 3051
},
{
"epoch": 1.5511085699764946,
"grad_norm": 3.034102439880371,
"learning_rate": 8.259145676452196e-06,
"loss": 0.8378,
"step": 3052
},
{
"epoch": 1.5516167968998158,
"grad_norm": 3.435119867324829,
"learning_rate": 8.257870486012574e-06,
"loss": 0.9189,
"step": 3053
},
{
"epoch": 1.552125023823137,
"grad_norm": 2.852510929107666,
"learning_rate": 8.256594927222798e-06,
"loss": 0.7759,
"step": 3054
},
{
"epoch": 1.5526332507464584,
"grad_norm": 3.141561269760132,
"learning_rate": 8.255319000227087e-06,
"loss": 0.8407,
"step": 3055
},
{
"epoch": 1.5531414776697794,
"grad_norm": 3.120166778564453,
"learning_rate": 8.254042705169702e-06,
"loss": 0.8263,
"step": 3056
},
{
"epoch": 1.5536497045931008,
"grad_norm": 3.157909393310547,
"learning_rate": 8.252766042194947e-06,
"loss": 0.8824,
"step": 3057
},
{
"epoch": 1.5541579315164222,
"grad_norm": 3.0600900650024414,
"learning_rate": 8.251489011447166e-06,
"loss": 0.7545,
"step": 3058
},
{
"epoch": 1.5546661584397432,
"grad_norm": 3.2997310161590576,
"learning_rate": 8.25021161307075e-06,
"loss": 0.9094,
"step": 3059
},
{
"epoch": 1.5551743853630646,
"grad_norm": 3.1490283012390137,
"learning_rate": 8.248933847210125e-06,
"loss": 0.7762,
"step": 3060
},
{
"epoch": 1.5556826122863858,
"grad_norm": 3.1866819858551025,
"learning_rate": 8.247655714009761e-06,
"loss": 0.77,
"step": 3061
},
{
"epoch": 1.556190839209707,
"grad_norm": 3.3561694622039795,
"learning_rate": 8.246377213614172e-06,
"loss": 0.8339,
"step": 3062
},
{
"epoch": 1.5566990661330284,
"grad_norm": 3.224182605743408,
"learning_rate": 8.245098346167908e-06,
"loss": 0.9327,
"step": 3063
},
{
"epoch": 1.5572072930563496,
"grad_norm": 3.1291093826293945,
"learning_rate": 8.243819111815567e-06,
"loss": 0.8927,
"step": 3064
},
{
"epoch": 1.5577155199796708,
"grad_norm": 5.050314426422119,
"learning_rate": 8.242539510701784e-06,
"loss": 0.8154,
"step": 3065
},
{
"epoch": 1.5582237469029923,
"grad_norm": 3.3334028720855713,
"learning_rate": 8.241259542971234e-06,
"loss": 0.8359,
"step": 3066
},
{
"epoch": 1.5587319738263135,
"grad_norm": 3.098841428756714,
"learning_rate": 8.23997920876864e-06,
"loss": 0.8848,
"step": 3067
},
{
"epoch": 1.5592402007496347,
"grad_norm": 3.003560781478882,
"learning_rate": 8.238698508238763e-06,
"loss": 0.8935,
"step": 3068
},
{
"epoch": 1.559748427672956,
"grad_norm": 4.89196252822876,
"learning_rate": 8.237417441526401e-06,
"loss": 0.8448,
"step": 3069
},
{
"epoch": 1.5602566545962773,
"grad_norm": 3.1076719760894775,
"learning_rate": 8.2361360087764e-06,
"loss": 0.7736,
"step": 3070
},
{
"epoch": 1.5607648815195985,
"grad_norm": 3.310075521469116,
"learning_rate": 8.234854210133647e-06,
"loss": 0.8718,
"step": 3071
},
{
"epoch": 1.5612731084429199,
"grad_norm": 3.2055442333221436,
"learning_rate": 8.233572045743064e-06,
"loss": 0.8538,
"step": 3072
},
{
"epoch": 1.5617813353662409,
"grad_norm": 3.108445644378662,
"learning_rate": 8.23228951574962e-06,
"loss": 0.863,
"step": 3073
},
{
"epoch": 1.5622895622895623,
"grad_norm": 3.3221216201782227,
"learning_rate": 8.231006620298324e-06,
"loss": 0.8715,
"step": 3074
},
{
"epoch": 1.5627977892128837,
"grad_norm": 3.3187458515167236,
"learning_rate": 8.229723359534227e-06,
"loss": 0.8981,
"step": 3075
},
{
"epoch": 1.5633060161362047,
"grad_norm": 3.0759851932525635,
"learning_rate": 8.228439733602417e-06,
"loss": 0.7856,
"step": 3076
},
{
"epoch": 1.563814243059526,
"grad_norm": 3.011303186416626,
"learning_rate": 8.227155742648034e-06,
"loss": 0.8163,
"step": 3077
},
{
"epoch": 1.5643224699828473,
"grad_norm": 3.2420897483825684,
"learning_rate": 8.225871386816246e-06,
"loss": 0.8399,
"step": 3078
},
{
"epoch": 1.5648306969061685,
"grad_norm": 3.1554501056671143,
"learning_rate": 8.22458666625227e-06,
"loss": 0.8572,
"step": 3079
},
{
"epoch": 1.56533892382949,
"grad_norm": 3.1208579540252686,
"learning_rate": 8.223301581101362e-06,
"loss": 0.894,
"step": 3080
},
{
"epoch": 1.5658471507528111,
"grad_norm": 3.216609001159668,
"learning_rate": 8.222016131508822e-06,
"loss": 0.7723,
"step": 3081
},
{
"epoch": 1.5663553776761323,
"grad_norm": 3.1499931812286377,
"learning_rate": 8.220730317619984e-06,
"loss": 0.7767,
"step": 3082
},
{
"epoch": 1.5668636045994537,
"grad_norm": 3.308377742767334,
"learning_rate": 8.219444139580233e-06,
"loss": 0.8795,
"step": 3083
},
{
"epoch": 1.567371831522775,
"grad_norm": 3.081089735031128,
"learning_rate": 8.218157597534989e-06,
"loss": 0.7532,
"step": 3084
},
{
"epoch": 1.5678800584460961,
"grad_norm": 3.2779386043548584,
"learning_rate": 8.216870691629715e-06,
"loss": 0.8305,
"step": 3085
},
{
"epoch": 1.5683882853694175,
"grad_norm": 3.1625919342041016,
"learning_rate": 8.215583422009912e-06,
"loss": 0.8548,
"step": 3086
},
{
"epoch": 1.5688965122927387,
"grad_norm": 3.231231451034546,
"learning_rate": 8.214295788821128e-06,
"loss": 0.8647,
"step": 3087
},
{
"epoch": 1.56940473921606,
"grad_norm": 3.0235724449157715,
"learning_rate": 8.213007792208946e-06,
"loss": 0.8357,
"step": 3088
},
{
"epoch": 1.5699129661393814,
"grad_norm": 3.2855448722839355,
"learning_rate": 8.211719432318996e-06,
"loss": 0.8629,
"step": 3089
},
{
"epoch": 1.5704211930627023,
"grad_norm": 3.349738121032715,
"learning_rate": 8.210430709296946e-06,
"loss": 0.8685,
"step": 3090
},
{
"epoch": 1.5709294199860238,
"grad_norm": 3.026463031768799,
"learning_rate": 8.209141623288501e-06,
"loss": 0.8174,
"step": 3091
},
{
"epoch": 1.5714376469093452,
"grad_norm": 3.2298712730407715,
"learning_rate": 8.207852174439415e-06,
"loss": 0.8269,
"step": 3092
},
{
"epoch": 1.5719458738326662,
"grad_norm": 3.0465500354766846,
"learning_rate": 8.206562362895476e-06,
"loss": 0.8116,
"step": 3093
},
{
"epoch": 1.5724541007559876,
"grad_norm": 3.303372859954834,
"learning_rate": 8.20527218880252e-06,
"loss": 0.8121,
"step": 3094
},
{
"epoch": 1.5729623276793088,
"grad_norm": 3.1203267574310303,
"learning_rate": 8.203981652306418e-06,
"loss": 0.7643,
"step": 3095
},
{
"epoch": 1.57347055460263,
"grad_norm": 3.2606565952301025,
"learning_rate": 8.202690753553083e-06,
"loss": 0.8244,
"step": 3096
},
{
"epoch": 1.5739787815259514,
"grad_norm": 3.0706636905670166,
"learning_rate": 8.201399492688474e-06,
"loss": 0.8284,
"step": 3097
},
{
"epoch": 1.5744870084492726,
"grad_norm": 3.146022081375122,
"learning_rate": 8.20010786985858e-06,
"loss": 0.9854,
"step": 3098
},
{
"epoch": 1.5749952353725938,
"grad_norm": 3.0561680793762207,
"learning_rate": 8.198815885209445e-06,
"loss": 0.8211,
"step": 3099
},
{
"epoch": 1.5755034622959152,
"grad_norm": 3.139600992202759,
"learning_rate": 8.197523538887144e-06,
"loss": 0.7939,
"step": 3100
},
{
"epoch": 1.5760116892192364,
"grad_norm": 3.0058977603912354,
"learning_rate": 8.196230831037797e-06,
"loss": 0.7286,
"step": 3101
},
{
"epoch": 1.5765199161425576,
"grad_norm": 3.07700777053833,
"learning_rate": 8.194937761807561e-06,
"loss": 0.7964,
"step": 3102
},
{
"epoch": 1.577028143065879,
"grad_norm": 2.9995245933532715,
"learning_rate": 8.193644331342639e-06,
"loss": 0.8075,
"step": 3103
},
{
"epoch": 1.5775363699892002,
"grad_norm": 3.1165170669555664,
"learning_rate": 8.19235053978927e-06,
"loss": 0.8286,
"step": 3104
},
{
"epoch": 1.5780445969125214,
"grad_norm": 3.026459217071533,
"learning_rate": 8.19105638729374e-06,
"loss": 0.7808,
"step": 3105
},
{
"epoch": 1.5785528238358428,
"grad_norm": 3.1128146648406982,
"learning_rate": 8.189761874002369e-06,
"loss": 0.7671,
"step": 3106
},
{
"epoch": 1.5790610507591638,
"grad_norm": 3.3012728691101074,
"learning_rate": 8.18846700006152e-06,
"loss": 0.8824,
"step": 3107
},
{
"epoch": 1.5795692776824852,
"grad_norm": 3.106581211090088,
"learning_rate": 8.187171765617598e-06,
"loss": 0.8511,
"step": 3108
},
{
"epoch": 1.5800775046058066,
"grad_norm": 3.08072566986084,
"learning_rate": 8.18587617081705e-06,
"loss": 0.8367,
"step": 3109
},
{
"epoch": 1.5805857315291276,
"grad_norm": 3.067379951477051,
"learning_rate": 8.184580215806363e-06,
"loss": 0.7869,
"step": 3110
},
{
"epoch": 1.581093958452449,
"grad_norm": 3.0315959453582764,
"learning_rate": 8.18328390073206e-06,
"loss": 0.8744,
"step": 3111
},
{
"epoch": 1.5816021853757702,
"grad_norm": 2.9520187377929688,
"learning_rate": 8.181987225740711e-06,
"loss": 0.7672,
"step": 3112
},
{
"epoch": 1.5821104122990914,
"grad_norm": 2.9568943977355957,
"learning_rate": 8.180690190978923e-06,
"loss": 0.8574,
"step": 3113
},
{
"epoch": 1.5826186392224129,
"grad_norm": 3.3284239768981934,
"learning_rate": 8.179392796593346e-06,
"loss": 0.8003,
"step": 3114
},
{
"epoch": 1.583126866145734,
"grad_norm": 3.1131980419158936,
"learning_rate": 8.17809504273067e-06,
"loss": 0.8305,
"step": 3115
},
{
"epoch": 1.5836350930690553,
"grad_norm": 3.2879278659820557,
"learning_rate": 8.176796929537622e-06,
"loss": 0.8894,
"step": 3116
},
{
"epoch": 1.5841433199923767,
"grad_norm": 3.3802006244659424,
"learning_rate": 8.175498457160976e-06,
"loss": 0.846,
"step": 3117
},
{
"epoch": 1.5846515469156979,
"grad_norm": 3.263233184814453,
"learning_rate": 8.174199625747542e-06,
"loss": 0.8689,
"step": 3118
},
{
"epoch": 1.585159773839019,
"grad_norm": 3.2811408042907715,
"learning_rate": 8.172900435444174e-06,
"loss": 0.8363,
"step": 3119
},
{
"epoch": 1.5856680007623405,
"grad_norm": 3.4866831302642822,
"learning_rate": 8.17160088639776e-06,
"loss": 0.8864,
"step": 3120
},
{
"epoch": 1.5861762276856617,
"grad_norm": 3.2428488731384277,
"learning_rate": 8.170300978755236e-06,
"loss": 0.8778,
"step": 3121
},
{
"epoch": 1.5866844546089829,
"grad_norm": 3.249417543411255,
"learning_rate": 8.169000712663577e-06,
"loss": 0.8464,
"step": 3122
},
{
"epoch": 1.5871926815323043,
"grad_norm": 3.479041576385498,
"learning_rate": 8.167700088269796e-06,
"loss": 0.8951,
"step": 3123
},
{
"epoch": 1.5877009084556253,
"grad_norm": 3.032106637954712,
"learning_rate": 8.166399105720946e-06,
"loss": 0.8026,
"step": 3124
},
{
"epoch": 1.5882091353789467,
"grad_norm": 2.91414737701416,
"learning_rate": 8.165097765164126e-06,
"loss": 0.8015,
"step": 3125
},
{
"epoch": 1.5887173623022681,
"grad_norm": 2.9475793838500977,
"learning_rate": 8.163796066746468e-06,
"loss": 0.7377,
"step": 3126
},
{
"epoch": 1.589225589225589,
"grad_norm": 3.372371196746826,
"learning_rate": 8.16249401061515e-06,
"loss": 0.86,
"step": 3127
},
{
"epoch": 1.5897338161489105,
"grad_norm": 3.2583720684051514,
"learning_rate": 8.161191596917385e-06,
"loss": 0.9854,
"step": 3128
},
{
"epoch": 1.5902420430722317,
"grad_norm": 3.0136237144470215,
"learning_rate": 8.159888825800439e-06,
"loss": 0.8749,
"step": 3129
},
{
"epoch": 1.590750269995553,
"grad_norm": 2.987494707107544,
"learning_rate": 8.158585697411601e-06,
"loss": 0.8088,
"step": 3130
},
{
"epoch": 1.5912584969188743,
"grad_norm": 3.117647409439087,
"learning_rate": 8.15728221189821e-06,
"loss": 0.8514,
"step": 3131
},
{
"epoch": 1.5917667238421955,
"grad_norm": 3.0407848358154297,
"learning_rate": 8.155978369407647e-06,
"loss": 0.9176,
"step": 3132
},
{
"epoch": 1.5922749507655167,
"grad_norm": 3.0892696380615234,
"learning_rate": 8.154674170087328e-06,
"loss": 0.8179,
"step": 3133
},
{
"epoch": 1.5927831776888381,
"grad_norm": 2.9991137981414795,
"learning_rate": 8.153369614084713e-06,
"loss": 0.8015,
"step": 3134
},
{
"epoch": 1.5932914046121593,
"grad_norm": 3.2096457481384277,
"learning_rate": 8.152064701547304e-06,
"loss": 0.932,
"step": 3135
},
{
"epoch": 1.5937996315354805,
"grad_norm": 3.3632469177246094,
"learning_rate": 8.150759432622635e-06,
"loss": 0.8488,
"step": 3136
},
{
"epoch": 1.594307858458802,
"grad_norm": 3.230520009994507,
"learning_rate": 8.14945380745829e-06,
"loss": 0.8742,
"step": 3137
},
{
"epoch": 1.5948160853821232,
"grad_norm": 3.2006702423095703,
"learning_rate": 8.148147826201887e-06,
"loss": 0.8101,
"step": 3138
},
{
"epoch": 1.5953243123054444,
"grad_norm": 3.0946967601776123,
"learning_rate": 8.146841489001089e-06,
"loss": 0.885,
"step": 3139
},
{
"epoch": 1.5958325392287658,
"grad_norm": 3.1396210193634033,
"learning_rate": 8.145534796003593e-06,
"loss": 0.8769,
"step": 3140
},
{
"epoch": 1.5963407661520868,
"grad_norm": 3.229386329650879,
"learning_rate": 8.144227747357142e-06,
"loss": 0.846,
"step": 3141
},
{
"epoch": 1.5968489930754082,
"grad_norm": 3.0499179363250732,
"learning_rate": 8.142920343209516e-06,
"loss": 0.8342,
"step": 3142
},
{
"epoch": 1.5973572199987296,
"grad_norm": 2.994961738586426,
"learning_rate": 8.141612583708539e-06,
"loss": 0.8829,
"step": 3143
},
{
"epoch": 1.5978654469220506,
"grad_norm": 2.935119390487671,
"learning_rate": 8.14030446900207e-06,
"loss": 0.8191,
"step": 3144
},
{
"epoch": 1.598373673845372,
"grad_norm": 3.3414881229400635,
"learning_rate": 8.138995999238011e-06,
"loss": 0.8305,
"step": 3145
},
{
"epoch": 1.5988819007686932,
"grad_norm": 3.234374761581421,
"learning_rate": 8.137687174564303e-06,
"loss": 0.9135,
"step": 3146
},
{
"epoch": 1.5993901276920144,
"grad_norm": 3.135486602783203,
"learning_rate": 8.136377995128929e-06,
"loss": 0.8391,
"step": 3147
},
{
"epoch": 1.5998983546153358,
"grad_norm": 2.8271825313568115,
"learning_rate": 8.135068461079912e-06,
"loss": 0.8114,
"step": 3148
},
{
"epoch": 1.600406581538657,
"grad_norm": 3.3534281253814697,
"learning_rate": 8.13375857256531e-06,
"loss": 0.8856,
"step": 3149
},
{
"epoch": 1.6009148084619782,
"grad_norm": 2.902682065963745,
"learning_rate": 8.13244832973323e-06,
"loss": 0.8384,
"step": 3150
},
{
"epoch": 1.6014230353852996,
"grad_norm": 3.036695718765259,
"learning_rate": 8.131137732731811e-06,
"loss": 0.9197,
"step": 3151
},
{
"epoch": 1.6019312623086208,
"grad_norm": 2.823070526123047,
"learning_rate": 8.129826781709239e-06,
"loss": 0.8652,
"step": 3152
},
{
"epoch": 1.602439489231942,
"grad_norm": 3.1444478034973145,
"learning_rate": 8.12851547681373e-06,
"loss": 0.783,
"step": 3153
},
{
"epoch": 1.6029477161552634,
"grad_norm": 2.9253718852996826,
"learning_rate": 8.127203818193551e-06,
"loss": 0.8148,
"step": 3154
},
{
"epoch": 1.6034559430785846,
"grad_norm": 3.1179044246673584,
"learning_rate": 8.125891805997005e-06,
"loss": 0.8942,
"step": 3155
},
{
"epoch": 1.6039641700019058,
"grad_norm": 3.1189663410186768,
"learning_rate": 8.12457944037243e-06,
"loss": 0.8296,
"step": 3156
},
{
"epoch": 1.6044723969252273,
"grad_norm": 3.124115228652954,
"learning_rate": 8.123266721468212e-06,
"loss": 0.8175,
"step": 3157
},
{
"epoch": 1.6049806238485482,
"grad_norm": 3.2029671669006348,
"learning_rate": 8.121953649432772e-06,
"loss": 0.8313,
"step": 3158
},
{
"epoch": 1.6054888507718696,
"grad_norm": 3.090684175491333,
"learning_rate": 8.120640224414572e-06,
"loss": 0.7608,
"step": 3159
},
{
"epoch": 1.605997077695191,
"grad_norm": 3.2074391841888428,
"learning_rate": 8.119326446562112e-06,
"loss": 0.864,
"step": 3160
},
{
"epoch": 1.606505304618512,
"grad_norm": 3.1305856704711914,
"learning_rate": 8.118012316023939e-06,
"loss": 0.8679,
"step": 3161
},
{
"epoch": 1.6070135315418335,
"grad_norm": 3.354135274887085,
"learning_rate": 8.11669783294863e-06,
"loss": 0.9898,
"step": 3162
},
{
"epoch": 1.6075217584651547,
"grad_norm": 3.194979190826416,
"learning_rate": 8.115382997484809e-06,
"loss": 0.7727,
"step": 3163
},
{
"epoch": 1.6080299853884759,
"grad_norm": 3.311617374420166,
"learning_rate": 8.114067809781137e-06,
"loss": 0.9731,
"step": 3164
},
{
"epoch": 1.6085382123117973,
"grad_norm": 3.249483585357666,
"learning_rate": 8.112752269986314e-06,
"loss": 0.8348,
"step": 3165
},
{
"epoch": 1.6090464392351185,
"grad_norm": 2.845046043395996,
"learning_rate": 8.111436378249085e-06,
"loss": 0.7932,
"step": 3166
},
{
"epoch": 1.6095546661584397,
"grad_norm": 3.1641786098480225,
"learning_rate": 8.110120134718224e-06,
"loss": 0.8059,
"step": 3167
},
{
"epoch": 1.610062893081761,
"grad_norm": 3.048527479171753,
"learning_rate": 8.10880353954256e-06,
"loss": 0.7602,
"step": 3168
},
{
"epoch": 1.6105711200050823,
"grad_norm": 3.179840564727783,
"learning_rate": 8.107486592870945e-06,
"loss": 0.9068,
"step": 3169
},
{
"epoch": 1.6110793469284035,
"grad_norm": 3.40436053276062,
"learning_rate": 8.106169294852288e-06,
"loss": 0.8295,
"step": 3170
},
{
"epoch": 1.611587573851725,
"grad_norm": 3.0481929779052734,
"learning_rate": 8.104851645635521e-06,
"loss": 0.7796,
"step": 3171
},
{
"epoch": 1.612095800775046,
"grad_norm": 2.995546817779541,
"learning_rate": 8.103533645369629e-06,
"loss": 0.873,
"step": 3172
},
{
"epoch": 1.6126040276983673,
"grad_norm": 3.2442634105682373,
"learning_rate": 8.102215294203627e-06,
"loss": 1.0155,
"step": 3173
},
{
"epoch": 1.6131122546216887,
"grad_norm": 3.1833958625793457,
"learning_rate": 8.100896592286579e-06,
"loss": 0.8552,
"step": 3174
},
{
"epoch": 1.6136204815450097,
"grad_norm": 3.268798351287842,
"learning_rate": 8.099577539767578e-06,
"loss": 0.8518,
"step": 3175
},
{
"epoch": 1.6141287084683311,
"grad_norm": 3.209165334701538,
"learning_rate": 8.098258136795767e-06,
"loss": 0.8605,
"step": 3176
},
{
"epoch": 1.6146369353916525,
"grad_norm": 3.4300894737243652,
"learning_rate": 8.096938383520323e-06,
"loss": 0.8265,
"step": 3177
},
{
"epoch": 1.6151451623149735,
"grad_norm": 3.218397378921509,
"learning_rate": 8.09561828009046e-06,
"loss": 0.8257,
"step": 3178
},
{
"epoch": 1.615653389238295,
"grad_norm": 3.162224292755127,
"learning_rate": 8.09429782665544e-06,
"loss": 0.8665,
"step": 3179
},
{
"epoch": 1.6161616161616161,
"grad_norm": 3.493285894393921,
"learning_rate": 8.092977023364556e-06,
"loss": 0.7889,
"step": 3180
},
{
"epoch": 1.6166698430849373,
"grad_norm": 3.060194492340088,
"learning_rate": 8.091655870367146e-06,
"loss": 0.8791,
"step": 3181
},
{
"epoch": 1.6171780700082588,
"grad_norm": 2.9806981086730957,
"learning_rate": 8.090334367812584e-06,
"loss": 0.7623,
"step": 3182
},
{
"epoch": 1.61768629693158,
"grad_norm": 3.182471513748169,
"learning_rate": 8.08901251585029e-06,
"loss": 0.9179,
"step": 3183
},
{
"epoch": 1.6181945238549011,
"grad_norm": 2.998816728591919,
"learning_rate": 8.087690314629712e-06,
"loss": 0.8197,
"step": 3184
},
{
"epoch": 1.6187027507782226,
"grad_norm": 2.9710581302642822,
"learning_rate": 8.086367764300352e-06,
"loss": 0.8487,
"step": 3185
},
{
"epoch": 1.6192109777015438,
"grad_norm": 3.1282782554626465,
"learning_rate": 8.085044865011735e-06,
"loss": 0.7931,
"step": 3186
},
{
"epoch": 1.619719204624865,
"grad_norm": 3.08868408203125,
"learning_rate": 8.083721616913441e-06,
"loss": 0.8249,
"step": 3187
},
{
"epoch": 1.6202274315481864,
"grad_norm": 3.246670722961426,
"learning_rate": 8.08239802015508e-06,
"loss": 0.7897,
"step": 3188
},
{
"epoch": 1.6207356584715076,
"grad_norm": 3.136277437210083,
"learning_rate": 8.081074074886303e-06,
"loss": 0.8653,
"step": 3189
},
{
"epoch": 1.6212438853948288,
"grad_norm": 3.2505767345428467,
"learning_rate": 8.079749781256806e-06,
"loss": 0.8833,
"step": 3190
},
{
"epoch": 1.6217521123181502,
"grad_norm": 3.1759870052337646,
"learning_rate": 8.078425139416314e-06,
"loss": 0.8268,
"step": 3191
},
{
"epoch": 1.6222603392414712,
"grad_norm": 3.146894931793213,
"learning_rate": 8.077100149514601e-06,
"loss": 0.7529,
"step": 3192
},
{
"epoch": 1.6227685661647926,
"grad_norm": 3.2407381534576416,
"learning_rate": 8.075774811701477e-06,
"loss": 0.8144,
"step": 3193
},
{
"epoch": 1.6232767930881138,
"grad_norm": 3.037705421447754,
"learning_rate": 8.074449126126788e-06,
"loss": 0.8034,
"step": 3194
},
{
"epoch": 1.623785020011435,
"grad_norm": 3.1687588691711426,
"learning_rate": 8.073123092940424e-06,
"loss": 0.8729,
"step": 3195
},
{
"epoch": 1.6242932469347564,
"grad_norm": 3.1716537475585938,
"learning_rate": 8.071796712292313e-06,
"loss": 0.8498,
"step": 3196
},
{
"epoch": 1.6248014738580776,
"grad_norm": 3.520030975341797,
"learning_rate": 8.070469984332421e-06,
"loss": 0.9367,
"step": 3197
},
{
"epoch": 1.6253097007813988,
"grad_norm": 3.2190101146698,
"learning_rate": 8.069142909210755e-06,
"loss": 0.7717,
"step": 3198
},
{
"epoch": 1.6258179277047202,
"grad_norm": 3.204716205596924,
"learning_rate": 8.067815487077357e-06,
"loss": 0.9277,
"step": 3199
},
{
"epoch": 1.6263261546280414,
"grad_norm": 2.906593084335327,
"learning_rate": 8.066487718082316e-06,
"loss": 0.8637,
"step": 3200
},
{
"epoch": 1.6268343815513626,
"grad_norm": 3.077334403991699,
"learning_rate": 8.065159602375754e-06,
"loss": 0.8172,
"step": 3201
},
{
"epoch": 1.627342608474684,
"grad_norm": 3.0299875736236572,
"learning_rate": 8.063831140107834e-06,
"loss": 0.8891,
"step": 3202
},
{
"epoch": 1.6278508353980052,
"grad_norm": 3.038489580154419,
"learning_rate": 8.06250233142876e-06,
"loss": 0.8571,
"step": 3203
},
{
"epoch": 1.6283590623213264,
"grad_norm": 3.1936428546905518,
"learning_rate": 8.061173176488769e-06,
"loss": 0.8191,
"step": 3204
},
{
"epoch": 1.6288672892446479,
"grad_norm": 2.9855539798736572,
"learning_rate": 8.059843675438144e-06,
"loss": 0.8109,
"step": 3205
},
{
"epoch": 1.629375516167969,
"grad_norm": 3.1044156551361084,
"learning_rate": 8.058513828427206e-06,
"loss": 0.8533,
"step": 3206
},
{
"epoch": 1.6298837430912902,
"grad_norm": 3.6107935905456543,
"learning_rate": 8.057183635606312e-06,
"loss": 0.9247,
"step": 3207
},
{
"epoch": 1.6303919700146117,
"grad_norm": 3.2537338733673096,
"learning_rate": 8.055853097125858e-06,
"loss": 0.8406,
"step": 3208
},
{
"epoch": 1.6309001969379326,
"grad_norm": 3.1173675060272217,
"learning_rate": 8.054522213136287e-06,
"loss": 0.7766,
"step": 3209
},
{
"epoch": 1.631408423861254,
"grad_norm": 3.2477848529815674,
"learning_rate": 8.05319098378807e-06,
"loss": 0.8719,
"step": 3210
},
{
"epoch": 1.6319166507845753,
"grad_norm": 3.6281533241271973,
"learning_rate": 8.051859409231723e-06,
"loss": 0.8705,
"step": 3211
},
{
"epoch": 1.6324248777078965,
"grad_norm": 3.104458808898926,
"learning_rate": 8.0505274896178e-06,
"loss": 0.8385,
"step": 3212
},
{
"epoch": 1.6329331046312179,
"grad_norm": 3.092541456222534,
"learning_rate": 8.049195225096897e-06,
"loss": 0.9495,
"step": 3213
},
{
"epoch": 1.633441331554539,
"grad_norm": 3.2451331615448,
"learning_rate": 8.047862615819642e-06,
"loss": 0.8221,
"step": 3214
},
{
"epoch": 1.6339495584778603,
"grad_norm": 3.119635820388794,
"learning_rate": 8.046529661936707e-06,
"loss": 0.8372,
"step": 3215
},
{
"epoch": 1.6344577854011817,
"grad_norm": 3.5131008625030518,
"learning_rate": 8.045196363598802e-06,
"loss": 0.897,
"step": 3216
},
{
"epoch": 1.634966012324503,
"grad_norm": 3.041543960571289,
"learning_rate": 8.04386272095668e-06,
"loss": 0.8203,
"step": 3217
},
{
"epoch": 1.635474239247824,
"grad_norm": 4.3333587646484375,
"learning_rate": 8.042528734161123e-06,
"loss": 0.8801,
"step": 3218
},
{
"epoch": 1.6359824661711455,
"grad_norm": 3.03456974029541,
"learning_rate": 8.04119440336296e-06,
"loss": 0.8274,
"step": 3219
},
{
"epoch": 1.6364906930944667,
"grad_norm": 3.17861008644104,
"learning_rate": 8.039859728713058e-06,
"loss": 0.8268,
"step": 3220
},
{
"epoch": 1.636998920017788,
"grad_norm": 3.216559648513794,
"learning_rate": 8.038524710362321e-06,
"loss": 0.8748,
"step": 3221
},
{
"epoch": 1.6375071469411093,
"grad_norm": 2.9259185791015625,
"learning_rate": 8.037189348461692e-06,
"loss": 0.8382,
"step": 3222
},
{
"epoch": 1.6380153738644303,
"grad_norm": 2.898538589477539,
"learning_rate": 8.035853643162153e-06,
"loss": 0.7463,
"step": 3223
},
{
"epoch": 1.6385236007877517,
"grad_norm": 3.110093593597412,
"learning_rate": 8.034517594614726e-06,
"loss": 0.8093,
"step": 3224
},
{
"epoch": 1.6390318277110731,
"grad_norm": 3.151292085647583,
"learning_rate": 8.033181202970471e-06,
"loss": 0.8397,
"step": 3225
},
{
"epoch": 1.6395400546343941,
"grad_norm": 3.235694408416748,
"learning_rate": 8.031844468380485e-06,
"loss": 0.9665,
"step": 3226
},
{
"epoch": 1.6400482815577155,
"grad_norm": 3.0993845462799072,
"learning_rate": 8.030507390995907e-06,
"loss": 0.8412,
"step": 3227
},
{
"epoch": 1.6405565084810367,
"grad_norm": 3.3848185539245605,
"learning_rate": 8.029169970967914e-06,
"loss": 0.9206,
"step": 3228
},
{
"epoch": 1.641064735404358,
"grad_norm": 3.3129689693450928,
"learning_rate": 8.027832208447719e-06,
"loss": 0.8809,
"step": 3229
},
{
"epoch": 1.6415729623276794,
"grad_norm": 3.0754380226135254,
"learning_rate": 8.026494103586577e-06,
"loss": 0.804,
"step": 3230
},
{
"epoch": 1.6420811892510006,
"grad_norm": 3.0243043899536133,
"learning_rate": 8.025155656535782e-06,
"loss": 0.7182,
"step": 3231
},
{
"epoch": 1.6425894161743217,
"grad_norm": 3.0670719146728516,
"learning_rate": 8.02381686744666e-06,
"loss": 0.8181,
"step": 3232
},
{
"epoch": 1.6430976430976432,
"grad_norm": 3.205423355102539,
"learning_rate": 8.022477736470584e-06,
"loss": 0.8251,
"step": 3233
},
{
"epoch": 1.6436058700209644,
"grad_norm": 3.2314603328704834,
"learning_rate": 8.021138263758966e-06,
"loss": 0.8689,
"step": 3234
},
{
"epoch": 1.6441140969442856,
"grad_norm": 3.0328774452209473,
"learning_rate": 8.019798449463248e-06,
"loss": 0.7866,
"step": 3235
},
{
"epoch": 1.644622323867607,
"grad_norm": 3.1050779819488525,
"learning_rate": 8.018458293734917e-06,
"loss": 0.8379,
"step": 3236
},
{
"epoch": 1.6451305507909282,
"grad_norm": 3.1296982765197754,
"learning_rate": 8.017117796725495e-06,
"loss": 0.7903,
"step": 3237
},
{
"epoch": 1.6456387777142494,
"grad_norm": 3.1918692588806152,
"learning_rate": 8.015776958586553e-06,
"loss": 0.8031,
"step": 3238
},
{
"epoch": 1.6461470046375708,
"grad_norm": 3.2104053497314453,
"learning_rate": 8.014435779469682e-06,
"loss": 0.866,
"step": 3239
},
{
"epoch": 1.6466552315608918,
"grad_norm": 3.264033079147339,
"learning_rate": 8.013094259526528e-06,
"loss": 0.824,
"step": 3240
},
{
"epoch": 1.6471634584842132,
"grad_norm": 3.0460946559906006,
"learning_rate": 8.011752398908771e-06,
"loss": 0.824,
"step": 3241
},
{
"epoch": 1.6476716854075346,
"grad_norm": 3.3134658336639404,
"learning_rate": 8.010410197768123e-06,
"loss": 0.8077,
"step": 3242
},
{
"epoch": 1.6481799123308556,
"grad_norm": 3.2771031856536865,
"learning_rate": 8.009067656256344e-06,
"loss": 0.8466,
"step": 3243
},
{
"epoch": 1.648688139254177,
"grad_norm": 3.121896982192993,
"learning_rate": 8.007724774525225e-06,
"loss": 0.7764,
"step": 3244
},
{
"epoch": 1.6491963661774982,
"grad_norm": 3.2331111431121826,
"learning_rate": 8.006381552726601e-06,
"loss": 0.7678,
"step": 3245
},
{
"epoch": 1.6497045931008194,
"grad_norm": 3.142518997192383,
"learning_rate": 8.005037991012341e-06,
"loss": 0.8648,
"step": 3246
},
{
"epoch": 1.6502128200241408,
"grad_norm": 3.501854181289673,
"learning_rate": 8.003694089534355e-06,
"loss": 0.7738,
"step": 3247
},
{
"epoch": 1.650721046947462,
"grad_norm": 3.3636884689331055,
"learning_rate": 8.00234984844459e-06,
"loss": 0.8262,
"step": 3248
},
{
"epoch": 1.6512292738707832,
"grad_norm": 3.1698949337005615,
"learning_rate": 8.001005267895034e-06,
"loss": 0.8882,
"step": 3249
},
{
"epoch": 1.6517375007941046,
"grad_norm": 3.1779544353485107,
"learning_rate": 7.999660348037713e-06,
"loss": 0.9491,
"step": 3250
},
{
"epoch": 1.6522457277174258,
"grad_norm": 3.0099754333496094,
"learning_rate": 7.998315089024684e-06,
"loss": 0.7621,
"step": 3251
},
{
"epoch": 1.652753954640747,
"grad_norm": 3.006117582321167,
"learning_rate": 7.996969491008054e-06,
"loss": 0.7613,
"step": 3252
},
{
"epoch": 1.6532621815640685,
"grad_norm": 3.1954116821289062,
"learning_rate": 7.99562355413996e-06,
"loss": 0.9564,
"step": 3253
},
{
"epoch": 1.6537704084873897,
"grad_norm": 3.165761947631836,
"learning_rate": 7.994277278572581e-06,
"loss": 0.8525,
"step": 3254
},
{
"epoch": 1.6542786354107109,
"grad_norm": 2.9796812534332275,
"learning_rate": 7.992930664458131e-06,
"loss": 0.7416,
"step": 3255
},
{
"epoch": 1.6547868623340323,
"grad_norm": 3.133790969848633,
"learning_rate": 7.99158371194887e-06,
"loss": 0.8482,
"step": 3256
},
{
"epoch": 1.6552950892573532,
"grad_norm": 3.0982847213745117,
"learning_rate": 7.990236421197084e-06,
"loss": 0.8582,
"step": 3257
},
{
"epoch": 1.6558033161806747,
"grad_norm": 3.39365816116333,
"learning_rate": 7.98888879235511e-06,
"loss": 0.8901,
"step": 3258
},
{
"epoch": 1.656311543103996,
"grad_norm": 3.165888547897339,
"learning_rate": 7.987540825575313e-06,
"loss": 0.9455,
"step": 3259
},
{
"epoch": 1.656819770027317,
"grad_norm": 3.2440237998962402,
"learning_rate": 7.986192521010103e-06,
"loss": 0.7762,
"step": 3260
},
{
"epoch": 1.6573279969506385,
"grad_norm": 3.042271375656128,
"learning_rate": 7.984843878811924e-06,
"loss": 0.8588,
"step": 3261
},
{
"epoch": 1.6578362238739597,
"grad_norm": 3.1160874366760254,
"learning_rate": 7.983494899133259e-06,
"loss": 0.8799,
"step": 3262
},
{
"epoch": 1.6583444507972809,
"grad_norm": 3.0635807514190674,
"learning_rate": 7.982145582126633e-06,
"loss": 0.817,
"step": 3263
},
{
"epoch": 1.6588526777206023,
"grad_norm": 3.40155029296875,
"learning_rate": 7.980795927944602e-06,
"loss": 0.9681,
"step": 3264
},
{
"epoch": 1.6593609046439235,
"grad_norm": 3.1403932571411133,
"learning_rate": 7.979445936739769e-06,
"loss": 0.833,
"step": 3265
},
{
"epoch": 1.6598691315672447,
"grad_norm": 3.3115484714508057,
"learning_rate": 7.97809560866477e-06,
"loss": 0.8623,
"step": 3266
},
{
"epoch": 1.6603773584905661,
"grad_norm": 3.2069787979125977,
"learning_rate": 7.976744943872274e-06,
"loss": 0.821,
"step": 3267
},
{
"epoch": 1.6608855854138873,
"grad_norm": 3.360119581222534,
"learning_rate": 7.975393942514998e-06,
"loss": 0.8245,
"step": 3268
},
{
"epoch": 1.6613938123372085,
"grad_norm": 3.2077269554138184,
"learning_rate": 7.974042604745692e-06,
"loss": 0.8357,
"step": 3269
},
{
"epoch": 1.66190203926053,
"grad_norm": 2.924471616744995,
"learning_rate": 7.972690930717145e-06,
"loss": 0.7866,
"step": 3270
},
{
"epoch": 1.6624102661838511,
"grad_norm": 3.392030715942383,
"learning_rate": 7.971338920582182e-06,
"loss": 0.8965,
"step": 3271
},
{
"epoch": 1.6629184931071723,
"grad_norm": 2.932337522506714,
"learning_rate": 7.969986574493667e-06,
"loss": 0.7455,
"step": 3272
},
{
"epoch": 1.6634267200304937,
"grad_norm": 3.115884780883789,
"learning_rate": 7.968633892604508e-06,
"loss": 0.8043,
"step": 3273
},
{
"epoch": 1.6639349469538147,
"grad_norm": 3.195850372314453,
"learning_rate": 7.967280875067638e-06,
"loss": 0.871,
"step": 3274
},
{
"epoch": 1.6644431738771361,
"grad_norm": 3.040839433670044,
"learning_rate": 7.965927522036041e-06,
"loss": 0.867,
"step": 3275
},
{
"epoch": 1.6649514008004576,
"grad_norm": 3.0806403160095215,
"learning_rate": 7.964573833662731e-06,
"loss": 0.8094,
"step": 3276
},
{
"epoch": 1.6654596277237785,
"grad_norm": 2.9806809425354004,
"learning_rate": 7.963219810100765e-06,
"loss": 0.9022,
"step": 3277
},
{
"epoch": 1.6659678546471,
"grad_norm": 3.1467132568359375,
"learning_rate": 7.96186545150323e-06,
"loss": 0.8511,
"step": 3278
},
{
"epoch": 1.6664760815704212,
"grad_norm": 2.929919481277466,
"learning_rate": 7.960510758023261e-06,
"loss": 0.8277,
"step": 3279
},
{
"epoch": 1.6669843084937424,
"grad_norm": 3.274540662765503,
"learning_rate": 7.959155729814025e-06,
"loss": 0.8846,
"step": 3280
},
{
"epoch": 1.6674925354170638,
"grad_norm": 2.9907915592193604,
"learning_rate": 7.957800367028726e-06,
"loss": 0.7783,
"step": 3281
},
{
"epoch": 1.668000762340385,
"grad_norm": 3.237807035446167,
"learning_rate": 7.956444669820611e-06,
"loss": 0.7738,
"step": 3282
},
{
"epoch": 1.6685089892637062,
"grad_norm": 2.7499542236328125,
"learning_rate": 7.955088638342959e-06,
"loss": 0.7801,
"step": 3283
},
{
"epoch": 1.6690172161870276,
"grad_norm": 3.229651927947998,
"learning_rate": 7.953732272749089e-06,
"loss": 0.8682,
"step": 3284
},
{
"epoch": 1.6695254431103488,
"grad_norm": 2.972989320755005,
"learning_rate": 7.95237557319236e-06,
"loss": 0.791,
"step": 3285
},
{
"epoch": 1.67003367003367,
"grad_norm": 3.0465450286865234,
"learning_rate": 7.951018539826162e-06,
"loss": 0.7577,
"step": 3286
},
{
"epoch": 1.6705418969569914,
"grad_norm": 3.4167490005493164,
"learning_rate": 7.949661172803935e-06,
"loss": 0.9066,
"step": 3287
},
{
"epoch": 1.6710501238803126,
"grad_norm": 3.232654094696045,
"learning_rate": 7.948303472279144e-06,
"loss": 0.8161,
"step": 3288
},
{
"epoch": 1.6715583508036338,
"grad_norm": 3.0992579460144043,
"learning_rate": 7.9469454384053e-06,
"loss": 0.8447,
"step": 3289
},
{
"epoch": 1.6720665777269552,
"grad_norm": 3.0505714416503906,
"learning_rate": 7.945587071335948e-06,
"loss": 0.7353,
"step": 3290
},
{
"epoch": 1.6725748046502762,
"grad_norm": 2.9668524265289307,
"learning_rate": 7.944228371224667e-06,
"loss": 0.7479,
"step": 3291
},
{
"epoch": 1.6730830315735976,
"grad_norm": 3.2085092067718506,
"learning_rate": 7.942869338225086e-06,
"loss": 0.9215,
"step": 3292
},
{
"epoch": 1.673591258496919,
"grad_norm": 3.120911121368408,
"learning_rate": 7.941509972490856e-06,
"loss": 0.852,
"step": 3293
},
{
"epoch": 1.67409948542024,
"grad_norm": 3.314965009689331,
"learning_rate": 7.940150274175677e-06,
"loss": 0.8492,
"step": 3294
},
{
"epoch": 1.6746077123435614,
"grad_norm": 3.2626428604125977,
"learning_rate": 7.938790243433285e-06,
"loss": 0.922,
"step": 3295
},
{
"epoch": 1.6751159392668826,
"grad_norm": 3.409306049346924,
"learning_rate": 7.937429880417447e-06,
"loss": 0.8554,
"step": 3296
},
{
"epoch": 1.6756241661902038,
"grad_norm": 3.1044716835021973,
"learning_rate": 7.936069185281974e-06,
"loss": 0.8706,
"step": 3297
},
{
"epoch": 1.6761323931135252,
"grad_norm": 3.5342752933502197,
"learning_rate": 7.934708158180713e-06,
"loss": 0.8668,
"step": 3298
},
{
"epoch": 1.6766406200368464,
"grad_norm": 3.315814971923828,
"learning_rate": 7.933346799267548e-06,
"loss": 0.7991,
"step": 3299
},
{
"epoch": 1.6771488469601676,
"grad_norm": 2.979701280593872,
"learning_rate": 7.931985108696401e-06,
"loss": 0.8347,
"step": 3300
},
{
"epoch": 1.677657073883489,
"grad_norm": 3.1003923416137695,
"learning_rate": 7.93062308662123e-06,
"loss": 0.8468,
"step": 3301
},
{
"epoch": 1.6781653008068103,
"grad_norm": 3.2387659549713135,
"learning_rate": 7.929260733196032e-06,
"loss": 0.9182,
"step": 3302
},
{
"epoch": 1.6786735277301315,
"grad_norm": 3.1733248233795166,
"learning_rate": 7.927898048574841e-06,
"loss": 0.8444,
"step": 3303
},
{
"epoch": 1.6791817546534529,
"grad_norm": 3.23020076751709,
"learning_rate": 7.926535032911728e-06,
"loss": 0.8839,
"step": 3304
},
{
"epoch": 1.679689981576774,
"grad_norm": 3.439688205718994,
"learning_rate": 7.925171686360803e-06,
"loss": 0.8456,
"step": 3305
},
{
"epoch": 1.6801982085000953,
"grad_norm": 3.2128794193267822,
"learning_rate": 7.923808009076213e-06,
"loss": 0.9149,
"step": 3306
},
{
"epoch": 1.6807064354234167,
"grad_norm": 2.9014220237731934,
"learning_rate": 7.922444001212139e-06,
"loss": 0.7875,
"step": 3307
},
{
"epoch": 1.6812146623467377,
"grad_norm": 3.3359878063201904,
"learning_rate": 7.921079662922806e-06,
"loss": 0.858,
"step": 3308
},
{
"epoch": 1.681722889270059,
"grad_norm": 2.9604530334472656,
"learning_rate": 7.919714994362471e-06,
"loss": 0.7724,
"step": 3309
},
{
"epoch": 1.6822311161933805,
"grad_norm": 3.2349345684051514,
"learning_rate": 7.918349995685428e-06,
"loss": 0.8352,
"step": 3310
},
{
"epoch": 1.6827393431167015,
"grad_norm": 2.8869545459747314,
"learning_rate": 7.916984667046012e-06,
"loss": 0.7956,
"step": 3311
},
{
"epoch": 1.683247570040023,
"grad_norm": 3.074676036834717,
"learning_rate": 7.915619008598592e-06,
"loss": 0.8504,
"step": 3312
},
{
"epoch": 1.683755796963344,
"grad_norm": 3.1231634616851807,
"learning_rate": 7.914253020497577e-06,
"loss": 0.7753,
"step": 3313
},
{
"epoch": 1.6842640238866653,
"grad_norm": 3.1155753135681152,
"learning_rate": 7.912886702897413e-06,
"loss": 0.8855,
"step": 3314
},
{
"epoch": 1.6847722508099867,
"grad_norm": 3.1568148136138916,
"learning_rate": 7.911520055952581e-06,
"loss": 0.8406,
"step": 3315
},
{
"epoch": 1.685280477733308,
"grad_norm": 3.1358795166015625,
"learning_rate": 7.9101530798176e-06,
"loss": 0.8323,
"step": 3316
},
{
"epoch": 1.6857887046566291,
"grad_norm": 3.40761661529541,
"learning_rate": 7.908785774647028e-06,
"loss": 0.8595,
"step": 3317
},
{
"epoch": 1.6862969315799505,
"grad_norm": 3.5222272872924805,
"learning_rate": 7.907418140595456e-06,
"loss": 0.9113,
"step": 3318
},
{
"epoch": 1.6868051585032717,
"grad_norm": 3.2144367694854736,
"learning_rate": 7.906050177817519e-06,
"loss": 0.8071,
"step": 3319
},
{
"epoch": 1.687313385426593,
"grad_norm": 3.3410897254943848,
"learning_rate": 7.904681886467885e-06,
"loss": 0.8993,
"step": 3320
},
{
"epoch": 1.6878216123499143,
"grad_norm": 2.950131416320801,
"learning_rate": 7.903313266701256e-06,
"loss": 0.8409,
"step": 3321
},
{
"epoch": 1.6883298392732355,
"grad_norm": 3.1286795139312744,
"learning_rate": 7.901944318672377e-06,
"loss": 0.7937,
"step": 3322
},
{
"epoch": 1.6888380661965567,
"grad_norm": 3.1939430236816406,
"learning_rate": 7.90057504253603e-06,
"loss": 0.8466,
"step": 3323
},
{
"epoch": 1.6893462931198782,
"grad_norm": 3.1400716304779053,
"learning_rate": 7.899205438447028e-06,
"loss": 0.8976,
"step": 3324
},
{
"epoch": 1.6898545200431991,
"grad_norm": 3.1489381790161133,
"learning_rate": 7.897835506560226e-06,
"loss": 0.8472,
"step": 3325
},
{
"epoch": 1.6903627469665206,
"grad_norm": 3.195754289627075,
"learning_rate": 7.896465247030514e-06,
"loss": 0.8202,
"step": 3326
},
{
"epoch": 1.690870973889842,
"grad_norm": 3.4317686557769775,
"learning_rate": 7.895094660012821e-06,
"loss": 0.9097,
"step": 3327
},
{
"epoch": 1.691379200813163,
"grad_norm": 3.1709091663360596,
"learning_rate": 7.893723745662114e-06,
"loss": 0.855,
"step": 3328
},
{
"epoch": 1.6918874277364844,
"grad_norm": 3.0010886192321777,
"learning_rate": 7.892352504133391e-06,
"loss": 0.8307,
"step": 3329
},
{
"epoch": 1.6923956546598056,
"grad_norm": 3.4652211666107178,
"learning_rate": 7.890980935581695e-06,
"loss": 0.8842,
"step": 3330
},
{
"epoch": 1.6929038815831268,
"grad_norm": 3.257430076599121,
"learning_rate": 7.8896090401621e-06,
"loss": 0.8528,
"step": 3331
},
{
"epoch": 1.6934121085064482,
"grad_norm": 3.176788806915283,
"learning_rate": 7.88823681802972e-06,
"loss": 0.8534,
"step": 3332
},
{
"epoch": 1.6939203354297694,
"grad_norm": 3.334630250930786,
"learning_rate": 7.886864269339703e-06,
"loss": 0.9219,
"step": 3333
},
{
"epoch": 1.6944285623530906,
"grad_norm": 3.25536847114563,
"learning_rate": 7.885491394247236e-06,
"loss": 0.9077,
"step": 3334
},
{
"epoch": 1.694936789276412,
"grad_norm": 3.5795812606811523,
"learning_rate": 7.884118192907543e-06,
"loss": 0.8206,
"step": 3335
},
{
"epoch": 1.6954450161997332,
"grad_norm": 3.35133957862854,
"learning_rate": 7.882744665475886e-06,
"loss": 0.8804,
"step": 3336
},
{
"epoch": 1.6959532431230544,
"grad_norm": 3.3669703006744385,
"learning_rate": 7.881370812107563e-06,
"loss": 0.7694,
"step": 3337
},
{
"epoch": 1.6964614700463758,
"grad_norm": 3.38563871383667,
"learning_rate": 7.879996632957904e-06,
"loss": 0.7634,
"step": 3338
},
{
"epoch": 1.696969696969697,
"grad_norm": 3.5372822284698486,
"learning_rate": 7.878622128182285e-06,
"loss": 0.929,
"step": 3339
},
{
"epoch": 1.6974779238930182,
"grad_norm": 3.052685022354126,
"learning_rate": 7.87724729793611e-06,
"loss": 0.9244,
"step": 3340
},
{
"epoch": 1.6979861508163396,
"grad_norm": 3.0986926555633545,
"learning_rate": 7.87587214237483e-06,
"loss": 0.9117,
"step": 3341
},
{
"epoch": 1.6984943777396606,
"grad_norm": 3.1174423694610596,
"learning_rate": 7.874496661653918e-06,
"loss": 0.8043,
"step": 3342
},
{
"epoch": 1.699002604662982,
"grad_norm": 3.176779270172119,
"learning_rate": 7.8731208559289e-06,
"loss": 0.838,
"step": 3343
},
{
"epoch": 1.6995108315863034,
"grad_norm": 3.2106759548187256,
"learning_rate": 7.871744725355324e-06,
"loss": 0.8462,
"step": 3344
},
{
"epoch": 1.7000190585096244,
"grad_norm": 3.2538504600524902,
"learning_rate": 7.870368270088789e-06,
"loss": 0.8153,
"step": 3345
},
{
"epoch": 1.7005272854329458,
"grad_norm": 2.95824933052063,
"learning_rate": 7.868991490284919e-06,
"loss": 0.8539,
"step": 3346
},
{
"epoch": 1.701035512356267,
"grad_norm": 3.3431808948516846,
"learning_rate": 7.86761438609938e-06,
"loss": 0.841,
"step": 3347
},
{
"epoch": 1.7015437392795882,
"grad_norm": 2.930280923843384,
"learning_rate": 7.866236957687874e-06,
"loss": 0.7645,
"step": 3348
},
{
"epoch": 1.7020519662029097,
"grad_norm": 3.450204610824585,
"learning_rate": 7.864859205206138e-06,
"loss": 0.8391,
"step": 3349
},
{
"epoch": 1.7025601931262309,
"grad_norm": 3.246631383895874,
"learning_rate": 7.863481128809952e-06,
"loss": 0.9022,
"step": 3350
},
{
"epoch": 1.703068420049552,
"grad_norm": 3.306354284286499,
"learning_rate": 7.862102728655122e-06,
"loss": 0.8004,
"step": 3351
},
{
"epoch": 1.7035766469728735,
"grad_norm": 3.3001654148101807,
"learning_rate": 7.8607240048975e-06,
"loss": 0.7788,
"step": 3352
},
{
"epoch": 1.7040848738961947,
"grad_norm": 2.9877474308013916,
"learning_rate": 7.859344957692972e-06,
"loss": 0.7975,
"step": 3353
},
{
"epoch": 1.7045931008195159,
"grad_norm": 3.221864938735962,
"learning_rate": 7.857965587197453e-06,
"loss": 0.9618,
"step": 3354
},
{
"epoch": 1.7051013277428373,
"grad_norm": 3.1139442920684814,
"learning_rate": 7.856585893566909e-06,
"loss": 0.7589,
"step": 3355
},
{
"epoch": 1.7056095546661585,
"grad_norm": 3.3803412914276123,
"learning_rate": 7.855205876957331e-06,
"loss": 0.8664,
"step": 3356
},
{
"epoch": 1.7061177815894797,
"grad_norm": 3.4577863216400146,
"learning_rate": 7.853825537524751e-06,
"loss": 0.9056,
"step": 3357
},
{
"epoch": 1.706626008512801,
"grad_norm": 2.8583593368530273,
"learning_rate": 7.852444875425234e-06,
"loss": 0.816,
"step": 3358
},
{
"epoch": 1.707134235436122,
"grad_norm": 2.987264394760132,
"learning_rate": 7.851063890814888e-06,
"loss": 0.8476,
"step": 3359
},
{
"epoch": 1.7076424623594435,
"grad_norm": 3.083441972732544,
"learning_rate": 7.84968258384985e-06,
"loss": 0.8287,
"step": 3360
},
{
"epoch": 1.7081506892827647,
"grad_norm": 3.007948160171509,
"learning_rate": 7.848300954686302e-06,
"loss": 0.8696,
"step": 3361
},
{
"epoch": 1.708658916206086,
"grad_norm": 3.2169318199157715,
"learning_rate": 7.846919003480453e-06,
"loss": 0.8461,
"step": 3362
},
{
"epoch": 1.7091671431294073,
"grad_norm": 3.058051586151123,
"learning_rate": 7.845536730388555e-06,
"loss": 0.7913,
"step": 3363
},
{
"epoch": 1.7096753700527285,
"grad_norm": 3.4415318965911865,
"learning_rate": 7.844154135566892e-06,
"loss": 0.8106,
"step": 3364
},
{
"epoch": 1.7101835969760497,
"grad_norm": 3.4176740646362305,
"learning_rate": 7.84277121917179e-06,
"loss": 0.8788,
"step": 3365
},
{
"epoch": 1.7106918238993711,
"grad_norm": 3.1206893920898438,
"learning_rate": 7.841387981359606e-06,
"loss": 0.8288,
"step": 3366
},
{
"epoch": 1.7112000508226923,
"grad_norm": 3.396747589111328,
"learning_rate": 7.840004422286735e-06,
"loss": 0.8438,
"step": 3367
},
{
"epoch": 1.7117082777460135,
"grad_norm": 3.080991744995117,
"learning_rate": 7.83862054210961e-06,
"loss": 0.7587,
"step": 3368
},
{
"epoch": 1.712216504669335,
"grad_norm": 3.3959927558898926,
"learning_rate": 7.837236340984699e-06,
"loss": 0.8476,
"step": 3369
},
{
"epoch": 1.7127247315926561,
"grad_norm": 3.123796224594116,
"learning_rate": 7.835851819068505e-06,
"loss": 0.8816,
"step": 3370
},
{
"epoch": 1.7132329585159773,
"grad_norm": 3.062106132507324,
"learning_rate": 7.834466976517569e-06,
"loss": 0.91,
"step": 3371
},
{
"epoch": 1.7137411854392988,
"grad_norm": 3.0195693969726562,
"learning_rate": 7.833081813488468e-06,
"loss": 0.7959,
"step": 3372
},
{
"epoch": 1.71424941236262,
"grad_norm": 3.146942377090454,
"learning_rate": 7.831696330137817e-06,
"loss": 0.882,
"step": 3373
},
{
"epoch": 1.7147576392859412,
"grad_norm": 3.1216883659362793,
"learning_rate": 7.830310526622261e-06,
"loss": 0.8257,
"step": 3374
},
{
"epoch": 1.7152658662092626,
"grad_norm": 3.703882932662964,
"learning_rate": 7.82892440309849e-06,
"loss": 0.818,
"step": 3375
},
{
"epoch": 1.7157740931325836,
"grad_norm": 3.1644270420074463,
"learning_rate": 7.827537959723222e-06,
"loss": 0.8017,
"step": 3376
},
{
"epoch": 1.716282320055905,
"grad_norm": 3.155853033065796,
"learning_rate": 7.826151196653216e-06,
"loss": 0.8255,
"step": 3377
},
{
"epoch": 1.7167905469792262,
"grad_norm": 3.078758716583252,
"learning_rate": 7.82476411404527e-06,
"loss": 0.7639,
"step": 3378
},
{
"epoch": 1.7172987739025474,
"grad_norm": 2.952954053878784,
"learning_rate": 7.823376712056205e-06,
"loss": 0.8544,
"step": 3379
},
{
"epoch": 1.7178070008258688,
"grad_norm": 3.054943323135376,
"learning_rate": 7.821988990842895e-06,
"loss": 0.8404,
"step": 3380
},
{
"epoch": 1.71831522774919,
"grad_norm": 2.981538772583008,
"learning_rate": 7.82060095056224e-06,
"loss": 0.9251,
"step": 3381
},
{
"epoch": 1.7188234546725112,
"grad_norm": 3.3590853214263916,
"learning_rate": 7.819212591371178e-06,
"loss": 0.9167,
"step": 3382
},
{
"epoch": 1.7193316815958326,
"grad_norm": 3.1496026515960693,
"learning_rate": 7.817823913426682e-06,
"loss": 0.898,
"step": 3383
},
{
"epoch": 1.7198399085191538,
"grad_norm": 3.1727194786071777,
"learning_rate": 7.816434916885767e-06,
"loss": 0.876,
"step": 3384
},
{
"epoch": 1.720348135442475,
"grad_norm": 3.156569004058838,
"learning_rate": 7.815045601905475e-06,
"loss": 0.8331,
"step": 3385
},
{
"epoch": 1.7208563623657964,
"grad_norm": 2.845827341079712,
"learning_rate": 7.81365596864289e-06,
"loss": 0.8177,
"step": 3386
},
{
"epoch": 1.7213645892891176,
"grad_norm": 3.048043966293335,
"learning_rate": 7.812266017255132e-06,
"loss": 0.8451,
"step": 3387
},
{
"epoch": 1.7218728162124388,
"grad_norm": 3.1950175762176514,
"learning_rate": 7.810875747899352e-06,
"loss": 0.8593,
"step": 3388
},
{
"epoch": 1.7223810431357602,
"grad_norm": 3.315939426422119,
"learning_rate": 7.809485160732744e-06,
"loss": 0.8856,
"step": 3389
},
{
"epoch": 1.7228892700590814,
"grad_norm": 3.349729299545288,
"learning_rate": 7.80809425591253e-06,
"loss": 0.8321,
"step": 3390
},
{
"epoch": 1.7233974969824026,
"grad_norm": 3.1980979442596436,
"learning_rate": 7.806703033595979e-06,
"loss": 0.851,
"step": 3391
},
{
"epoch": 1.723905723905724,
"grad_norm": 3.113279342651367,
"learning_rate": 7.805311493940382e-06,
"loss": 0.8821,
"step": 3392
},
{
"epoch": 1.724413950829045,
"grad_norm": 3.150865316390991,
"learning_rate": 7.803919637103077e-06,
"loss": 0.8508,
"step": 3393
},
{
"epoch": 1.7249221777523664,
"grad_norm": 3.0096330642700195,
"learning_rate": 7.802527463241432e-06,
"loss": 0.7343,
"step": 3394
},
{
"epoch": 1.7254304046756876,
"grad_norm": 3.2845497131347656,
"learning_rate": 7.801134972512856e-06,
"loss": 0.8722,
"step": 3395
},
{
"epoch": 1.7259386315990088,
"grad_norm": 2.9541282653808594,
"learning_rate": 7.799742165074784e-06,
"loss": 0.7932,
"step": 3396
},
{
"epoch": 1.7264468585223303,
"grad_norm": 3.258234977722168,
"learning_rate": 7.798349041084701e-06,
"loss": 0.9281,
"step": 3397
},
{
"epoch": 1.7269550854456515,
"grad_norm": 2.8658859729766846,
"learning_rate": 7.796955600700115e-06,
"loss": 0.8579,
"step": 3398
},
{
"epoch": 1.7274633123689727,
"grad_norm": 3.0659303665161133,
"learning_rate": 7.795561844078578e-06,
"loss": 0.8582,
"step": 3399
},
{
"epoch": 1.727971539292294,
"grad_norm": 3.235898733139038,
"learning_rate": 7.794167771377672e-06,
"loss": 0.8627,
"step": 3400
},
{
"epoch": 1.7284797662156153,
"grad_norm": 3.0602004528045654,
"learning_rate": 7.792773382755021e-06,
"loss": 0.849,
"step": 3401
},
{
"epoch": 1.7289879931389365,
"grad_norm": 3.159080743789673,
"learning_rate": 7.791378678368278e-06,
"loss": 0.8391,
"step": 3402
},
{
"epoch": 1.729496220062258,
"grad_norm": 3.1424660682678223,
"learning_rate": 7.789983658375134e-06,
"loss": 0.9017,
"step": 3403
},
{
"epoch": 1.730004446985579,
"grad_norm": 3.1947531700134277,
"learning_rate": 7.78858832293332e-06,
"loss": 0.7713,
"step": 3404
},
{
"epoch": 1.7305126739089003,
"grad_norm": 3.207350969314575,
"learning_rate": 7.787192672200597e-06,
"loss": 0.8945,
"step": 3405
},
{
"epoch": 1.7310209008322217,
"grad_norm": 3.4544808864593506,
"learning_rate": 7.785796706334762e-06,
"loss": 0.8222,
"step": 3406
},
{
"epoch": 1.7315291277555427,
"grad_norm": 3.1704776287078857,
"learning_rate": 7.784400425493656e-06,
"loss": 0.8524,
"step": 3407
},
{
"epoch": 1.732037354678864,
"grad_norm": 3.2776436805725098,
"learning_rate": 7.783003829835142e-06,
"loss": 0.8799,
"step": 3408
},
{
"epoch": 1.7325455816021855,
"grad_norm": 3.104471206665039,
"learning_rate": 7.78160691951713e-06,
"loss": 0.7841,
"step": 3409
},
{
"epoch": 1.7330538085255065,
"grad_norm": 3.2252237796783447,
"learning_rate": 7.780209694697558e-06,
"loss": 0.8334,
"step": 3410
},
{
"epoch": 1.733562035448828,
"grad_norm": 2.9332568645477295,
"learning_rate": 7.778812155534406e-06,
"loss": 0.8084,
"step": 3411
},
{
"epoch": 1.7340702623721491,
"grad_norm": 3.004978895187378,
"learning_rate": 7.777414302185683e-06,
"loss": 0.8543,
"step": 3412
},
{
"epoch": 1.7345784892954703,
"grad_norm": 3.2775914669036865,
"learning_rate": 7.776016134809439e-06,
"loss": 0.8399,
"step": 3413
},
{
"epoch": 1.7350867162187917,
"grad_norm": 2.82473087310791,
"learning_rate": 7.77461765356376e-06,
"loss": 0.7478,
"step": 3414
},
{
"epoch": 1.735594943142113,
"grad_norm": 3.2043254375457764,
"learning_rate": 7.77321885860676e-06,
"loss": 0.8112,
"step": 3415
},
{
"epoch": 1.7361031700654341,
"grad_norm": 3.1789662837982178,
"learning_rate": 7.771819750096594e-06,
"loss": 0.7874,
"step": 3416
},
{
"epoch": 1.7366113969887556,
"grad_norm": 3.2129077911376953,
"learning_rate": 7.770420328191454e-06,
"loss": 0.8202,
"step": 3417
},
{
"epoch": 1.7371196239120767,
"grad_norm": 3.1670689582824707,
"learning_rate": 7.769020593049565e-06,
"loss": 0.8352,
"step": 3418
},
{
"epoch": 1.737627850835398,
"grad_norm": 3.1509406566619873,
"learning_rate": 7.767620544829186e-06,
"loss": 0.7717,
"step": 3419
},
{
"epoch": 1.7381360777587194,
"grad_norm": 3.288256883621216,
"learning_rate": 7.766220183688615e-06,
"loss": 0.909,
"step": 3420
},
{
"epoch": 1.7386443046820406,
"grad_norm": 3.142703056335449,
"learning_rate": 7.76481950978618e-06,
"loss": 0.9001,
"step": 3421
},
{
"epoch": 1.7391525316053618,
"grad_norm": 3.0902483463287354,
"learning_rate": 7.763418523280253e-06,
"loss": 0.8006,
"step": 3422
},
{
"epoch": 1.7396607585286832,
"grad_norm": 3.544025421142578,
"learning_rate": 7.762017224329233e-06,
"loss": 0.8711,
"step": 3423
},
{
"epoch": 1.7401689854520042,
"grad_norm": 3.133329153060913,
"learning_rate": 7.760615613091557e-06,
"loss": 0.8377,
"step": 3424
},
{
"epoch": 1.7406772123753256,
"grad_norm": 3.357799530029297,
"learning_rate": 7.759213689725699e-06,
"loss": 0.8351,
"step": 3425
},
{
"epoch": 1.741185439298647,
"grad_norm": 2.8933751583099365,
"learning_rate": 7.757811454390168e-06,
"loss": 0.8533,
"step": 3426
},
{
"epoch": 1.741693666221968,
"grad_norm": 2.9360575675964355,
"learning_rate": 7.756408907243503e-06,
"loss": 0.8728,
"step": 3427
},
{
"epoch": 1.7422018931452894,
"grad_norm": 3.189209461212158,
"learning_rate": 7.755006048444287e-06,
"loss": 0.911,
"step": 3428
},
{
"epoch": 1.7427101200686106,
"grad_norm": 3.846020460128784,
"learning_rate": 7.753602878151132e-06,
"loss": 0.9189,
"step": 3429
},
{
"epoch": 1.7432183469919318,
"grad_norm": 2.9996988773345947,
"learning_rate": 7.752199396522688e-06,
"loss": 0.7928,
"step": 3430
},
{
"epoch": 1.7437265739152532,
"grad_norm": 3.2458527088165283,
"learning_rate": 7.750795603717637e-06,
"loss": 0.8081,
"step": 3431
},
{
"epoch": 1.7442348008385744,
"grad_norm": 3.339367151260376,
"learning_rate": 7.749391499894701e-06,
"loss": 0.8459,
"step": 3432
},
{
"epoch": 1.7447430277618956,
"grad_norm": 3.1276707649230957,
"learning_rate": 7.747987085212633e-06,
"loss": 0.8501,
"step": 3433
},
{
"epoch": 1.745251254685217,
"grad_norm": 3.230774164199829,
"learning_rate": 7.746582359830223e-06,
"loss": 0.9113,
"step": 3434
},
{
"epoch": 1.7457594816085382,
"grad_norm": 2.9944803714752197,
"learning_rate": 7.745177323906297e-06,
"loss": 0.815,
"step": 3435
},
{
"epoch": 1.7462677085318594,
"grad_norm": 3.396505117416382,
"learning_rate": 7.743771977599714e-06,
"loss": 0.8726,
"step": 3436
},
{
"epoch": 1.7467759354551808,
"grad_norm": 3.2798049449920654,
"learning_rate": 7.74236632106937e-06,
"loss": 0.8442,
"step": 3437
},
{
"epoch": 1.747284162378502,
"grad_norm": 3.106595039367676,
"learning_rate": 7.740960354474191e-06,
"loss": 0.8201,
"step": 3438
},
{
"epoch": 1.7477923893018232,
"grad_norm": 3.378309726715088,
"learning_rate": 7.73955407797315e-06,
"loss": 0.8769,
"step": 3439
},
{
"epoch": 1.7483006162251447,
"grad_norm": 3.187196731567383,
"learning_rate": 7.73814749172524e-06,
"loss": 0.8586,
"step": 3440
},
{
"epoch": 1.7488088431484656,
"grad_norm": 3.2755212783813477,
"learning_rate": 7.736740595889499e-06,
"loss": 0.7788,
"step": 3441
},
{
"epoch": 1.749317070071787,
"grad_norm": 3.3902981281280518,
"learning_rate": 7.735333390624999e-06,
"loss": 0.9026,
"step": 3442
},
{
"epoch": 1.7498252969951085,
"grad_norm": 3.0064620971679688,
"learning_rate": 7.733925876090842e-06,
"loss": 0.8739,
"step": 3443
},
{
"epoch": 1.7503335239184294,
"grad_norm": 3.1249990463256836,
"learning_rate": 7.73251805244617e-06,
"loss": 0.893,
"step": 3444
},
{
"epoch": 1.7508417508417509,
"grad_norm": 3.122293710708618,
"learning_rate": 7.731109919850156e-06,
"loss": 0.7919,
"step": 3445
},
{
"epoch": 1.751349977765072,
"grad_norm": 3.1727752685546875,
"learning_rate": 7.729701478462014e-06,
"loss": 0.8264,
"step": 3446
},
{
"epoch": 1.7518582046883933,
"grad_norm": 3.2961251735687256,
"learning_rate": 7.728292728440987e-06,
"loss": 0.7647,
"step": 3447
},
{
"epoch": 1.7523664316117147,
"grad_norm": 3.3101634979248047,
"learning_rate": 7.726883669946355e-06,
"loss": 0.9487,
"step": 3448
},
{
"epoch": 1.7528746585350359,
"grad_norm": 3.055027484893799,
"learning_rate": 7.725474303137432e-06,
"loss": 0.8389,
"step": 3449
},
{
"epoch": 1.753382885458357,
"grad_norm": 3.1277880668640137,
"learning_rate": 7.724064628173568e-06,
"loss": 0.8013,
"step": 3450
},
{
"epoch": 1.7538911123816785,
"grad_norm": 3.3328499794006348,
"learning_rate": 7.722654645214148e-06,
"loss": 0.9683,
"step": 3451
},
{
"epoch": 1.7543993393049997,
"grad_norm": 3.0421502590179443,
"learning_rate": 7.72124435441859e-06,
"loss": 0.8509,
"step": 3452
},
{
"epoch": 1.754907566228321,
"grad_norm": 3.255542516708374,
"learning_rate": 7.719833755946352e-06,
"loss": 0.8878,
"step": 3453
},
{
"epoch": 1.7554157931516423,
"grad_norm": 3.13769268989563,
"learning_rate": 7.718422849956918e-06,
"loss": 0.8319,
"step": 3454
},
{
"epoch": 1.7559240200749635,
"grad_norm": 3.3945152759552,
"learning_rate": 7.717011636609815e-06,
"loss": 0.8114,
"step": 3455
},
{
"epoch": 1.7564322469982847,
"grad_norm": 3.2403454780578613,
"learning_rate": 7.7156001160646e-06,
"loss": 0.8258,
"step": 3456
},
{
"epoch": 1.7569404739216061,
"grad_norm": 3.01177978515625,
"learning_rate": 7.714188288480864e-06,
"loss": 0.7997,
"step": 3457
},
{
"epoch": 1.757448700844927,
"grad_norm": 3.2744243144989014,
"learning_rate": 7.712776154018238e-06,
"loss": 0.897,
"step": 3458
},
{
"epoch": 1.7579569277682485,
"grad_norm": 3.0223116874694824,
"learning_rate": 7.711363712836387e-06,
"loss": 0.8106,
"step": 3459
},
{
"epoch": 1.75846515469157,
"grad_norm": 3.2434840202331543,
"learning_rate": 7.709950965095e-06,
"loss": 0.8571,
"step": 3460
},
{
"epoch": 1.758973381614891,
"grad_norm": 3.1417956352233887,
"learning_rate": 7.708537910953818e-06,
"loss": 0.9404,
"step": 3461
},
{
"epoch": 1.7594816085382123,
"grad_norm": 3.2836475372314453,
"learning_rate": 7.7071245505726e-06,
"loss": 0.8172,
"step": 3462
},
{
"epoch": 1.7599898354615335,
"grad_norm": 3.0664286613464355,
"learning_rate": 7.705710884111153e-06,
"loss": 0.8509,
"step": 3463
},
{
"epoch": 1.7604980623848547,
"grad_norm": 2.844975233078003,
"learning_rate": 7.70429691172931e-06,
"loss": 0.7531,
"step": 3464
},
{
"epoch": 1.7610062893081762,
"grad_norm": 3.3454537391662598,
"learning_rate": 7.702882633586941e-06,
"loss": 0.8593,
"step": 3465
},
{
"epoch": 1.7615145162314974,
"grad_norm": 3.070310115814209,
"learning_rate": 7.701468049843952e-06,
"loss": 0.9028,
"step": 3466
},
{
"epoch": 1.7620227431548185,
"grad_norm": 3.2803428173065186,
"learning_rate": 7.70005316066028e-06,
"loss": 0.7379,
"step": 3467
},
{
"epoch": 1.76253097007814,
"grad_norm": 3.622762680053711,
"learning_rate": 7.698637966195906e-06,
"loss": 0.9147,
"step": 3468
},
{
"epoch": 1.7630391970014612,
"grad_norm": 2.88554048538208,
"learning_rate": 7.69722246661083e-06,
"loss": 0.7526,
"step": 3469
},
{
"epoch": 1.7635474239247824,
"grad_norm": 3.2611470222473145,
"learning_rate": 7.6958066620651e-06,
"loss": 0.838,
"step": 3470
},
{
"epoch": 1.7640556508481038,
"grad_norm": 3.031313896179199,
"learning_rate": 7.694390552718791e-06,
"loss": 0.8521,
"step": 3471
},
{
"epoch": 1.764563877771425,
"grad_norm": 3.072566509246826,
"learning_rate": 7.692974138732018e-06,
"loss": 0.8519,
"step": 3472
},
{
"epoch": 1.7650721046947462,
"grad_norm": 3.1689980030059814,
"learning_rate": 7.691557420264926e-06,
"loss": 0.793,
"step": 3473
},
{
"epoch": 1.7655803316180676,
"grad_norm": 3.405853033065796,
"learning_rate": 7.690140397477694e-06,
"loss": 0.792,
"step": 3474
},
{
"epoch": 1.7660885585413886,
"grad_norm": 3.279622793197632,
"learning_rate": 7.688723070530539e-06,
"loss": 0.8657,
"step": 3475
},
{
"epoch": 1.76659678546471,
"grad_norm": 3.1858105659484863,
"learning_rate": 7.68730543958371e-06,
"loss": 0.8702,
"step": 3476
},
{
"epoch": 1.7671050123880314,
"grad_norm": 3.201594114303589,
"learning_rate": 7.685887504797494e-06,
"loss": 0.8724,
"step": 3477
},
{
"epoch": 1.7676132393113524,
"grad_norm": 3.152366876602173,
"learning_rate": 7.684469266332205e-06,
"loss": 0.7965,
"step": 3478
},
{
"epoch": 1.7681214662346738,
"grad_norm": 3.1901934146881104,
"learning_rate": 7.683050724348196e-06,
"loss": 0.8763,
"step": 3479
},
{
"epoch": 1.768629693157995,
"grad_norm": 3.3099849224090576,
"learning_rate": 7.681631879005857e-06,
"loss": 0.8521,
"step": 3480
},
{
"epoch": 1.7691379200813162,
"grad_norm": 3.154052257537842,
"learning_rate": 7.680212730465609e-06,
"loss": 0.9154,
"step": 3481
},
{
"epoch": 1.7696461470046376,
"grad_norm": 3.3573923110961914,
"learning_rate": 7.678793278887906e-06,
"loss": 0.8304,
"step": 3482
},
{
"epoch": 1.7701543739279588,
"grad_norm": 3.297215461730957,
"learning_rate": 7.677373524433238e-06,
"loss": 0.8368,
"step": 3483
},
{
"epoch": 1.77066260085128,
"grad_norm": 3.335425853729248,
"learning_rate": 7.67595346726213e-06,
"loss": 0.8798,
"step": 3484
},
{
"epoch": 1.7711708277746014,
"grad_norm": 2.9975199699401855,
"learning_rate": 7.674533107535138e-06,
"loss": 0.8346,
"step": 3485
},
{
"epoch": 1.7716790546979226,
"grad_norm": 3.0628726482391357,
"learning_rate": 7.673112445412859e-06,
"loss": 0.8318,
"step": 3486
},
{
"epoch": 1.7721872816212438,
"grad_norm": 3.0613350868225098,
"learning_rate": 7.671691481055915e-06,
"loss": 0.8484,
"step": 3487
},
{
"epoch": 1.7726955085445653,
"grad_norm": 3.252533435821533,
"learning_rate": 7.67027021462497e-06,
"loss": 0.8594,
"step": 3488
},
{
"epoch": 1.7732037354678865,
"grad_norm": 3.155071496963501,
"learning_rate": 7.668848646280718e-06,
"loss": 0.7437,
"step": 3489
},
{
"epoch": 1.7737119623912077,
"grad_norm": 3.096879005432129,
"learning_rate": 7.667426776183888e-06,
"loss": 0.7902,
"step": 3490
},
{
"epoch": 1.774220189314529,
"grad_norm": 3.074460744857788,
"learning_rate": 7.666004604495243e-06,
"loss": 0.8088,
"step": 3491
},
{
"epoch": 1.77472841623785,
"grad_norm": 3.132429599761963,
"learning_rate": 7.664582131375581e-06,
"loss": 0.81,
"step": 3492
},
{
"epoch": 1.7752366431611715,
"grad_norm": 3.136418581008911,
"learning_rate": 7.663159356985736e-06,
"loss": 0.9542,
"step": 3493
},
{
"epoch": 1.7757448700844929,
"grad_norm": 3.1513595581054688,
"learning_rate": 7.661736281486568e-06,
"loss": 0.8895,
"step": 3494
},
{
"epoch": 1.7762530970078139,
"grad_norm": 3.2499263286590576,
"learning_rate": 7.660312905038983e-06,
"loss": 0.9252,
"step": 3495
},
{
"epoch": 1.7767613239311353,
"grad_norm": 3.060739040374756,
"learning_rate": 7.65888922780391e-06,
"loss": 0.8141,
"step": 3496
},
{
"epoch": 1.7772695508544565,
"grad_norm": 3.1161351203918457,
"learning_rate": 7.657465249942318e-06,
"loss": 0.9581,
"step": 3497
},
{
"epoch": 1.7777777777777777,
"grad_norm": 3.0054283142089844,
"learning_rate": 7.656040971615209e-06,
"loss": 0.8671,
"step": 3498
},
{
"epoch": 1.778286004701099,
"grad_norm": 3.2062299251556396,
"learning_rate": 7.654616392983616e-06,
"loss": 0.8475,
"step": 3499
},
{
"epoch": 1.7787942316244203,
"grad_norm": 3.0939881801605225,
"learning_rate": 7.653191514208612e-06,
"loss": 0.8605,
"step": 3500
},
{
"epoch": 1.7787942316244203,
"eval_loss": 1.2510522603988647,
"eval_runtime": 14.8176,
"eval_samples_per_second": 26.995,
"eval_steps_per_second": 3.374,
"step": 3500
}
],
"logging_steps": 1.0,
"max_steps": 9835,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8214743984989798e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}