Files
SmolLM2-MagpieUltraPlus-OH/trainer_state.json
ModelHub XC a89d361171 初始化项目,由ModelHub XC社区提供模型
Model: HuggingFaceTB/SmolLM2-MagpieUltraPlus-OH
Source: Original Platform
2026-06-18 21:38:13 +08:00

7443 lines
182 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000946969696969697,
"grad_norm": 49.75650598702225,
"learning_rate": 2.8409090909090907e-06,
"loss": 3.9178,
"step": 5
},
{
"epoch": 0.001893939393939394,
"grad_norm": 39.27361859039198,
"learning_rate": 5.6818181818181815e-06,
"loss": 3.7038,
"step": 10
},
{
"epoch": 0.002840909090909091,
"grad_norm": 22.806730377223552,
"learning_rate": 8.522727272727271e-06,
"loss": 2.8608,
"step": 15
},
{
"epoch": 0.003787878787878788,
"grad_norm": 9.34674382037142,
"learning_rate": 1.1363636363636363e-05,
"loss": 2.207,
"step": 20
},
{
"epoch": 0.004734848484848485,
"grad_norm": 4.014713370693969,
"learning_rate": 1.4204545454545453e-05,
"loss": 1.74,
"step": 25
},
{
"epoch": 0.005681818181818182,
"grad_norm": 2.1906667080696756,
"learning_rate": 1.7045454545454543e-05,
"loss": 1.4441,
"step": 30
},
{
"epoch": 0.006628787878787879,
"grad_norm": 1.0752279902100068,
"learning_rate": 1.9886363636363634e-05,
"loss": 1.2429,
"step": 35
},
{
"epoch": 0.007575757575757576,
"grad_norm": 0.7187175898896523,
"learning_rate": 2.2727272727272726e-05,
"loss": 1.1147,
"step": 40
},
{
"epoch": 0.008522727272727272,
"grad_norm": 0.49281093084047145,
"learning_rate": 2.5568181818181814e-05,
"loss": 1.1034,
"step": 45
},
{
"epoch": 0.00946969696969697,
"grad_norm": 0.3269633043103454,
"learning_rate": 2.8409090909090906e-05,
"loss": 1.058,
"step": 50
},
{
"epoch": 0.010416666666666666,
"grad_norm": 0.33257710673863067,
"learning_rate": 3.125e-05,
"loss": 1.0722,
"step": 55
},
{
"epoch": 0.011363636363636364,
"grad_norm": 0.25938022961715196,
"learning_rate": 3.4090909090909085e-05,
"loss": 1.0246,
"step": 60
},
{
"epoch": 0.01231060606060606,
"grad_norm": 0.23813343449521693,
"learning_rate": 3.693181818181818e-05,
"loss": 1.0074,
"step": 65
},
{
"epoch": 0.013257575757575758,
"grad_norm": 0.2200079499271612,
"learning_rate": 3.977272727272727e-05,
"loss": 1.0025,
"step": 70
},
{
"epoch": 0.014204545454545454,
"grad_norm": 0.20277353873378182,
"learning_rate": 4.261363636363637e-05,
"loss": 0.9877,
"step": 75
},
{
"epoch": 0.015151515151515152,
"grad_norm": 0.20338107431083782,
"learning_rate": 4.545454545454545e-05,
"loss": 0.9579,
"step": 80
},
{
"epoch": 0.016098484848484848,
"grad_norm": 0.20166600929973375,
"learning_rate": 4.8295454545454537e-05,
"loss": 1.0121,
"step": 85
},
{
"epoch": 0.017045454545454544,
"grad_norm": 0.14601915731061527,
"learning_rate": 5.113636363636363e-05,
"loss": 0.9809,
"step": 90
},
{
"epoch": 0.017992424242424244,
"grad_norm": 0.1910575503845415,
"learning_rate": 5.3977272727272727e-05,
"loss": 0.9687,
"step": 95
},
{
"epoch": 0.01893939393939394,
"grad_norm": 0.1710125261741899,
"learning_rate": 5.681818181818181e-05,
"loss": 0.9759,
"step": 100
},
{
"epoch": 0.019886363636363636,
"grad_norm": 0.1546310725877226,
"learning_rate": 5.96590909090909e-05,
"loss": 0.9449,
"step": 105
},
{
"epoch": 0.020833333333333332,
"grad_norm": 0.15669899681499375,
"learning_rate": 6.25e-05,
"loss": 0.9379,
"step": 110
},
{
"epoch": 0.021780303030303032,
"grad_norm": 0.10549598423376465,
"learning_rate": 6.534090909090909e-05,
"loss": 0.9521,
"step": 115
},
{
"epoch": 0.022727272727272728,
"grad_norm": 0.09265444874772286,
"learning_rate": 6.818181818181817e-05,
"loss": 0.9402,
"step": 120
},
{
"epoch": 0.023674242424242424,
"grad_norm": 0.10012187642369699,
"learning_rate": 7.102272727272727e-05,
"loss": 0.9509,
"step": 125
},
{
"epoch": 0.02462121212121212,
"grad_norm": 0.10405267547853224,
"learning_rate": 7.386363636363635e-05,
"loss": 0.943,
"step": 130
},
{
"epoch": 0.02556818181818182,
"grad_norm": 0.0932862459532729,
"learning_rate": 7.670454545454545e-05,
"loss": 0.923,
"step": 135
},
{
"epoch": 0.026515151515151516,
"grad_norm": 0.08799539522039788,
"learning_rate": 7.954545454545454e-05,
"loss": 0.9221,
"step": 140
},
{
"epoch": 0.027462121212121212,
"grad_norm": 0.07936971459661492,
"learning_rate": 8.238636363636362e-05,
"loss": 0.9385,
"step": 145
},
{
"epoch": 0.028409090909090908,
"grad_norm": 0.09395245331857886,
"learning_rate": 8.522727272727273e-05,
"loss": 0.9098,
"step": 150
},
{
"epoch": 0.029356060606060608,
"grad_norm": 0.08346381824724323,
"learning_rate": 8.806818181818182e-05,
"loss": 0.9592,
"step": 155
},
{
"epoch": 0.030303030303030304,
"grad_norm": 0.0683897969731906,
"learning_rate": 9.09090909090909e-05,
"loss": 0.9162,
"step": 160
},
{
"epoch": 0.03125,
"grad_norm": 0.07364507428375824,
"learning_rate": 9.374999999999999e-05,
"loss": 0.905,
"step": 165
},
{
"epoch": 0.032196969696969696,
"grad_norm": 0.06957507038154116,
"learning_rate": 9.659090909090907e-05,
"loss": 0.9277,
"step": 170
},
{
"epoch": 0.03314393939393939,
"grad_norm": 0.07231783801209996,
"learning_rate": 9.943181818181817e-05,
"loss": 0.8865,
"step": 175
},
{
"epoch": 0.03409090909090909,
"grad_norm": 0.08108886238015861,
"learning_rate": 0.00010227272727272726,
"loss": 0.9221,
"step": 180
},
{
"epoch": 0.035037878787878785,
"grad_norm": 0.06746334152643936,
"learning_rate": 0.00010511363636363635,
"loss": 0.8921,
"step": 185
},
{
"epoch": 0.03598484848484849,
"grad_norm": 0.07474975292416153,
"learning_rate": 0.00010795454545454545,
"loss": 0.9067,
"step": 190
},
{
"epoch": 0.036931818181818184,
"grad_norm": 0.06954632694727424,
"learning_rate": 0.00011079545454545454,
"loss": 0.9274,
"step": 195
},
{
"epoch": 0.03787878787878788,
"grad_norm": 0.1071194914420164,
"learning_rate": 0.00011363636363636362,
"loss": 0.9174,
"step": 200
},
{
"epoch": 0.038825757575757576,
"grad_norm": 0.08047063933324308,
"learning_rate": 0.00011647727272727271,
"loss": 0.8853,
"step": 205
},
{
"epoch": 0.03977272727272727,
"grad_norm": 0.06720262982444847,
"learning_rate": 0.0001193181818181818,
"loss": 0.8936,
"step": 210
},
{
"epoch": 0.04071969696969697,
"grad_norm": 0.06874990083102131,
"learning_rate": 0.0001221590909090909,
"loss": 0.8966,
"step": 215
},
{
"epoch": 0.041666666666666664,
"grad_norm": 0.09031821629007566,
"learning_rate": 0.000125,
"loss": 0.9002,
"step": 220
},
{
"epoch": 0.04261363636363636,
"grad_norm": 0.08250031333079004,
"learning_rate": 0.00012784090909090907,
"loss": 0.9314,
"step": 225
},
{
"epoch": 0.043560606060606064,
"grad_norm": 0.06517851552105172,
"learning_rate": 0.00013068181818181817,
"loss": 0.9264,
"step": 230
},
{
"epoch": 0.04450757575757576,
"grad_norm": 0.0682659812110987,
"learning_rate": 0.00013352272727272727,
"loss": 0.8933,
"step": 235
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.07147589587336683,
"learning_rate": 0.00013636363636363634,
"loss": 0.9181,
"step": 240
},
{
"epoch": 0.04640151515151515,
"grad_norm": 0.06876166962712452,
"learning_rate": 0.00013920454545454544,
"loss": 0.9221,
"step": 245
},
{
"epoch": 0.04734848484848485,
"grad_norm": 0.06370293403172177,
"learning_rate": 0.00014204545454545454,
"loss": 0.8685,
"step": 250
},
{
"epoch": 0.048295454545454544,
"grad_norm": 0.06615942930120759,
"learning_rate": 0.00014488636363636364,
"loss": 0.8895,
"step": 255
},
{
"epoch": 0.04924242424242424,
"grad_norm": 0.06664909205706883,
"learning_rate": 0.0001477272727272727,
"loss": 0.9207,
"step": 260
},
{
"epoch": 0.050189393939393936,
"grad_norm": 0.07595494362406784,
"learning_rate": 0.00015056818181818183,
"loss": 0.8644,
"step": 265
},
{
"epoch": 0.05113636363636364,
"grad_norm": 0.07408826723348173,
"learning_rate": 0.0001534090909090909,
"loss": 0.9062,
"step": 270
},
{
"epoch": 0.052083333333333336,
"grad_norm": 0.06808841073565555,
"learning_rate": 0.00015625,
"loss": 0.9151,
"step": 275
},
{
"epoch": 0.05303030303030303,
"grad_norm": 0.07953899365614112,
"learning_rate": 0.00015909090909090907,
"loss": 0.8909,
"step": 280
},
{
"epoch": 0.05397727272727273,
"grad_norm": 0.07167080509602292,
"learning_rate": 0.00016193181818181817,
"loss": 0.9009,
"step": 285
},
{
"epoch": 0.054924242424242424,
"grad_norm": 0.07064676898002652,
"learning_rate": 0.00016477272727272724,
"loss": 0.8908,
"step": 290
},
{
"epoch": 0.05587121212121212,
"grad_norm": 0.07214540293164669,
"learning_rate": 0.00016761363636363634,
"loss": 0.9244,
"step": 295
},
{
"epoch": 0.056818181818181816,
"grad_norm": 0.06617148307042509,
"learning_rate": 0.00017045454545454547,
"loss": 0.8841,
"step": 300
},
{
"epoch": 0.05776515151515151,
"grad_norm": 0.06536482732681019,
"learning_rate": 0.00017329545454545454,
"loss": 0.8926,
"step": 305
},
{
"epoch": 0.058712121212121215,
"grad_norm": 0.0770022545469697,
"learning_rate": 0.00017613636363636364,
"loss": 0.8918,
"step": 310
},
{
"epoch": 0.05965909090909091,
"grad_norm": 0.07796650440153677,
"learning_rate": 0.0001789772727272727,
"loss": 0.8861,
"step": 315
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.06664140681353005,
"learning_rate": 0.0001818181818181818,
"loss": 0.883,
"step": 320
},
{
"epoch": 0.061553030303030304,
"grad_norm": 0.06505572579245275,
"learning_rate": 0.00018465909090909088,
"loss": 0.9046,
"step": 325
},
{
"epoch": 0.0625,
"grad_norm": 0.07411769991572527,
"learning_rate": 0.00018749999999999998,
"loss": 0.8935,
"step": 330
},
{
"epoch": 0.0634469696969697,
"grad_norm": 0.06839084238999557,
"learning_rate": 0.00019034090909090908,
"loss": 0.8994,
"step": 335
},
{
"epoch": 0.06439393939393939,
"grad_norm": 0.06991741218579048,
"learning_rate": 0.00019318181818181815,
"loss": 0.9011,
"step": 340
},
{
"epoch": 0.06534090909090909,
"grad_norm": 0.06765382117363,
"learning_rate": 0.00019602272727272727,
"loss": 0.8757,
"step": 345
},
{
"epoch": 0.06628787878787878,
"grad_norm": 0.07394479834842242,
"learning_rate": 0.00019886363636363634,
"loss": 0.8869,
"step": 350
},
{
"epoch": 0.06723484848484848,
"grad_norm": 0.07779852408072253,
"learning_rate": 0.00020170454545454544,
"loss": 0.8721,
"step": 355
},
{
"epoch": 0.06818181818181818,
"grad_norm": 0.07182147114935328,
"learning_rate": 0.0002045454545454545,
"loss": 0.9053,
"step": 360
},
{
"epoch": 0.06912878787878787,
"grad_norm": 0.07763475442947491,
"learning_rate": 0.0002073863636363636,
"loss": 0.8886,
"step": 365
},
{
"epoch": 0.07007575757575757,
"grad_norm": 0.06639383299470691,
"learning_rate": 0.0002102272727272727,
"loss": 0.9216,
"step": 370
},
{
"epoch": 0.07102272727272728,
"grad_norm": 0.07582408978067692,
"learning_rate": 0.00021306818181818178,
"loss": 0.9187,
"step": 375
},
{
"epoch": 0.07196969696969698,
"grad_norm": 0.067889778321114,
"learning_rate": 0.0002159090909090909,
"loss": 0.8848,
"step": 380
},
{
"epoch": 0.07291666666666667,
"grad_norm": 0.06350237430320019,
"learning_rate": 0.00021874999999999998,
"loss": 0.8991,
"step": 385
},
{
"epoch": 0.07386363636363637,
"grad_norm": 0.06463105152473327,
"learning_rate": 0.00022159090909090908,
"loss": 0.8993,
"step": 390
},
{
"epoch": 0.07481060606060606,
"grad_norm": 0.06289812753072489,
"learning_rate": 0.00022443181818181815,
"loss": 0.8977,
"step": 395
},
{
"epoch": 0.07575757575757576,
"grad_norm": 0.06451182368963407,
"learning_rate": 0.00022727272727272725,
"loss": 0.9109,
"step": 400
},
{
"epoch": 0.07670454545454546,
"grad_norm": 0.06417545375628221,
"learning_rate": 0.00023011363636363634,
"loss": 0.8689,
"step": 405
},
{
"epoch": 0.07765151515151515,
"grad_norm": 0.06624677302997224,
"learning_rate": 0.00023295454545454542,
"loss": 0.9096,
"step": 410
},
{
"epoch": 0.07859848484848485,
"grad_norm": 0.06713767944662469,
"learning_rate": 0.00023579545454545454,
"loss": 0.9128,
"step": 415
},
{
"epoch": 0.07954545454545454,
"grad_norm": 0.06632474263833514,
"learning_rate": 0.0002386363636363636,
"loss": 0.8992,
"step": 420
},
{
"epoch": 0.08049242424242424,
"grad_norm": 0.06326893641363093,
"learning_rate": 0.0002414772727272727,
"loss": 0.8838,
"step": 425
},
{
"epoch": 0.08143939393939394,
"grad_norm": 0.05270584817938461,
"learning_rate": 0.0002443181818181818,
"loss": 0.8604,
"step": 430
},
{
"epoch": 0.08238636363636363,
"grad_norm": 0.06950851335464077,
"learning_rate": 0.0002471590909090909,
"loss": 0.8928,
"step": 435
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.06031142221337703,
"learning_rate": 0.00025,
"loss": 0.8997,
"step": 440
},
{
"epoch": 0.08428030303030302,
"grad_norm": 0.0598802579058441,
"learning_rate": 0.00025284090909090905,
"loss": 0.888,
"step": 445
},
{
"epoch": 0.08522727272727272,
"grad_norm": 0.05979293999494916,
"learning_rate": 0.00025568181818181815,
"loss": 0.914,
"step": 450
},
{
"epoch": 0.08617424242424243,
"grad_norm": 0.06332115337692762,
"learning_rate": 0.00025852272727272725,
"loss": 0.8897,
"step": 455
},
{
"epoch": 0.08712121212121213,
"grad_norm": 0.05664133712393486,
"learning_rate": 0.00026136363636363634,
"loss": 0.8958,
"step": 460
},
{
"epoch": 0.08806818181818182,
"grad_norm": 0.06262104837726735,
"learning_rate": 0.00026420454545454544,
"loss": 0.8773,
"step": 465
},
{
"epoch": 0.08901515151515152,
"grad_norm": 0.06325434933754956,
"learning_rate": 0.00026704545454545454,
"loss": 0.8941,
"step": 470
},
{
"epoch": 0.08996212121212122,
"grad_norm": 0.06454144975644246,
"learning_rate": 0.00026988636363636364,
"loss": 0.9055,
"step": 475
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.058848918660389354,
"learning_rate": 0.0002727272727272727,
"loss": 0.9066,
"step": 480
},
{
"epoch": 0.09185606060606061,
"grad_norm": 0.0643339517437263,
"learning_rate": 0.0002755681818181818,
"loss": 0.9207,
"step": 485
},
{
"epoch": 0.0928030303030303,
"grad_norm": 0.06062790165341026,
"learning_rate": 0.0002784090909090909,
"loss": 0.9096,
"step": 490
},
{
"epoch": 0.09375,
"grad_norm": 0.06483851920476219,
"learning_rate": 0.00028125,
"loss": 0.8924,
"step": 495
},
{
"epoch": 0.0946969696969697,
"grad_norm": 0.06599789444637924,
"learning_rate": 0.0002840909090909091,
"loss": 0.9052,
"step": 500
},
{
"epoch": 0.09564393939393939,
"grad_norm": 0.06622053779375818,
"learning_rate": 0.0002869318181818182,
"loss": 0.9261,
"step": 505
},
{
"epoch": 0.09659090909090909,
"grad_norm": 0.06986338841915192,
"learning_rate": 0.0002897727272727273,
"loss": 0.9147,
"step": 510
},
{
"epoch": 0.09753787878787878,
"grad_norm": 0.0566839679645091,
"learning_rate": 0.0002926136363636363,
"loss": 0.8702,
"step": 515
},
{
"epoch": 0.09848484848484848,
"grad_norm": 0.06286540817635865,
"learning_rate": 0.0002954545454545454,
"loss": 0.9081,
"step": 520
},
{
"epoch": 0.09943181818181818,
"grad_norm": 0.1319474057131131,
"learning_rate": 0.0002982954545454545,
"loss": 0.9121,
"step": 525
},
{
"epoch": 0.10037878787878787,
"grad_norm": 0.05848327672137334,
"learning_rate": 0.0002999998688802619,
"loss": 0.9124,
"step": 530
},
{
"epoch": 0.10132575757575757,
"grad_norm": 0.06169209923879713,
"learning_rate": 0.0002999983937858416,
"loss": 0.9065,
"step": 535
},
{
"epoch": 0.10227272727272728,
"grad_norm": 0.06107286390796437,
"learning_rate": 0.0002999952797134999,
"loss": 0.9061,
"step": 540
},
{
"epoch": 0.10321969696969698,
"grad_norm": 0.051714218670872523,
"learning_rate": 0.00029999052669726326,
"loss": 0.9188,
"step": 545
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.05186568461682868,
"learning_rate": 0.00029998413478906613,
"loss": 0.8956,
"step": 550
},
{
"epoch": 0.10511363636363637,
"grad_norm": 0.06237137033014081,
"learning_rate": 0.00029997610405875047,
"loss": 0.913,
"step": 555
},
{
"epoch": 0.10606060606060606,
"grad_norm": 0.0627745121471118,
"learning_rate": 0.00029996643459406525,
"loss": 0.8781,
"step": 560
},
{
"epoch": 0.10700757575757576,
"grad_norm": 0.05775548150329091,
"learning_rate": 0.00029995512650066516,
"loss": 0.8961,
"step": 565
},
{
"epoch": 0.10795454545454546,
"grad_norm": 0.058742015004762956,
"learning_rate": 0.0002999421799021097,
"loss": 0.9081,
"step": 570
},
{
"epoch": 0.10890151515151515,
"grad_norm": 0.06278642260383162,
"learning_rate": 0.00029992759493986144,
"loss": 0.9065,
"step": 575
},
{
"epoch": 0.10984848484848485,
"grad_norm": 0.05368202960228582,
"learning_rate": 0.0002999113717732852,
"loss": 0.8793,
"step": 580
},
{
"epoch": 0.11079545454545454,
"grad_norm": 0.06412640518837653,
"learning_rate": 0.0002998935105796455,
"loss": 0.8537,
"step": 585
},
{
"epoch": 0.11174242424242424,
"grad_norm": 0.06196513917098741,
"learning_rate": 0.00029987401155410516,
"loss": 0.8954,
"step": 590
},
{
"epoch": 0.11268939393939394,
"grad_norm": 0.0605833197123053,
"learning_rate": 0.00029985287490972293,
"loss": 0.8945,
"step": 595
},
{
"epoch": 0.11363636363636363,
"grad_norm": 0.06504637583052113,
"learning_rate": 0.0002998301008774512,
"loss": 0.9008,
"step": 600
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.05164915733505111,
"learning_rate": 0.0002998056897061335,
"loss": 0.9051,
"step": 605
},
{
"epoch": 0.11553030303030302,
"grad_norm": 0.051685554793978426,
"learning_rate": 0.000299779641662502,
"loss": 0.8529,
"step": 610
},
{
"epoch": 0.11647727272727272,
"grad_norm": 0.062362055131338696,
"learning_rate": 0.00029975195703117405,
"loss": 0.8691,
"step": 615
},
{
"epoch": 0.11742424242424243,
"grad_norm": 0.06117736368428696,
"learning_rate": 0.00029972263611464966,
"loss": 0.8849,
"step": 620
},
{
"epoch": 0.11837121212121213,
"grad_norm": 0.050904058331399764,
"learning_rate": 0.00029969167923330766,
"loss": 0.8576,
"step": 625
},
{
"epoch": 0.11931818181818182,
"grad_norm": 0.053607823287624916,
"learning_rate": 0.0002996590867254028,
"loss": 0.9272,
"step": 630
},
{
"epoch": 0.12026515151515152,
"grad_norm": 0.06037742366776271,
"learning_rate": 0.00029962485894706155,
"loss": 0.882,
"step": 635
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.05143566912627675,
"learning_rate": 0.00029958899627227837,
"loss": 0.8828,
"step": 640
},
{
"epoch": 0.12215909090909091,
"grad_norm": 0.05625636209624713,
"learning_rate": 0.00029955149909291154,
"loss": 0.9344,
"step": 645
},
{
"epoch": 0.12310606060606061,
"grad_norm": 0.056332501852780985,
"learning_rate": 0.00029951236781867937,
"loss": 0.8857,
"step": 650
},
{
"epoch": 0.1240530303030303,
"grad_norm": 0.05582242562634511,
"learning_rate": 0.0002994716028771549,
"loss": 0.8911,
"step": 655
},
{
"epoch": 0.125,
"grad_norm": 0.05726658160474268,
"learning_rate": 0.0002994292047137618,
"loss": 0.9116,
"step": 660
},
{
"epoch": 0.1259469696969697,
"grad_norm": 0.06041167916510802,
"learning_rate": 0.0002993851737917695,
"loss": 0.8898,
"step": 665
},
{
"epoch": 0.1268939393939394,
"grad_norm": 0.05471735073845254,
"learning_rate": 0.00029933951059228777,
"loss": 0.8831,
"step": 670
},
{
"epoch": 0.1278409090909091,
"grad_norm": 0.05947780307997745,
"learning_rate": 0.0002992922156142619,
"loss": 0.8745,
"step": 675
},
{
"epoch": 0.12878787878787878,
"grad_norm": 0.06439001370901883,
"learning_rate": 0.00029924328937446686,
"loss": 0.8786,
"step": 680
},
{
"epoch": 0.12973484848484848,
"grad_norm": 0.05448875777529848,
"learning_rate": 0.0002991927324075019,
"loss": 0.8619,
"step": 685
},
{
"epoch": 0.13068181818181818,
"grad_norm": 0.05664358337517303,
"learning_rate": 0.0002991405452657846,
"loss": 0.8997,
"step": 690
},
{
"epoch": 0.13162878787878787,
"grad_norm": 0.06476323651067244,
"learning_rate": 0.0002990867285195449,
"loss": 0.8965,
"step": 695
},
{
"epoch": 0.13257575757575757,
"grad_norm": 0.052072472586110224,
"learning_rate": 0.0002990312827568188,
"loss": 0.9026,
"step": 700
},
{
"epoch": 0.13352272727272727,
"grad_norm": 0.058261512452499706,
"learning_rate": 0.00029897420858344205,
"loss": 0.8927,
"step": 705
},
{
"epoch": 0.13446969696969696,
"grad_norm": 0.048984858535432614,
"learning_rate": 0.0002989155066230433,
"loss": 0.8755,
"step": 710
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.05290095759260302,
"learning_rate": 0.0002988551775170377,
"loss": 0.8848,
"step": 715
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.05687350078562719,
"learning_rate": 0.00029879322192461925,
"loss": 0.8539,
"step": 720
},
{
"epoch": 0.13731060606060605,
"grad_norm": 0.05103479492140091,
"learning_rate": 0.0002987296405227543,
"loss": 0.8953,
"step": 725
},
{
"epoch": 0.13825757575757575,
"grad_norm": 0.06301978431499398,
"learning_rate": 0.0002986644340061738,
"loss": 0.8679,
"step": 730
},
{
"epoch": 0.13920454545454544,
"grad_norm": 0.06819363260699743,
"learning_rate": 0.0002985976030873655,
"loss": 0.8767,
"step": 735
},
{
"epoch": 0.14015151515151514,
"grad_norm": 0.06229636016637017,
"learning_rate": 0.0002985291484965666,
"loss": 0.8764,
"step": 740
},
{
"epoch": 0.14109848484848486,
"grad_norm": 0.05610530939029699,
"learning_rate": 0.0002984590709817555,
"loss": 0.9009,
"step": 745
},
{
"epoch": 0.14204545454545456,
"grad_norm": 0.04941941919339848,
"learning_rate": 0.0002983873713086439,
"loss": 0.8986,
"step": 750
},
{
"epoch": 0.14299242424242425,
"grad_norm": 0.054105335971047615,
"learning_rate": 0.00029831405026066785,
"loss": 0.9131,
"step": 755
},
{
"epoch": 0.14393939393939395,
"grad_norm": 0.055466909315876986,
"learning_rate": 0.0002982391086389799,
"loss": 0.8663,
"step": 760
},
{
"epoch": 0.14488636363636365,
"grad_norm": 0.05043550339837371,
"learning_rate": 0.00029816254726243983,
"loss": 0.8959,
"step": 765
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.05417610489948402,
"learning_rate": 0.0002980843669676061,
"loss": 0.8616,
"step": 770
},
{
"epoch": 0.14678030303030304,
"grad_norm": 0.05041852142450034,
"learning_rate": 0.0002980045686087262,
"loss": 0.8855,
"step": 775
},
{
"epoch": 0.14772727272727273,
"grad_norm": 0.06901010647152613,
"learning_rate": 0.00029792315305772796,
"loss": 0.9032,
"step": 780
},
{
"epoch": 0.14867424242424243,
"grad_norm": 2.738390350230735,
"learning_rate": 0.00029784012120420944,
"loss": 0.888,
"step": 785
},
{
"epoch": 0.14962121212121213,
"grad_norm": 0.1937615852009521,
"learning_rate": 0.0002977554739554294,
"loss": 1.0592,
"step": 790
},
{
"epoch": 0.15056818181818182,
"grad_norm": 0.11234392360677802,
"learning_rate": 0.00029766921223629774,
"loss": 0.9652,
"step": 795
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.12436979475241608,
"learning_rate": 0.00029758133698936485,
"loss": 0.9394,
"step": 800
},
{
"epoch": 0.15246212121212122,
"grad_norm": 0.06657911701516095,
"learning_rate": 0.00029749184917481157,
"loss": 0.9099,
"step": 805
},
{
"epoch": 0.1534090909090909,
"grad_norm": 0.0824368743598765,
"learning_rate": 0.00029740074977043873,
"loss": 0.8753,
"step": 810
},
{
"epoch": 0.1543560606060606,
"grad_norm": 0.059895919738978086,
"learning_rate": 0.00029730803977165643,
"loss": 0.9159,
"step": 815
},
{
"epoch": 0.1553030303030303,
"grad_norm": 0.04706104790187168,
"learning_rate": 0.00029721372019147314,
"loss": 0.9117,
"step": 820
},
{
"epoch": 0.15625,
"grad_norm": 0.05202762617344226,
"learning_rate": 0.00029711779206048454,
"loss": 0.8807,
"step": 825
},
{
"epoch": 0.1571969696969697,
"grad_norm": 0.050944436154957536,
"learning_rate": 0.0002970202564268625,
"loss": 0.8665,
"step": 830
},
{
"epoch": 0.1581439393939394,
"grad_norm": 0.050530106988524406,
"learning_rate": 0.00029692111435634347,
"loss": 0.853,
"step": 835
},
{
"epoch": 0.1590909090909091,
"grad_norm": 0.047944409481566634,
"learning_rate": 0.0002968203669322168,
"loss": 0.8719,
"step": 840
},
{
"epoch": 0.16003787878787878,
"grad_norm": 0.05621479623321537,
"learning_rate": 0.0002967180152553129,
"loss": 0.8602,
"step": 845
},
{
"epoch": 0.16098484848484848,
"grad_norm": 0.053970797332012134,
"learning_rate": 0.0002966140604439914,
"loss": 0.8804,
"step": 850
},
{
"epoch": 0.16193181818181818,
"grad_norm": 0.05575444277143619,
"learning_rate": 0.0002965085036341287,
"loss": 0.8672,
"step": 855
},
{
"epoch": 0.16287878787878787,
"grad_norm": 0.054447528875280926,
"learning_rate": 0.0002964013459791057,
"loss": 0.8705,
"step": 860
},
{
"epoch": 0.16382575757575757,
"grad_norm": 0.04819722412795564,
"learning_rate": 0.0002962925886497952,
"loss": 0.885,
"step": 865
},
{
"epoch": 0.16477272727272727,
"grad_norm": 0.04927047066910389,
"learning_rate": 0.00029618223283454893,
"loss": 0.8793,
"step": 870
},
{
"epoch": 0.16571969696969696,
"grad_norm": 0.05156180447905288,
"learning_rate": 0.0002960702797391848,
"loss": 0.8697,
"step": 875
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.047498663975991506,
"learning_rate": 0.00029595673058697357,
"loss": 0.8944,
"step": 880
},
{
"epoch": 0.16761363636363635,
"grad_norm": 0.04931121452183828,
"learning_rate": 0.0002958415866186255,
"loss": 0.8708,
"step": 885
},
{
"epoch": 0.16856060606060605,
"grad_norm": 0.04971806922102403,
"learning_rate": 0.000295724849092277,
"loss": 0.886,
"step": 890
},
{
"epoch": 0.16950757575757575,
"grad_norm": 0.04782190553843517,
"learning_rate": 0.0002956065192834765,
"loss": 0.8625,
"step": 895
},
{
"epoch": 0.17045454545454544,
"grad_norm": 0.05985135374827048,
"learning_rate": 0.00029548659848517073,
"loss": 0.8572,
"step": 900
},
{
"epoch": 0.17140151515151514,
"grad_norm": 0.052563040632905934,
"learning_rate": 0.00029536508800769083,
"loss": 0.8527,
"step": 905
},
{
"epoch": 0.17234848484848486,
"grad_norm": 0.04857882149280908,
"learning_rate": 0.0002952419891787375,
"loss": 0.8739,
"step": 910
},
{
"epoch": 0.17329545454545456,
"grad_norm": 0.0479060897399876,
"learning_rate": 0.00029511730334336693,
"loss": 0.8905,
"step": 915
},
{
"epoch": 0.17424242424242425,
"grad_norm": 0.05040950791010248,
"learning_rate": 0.00029499103186397596,
"loss": 0.8738,
"step": 920
},
{
"epoch": 0.17518939393939395,
"grad_norm": 0.04549261460480799,
"learning_rate": 0.00029486317612028705,
"loss": 0.8697,
"step": 925
},
{
"epoch": 0.17613636363636365,
"grad_norm": 0.04640647025353498,
"learning_rate": 0.00029473373750933354,
"loss": 0.8697,
"step": 930
},
{
"epoch": 0.17708333333333334,
"grad_norm": 0.04813238779352585,
"learning_rate": 0.0002946027174454439,
"loss": 0.8691,
"step": 935
},
{
"epoch": 0.17803030303030304,
"grad_norm": 0.04735624926206103,
"learning_rate": 0.0002944701173602269,
"loss": 0.8785,
"step": 940
},
{
"epoch": 0.17897727272727273,
"grad_norm": 0.04918343048516045,
"learning_rate": 0.00029433593870255547,
"loss": 0.8832,
"step": 945
},
{
"epoch": 0.17992424242424243,
"grad_norm": 0.05187324555019995,
"learning_rate": 0.00029420018293855097,
"loss": 0.8931,
"step": 950
},
{
"epoch": 0.18087121212121213,
"grad_norm": 0.051833428541254264,
"learning_rate": 0.0002940628515515673,
"loss": 0.8505,
"step": 955
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.04842441750953903,
"learning_rate": 0.0002939239460421746,
"loss": 0.8619,
"step": 960
},
{
"epoch": 0.18276515151515152,
"grad_norm": 0.04450318082424258,
"learning_rate": 0.00029378346792814284,
"loss": 0.8935,
"step": 965
},
{
"epoch": 0.18371212121212122,
"grad_norm": 0.050265508339128746,
"learning_rate": 0.00029364141874442534,
"loss": 0.8875,
"step": 970
},
{
"epoch": 0.1846590909090909,
"grad_norm": 0.0489312969341679,
"learning_rate": 0.00029349780004314196,
"loss": 0.8707,
"step": 975
},
{
"epoch": 0.1856060606060606,
"grad_norm": 0.0448678465223849,
"learning_rate": 0.0002933526133935619,
"loss": 0.8759,
"step": 980
},
{
"epoch": 0.1865530303030303,
"grad_norm": 0.04933677179150076,
"learning_rate": 0.000293205860382087,
"loss": 0.8761,
"step": 985
},
{
"epoch": 0.1875,
"grad_norm": 0.04538600611093541,
"learning_rate": 0.000293057542612234,
"loss": 0.8683,
"step": 990
},
{
"epoch": 0.1884469696969697,
"grad_norm": 0.04816654163118486,
"learning_rate": 0.00029290766170461733,
"loss": 0.8575,
"step": 995
},
{
"epoch": 0.1893939393939394,
"grad_norm": 0.05162840473736974,
"learning_rate": 0.0002927562192969312,
"loss": 0.8788,
"step": 1000
},
{
"epoch": 0.1903409090909091,
"grad_norm": 0.0526139634089819,
"learning_rate": 0.00029260321704393166,
"loss": 0.8842,
"step": 1005
},
{
"epoch": 0.19128787878787878,
"grad_norm": 0.06020706080827499,
"learning_rate": 0.0002924486566174187,
"loss": 0.8873,
"step": 1010
},
{
"epoch": 0.19223484848484848,
"grad_norm": 0.05225939758166581,
"learning_rate": 0.00029229253970621796,
"loss": 0.8354,
"step": 1015
},
{
"epoch": 0.19318181818181818,
"grad_norm": 0.05284878294440371,
"learning_rate": 0.0002921348680161622,
"loss": 0.9025,
"step": 1020
},
{
"epoch": 0.19412878787878787,
"grad_norm": 0.059173148849731974,
"learning_rate": 0.00029197564327007266,
"loss": 0.8405,
"step": 1025
},
{
"epoch": 0.19507575757575757,
"grad_norm": 0.05723447002828778,
"learning_rate": 0.00029181486720774024,
"loss": 0.9033,
"step": 1030
},
{
"epoch": 0.19602272727272727,
"grad_norm": 0.04961291522370079,
"learning_rate": 0.0002916525415859065,
"loss": 0.8517,
"step": 1035
},
{
"epoch": 0.19696969696969696,
"grad_norm": 0.04405850577071398,
"learning_rate": 0.0002914886681782445,
"loss": 0.8605,
"step": 1040
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.052549109623340674,
"learning_rate": 0.00029132324877533943,
"loss": 0.8903,
"step": 1045
},
{
"epoch": 0.19886363636363635,
"grad_norm": 0.052002448553744814,
"learning_rate": 0.000291156285184669,
"loss": 0.8673,
"step": 1050
},
{
"epoch": 0.19981060606060605,
"grad_norm": 0.057528545450935206,
"learning_rate": 0.0002909877792305836,
"loss": 0.8693,
"step": 1055
},
{
"epoch": 0.20075757575757575,
"grad_norm": 0.05849975352441284,
"learning_rate": 0.0002908177327542866,
"loss": 0.8806,
"step": 1060
},
{
"epoch": 0.20170454545454544,
"grad_norm": 0.0668330093981767,
"learning_rate": 0.00029064614761381395,
"loss": 0.8573,
"step": 1065
},
{
"epoch": 0.20265151515151514,
"grad_norm": 0.08085163002687007,
"learning_rate": 0.0002904730256840142,
"loss": 0.8588,
"step": 1070
},
{
"epoch": 0.20359848484848486,
"grad_norm": 0.06115289967256575,
"learning_rate": 0.0002902983688565276,
"loss": 0.8489,
"step": 1075
},
{
"epoch": 0.20454545454545456,
"grad_norm": 0.04578661935010964,
"learning_rate": 0.000290122179039766,
"loss": 0.8647,
"step": 1080
},
{
"epoch": 0.20549242424242425,
"grad_norm": 0.04738987289722607,
"learning_rate": 0.00028994445815889135,
"loss": 0.8928,
"step": 1085
},
{
"epoch": 0.20643939393939395,
"grad_norm": 0.048922711283470234,
"learning_rate": 0.00028976520815579516,
"loss": 0.8571,
"step": 1090
},
{
"epoch": 0.20738636363636365,
"grad_norm": 0.042842101095754544,
"learning_rate": 0.000289584430989077,
"loss": 0.8994,
"step": 1095
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.049240088817609676,
"learning_rate": 0.0002894021286340233,
"loss": 0.8703,
"step": 1100
},
{
"epoch": 0.20928030303030304,
"grad_norm": 0.04405475980914324,
"learning_rate": 0.0002892183030825857,
"loss": 0.8697,
"step": 1105
},
{
"epoch": 0.21022727272727273,
"grad_norm": 0.052077127621998655,
"learning_rate": 0.00028903295634335904,
"loss": 0.8995,
"step": 1110
},
{
"epoch": 0.21117424242424243,
"grad_norm": 0.059590406683339404,
"learning_rate": 0.00028884609044155983,
"loss": 0.8798,
"step": 1115
},
{
"epoch": 0.21212121212121213,
"grad_norm": 0.044865973163293216,
"learning_rate": 0.0002886577074190038,
"loss": 0.8965,
"step": 1120
},
{
"epoch": 0.21306818181818182,
"grad_norm": 0.05169860969332574,
"learning_rate": 0.0002884678093340838,
"loss": 0.8554,
"step": 1125
},
{
"epoch": 0.21401515151515152,
"grad_norm": 0.05156894746619203,
"learning_rate": 0.00028827639826174716,
"loss": 0.8727,
"step": 1130
},
{
"epoch": 0.21496212121212122,
"grad_norm": 0.04493193351896756,
"learning_rate": 0.0002880834762934731,
"loss": 0.8659,
"step": 1135
},
{
"epoch": 0.2159090909090909,
"grad_norm": 0.04709600714027476,
"learning_rate": 0.0002878890455372498,
"loss": 0.8494,
"step": 1140
},
{
"epoch": 0.2168560606060606,
"grad_norm": 0.040578982776887075,
"learning_rate": 0.00028769310811755153,
"loss": 0.8345,
"step": 1145
},
{
"epoch": 0.2178030303030303,
"grad_norm": 0.0454120547462,
"learning_rate": 0.0002874956661753152,
"loss": 0.893,
"step": 1150
},
{
"epoch": 0.21875,
"grad_norm": 0.0458718950734734,
"learning_rate": 0.00028729672186791704,
"loss": 0.8453,
"step": 1155
},
{
"epoch": 0.2196969696969697,
"grad_norm": 0.04315005418940203,
"learning_rate": 0.0002870962773691493,
"loss": 0.8389,
"step": 1160
},
{
"epoch": 0.2206439393939394,
"grad_norm": 0.04752379784930806,
"learning_rate": 0.00028689433486919617,
"loss": 0.8673,
"step": 1165
},
{
"epoch": 0.2215909090909091,
"grad_norm": 0.05684552525580732,
"learning_rate": 0.00028669089657460984,
"loss": 0.867,
"step": 1170
},
{
"epoch": 0.22253787878787878,
"grad_norm": 0.050983860484211566,
"learning_rate": 0.00028648596470828673,
"loss": 0.8647,
"step": 1175
},
{
"epoch": 0.22348484848484848,
"grad_norm": 0.046503504621597226,
"learning_rate": 0.0002862795415094427,
"loss": 0.8697,
"step": 1180
},
{
"epoch": 0.22443181818181818,
"grad_norm": 0.0450753851181475,
"learning_rate": 0.0002860716292335891,
"loss": 0.8249,
"step": 1185
},
{
"epoch": 0.22537878787878787,
"grad_norm": 0.04485421555009402,
"learning_rate": 0.0002858622301525078,
"loss": 0.8637,
"step": 1190
},
{
"epoch": 0.22632575757575757,
"grad_norm": 0.04710476072245473,
"learning_rate": 0.0002856513465542263,
"loss": 0.8712,
"step": 1195
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.04590515210754422,
"learning_rate": 0.00028543898074299317,
"loss": 0.8899,
"step": 1200
},
{
"epoch": 0.22821969696969696,
"grad_norm": 0.046677806922824555,
"learning_rate": 0.00028522513503925236,
"loss": 0.8331,
"step": 1205
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.05326891301520586,
"learning_rate": 0.00028500981177961816,
"loss": 0.8506,
"step": 1210
},
{
"epoch": 0.23011363636363635,
"grad_norm": 0.04687694995176648,
"learning_rate": 0.0002847930133168495,
"loss": 0.8718,
"step": 1215
},
{
"epoch": 0.23106060606060605,
"grad_norm": 0.043766596329374095,
"learning_rate": 0.0002845747420198245,
"loss": 0.8355,
"step": 1220
},
{
"epoch": 0.23200757575757575,
"grad_norm": 0.051852335183307466,
"learning_rate": 0.00028435500027351415,
"loss": 0.9018,
"step": 1225
},
{
"epoch": 0.23295454545454544,
"grad_norm": 0.046871786280526524,
"learning_rate": 0.00028413379047895665,
"loss": 0.8773,
"step": 1230
},
{
"epoch": 0.23390151515151514,
"grad_norm": 0.05319047340562311,
"learning_rate": 0.0002839111150532311,
"loss": 0.8744,
"step": 1235
},
{
"epoch": 0.23484848484848486,
"grad_norm": 0.0480557659328724,
"learning_rate": 0.0002836869764294308,
"loss": 0.8543,
"step": 1240
},
{
"epoch": 0.23579545454545456,
"grad_norm": 0.045235971797863456,
"learning_rate": 0.0002834613770566371,
"loss": 0.8811,
"step": 1245
},
{
"epoch": 0.23674242424242425,
"grad_norm": 0.051086084811488776,
"learning_rate": 0.0002832343193998923,
"loss": 0.8688,
"step": 1250
},
{
"epoch": 0.23768939393939395,
"grad_norm": 0.04921720651133015,
"learning_rate": 0.00028300580594017296,
"loss": 0.8556,
"step": 1255
},
{
"epoch": 0.23863636363636365,
"grad_norm": 0.046260991109867027,
"learning_rate": 0.00028277583917436246,
"loss": 0.8536,
"step": 1260
},
{
"epoch": 0.23958333333333334,
"grad_norm": 0.04507558602085682,
"learning_rate": 0.00028254442161522415,
"loss": 0.8606,
"step": 1265
},
{
"epoch": 0.24053030303030304,
"grad_norm": 0.04347528822105258,
"learning_rate": 0.00028231155579137347,
"loss": 0.8224,
"step": 1270
},
{
"epoch": 0.24147727272727273,
"grad_norm": 0.044645406213969646,
"learning_rate": 0.00028207724424725067,
"loss": 0.8103,
"step": 1275
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.04886991006139869,
"learning_rate": 0.0002818414895430929,
"loss": 0.8681,
"step": 1280
},
{
"epoch": 0.24337121212121213,
"grad_norm": 0.050211499173204034,
"learning_rate": 0.000281604294254906,
"loss": 0.8465,
"step": 1285
},
{
"epoch": 0.24431818181818182,
"grad_norm": 0.04407450456467826,
"learning_rate": 0.0002813656609744367,
"loss": 0.8587,
"step": 1290
},
{
"epoch": 0.24526515151515152,
"grad_norm": 0.046396078514299374,
"learning_rate": 0.00028112559230914413,
"loss": 0.8836,
"step": 1295
},
{
"epoch": 0.24621212121212122,
"grad_norm": 0.04992457772876899,
"learning_rate": 0.0002808840908821713,
"loss": 0.847,
"step": 1300
},
{
"epoch": 0.2471590909090909,
"grad_norm": 0.06899088573327124,
"learning_rate": 0.00028064115933231653,
"loss": 0.8284,
"step": 1305
},
{
"epoch": 0.2481060606060606,
"grad_norm": 0.047887893856181384,
"learning_rate": 0.00028039680031400455,
"loss": 0.8428,
"step": 1310
},
{
"epoch": 0.2490530303030303,
"grad_norm": 0.04582410718511601,
"learning_rate": 0.00028015101649725747,
"loss": 0.8384,
"step": 1315
},
{
"epoch": 0.25,
"grad_norm": 0.045636060323846026,
"learning_rate": 0.0002799038105676658,
"loss": 0.843,
"step": 1320
},
{
"epoch": 0.2509469696969697,
"grad_norm": 0.04519189300518443,
"learning_rate": 0.0002796551852263588,
"loss": 0.8908,
"step": 1325
},
{
"epoch": 0.2518939393939394,
"grad_norm": 0.045578564236409914,
"learning_rate": 0.00027940514318997516,
"loss": 0.8572,
"step": 1330
},
{
"epoch": 0.2528409090909091,
"grad_norm": 0.051686507213315025,
"learning_rate": 0.0002791536871906334,
"loss": 0.8619,
"step": 1335
},
{
"epoch": 0.2537878787878788,
"grad_norm": 0.04345687899653502,
"learning_rate": 0.0002789008199759018,
"loss": 0.8459,
"step": 1340
},
{
"epoch": 0.2547348484848485,
"grad_norm": 0.046507137153713074,
"learning_rate": 0.0002786465443087685,
"loss": 0.8607,
"step": 1345
},
{
"epoch": 0.2556818181818182,
"grad_norm": 0.042657023216453836,
"learning_rate": 0.0002783908629676112,
"loss": 0.8548,
"step": 1350
},
{
"epoch": 0.2566287878787879,
"grad_norm": 0.040538053747547015,
"learning_rate": 0.00027813377874616707,
"loss": 0.8389,
"step": 1355
},
{
"epoch": 0.25757575757575757,
"grad_norm": 0.04525382597081455,
"learning_rate": 0.0002778752944535019,
"loss": 0.8372,
"step": 1360
},
{
"epoch": 0.2585227272727273,
"grad_norm": 0.04234205633912051,
"learning_rate": 0.00027761541291397964,
"loss": 0.8426,
"step": 1365
},
{
"epoch": 0.25946969696969696,
"grad_norm": 0.04757806675030589,
"learning_rate": 0.00027735413696723123,
"loss": 0.8459,
"step": 1370
},
{
"epoch": 0.2604166666666667,
"grad_norm": 0.043206314979838206,
"learning_rate": 0.00027709146946812413,
"loss": 0.8384,
"step": 1375
},
{
"epoch": 0.26136363636363635,
"grad_norm": 0.04812617395176995,
"learning_rate": 0.00027682741328673063,
"loss": 0.83,
"step": 1380
},
{
"epoch": 0.2623106060606061,
"grad_norm": 0.04595880004567738,
"learning_rate": 0.0002765619713082965,
"loss": 0.8704,
"step": 1385
},
{
"epoch": 0.26325757575757575,
"grad_norm": 0.04342248392045638,
"learning_rate": 0.0002762951464332098,
"loss": 0.8545,
"step": 1390
},
{
"epoch": 0.26420454545454547,
"grad_norm": 0.0468715942875786,
"learning_rate": 0.0002760269415769691,
"loss": 0.854,
"step": 1395
},
{
"epoch": 0.26515151515151514,
"grad_norm": 0.045557679264808,
"learning_rate": 0.0002757573596701511,
"loss": 0.8543,
"step": 1400
},
{
"epoch": 0.26609848484848486,
"grad_norm": 0.045277257320571196,
"learning_rate": 0.0002754864036583795,
"loss": 0.8519,
"step": 1405
},
{
"epoch": 0.26704545454545453,
"grad_norm": 0.04446480155061431,
"learning_rate": 0.000275214076502292,
"loss": 0.852,
"step": 1410
},
{
"epoch": 0.26799242424242425,
"grad_norm": 0.04286461978075112,
"learning_rate": 0.00027494038117750855,
"loss": 0.873,
"step": 1415
},
{
"epoch": 0.2689393939393939,
"grad_norm": 0.04635120751169076,
"learning_rate": 0.0002746653206745984,
"loss": 0.8675,
"step": 1420
},
{
"epoch": 0.26988636363636365,
"grad_norm": 0.04897795119080024,
"learning_rate": 0.0002743888979990477,
"loss": 0.8489,
"step": 1425
},
{
"epoch": 0.2708333333333333,
"grad_norm": 0.045351523544172836,
"learning_rate": 0.00027411111617122656,
"loss": 0.8815,
"step": 1430
},
{
"epoch": 0.27178030303030304,
"grad_norm": 0.046888003925487816,
"learning_rate": 0.00027383197822635597,
"loss": 0.8619,
"step": 1435
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.050747409625094775,
"learning_rate": 0.0002735514872144749,
"loss": 0.877,
"step": 1440
},
{
"epoch": 0.27367424242424243,
"grad_norm": 0.04867163124702627,
"learning_rate": 0.0002732696462004066,
"loss": 0.86,
"step": 1445
},
{
"epoch": 0.2746212121212121,
"grad_norm": 0.053312130647735934,
"learning_rate": 0.00027298645826372527,
"loss": 0.8609,
"step": 1450
},
{
"epoch": 0.2755681818181818,
"grad_norm": 0.04023418732684079,
"learning_rate": 0.0002727019264987227,
"loss": 0.8598,
"step": 1455
},
{
"epoch": 0.2765151515151515,
"grad_norm": 0.04347366104067643,
"learning_rate": 0.000272416054014374,
"loss": 0.8443,
"step": 1460
},
{
"epoch": 0.2774621212121212,
"grad_norm": 0.042854675811405736,
"learning_rate": 0.00027212884393430396,
"loss": 0.8632,
"step": 1465
},
{
"epoch": 0.2784090909090909,
"grad_norm": 0.04461599878281101,
"learning_rate": 0.0002718402993967526,
"loss": 0.8469,
"step": 1470
},
{
"epoch": 0.2793560606060606,
"grad_norm": 0.0458799502796299,
"learning_rate": 0.0002715504235545412,
"loss": 0.8675,
"step": 1475
},
{
"epoch": 0.2803030303030303,
"grad_norm": 0.041761756053765885,
"learning_rate": 0.0002712592195750378,
"loss": 0.8751,
"step": 1480
},
{
"epoch": 0.28125,
"grad_norm": 0.04293009223271159,
"learning_rate": 0.0002709666906401224,
"loss": 0.8591,
"step": 1485
},
{
"epoch": 0.2821969696969697,
"grad_norm": 0.042628404150602366,
"learning_rate": 0.00027067283994615225,
"loss": 0.8314,
"step": 1490
},
{
"epoch": 0.2831439393939394,
"grad_norm": 0.043803929434188336,
"learning_rate": 0.0002703776707039271,
"loss": 0.8515,
"step": 1495
},
{
"epoch": 0.2840909090909091,
"grad_norm": 0.047256485311155456,
"learning_rate": 0.00027008118613865406,
"loss": 0.8376,
"step": 1500
},
{
"epoch": 0.2850378787878788,
"grad_norm": 0.046926959946348615,
"learning_rate": 0.00026978338948991206,
"loss": 0.8423,
"step": 1505
},
{
"epoch": 0.2859848484848485,
"grad_norm": 0.04941952831110132,
"learning_rate": 0.0002694842840116169,
"loss": 0.8564,
"step": 1510
},
{
"epoch": 0.2869318181818182,
"grad_norm": 0.04638823285342314,
"learning_rate": 0.0002691838729719854,
"loss": 0.851,
"step": 1515
},
{
"epoch": 0.2878787878787879,
"grad_norm": 0.051062848616744594,
"learning_rate": 0.0002688821596534997,
"loss": 0.8592,
"step": 1520
},
{
"epoch": 0.28882575757575757,
"grad_norm": 0.048642765971924094,
"learning_rate": 0.00026857914735287173,
"loss": 0.8651,
"step": 1525
},
{
"epoch": 0.2897727272727273,
"grad_norm": 0.041614396540575575,
"learning_rate": 0.0002682748393810066,
"loss": 0.853,
"step": 1530
},
{
"epoch": 0.29071969696969696,
"grad_norm": 0.04037850703898104,
"learning_rate": 0.0002679692390629669,
"loss": 0.8714,
"step": 1535
},
{
"epoch": 0.2916666666666667,
"grad_norm": 0.045919213909734996,
"learning_rate": 0.0002676623497379363,
"loss": 0.8526,
"step": 1540
},
{
"epoch": 0.29261363636363635,
"grad_norm": 0.0435916558717206,
"learning_rate": 0.00026735417475918285,
"loss": 0.8474,
"step": 1545
},
{
"epoch": 0.2935606060606061,
"grad_norm": 0.04829847525687287,
"learning_rate": 0.00026704471749402256,
"loss": 0.8548,
"step": 1550
},
{
"epoch": 0.29450757575757575,
"grad_norm": 0.04805727825764429,
"learning_rate": 0.0002667339813237824,
"loss": 0.8453,
"step": 1555
},
{
"epoch": 0.29545454545454547,
"grad_norm": 0.05010295983510741,
"learning_rate": 0.0002664219696437635,
"loss": 0.8416,
"step": 1560
},
{
"epoch": 0.29640151515151514,
"grad_norm": 0.04388851666661931,
"learning_rate": 0.00026610868586320416,
"loss": 0.8341,
"step": 1565
},
{
"epoch": 0.29734848484848486,
"grad_norm": 0.045305461318018866,
"learning_rate": 0.00026579413340524233,
"loss": 0.8322,
"step": 1570
},
{
"epoch": 0.29829545454545453,
"grad_norm": 0.04178070112825466,
"learning_rate": 0.0002654783157068785,
"loss": 0.8798,
"step": 1575
},
{
"epoch": 0.29924242424242425,
"grad_norm": 0.039660241288771,
"learning_rate": 0.00026516123621893756,
"loss": 0.8512,
"step": 1580
},
{
"epoch": 0.3001893939393939,
"grad_norm": 0.04956139252725399,
"learning_rate": 0.0002648428984060321,
"loss": 0.8531,
"step": 1585
},
{
"epoch": 0.30113636363636365,
"grad_norm": 0.04050711765679051,
"learning_rate": 0.0002645233057465235,
"loss": 0.8714,
"step": 1590
},
{
"epoch": 0.3020833333333333,
"grad_norm": 0.044882658138218526,
"learning_rate": 0.00026420246173248466,
"loss": 0.8576,
"step": 1595
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.0443601837928335,
"learning_rate": 0.00026388036986966146,
"loss": 0.8458,
"step": 1600
},
{
"epoch": 0.3039772727272727,
"grad_norm": 0.04445201847842639,
"learning_rate": 0.00026355703367743463,
"loss": 0.8262,
"step": 1605
},
{
"epoch": 0.30492424242424243,
"grad_norm": 0.04296295815749959,
"learning_rate": 0.0002632324566887811,
"loss": 0.852,
"step": 1610
},
{
"epoch": 0.3058712121212121,
"grad_norm": 0.04595484323544366,
"learning_rate": 0.0002629066424502358,
"loss": 0.8712,
"step": 1615
},
{
"epoch": 0.3068181818181818,
"grad_norm": 0.05135389183761192,
"learning_rate": 0.0002625795945218523,
"loss": 0.8686,
"step": 1620
},
{
"epoch": 0.3077651515151515,
"grad_norm": 0.05402260665782284,
"learning_rate": 0.00026225131647716454,
"loss": 0.8705,
"step": 1625
},
{
"epoch": 0.3087121212121212,
"grad_norm": 0.04647458435053281,
"learning_rate": 0.00026192181190314734,
"loss": 0.8497,
"step": 1630
},
{
"epoch": 0.3096590909090909,
"grad_norm": 0.04501320267899854,
"learning_rate": 0.0002615910844001774,
"loss": 0.8699,
"step": 1635
},
{
"epoch": 0.3106060606060606,
"grad_norm": 0.044016650809303635,
"learning_rate": 0.0002612591375819939,
"loss": 0.8451,
"step": 1640
},
{
"epoch": 0.3115530303030303,
"grad_norm": 0.04270641863707379,
"learning_rate": 0.0002609259750756591,
"loss": 0.8264,
"step": 1645
},
{
"epoch": 0.3125,
"grad_norm": 0.0454677296060317,
"learning_rate": 0.0002605916005215186,
"loss": 0.8344,
"step": 1650
},
{
"epoch": 0.3134469696969697,
"grad_norm": 0.04056658550140151,
"learning_rate": 0.0002602560175731615,
"loss": 0.8187,
"step": 1655
},
{
"epoch": 0.3143939393939394,
"grad_norm": 0.047795906369322495,
"learning_rate": 0.0002599192298973808,
"loss": 0.8596,
"step": 1660
},
{
"epoch": 0.3153409090909091,
"grad_norm": 0.04746275776015859,
"learning_rate": 0.00025958124117413296,
"loss": 0.8373,
"step": 1665
},
{
"epoch": 0.3162878787878788,
"grad_norm": 0.0490167919610274,
"learning_rate": 0.0002592420550964979,
"loss": 0.8605,
"step": 1670
},
{
"epoch": 0.3172348484848485,
"grad_norm": 0.0425174441426416,
"learning_rate": 0.00025890167537063856,
"loss": 0.8466,
"step": 1675
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.04266217832901279,
"learning_rate": 0.0002585601057157605,
"loss": 0.853,
"step": 1680
},
{
"epoch": 0.3191287878787879,
"grad_norm": 0.042298923569418494,
"learning_rate": 0.00025821734986407113,
"loss": 0.852,
"step": 1685
},
{
"epoch": 0.32007575757575757,
"grad_norm": 0.03839109699626175,
"learning_rate": 0.00025787341156073915,
"loss": 0.8079,
"step": 1690
},
{
"epoch": 0.3210227272727273,
"grad_norm": 0.046090433170696436,
"learning_rate": 0.0002575282945638532,
"loss": 0.8622,
"step": 1695
},
{
"epoch": 0.32196969696969696,
"grad_norm": 0.043394806500603955,
"learning_rate": 0.0002571820026443814,
"loss": 0.8569,
"step": 1700
},
{
"epoch": 0.3229166666666667,
"grad_norm": 0.04364474268105156,
"learning_rate": 0.00025683453958612963,
"loss": 0.859,
"step": 1705
},
{
"epoch": 0.32386363636363635,
"grad_norm": 0.04738452037799788,
"learning_rate": 0.0002564859091857004,
"loss": 0.8639,
"step": 1710
},
{
"epoch": 0.3248106060606061,
"grad_norm": 0.046706389086655475,
"learning_rate": 0.0002561361152524513,
"loss": 0.8685,
"step": 1715
},
{
"epoch": 0.32575757575757575,
"grad_norm": 0.04065493744655141,
"learning_rate": 0.0002557851616084536,
"loss": 0.8287,
"step": 1720
},
{
"epoch": 0.32670454545454547,
"grad_norm": 0.042170889184096456,
"learning_rate": 0.00025543305208845015,
"loss": 0.8397,
"step": 1725
},
{
"epoch": 0.32765151515151514,
"grad_norm": 0.057009989738631466,
"learning_rate": 0.0002550797905398136,
"loss": 0.8424,
"step": 1730
},
{
"epoch": 0.32859848484848486,
"grad_norm": 0.054999335357357654,
"learning_rate": 0.0002547253808225045,
"loss": 0.8493,
"step": 1735
},
{
"epoch": 0.32954545454545453,
"grad_norm": 0.04878510537743201,
"learning_rate": 0.0002543698268090291,
"loss": 0.8687,
"step": 1740
},
{
"epoch": 0.33049242424242425,
"grad_norm": 0.04555832298125919,
"learning_rate": 0.0002540131323843968,
"loss": 0.848,
"step": 1745
},
{
"epoch": 0.3314393939393939,
"grad_norm": 0.04559121786828398,
"learning_rate": 0.0002536553014460778,
"loss": 0.8422,
"step": 1750
},
{
"epoch": 0.33238636363636365,
"grad_norm": 0.0412750415651045,
"learning_rate": 0.00025329633790396086,
"loss": 0.8528,
"step": 1755
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.041659064100662745,
"learning_rate": 0.00025293624568031,
"loss": 0.8587,
"step": 1760
},
{
"epoch": 0.33428030303030304,
"grad_norm": 0.04017536408178578,
"learning_rate": 0.0002525750287097221,
"loss": 0.8273,
"step": 1765
},
{
"epoch": 0.3352272727272727,
"grad_norm": 0.043594334725810896,
"learning_rate": 0.00025221269093908365,
"loss": 0.8344,
"step": 1770
},
{
"epoch": 0.33617424242424243,
"grad_norm": 0.03975857698007793,
"learning_rate": 0.00025184923632752776,
"loss": 0.8312,
"step": 1775
},
{
"epoch": 0.3371212121212121,
"grad_norm": 0.043307661902050605,
"learning_rate": 0.0002514846688463909,
"loss": 0.8384,
"step": 1780
},
{
"epoch": 0.3380681818181818,
"grad_norm": 0.041752317967921206,
"learning_rate": 0.00025111899247916926,
"loss": 0.8407,
"step": 1785
},
{
"epoch": 0.3390151515151515,
"grad_norm": 0.042776187451381745,
"learning_rate": 0.0002507522112214758,
"loss": 0.8217,
"step": 1790
},
{
"epoch": 0.3399621212121212,
"grad_norm": 0.04588631965260685,
"learning_rate": 0.0002503843290809958,
"loss": 0.8546,
"step": 1795
},
{
"epoch": 0.3409090909090909,
"grad_norm": 0.043083702336913526,
"learning_rate": 0.00025001535007744373,
"loss": 0.8378,
"step": 1800
},
{
"epoch": 0.3418560606060606,
"grad_norm": 0.04134116740011592,
"learning_rate": 0.00024964527824251903,
"loss": 0.8525,
"step": 1805
},
{
"epoch": 0.3428030303030303,
"grad_norm": 0.04016855550268295,
"learning_rate": 0.00024927411761986216,
"loss": 0.8114,
"step": 1810
},
{
"epoch": 0.34375,
"grad_norm": 0.04440153926149279,
"learning_rate": 0.0002489018722650103,
"loss": 0.8502,
"step": 1815
},
{
"epoch": 0.3446969696969697,
"grad_norm": 0.056819456110476556,
"learning_rate": 0.00024852854624535307,
"loss": 0.8235,
"step": 1820
},
{
"epoch": 0.3456439393939394,
"grad_norm": 0.05226275677923263,
"learning_rate": 0.00024815414364008826,
"loss": 0.8361,
"step": 1825
},
{
"epoch": 0.3465909090909091,
"grad_norm": 0.044209574304730354,
"learning_rate": 0.0002477786685401769,
"loss": 0.8408,
"step": 1830
},
{
"epoch": 0.3475378787878788,
"grad_norm": 0.04524443871766083,
"learning_rate": 0.0002474021250482991,
"loss": 0.837,
"step": 1835
},
{
"epoch": 0.3484848484848485,
"grad_norm": 0.045412892342884655,
"learning_rate": 0.0002470245172788086,
"loss": 0.8386,
"step": 1840
},
{
"epoch": 0.3494318181818182,
"grad_norm": 0.04952839628457116,
"learning_rate": 0.0002466458493576882,
"loss": 0.8396,
"step": 1845
},
{
"epoch": 0.3503787878787879,
"grad_norm": 0.05660754811136995,
"learning_rate": 0.0002462661254225047,
"loss": 0.881,
"step": 1850
},
{
"epoch": 0.35132575757575757,
"grad_norm": 0.04478496122656017,
"learning_rate": 0.00024588534962236344,
"loss": 0.8725,
"step": 1855
},
{
"epoch": 0.3522727272727273,
"grad_norm": 0.047218982842698924,
"learning_rate": 0.0002455035261178632,
"loss": 0.8637,
"step": 1860
},
{
"epoch": 0.35321969696969696,
"grad_norm": 0.041481601312663416,
"learning_rate": 0.0002451206590810506,
"loss": 0.8217,
"step": 1865
},
{
"epoch": 0.3541666666666667,
"grad_norm": 0.04270379380461237,
"learning_rate": 0.0002447367526953746,
"loss": 0.8779,
"step": 1870
},
{
"epoch": 0.35511363636363635,
"grad_norm": 0.03961220993813286,
"learning_rate": 0.0002443518111556407,
"loss": 0.8625,
"step": 1875
},
{
"epoch": 0.3560606060606061,
"grad_norm": 0.042995120300460946,
"learning_rate": 0.00024396583866796517,
"loss": 0.8335,
"step": 1880
},
{
"epoch": 0.35700757575757575,
"grad_norm": 0.04563005281967126,
"learning_rate": 0.00024357883944972904,
"loss": 0.8734,
"step": 1885
},
{
"epoch": 0.35795454545454547,
"grad_norm": 0.046070346094399604,
"learning_rate": 0.00024319081772953213,
"loss": 0.8503,
"step": 1890
},
{
"epoch": 0.35890151515151514,
"grad_norm": 0.046399508495104796,
"learning_rate": 0.0002428017777471467,
"loss": 0.8468,
"step": 1895
},
{
"epoch": 0.35984848484848486,
"grad_norm": 0.04153432929534662,
"learning_rate": 0.0002424117237534712,
"loss": 0.8511,
"step": 1900
},
{
"epoch": 0.36079545454545453,
"grad_norm": 0.04277554441903786,
"learning_rate": 0.0002420206600104839,
"loss": 0.8517,
"step": 1905
},
{
"epoch": 0.36174242424242425,
"grad_norm": 0.041758108039158866,
"learning_rate": 0.0002416285907911961,
"loss": 0.8114,
"step": 1910
},
{
"epoch": 0.3626893939393939,
"grad_norm": 0.05362310318010687,
"learning_rate": 0.0002412355203796056,
"loss": 0.8584,
"step": 1915
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.0415342758524878,
"learning_rate": 0.00024084145307064997,
"loss": 0.8338,
"step": 1920
},
{
"epoch": 0.3645833333333333,
"grad_norm": 0.04660724457587357,
"learning_rate": 0.00024044639317015942,
"loss": 0.8458,
"step": 1925
},
{
"epoch": 0.36553030303030304,
"grad_norm": 0.0420525971980156,
"learning_rate": 0.00024005034499480983,
"loss": 0.8127,
"step": 1930
},
{
"epoch": 0.3664772727272727,
"grad_norm": 0.04142360417103535,
"learning_rate": 0.0002396533128720757,
"loss": 0.8255,
"step": 1935
},
{
"epoch": 0.36742424242424243,
"grad_norm": 0.04258598086280745,
"learning_rate": 0.0002392553011401827,
"loss": 0.8083,
"step": 1940
},
{
"epoch": 0.3683712121212121,
"grad_norm": 0.04277812754465849,
"learning_rate": 0.00023885631414806026,
"loss": 0.8093,
"step": 1945
},
{
"epoch": 0.3693181818181818,
"grad_norm": 0.048152407374910465,
"learning_rate": 0.0002384563562552943,
"loss": 0.8265,
"step": 1950
},
{
"epoch": 0.3702651515151515,
"grad_norm": 0.047713607129726394,
"learning_rate": 0.00023805543183207927,
"loss": 0.8302,
"step": 1955
},
{
"epoch": 0.3712121212121212,
"grad_norm": 0.04316835360005878,
"learning_rate": 0.00023765354525917063,
"loss": 0.8699,
"step": 1960
},
{
"epoch": 0.3721590909090909,
"grad_norm": 0.04318616029335284,
"learning_rate": 0.0002372507009278368,
"loss": 0.8369,
"step": 1965
},
{
"epoch": 0.3731060606060606,
"grad_norm": 0.04234533104631891,
"learning_rate": 0.00023684690323981142,
"loss": 0.8252,
"step": 1970
},
{
"epoch": 0.3740530303030303,
"grad_norm": 0.03731733862303514,
"learning_rate": 0.00023644215660724503,
"loss": 0.8043,
"step": 1975
},
{
"epoch": 0.375,
"grad_norm": 0.04858576668810662,
"learning_rate": 0.00023603646545265687,
"loss": 0.8011,
"step": 1980
},
{
"epoch": 0.3759469696969697,
"grad_norm": 0.03947493260183539,
"learning_rate": 0.00023562983420888684,
"loss": 0.8456,
"step": 1985
},
{
"epoch": 0.3768939393939394,
"grad_norm": 0.03987615045295327,
"learning_rate": 0.00023522226731904664,
"loss": 0.8081,
"step": 1990
},
{
"epoch": 0.3778409090909091,
"grad_norm": 0.04467453055781031,
"learning_rate": 0.0002348137692364715,
"loss": 0.8196,
"step": 1995
},
{
"epoch": 0.3787878787878788,
"grad_norm": 0.04289778568763919,
"learning_rate": 0.00023440434442467152,
"loss": 0.8242,
"step": 2000
},
{
"epoch": 0.3797348484848485,
"grad_norm": 0.03957134697373444,
"learning_rate": 0.00023399399735728277,
"loss": 0.8271,
"step": 2005
},
{
"epoch": 0.3806818181818182,
"grad_norm": 0.04057581593775104,
"learning_rate": 0.00023358273251801847,
"loss": 0.7991,
"step": 2010
},
{
"epoch": 0.3816287878787879,
"grad_norm": 0.044395062218471865,
"learning_rate": 0.00023317055440062,
"loss": 0.8398,
"step": 2015
},
{
"epoch": 0.38257575757575757,
"grad_norm": 0.04570817682106231,
"learning_rate": 0.00023275746750880784,
"loss": 0.8499,
"step": 2020
},
{
"epoch": 0.3835227272727273,
"grad_norm": 0.04410857496355408,
"learning_rate": 0.00023234347635623233,
"loss": 0.8344,
"step": 2025
},
{
"epoch": 0.38446969696969696,
"grad_norm": 0.044494824681638484,
"learning_rate": 0.0002319285854664242,
"loss": 0.8177,
"step": 2030
},
{
"epoch": 0.3854166666666667,
"grad_norm": 0.05291529187135586,
"learning_rate": 0.00023151279937274548,
"loss": 0.8162,
"step": 2035
},
{
"epoch": 0.38636363636363635,
"grad_norm": 0.0475425548888116,
"learning_rate": 0.00023109612261833963,
"loss": 0.836,
"step": 2040
},
{
"epoch": 0.3873106060606061,
"grad_norm": 0.04795570932520818,
"learning_rate": 0.00023067855975608204,
"loss": 0.8017,
"step": 2045
},
{
"epoch": 0.38825757575757575,
"grad_norm": 0.04351083558421903,
"learning_rate": 0.0002302601153485304,
"loss": 0.8304,
"step": 2050
},
{
"epoch": 0.38920454545454547,
"grad_norm": 0.03951679906520293,
"learning_rate": 0.00022984079396787453,
"loss": 0.8141,
"step": 2055
},
{
"epoch": 0.39015151515151514,
"grad_norm": 0.04196816207750975,
"learning_rate": 0.00022942060019588681,
"loss": 0.8152,
"step": 2060
},
{
"epoch": 0.39109848484848486,
"grad_norm": 0.04118592006087123,
"learning_rate": 0.00022899953862387182,
"loss": 0.8221,
"step": 2065
},
{
"epoch": 0.39204545454545453,
"grad_norm": 0.043398944942405406,
"learning_rate": 0.00022857761385261624,
"loss": 0.8784,
"step": 2070
},
{
"epoch": 0.39299242424242425,
"grad_norm": 0.048877313557472005,
"learning_rate": 0.0002281548304923387,
"loss": 0.8301,
"step": 2075
},
{
"epoch": 0.3939393939393939,
"grad_norm": 0.04660152034578749,
"learning_rate": 0.0002277311931626393,
"loss": 0.8383,
"step": 2080
},
{
"epoch": 0.39488636363636365,
"grad_norm": 0.043844576323543855,
"learning_rate": 0.00022730670649244913,
"loss": 0.8598,
"step": 2085
},
{
"epoch": 0.3958333333333333,
"grad_norm": 0.04598334857841716,
"learning_rate": 0.00022688137511997977,
"loss": 0.8339,
"step": 2090
},
{
"epoch": 0.39678030303030304,
"grad_norm": 0.044197199133415584,
"learning_rate": 0.00022645520369267246,
"loss": 0.8444,
"step": 2095
},
{
"epoch": 0.3977272727272727,
"grad_norm": 0.04324972778637147,
"learning_rate": 0.00022602819686714745,
"loss": 0.8347,
"step": 2100
},
{
"epoch": 0.39867424242424243,
"grad_norm": 0.07113871411866793,
"learning_rate": 0.00022560035930915308,
"loss": 0.8084,
"step": 2105
},
{
"epoch": 0.3996212121212121,
"grad_norm": 0.04394138759860652,
"learning_rate": 0.0002251716956935149,
"loss": 0.7981,
"step": 2110
},
{
"epoch": 0.4005681818181818,
"grad_norm": 0.0418228499942872,
"learning_rate": 0.00022474221070408436,
"loss": 0.8289,
"step": 2115
},
{
"epoch": 0.4015151515151515,
"grad_norm": 0.04161882179160139,
"learning_rate": 0.00022431190903368786,
"loss": 0.847,
"step": 2120
},
{
"epoch": 0.4024621212121212,
"grad_norm": 0.043508618425632965,
"learning_rate": 0.00022388079538407523,
"loss": 0.8437,
"step": 2125
},
{
"epoch": 0.4034090909090909,
"grad_norm": 0.0411877941963478,
"learning_rate": 0.00022344887446586865,
"loss": 0.8397,
"step": 2130
},
{
"epoch": 0.4043560606060606,
"grad_norm": 0.044653146921684504,
"learning_rate": 0.00022301615099851104,
"loss": 0.8387,
"step": 2135
},
{
"epoch": 0.4053030303030303,
"grad_norm": 0.039794410056582685,
"learning_rate": 0.00022258262971021437,
"loss": 0.8602,
"step": 2140
},
{
"epoch": 0.40625,
"grad_norm": 0.04159725529141693,
"learning_rate": 0.00022214831533790813,
"loss": 0.8418,
"step": 2145
},
{
"epoch": 0.4071969696969697,
"grad_norm": 0.04333570277742372,
"learning_rate": 0.00022171321262718765,
"loss": 0.8405,
"step": 2150
},
{
"epoch": 0.4081439393939394,
"grad_norm": 0.041080619204016525,
"learning_rate": 0.00022127732633226205,
"loss": 0.812,
"step": 2155
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.040348407175793716,
"learning_rate": 0.0002208406612159024,
"loss": 0.814,
"step": 2160
},
{
"epoch": 0.4100378787878788,
"grad_norm": 0.04031482700237761,
"learning_rate": 0.0002204032220493897,
"loss": 0.8147,
"step": 2165
},
{
"epoch": 0.4109848484848485,
"grad_norm": 0.039485799909013825,
"learning_rate": 0.00021996501361246277,
"loss": 0.8176,
"step": 2170
},
{
"epoch": 0.4119318181818182,
"grad_norm": 0.04055755183348957,
"learning_rate": 0.00021952604069326579,
"loss": 0.7957,
"step": 2175
},
{
"epoch": 0.4128787878787879,
"grad_norm": 0.042423918520902,
"learning_rate": 0.0002190863080882964,
"loss": 0.8233,
"step": 2180
},
{
"epoch": 0.41382575757575757,
"grad_norm": 0.0425176465118192,
"learning_rate": 0.00021864582060235278,
"loss": 0.8248,
"step": 2185
},
{
"epoch": 0.4147727272727273,
"grad_norm": 0.04617541469982274,
"learning_rate": 0.00021820458304848165,
"loss": 0.8517,
"step": 2190
},
{
"epoch": 0.41571969696969696,
"grad_norm": 0.04672199768475857,
"learning_rate": 0.0002177626002479254,
"loss": 0.8431,
"step": 2195
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.04438315570807549,
"learning_rate": 0.00021731987703006933,
"loss": 0.8259,
"step": 2200
},
{
"epoch": 0.41761363636363635,
"grad_norm": 0.04427543226414618,
"learning_rate": 0.00021687641823238914,
"loss": 0.8297,
"step": 2205
},
{
"epoch": 0.4185606060606061,
"grad_norm": 0.046143213680102727,
"learning_rate": 0.00021643222870039788,
"loss": 0.8183,
"step": 2210
},
{
"epoch": 0.41950757575757575,
"grad_norm": 0.05133388935241887,
"learning_rate": 0.00021598731328759316,
"loss": 0.8433,
"step": 2215
},
{
"epoch": 0.42045454545454547,
"grad_norm": 0.04409942894634961,
"learning_rate": 0.0002155416768554039,
"loss": 0.8341,
"step": 2220
},
{
"epoch": 0.42140151515151514,
"grad_norm": 0.040874633772368184,
"learning_rate": 0.00021509532427313745,
"loss": 0.8257,
"step": 2225
},
{
"epoch": 0.42234848484848486,
"grad_norm": 0.04536497602517255,
"learning_rate": 0.00021464826041792616,
"loss": 0.8265,
"step": 2230
},
{
"epoch": 0.42329545454545453,
"grad_norm": 0.04063288891442798,
"learning_rate": 0.0002142004901746743,
"loss": 0.8157,
"step": 2235
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.04304936310139827,
"learning_rate": 0.00021375201843600448,
"loss": 0.8154,
"step": 2240
},
{
"epoch": 0.4251893939393939,
"grad_norm": 0.03886996255738611,
"learning_rate": 0.00021330285010220444,
"loss": 0.8064,
"step": 2245
},
{
"epoch": 0.42613636363636365,
"grad_norm": 0.0439951415809052,
"learning_rate": 0.00021285299008117327,
"loss": 0.8189,
"step": 2250
},
{
"epoch": 0.4270833333333333,
"grad_norm": 0.04105867295635526,
"learning_rate": 0.00021240244328836786,
"loss": 0.8042,
"step": 2255
},
{
"epoch": 0.42803030303030304,
"grad_norm": 0.0413161086382512,
"learning_rate": 0.0002119512146467492,
"loss": 0.8416,
"step": 2260
},
{
"epoch": 0.4289772727272727,
"grad_norm": 0.04014206208769556,
"learning_rate": 0.00021149930908672868,
"loss": 0.8185,
"step": 2265
},
{
"epoch": 0.42992424242424243,
"grad_norm": 0.04829740014687004,
"learning_rate": 0.00021104673154611408,
"loss": 0.8361,
"step": 2270
},
{
"epoch": 0.4308712121212121,
"grad_norm": 0.03905046798702236,
"learning_rate": 0.0002105934869700556,
"loss": 0.8242,
"step": 2275
},
{
"epoch": 0.4318181818181818,
"grad_norm": 0.053526559819707935,
"learning_rate": 0.00021013958031099205,
"loss": 0.8426,
"step": 2280
},
{
"epoch": 0.4327651515151515,
"grad_norm": 0.04462350019901078,
"learning_rate": 0.0002096850165285964,
"loss": 0.8408,
"step": 2285
},
{
"epoch": 0.4337121212121212,
"grad_norm": 0.04177938781732431,
"learning_rate": 0.00020922980058972194,
"loss": 0.8295,
"step": 2290
},
{
"epoch": 0.4346590909090909,
"grad_norm": 0.03988703975583103,
"learning_rate": 0.00020877393746834768,
"loss": 0.8324,
"step": 2295
},
{
"epoch": 0.4356060606060606,
"grad_norm": 0.04147526586188008,
"learning_rate": 0.0002083174321455243,
"loss": 0.8388,
"step": 2300
},
{
"epoch": 0.4365530303030303,
"grad_norm": 0.038044695307941134,
"learning_rate": 0.0002078602896093194,
"loss": 0.7954,
"step": 2305
},
{
"epoch": 0.4375,
"grad_norm": 0.045559472118838124,
"learning_rate": 0.00020740251485476345,
"loss": 0.8813,
"step": 2310
},
{
"epoch": 0.4384469696969697,
"grad_norm": 0.06279101091973911,
"learning_rate": 0.0002069441128837947,
"loss": 0.839,
"step": 2315
},
{
"epoch": 0.4393939393939394,
"grad_norm": 0.04692715704201686,
"learning_rate": 0.00020648508870520476,
"loss": 0.8352,
"step": 2320
},
{
"epoch": 0.4403409090909091,
"grad_norm": 0.051468594729952064,
"learning_rate": 0.00020602544733458418,
"loss": 0.839,
"step": 2325
},
{
"epoch": 0.4412878787878788,
"grad_norm": 0.049853534068296984,
"learning_rate": 0.00020556519379426693,
"loss": 0.8457,
"step": 2330
},
{
"epoch": 0.4422348484848485,
"grad_norm": 0.043981249031014996,
"learning_rate": 0.0002051043331132762,
"loss": 0.8371,
"step": 2335
},
{
"epoch": 0.4431818181818182,
"grad_norm": 0.044420240602267035,
"learning_rate": 0.00020464287032726913,
"loss": 0.889,
"step": 2340
},
{
"epoch": 0.4441287878787879,
"grad_norm": 0.0386102436252357,
"learning_rate": 0.00020418081047848187,
"loss": 0.8372,
"step": 2345
},
{
"epoch": 0.44507575757575757,
"grad_norm": 0.04140201905259435,
"learning_rate": 0.00020371815861567428,
"loss": 0.8336,
"step": 2350
},
{
"epoch": 0.4460227272727273,
"grad_norm": 0.04249364186602932,
"learning_rate": 0.00020325491979407523,
"loss": 0.8116,
"step": 2355
},
{
"epoch": 0.44696969696969696,
"grad_norm": 0.04505263140825889,
"learning_rate": 0.00020279109907532693,
"loss": 0.8089,
"step": 2360
},
{
"epoch": 0.4479166666666667,
"grad_norm": 0.04905451772451616,
"learning_rate": 0.0002023267015274296,
"loss": 0.8161,
"step": 2365
},
{
"epoch": 0.44886363636363635,
"grad_norm": 0.044083131589713095,
"learning_rate": 0.0002018617322246866,
"loss": 0.7928,
"step": 2370
},
{
"epoch": 0.4498106060606061,
"grad_norm": 0.04087182152366722,
"learning_rate": 0.0002013961962476484,
"loss": 0.8176,
"step": 2375
},
{
"epoch": 0.45075757575757575,
"grad_norm": 0.03992981513666836,
"learning_rate": 0.0002009300986830574,
"loss": 0.8202,
"step": 2380
},
{
"epoch": 0.45170454545454547,
"grad_norm": 0.04118122404075756,
"learning_rate": 0.00020046344462379222,
"loss": 0.8084,
"step": 2385
},
{
"epoch": 0.45265151515151514,
"grad_norm": 0.03836846591943074,
"learning_rate": 0.00019999623916881217,
"loss": 0.7813,
"step": 2390
},
{
"epoch": 0.45359848484848486,
"grad_norm": 0.042256616559667545,
"learning_rate": 0.0001995284874231014,
"loss": 0.8405,
"step": 2395
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.041816536986922656,
"learning_rate": 0.00019906019449761325,
"loss": 0.8265,
"step": 2400
},
{
"epoch": 0.45549242424242425,
"grad_norm": 0.03818514732175339,
"learning_rate": 0.0001985913655092142,
"loss": 0.829,
"step": 2405
},
{
"epoch": 0.4564393939393939,
"grad_norm": 0.040830601356944725,
"learning_rate": 0.00019812200558062817,
"loss": 0.833,
"step": 2410
},
{
"epoch": 0.45738636363636365,
"grad_norm": 0.04011921388896301,
"learning_rate": 0.0001976521198403806,
"loss": 0.7861,
"step": 2415
},
{
"epoch": 0.4583333333333333,
"grad_norm": 0.044386214010181065,
"learning_rate": 0.00019718171342274205,
"loss": 0.8065,
"step": 2420
},
{
"epoch": 0.45928030303030304,
"grad_norm": 0.039421528176134195,
"learning_rate": 0.00019671079146767244,
"loss": 0.8064,
"step": 2425
},
{
"epoch": 0.4602272727272727,
"grad_norm": 0.039658502246534034,
"learning_rate": 0.00019623935912076488,
"loss": 0.8319,
"step": 2430
},
{
"epoch": 0.46117424242424243,
"grad_norm": 0.04350429112923609,
"learning_rate": 0.00019576742153318914,
"loss": 0.7962,
"step": 2435
},
{
"epoch": 0.4621212121212121,
"grad_norm": 0.042692790817445286,
"learning_rate": 0.0001952949838616357,
"loss": 0.8373,
"step": 2440
},
{
"epoch": 0.4630681818181818,
"grad_norm": 0.04339398756235945,
"learning_rate": 0.00019482205126825937,
"loss": 0.8022,
"step": 2445
},
{
"epoch": 0.4640151515151515,
"grad_norm": 0.04304166274280883,
"learning_rate": 0.0001943486289206225,
"loss": 0.8106,
"step": 2450
},
{
"epoch": 0.4649621212121212,
"grad_norm": 0.043750220078402964,
"learning_rate": 0.0001938747219916391,
"loss": 0.8435,
"step": 2455
},
{
"epoch": 0.4659090909090909,
"grad_norm": 0.04664680007453336,
"learning_rate": 0.0001934003356595179,
"loss": 0.8472,
"step": 2460
},
{
"epoch": 0.4668560606060606,
"grad_norm": 0.041222110220423554,
"learning_rate": 0.00019292547510770585,
"loss": 0.7787,
"step": 2465
},
{
"epoch": 0.4678030303030303,
"grad_norm": 0.044722804250566385,
"learning_rate": 0.00019245014552483162,
"loss": 0.8394,
"step": 2470
},
{
"epoch": 0.46875,
"grad_norm": 0.0404146031607715,
"learning_rate": 0.00019197435210464882,
"loss": 0.8154,
"step": 2475
},
{
"epoch": 0.4696969696969697,
"grad_norm": 0.04023149404763966,
"learning_rate": 0.00019149810004597903,
"loss": 0.8191,
"step": 2480
},
{
"epoch": 0.4706439393939394,
"grad_norm": 0.042874304030485456,
"learning_rate": 0.00019102139455265556,
"loss": 0.815,
"step": 2485
},
{
"epoch": 0.4715909090909091,
"grad_norm": 0.042901399276201645,
"learning_rate": 0.00019054424083346592,
"loss": 0.8254,
"step": 2490
},
{
"epoch": 0.4725378787878788,
"grad_norm": 0.048218277435419815,
"learning_rate": 0.00019006664410209533,
"loss": 0.8005,
"step": 2495
},
{
"epoch": 0.4734848484848485,
"grad_norm": 0.04801481036784791,
"learning_rate": 0.00018958860957706973,
"loss": 0.7971,
"step": 2500
},
{
"epoch": 0.4744318181818182,
"grad_norm": 0.045807113572995425,
"learning_rate": 0.00018911014248169862,
"loss": 0.8308,
"step": 2505
},
{
"epoch": 0.4753787878787879,
"grad_norm": 0.04157428170316455,
"learning_rate": 0.00018863124804401792,
"loss": 0.7937,
"step": 2510
},
{
"epoch": 0.47632575757575757,
"grad_norm": 0.044060198787386526,
"learning_rate": 0.0001881519314967331,
"loss": 0.8345,
"step": 2515
},
{
"epoch": 0.4772727272727273,
"grad_norm": 0.04229775014082669,
"learning_rate": 0.00018767219807716185,
"loss": 0.7952,
"step": 2520
},
{
"epoch": 0.47821969696969696,
"grad_norm": 0.038797572756557264,
"learning_rate": 0.00018719205302717687,
"loss": 0.8176,
"step": 2525
},
{
"epoch": 0.4791666666666667,
"grad_norm": 0.03699768912795355,
"learning_rate": 0.00018671150159314855,
"loss": 0.8063,
"step": 2530
},
{
"epoch": 0.48011363636363635,
"grad_norm": 0.044197153844287275,
"learning_rate": 0.00018623054902588775,
"loss": 0.8083,
"step": 2535
},
{
"epoch": 0.4810606060606061,
"grad_norm": 0.04091887354099648,
"learning_rate": 0.00018574920058058824,
"loss": 0.807,
"step": 2540
},
{
"epoch": 0.48200757575757575,
"grad_norm": 0.48718224838979884,
"learning_rate": 0.0001852674615167696,
"loss": 0.8124,
"step": 2545
},
{
"epoch": 0.48295454545454547,
"grad_norm": 0.08791099719174246,
"learning_rate": 0.00018478533709821946,
"loss": 0.8227,
"step": 2550
},
{
"epoch": 0.48390151515151514,
"grad_norm": 0.049480601037611455,
"learning_rate": 0.000184302832592936,
"loss": 0.8321,
"step": 2555
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.0429607961370786,
"learning_rate": 0.00018381995327307067,
"loss": 0.8178,
"step": 2560
},
{
"epoch": 0.48579545454545453,
"grad_norm": 0.0432423766004113,
"learning_rate": 0.0001833367044148701,
"loss": 0.7845,
"step": 2565
},
{
"epoch": 0.48674242424242425,
"grad_norm": 0.04315753569890211,
"learning_rate": 0.00018285309129861905,
"loss": 0.8346,
"step": 2570
},
{
"epoch": 0.4876893939393939,
"grad_norm": 0.04241261149768791,
"learning_rate": 0.00018236911920858215,
"loss": 0.8322,
"step": 2575
},
{
"epoch": 0.48863636363636365,
"grad_norm": 0.042058874287831324,
"learning_rate": 0.00018188479343294648,
"loss": 0.8246,
"step": 2580
},
{
"epoch": 0.4895833333333333,
"grad_norm": 0.04305771883252671,
"learning_rate": 0.0001814001192637638,
"loss": 0.826,
"step": 2585
},
{
"epoch": 0.49053030303030304,
"grad_norm": 0.041842967176783166,
"learning_rate": 0.0001809151019968925,
"loss": 0.7911,
"step": 2590
},
{
"epoch": 0.4914772727272727,
"grad_norm": 0.04061701352923131,
"learning_rate": 0.00018042974693193998,
"loss": 0.797,
"step": 2595
},
{
"epoch": 0.49242424242424243,
"grad_norm": 0.04647922786872379,
"learning_rate": 0.0001799440593722046,
"loss": 0.7946,
"step": 2600
},
{
"epoch": 0.4933712121212121,
"grad_norm": 0.04039980658232629,
"learning_rate": 0.00017945804462461776,
"loss": 0.8,
"step": 2605
},
{
"epoch": 0.4943181818181818,
"grad_norm": 0.03886277509185924,
"learning_rate": 0.00017897170799968583,
"loss": 0.7849,
"step": 2610
},
{
"epoch": 0.4952651515151515,
"grad_norm": 0.038833022410046734,
"learning_rate": 0.00017848505481143253,
"loss": 0.844,
"step": 2615
},
{
"epoch": 0.4962121212121212,
"grad_norm": 0.03944433422520882,
"learning_rate": 0.00017799809037734017,
"loss": 0.8163,
"step": 2620
},
{
"epoch": 0.4971590909090909,
"grad_norm": 0.043008540505396285,
"learning_rate": 0.00017751082001829215,
"loss": 0.8258,
"step": 2625
},
{
"epoch": 0.4981060606060606,
"grad_norm": 0.040199050500284966,
"learning_rate": 0.00017702324905851456,
"loss": 0.8315,
"step": 2630
},
{
"epoch": 0.4990530303030303,
"grad_norm": 0.040791783573880774,
"learning_rate": 0.00017653538282551805,
"loss": 0.7863,
"step": 2635
},
{
"epoch": 0.5,
"grad_norm": 0.0442708357721425,
"learning_rate": 0.00017604722665003956,
"loss": 0.8213,
"step": 2640
},
{
"epoch": 0.5009469696969697,
"grad_norm": 0.04301849579782737,
"learning_rate": 0.00017555878586598413,
"loss": 0.8236,
"step": 2645
},
{
"epoch": 0.5018939393939394,
"grad_norm": 0.04101785865093793,
"learning_rate": 0.00017507006581036678,
"loss": 0.8062,
"step": 2650
},
{
"epoch": 0.5028409090909091,
"grad_norm": 0.04131104487003713,
"learning_rate": 0.00017458107182325374,
"loss": 0.8257,
"step": 2655
},
{
"epoch": 0.5037878787878788,
"grad_norm": 0.043009445132835604,
"learning_rate": 0.00017409180924770468,
"loss": 0.8165,
"step": 2660
},
{
"epoch": 0.5047348484848485,
"grad_norm": 0.03958181194344823,
"learning_rate": 0.00017360228342971383,
"loss": 0.8325,
"step": 2665
},
{
"epoch": 0.5056818181818182,
"grad_norm": 0.0397542558833111,
"learning_rate": 0.00017311249971815185,
"loss": 0.798,
"step": 2670
},
{
"epoch": 0.5066287878787878,
"grad_norm": 0.04623083615061935,
"learning_rate": 0.00017262246346470733,
"loss": 0.8354,
"step": 2675
},
{
"epoch": 0.5075757575757576,
"grad_norm": 0.04071727047371299,
"learning_rate": 0.0001721321800238283,
"loss": 0.7985,
"step": 2680
},
{
"epoch": 0.5085227272727273,
"grad_norm": 0.03863023788708821,
"learning_rate": 0.00017164165475266362,
"loss": 0.8162,
"step": 2685
},
{
"epoch": 0.509469696969697,
"grad_norm": 0.03916331940406132,
"learning_rate": 0.0001711508930110047,
"loss": 0.7845,
"step": 2690
},
{
"epoch": 0.5104166666666666,
"grad_norm": 0.03931132854415474,
"learning_rate": 0.0001706599001612266,
"loss": 0.7776,
"step": 2695
},
{
"epoch": 0.5113636363636364,
"grad_norm": 0.043368245986302636,
"learning_rate": 0.00017016868156822978,
"loss": 0.8054,
"step": 2700
},
{
"epoch": 0.5123106060606061,
"grad_norm": 0.03992828095880944,
"learning_rate": 0.00016967724259938123,
"loss": 0.7988,
"step": 2705
},
{
"epoch": 0.5132575757575758,
"grad_norm": 0.04739921469866324,
"learning_rate": 0.00016918558862445582,
"loss": 0.7943,
"step": 2710
},
{
"epoch": 0.5142045454545454,
"grad_norm": 0.04332958886748095,
"learning_rate": 0.00016869372501557788,
"loss": 0.819,
"step": 2715
},
{
"epoch": 0.5151515151515151,
"grad_norm": 0.04003282248966938,
"learning_rate": 0.00016820165714716227,
"loss": 0.8292,
"step": 2720
},
{
"epoch": 0.5160984848484849,
"grad_norm": 0.04862073540042466,
"learning_rate": 0.00016770939039585571,
"loss": 0.827,
"step": 2725
},
{
"epoch": 0.5170454545454546,
"grad_norm": 0.04320764468844852,
"learning_rate": 0.00016721693014047805,
"loss": 0.804,
"step": 2730
},
{
"epoch": 0.5179924242424242,
"grad_norm": 0.041252462956353965,
"learning_rate": 0.00016672428176196344,
"loss": 0.7767,
"step": 2735
},
{
"epoch": 0.5189393939393939,
"grad_norm": 0.040985992228162016,
"learning_rate": 0.00016623145064330162,
"loss": 0.8092,
"step": 2740
},
{
"epoch": 0.5198863636363636,
"grad_norm": 0.042258864981091555,
"learning_rate": 0.0001657384421694791,
"loss": 0.7994,
"step": 2745
},
{
"epoch": 0.5208333333333334,
"grad_norm": 0.04021897972485824,
"learning_rate": 0.00016524526172742026,
"loss": 0.784,
"step": 2750
},
{
"epoch": 0.521780303030303,
"grad_norm": 0.04326415978694196,
"learning_rate": 0.0001647519147059285,
"loss": 0.8047,
"step": 2755
},
{
"epoch": 0.5227272727272727,
"grad_norm": 0.04410213043970681,
"learning_rate": 0.00016425840649562736,
"loss": 0.8126,
"step": 2760
},
{
"epoch": 0.5236742424242424,
"grad_norm": 0.04353103158360417,
"learning_rate": 0.00016376474248890171,
"loss": 0.8286,
"step": 2765
},
{
"epoch": 0.5246212121212122,
"grad_norm": 0.040750168292872986,
"learning_rate": 0.00016327092807983865,
"loss": 0.808,
"step": 2770
},
{
"epoch": 0.5255681818181818,
"grad_norm": 0.03802927883237263,
"learning_rate": 0.0001627769686641687,
"loss": 0.8053,
"step": 2775
},
{
"epoch": 0.5265151515151515,
"grad_norm": 0.04157749031417506,
"learning_rate": 0.0001622828696392069,
"loss": 0.8244,
"step": 2780
},
{
"epoch": 0.5274621212121212,
"grad_norm": 0.03901280596306352,
"learning_rate": 0.00016178863640379357,
"loss": 0.8057,
"step": 2785
},
{
"epoch": 0.5284090909090909,
"grad_norm": 0.04452601323911491,
"learning_rate": 0.0001612942743582357,
"loss": 0.8382,
"step": 2790
},
{
"epoch": 0.5293560606060606,
"grad_norm": 0.05541519438754282,
"learning_rate": 0.0001607997889042476,
"loss": 0.841,
"step": 2795
},
{
"epoch": 0.5303030303030303,
"grad_norm": 0.058989969809454576,
"learning_rate": 0.00016030518544489213,
"loss": 0.8176,
"step": 2800
},
{
"epoch": 0.53125,
"grad_norm": 0.05002098578052214,
"learning_rate": 0.00015981046938452146,
"loss": 0.8002,
"step": 2805
},
{
"epoch": 0.5321969696969697,
"grad_norm": 0.08021694387476866,
"learning_rate": 0.00015931564612871812,
"loss": 0.81,
"step": 2810
},
{
"epoch": 0.5331439393939394,
"grad_norm": 0.045649438932203974,
"learning_rate": 0.00015882072108423594,
"loss": 0.7931,
"step": 2815
},
{
"epoch": 0.5340909090909091,
"grad_norm": 0.042928318104272216,
"learning_rate": 0.000158325699658941,
"loss": 0.8097,
"step": 2820
},
{
"epoch": 0.5350378787878788,
"grad_norm": 0.041290327628681206,
"learning_rate": 0.0001578305872617525,
"loss": 0.8009,
"step": 2825
},
{
"epoch": 0.5359848484848485,
"grad_norm": 0.043370278921034656,
"learning_rate": 0.0001573353893025835,
"loss": 0.8072,
"step": 2830
},
{
"epoch": 0.5369318181818182,
"grad_norm": 0.04062057924038298,
"learning_rate": 0.00015684011119228224,
"loss": 0.8135,
"step": 2835
},
{
"epoch": 0.5378787878787878,
"grad_norm": 0.03956842259978757,
"learning_rate": 0.00015634475834257246,
"loss": 0.8083,
"step": 2840
},
{
"epoch": 0.5388257575757576,
"grad_norm": 0.04065029502341851,
"learning_rate": 0.00015584933616599473,
"loss": 0.8252,
"step": 2845
},
{
"epoch": 0.5397727272727273,
"grad_norm": 0.0406473529931244,
"learning_rate": 0.00015535385007584706,
"loss": 0.788,
"step": 2850
},
{
"epoch": 0.540719696969697,
"grad_norm": 0.040142414490938486,
"learning_rate": 0.0001548583054861259,
"loss": 0.7869,
"step": 2855
},
{
"epoch": 0.5416666666666666,
"grad_norm": 0.03672349497309872,
"learning_rate": 0.0001543627078114667,
"loss": 0.7999,
"step": 2860
},
{
"epoch": 0.5426136363636364,
"grad_norm": 0.0430040050376934,
"learning_rate": 0.00015386706246708524,
"loss": 0.8061,
"step": 2865
},
{
"epoch": 0.5435606060606061,
"grad_norm": 0.0395889490877207,
"learning_rate": 0.00015337137486871796,
"loss": 0.7938,
"step": 2870
},
{
"epoch": 0.5445075757575758,
"grad_norm": 0.04178894808034304,
"learning_rate": 0.00015287565043256302,
"loss": 0.7898,
"step": 2875
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.04229269873654704,
"learning_rate": 0.00015237989457522118,
"loss": 0.8025,
"step": 2880
},
{
"epoch": 0.5464015151515151,
"grad_norm": 0.04274569352435655,
"learning_rate": 0.00015188411271363646,
"loss": 0.8477,
"step": 2885
},
{
"epoch": 0.5473484848484849,
"grad_norm": 0.04018997606928589,
"learning_rate": 0.00015138831026503702,
"loss": 0.8121,
"step": 2890
},
{
"epoch": 0.5482954545454546,
"grad_norm": 0.04487654715701671,
"learning_rate": 0.00015089249264687603,
"loss": 0.7961,
"step": 2895
},
{
"epoch": 0.5492424242424242,
"grad_norm": 0.03928757645472925,
"learning_rate": 0.00015039666527677233,
"loss": 0.8406,
"step": 2900
},
{
"epoch": 0.5501893939393939,
"grad_norm": 0.042611840013764696,
"learning_rate": 0.00014990083357245128,
"loss": 0.7913,
"step": 2905
},
{
"epoch": 0.5511363636363636,
"grad_norm": 0.04292928983412645,
"learning_rate": 0.0001494050029516858,
"loss": 0.7977,
"step": 2910
},
{
"epoch": 0.5520833333333334,
"grad_norm": 0.038369849549461224,
"learning_rate": 0.00014890917883223677,
"loss": 0.8199,
"step": 2915
},
{
"epoch": 0.553030303030303,
"grad_norm": 0.04541317576278365,
"learning_rate": 0.00014841336663179406,
"loss": 0.8091,
"step": 2920
},
{
"epoch": 0.5539772727272727,
"grad_norm": 0.03684570730569431,
"learning_rate": 0.00014791757176791742,
"loss": 0.8195,
"step": 2925
},
{
"epoch": 0.5549242424242424,
"grad_norm": 0.03872377757151605,
"learning_rate": 0.00014742179965797705,
"loss": 0.8107,
"step": 2930
},
{
"epoch": 0.5558712121212122,
"grad_norm": 0.0395990381746492,
"learning_rate": 0.00014692605571909462,
"loss": 0.8034,
"step": 2935
},
{
"epoch": 0.5568181818181818,
"grad_norm": 0.03886126907028109,
"learning_rate": 0.00014643034536808387,
"loss": 0.7968,
"step": 2940
},
{
"epoch": 0.5577651515151515,
"grad_norm": 0.039986116602809194,
"learning_rate": 0.00014593467402139164,
"loss": 0.7946,
"step": 2945
},
{
"epoch": 0.5587121212121212,
"grad_norm": 0.03812573349649333,
"learning_rate": 0.00014543904709503854,
"loss": 0.7866,
"step": 2950
},
{
"epoch": 0.5596590909090909,
"grad_norm": 0.03932184467330868,
"learning_rate": 0.0001449434700045599,
"loss": 0.8019,
"step": 2955
},
{
"epoch": 0.5606060606060606,
"grad_norm": 0.04445798754887963,
"learning_rate": 0.00014444794816494626,
"loss": 0.825,
"step": 2960
},
{
"epoch": 0.5615530303030303,
"grad_norm": 0.04248501207056126,
"learning_rate": 0.0001439524869905848,
"loss": 0.8226,
"step": 2965
},
{
"epoch": 0.5625,
"grad_norm": 0.040603736504944546,
"learning_rate": 0.0001434570918951996,
"loss": 0.8263,
"step": 2970
},
{
"epoch": 0.5634469696969697,
"grad_norm": 0.044713609887935456,
"learning_rate": 0.00014296176829179275,
"loss": 0.7915,
"step": 2975
},
{
"epoch": 0.5643939393939394,
"grad_norm": 0.042449864997604524,
"learning_rate": 0.00014246652159258526,
"loss": 0.7896,
"step": 2980
},
{
"epoch": 0.5653409090909091,
"grad_norm": 0.03798962681468922,
"learning_rate": 0.0001419713572089577,
"loss": 0.9055,
"step": 2985
},
{
"epoch": 0.5662878787878788,
"grad_norm": 0.04786757520604103,
"learning_rate": 0.0001414762805513914,
"loss": 0.8006,
"step": 2990
},
{
"epoch": 0.5672348484848485,
"grad_norm": 0.04252660263305811,
"learning_rate": 0.00014098129702940892,
"loss": 0.7907,
"step": 2995
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.043829106135149745,
"learning_rate": 0.00014048641205151533,
"loss": 0.7872,
"step": 3000
},
{
"epoch": 0.5691287878787878,
"grad_norm": 0.03916395834671137,
"learning_rate": 0.0001399916310251388,
"loss": 0.7761,
"step": 3005
},
{
"epoch": 0.5700757575757576,
"grad_norm": 0.03967852059796232,
"learning_rate": 0.00013949695935657193,
"loss": 0.7951,
"step": 3010
},
{
"epoch": 0.5710227272727273,
"grad_norm": 0.04226287650595886,
"learning_rate": 0.00013900240245091203,
"loss": 0.7765,
"step": 3015
},
{
"epoch": 0.571969696969697,
"grad_norm": 0.04195996621258936,
"learning_rate": 0.00013850796571200264,
"loss": 0.8174,
"step": 3020
},
{
"epoch": 0.5729166666666666,
"grad_norm": 0.04343356999935917,
"learning_rate": 0.00013801365454237444,
"loss": 0.8048,
"step": 3025
},
{
"epoch": 0.5738636363636364,
"grad_norm": 0.038284495114294666,
"learning_rate": 0.00013751947434318564,
"loss": 0.7818,
"step": 3030
},
{
"epoch": 0.5748106060606061,
"grad_norm": 0.04286762969801166,
"learning_rate": 0.00013702543051416383,
"loss": 0.7904,
"step": 3035
},
{
"epoch": 0.5757575757575758,
"grad_norm": 0.03931256543968111,
"learning_rate": 0.00013653152845354623,
"loss": 0.8209,
"step": 3040
},
{
"epoch": 0.5767045454545454,
"grad_norm": 0.04052581793415016,
"learning_rate": 0.0001360377735580212,
"loss": 0.7895,
"step": 3045
},
{
"epoch": 0.5776515151515151,
"grad_norm": 0.04013847290742192,
"learning_rate": 0.00013554417122266888,
"loss": 0.7997,
"step": 3050
},
{
"epoch": 0.5785984848484849,
"grad_norm": 0.04225466886048973,
"learning_rate": 0.00013505072684090263,
"loss": 0.8018,
"step": 3055
},
{
"epoch": 0.5795454545454546,
"grad_norm": 0.04063423310025803,
"learning_rate": 0.00013455744580440982,
"loss": 0.8103,
"step": 3060
},
{
"epoch": 0.5804924242424242,
"grad_norm": 0.041112600968304276,
"learning_rate": 0.00013406433350309304,
"loss": 0.771,
"step": 3065
},
{
"epoch": 0.5814393939393939,
"grad_norm": 0.043166425761174104,
"learning_rate": 0.0001335713953250111,
"loss": 0.7813,
"step": 3070
},
{
"epoch": 0.5823863636363636,
"grad_norm": 0.042559737261154675,
"learning_rate": 0.0001330786366563203,
"loss": 0.7795,
"step": 3075
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.039294962688869624,
"learning_rate": 0.00013258606288121542,
"loss": 0.7852,
"step": 3080
},
{
"epoch": 0.584280303030303,
"grad_norm": 0.039625489168065825,
"learning_rate": 0.00013209367938187125,
"loss": 0.7602,
"step": 3085
},
{
"epoch": 0.5852272727272727,
"grad_norm": 0.038997900760427306,
"learning_rate": 0.000131601491538383,
"loss": 0.78,
"step": 3090
},
{
"epoch": 0.5861742424242424,
"grad_norm": 0.040146555417594515,
"learning_rate": 0.00013110950472870853,
"loss": 0.8004,
"step": 3095
},
{
"epoch": 0.5871212121212122,
"grad_norm": 0.039349933500868364,
"learning_rate": 0.00013061772432860886,
"loss": 0.8254,
"step": 3100
},
{
"epoch": 0.5880681818181818,
"grad_norm": 0.040521068783339456,
"learning_rate": 0.0001301261557115895,
"loss": 0.7688,
"step": 3105
},
{
"epoch": 0.5890151515151515,
"grad_norm": 0.04280174220822872,
"learning_rate": 0.00012963480424884214,
"loss": 0.7883,
"step": 3110
},
{
"epoch": 0.5899621212121212,
"grad_norm": 0.04000965172907218,
"learning_rate": 0.00012914367530918557,
"loss": 0.7733,
"step": 3115
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.04058021099794167,
"learning_rate": 0.00012865277425900724,
"loss": 0.7816,
"step": 3120
},
{
"epoch": 0.5918560606060606,
"grad_norm": 0.044091148466870456,
"learning_rate": 0.00012816210646220437,
"loss": 0.7797,
"step": 3125
},
{
"epoch": 0.5928030303030303,
"grad_norm": 0.042915659449647994,
"learning_rate": 0.00012767167728012566,
"loss": 0.787,
"step": 3130
},
{
"epoch": 0.59375,
"grad_norm": 0.04044950460083324,
"learning_rate": 0.00012718149207151247,
"loss": 0.8153,
"step": 3135
},
{
"epoch": 0.5946969696969697,
"grad_norm": 0.035974336074393466,
"learning_rate": 0.00012669155619244048,
"loss": 0.7665,
"step": 3140
},
{
"epoch": 0.5956439393939394,
"grad_norm": 0.03990766970124255,
"learning_rate": 0.00012620187499626082,
"loss": 0.7814,
"step": 3145
},
{
"epoch": 0.5965909090909091,
"grad_norm": 0.04063094274983586,
"learning_rate": 0.00012571245383354192,
"loss": 0.8079,
"step": 3150
},
{
"epoch": 0.5975378787878788,
"grad_norm": 0.04192680886047405,
"learning_rate": 0.00012522329805201104,
"loss": 0.7851,
"step": 3155
},
{
"epoch": 0.5984848484848485,
"grad_norm": 0.0405476068701757,
"learning_rate": 0.00012473441299649544,
"loss": 0.8231,
"step": 3160
},
{
"epoch": 0.5994318181818182,
"grad_norm": 0.040569633945997545,
"learning_rate": 0.0001242458040088644,
"loss": 0.7737,
"step": 3165
},
{
"epoch": 0.6003787878787878,
"grad_norm": 0.038360447057587385,
"learning_rate": 0.00012375747642797083,
"loss": 0.7874,
"step": 3170
},
{
"epoch": 0.6013257575757576,
"grad_norm": 0.04006212810733869,
"learning_rate": 0.00012326943558959265,
"loss": 0.7899,
"step": 3175
},
{
"epoch": 0.6022727272727273,
"grad_norm": 0.04200526076077111,
"learning_rate": 0.0001227816868263746,
"loss": 0.8006,
"step": 3180
},
{
"epoch": 0.603219696969697,
"grad_norm": 0.04132950189249958,
"learning_rate": 0.0001222942354677702,
"loss": 0.7927,
"step": 3185
},
{
"epoch": 0.6041666666666666,
"grad_norm": 0.039846023645240154,
"learning_rate": 0.00012180708683998321,
"loss": 0.8127,
"step": 3190
},
{
"epoch": 0.6051136363636364,
"grad_norm": 0.0370381211582106,
"learning_rate": 0.00012132024626590963,
"loss": 0.7977,
"step": 3195
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.03637545691166675,
"learning_rate": 0.00012083371906507937,
"loss": 0.7972,
"step": 3200
},
{
"epoch": 0.6070075757575758,
"grad_norm": 0.03719020082784945,
"learning_rate": 0.00012034751055359836,
"loss": 0.7944,
"step": 3205
},
{
"epoch": 0.6079545454545454,
"grad_norm": 0.04061887180440516,
"learning_rate": 0.00011986162604409015,
"loss": 0.8207,
"step": 3210
},
{
"epoch": 0.6089015151515151,
"grad_norm": 0.03857442410511439,
"learning_rate": 0.00011937607084563836,
"loss": 0.7841,
"step": 3215
},
{
"epoch": 0.6098484848484849,
"grad_norm": 0.03544743527411389,
"learning_rate": 0.00011889085026372792,
"loss": 0.7499,
"step": 3220
},
{
"epoch": 0.6107954545454546,
"grad_norm": 0.040036064198766305,
"learning_rate": 0.00011840596960018779,
"loss": 0.7856,
"step": 3225
},
{
"epoch": 0.6117424242424242,
"grad_norm": 0.03717988537059713,
"learning_rate": 0.00011792143415313285,
"loss": 0.7884,
"step": 3230
},
{
"epoch": 0.6126893939393939,
"grad_norm": 0.038107331310669845,
"learning_rate": 0.00011743724921690557,
"loss": 0.8106,
"step": 3235
},
{
"epoch": 0.6136363636363636,
"grad_norm": 0.04482871942046956,
"learning_rate": 0.00011695342008201888,
"loss": 0.7865,
"step": 3240
},
{
"epoch": 0.6145833333333334,
"grad_norm": 0.04317100302292292,
"learning_rate": 0.00011646995203509786,
"loss": 0.7826,
"step": 3245
},
{
"epoch": 0.615530303030303,
"grad_norm": 0.039413143234785654,
"learning_rate": 0.00011598685035882209,
"loss": 0.8101,
"step": 3250
},
{
"epoch": 0.6164772727272727,
"grad_norm": 0.04160937673240829,
"learning_rate": 0.00011550412033186792,
"loss": 0.8075,
"step": 3255
},
{
"epoch": 0.6174242424242424,
"grad_norm": 0.04265804651359686,
"learning_rate": 0.00011502176722885092,
"loss": 0.7775,
"step": 3260
},
{
"epoch": 0.6183712121212122,
"grad_norm": 0.03635722191647411,
"learning_rate": 0.00011453979632026809,
"loss": 0.791,
"step": 3265
},
{
"epoch": 0.6193181818181818,
"grad_norm": 0.03705221554060922,
"learning_rate": 0.00011405821287244035,
"loss": 0.8008,
"step": 3270
},
{
"epoch": 0.6202651515151515,
"grad_norm": 0.042403810547206766,
"learning_rate": 0.00011357702214745493,
"loss": 0.7652,
"step": 3275
},
{
"epoch": 0.6212121212121212,
"grad_norm": 0.03979753737480537,
"learning_rate": 0.00011309622940310798,
"loss": 0.7991,
"step": 3280
},
{
"epoch": 0.6221590909090909,
"grad_norm": 0.03836363646080294,
"learning_rate": 0.00011261583989284712,
"loss": 0.803,
"step": 3285
},
{
"epoch": 0.6231060606060606,
"grad_norm": 0.04244615364903799,
"learning_rate": 0.00011213585886571376,
"loss": 0.8072,
"step": 3290
},
{
"epoch": 0.6240530303030303,
"grad_norm": 0.04283582850640676,
"learning_rate": 0.00011165629156628613,
"loss": 0.7861,
"step": 3295
},
{
"epoch": 0.625,
"grad_norm": 0.038461780382639685,
"learning_rate": 0.00011117714323462186,
"loss": 0.7835,
"step": 3300
},
{
"epoch": 0.6259469696969697,
"grad_norm": 0.03744497970084062,
"learning_rate": 0.00011069841910620057,
"loss": 0.8062,
"step": 3305
},
{
"epoch": 0.6268939393939394,
"grad_norm": 0.04483128738934721,
"learning_rate": 0.00011022012441186671,
"loss": 0.7961,
"step": 3310
},
{
"epoch": 0.6278409090909091,
"grad_norm": 0.04424941740338033,
"learning_rate": 0.00010974226437777261,
"loss": 0.7949,
"step": 3315
},
{
"epoch": 0.6287878787878788,
"grad_norm": 0.04251454254286352,
"learning_rate": 0.0001092648442253211,
"loss": 0.7725,
"step": 3320
},
{
"epoch": 0.6297348484848485,
"grad_norm": 0.040105374119544505,
"learning_rate": 0.0001087878691711087,
"loss": 0.8147,
"step": 3325
},
{
"epoch": 0.6306818181818182,
"grad_norm": 0.04164828498994665,
"learning_rate": 0.00010831134442686835,
"loss": 0.8076,
"step": 3330
},
{
"epoch": 0.6316287878787878,
"grad_norm": 0.04194724008380887,
"learning_rate": 0.00010783527519941272,
"loss": 0.7514,
"step": 3335
},
{
"epoch": 0.6325757575757576,
"grad_norm": 0.04282010148959667,
"learning_rate": 0.00010735966669057723,
"loss": 0.8084,
"step": 3340
},
{
"epoch": 0.6335227272727273,
"grad_norm": 0.037751992950868556,
"learning_rate": 0.00010688452409716325,
"loss": 0.7971,
"step": 3345
},
{
"epoch": 0.634469696969697,
"grad_norm": 0.040981833047628674,
"learning_rate": 0.00010640985261088102,
"loss": 0.8259,
"step": 3350
},
{
"epoch": 0.6354166666666666,
"grad_norm": 0.03623074593719334,
"learning_rate": 0.00010593565741829331,
"loss": 0.7584,
"step": 3355
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.04085407578588483,
"learning_rate": 0.00010546194370075881,
"loss": 0.7941,
"step": 3360
},
{
"epoch": 0.6373106060606061,
"grad_norm": 0.04107679689904555,
"learning_rate": 0.00010498871663437485,
"loss": 0.7985,
"step": 3365
},
{
"epoch": 0.6382575757575758,
"grad_norm": 0.03850210602630568,
"learning_rate": 0.00010451598138992173,
"loss": 0.7737,
"step": 3370
},
{
"epoch": 0.6392045454545454,
"grad_norm": 0.0375973308222491,
"learning_rate": 0.00010404374313280557,
"loss": 0.7849,
"step": 3375
},
{
"epoch": 0.6401515151515151,
"grad_norm": 0.03545282006804828,
"learning_rate": 0.00010357200702300214,
"loss": 0.7993,
"step": 3380
},
{
"epoch": 0.6410984848484849,
"grad_norm": 0.04099405038912456,
"learning_rate": 0.0001031007782150004,
"loss": 0.7879,
"step": 3385
},
{
"epoch": 0.6420454545454546,
"grad_norm": 0.04269554474417421,
"learning_rate": 0.00010263006185774627,
"loss": 0.7559,
"step": 3390
},
{
"epoch": 0.6429924242424242,
"grad_norm": 0.039655024113479126,
"learning_rate": 0.00010215986309458622,
"loss": 0.7633,
"step": 3395
},
{
"epoch": 0.6439393939393939,
"grad_norm": 0.040202236041103546,
"learning_rate": 0.0001016901870632113,
"loss": 0.7795,
"step": 3400
},
{
"epoch": 0.6448863636363636,
"grad_norm": 0.038440162083217946,
"learning_rate": 0.00010122103889560066,
"loss": 0.788,
"step": 3405
},
{
"epoch": 0.6458333333333334,
"grad_norm": 0.0380210653037665,
"learning_rate": 0.00010075242371796585,
"loss": 0.7796,
"step": 3410
},
{
"epoch": 0.646780303030303,
"grad_norm": 0.038714184645298265,
"learning_rate": 0.00010028434665069456,
"loss": 0.7505,
"step": 3415
},
{
"epoch": 0.6477272727272727,
"grad_norm": 0.036301784575765876,
"learning_rate": 9.981681280829472e-05,
"loss": 0.7863,
"step": 3420
},
{
"epoch": 0.6486742424242424,
"grad_norm": 0.04273246901454883,
"learning_rate": 9.934982729933864e-05,
"loss": 0.7936,
"step": 3425
},
{
"epoch": 0.6496212121212122,
"grad_norm": 0.04096752213327176,
"learning_rate": 9.888339522640727e-05,
"loss": 0.7848,
"step": 3430
},
{
"epoch": 0.6505681818181818,
"grad_norm": 0.03654932535140771,
"learning_rate": 9.84175216860344e-05,
"loss": 0.801,
"step": 3435
},
{
"epoch": 0.6515151515151515,
"grad_norm": 0.03977277870704206,
"learning_rate": 9.795221176865064e-05,
"loss": 0.7817,
"step": 3440
},
{
"epoch": 0.6524621212121212,
"grad_norm": 0.03945778648109342,
"learning_rate": 9.748747055852845e-05,
"loss": 0.8034,
"step": 3445
},
{
"epoch": 0.6534090909090909,
"grad_norm": 0.03750951937019652,
"learning_rate": 9.702330313372607e-05,
"loss": 0.8047,
"step": 3450
},
{
"epoch": 0.6543560606060606,
"grad_norm": 0.04224253753307829,
"learning_rate": 9.655971456603222e-05,
"loss": 0.7741,
"step": 3455
},
{
"epoch": 0.6553030303030303,
"grad_norm": 0.04193635128292089,
"learning_rate": 9.609670992091063e-05,
"loss": 0.7686,
"step": 3460
},
{
"epoch": 0.65625,
"grad_norm": 0.0383707593111435,
"learning_rate": 9.563429425744476e-05,
"loss": 0.7937,
"step": 3465
},
{
"epoch": 0.6571969696969697,
"grad_norm": 0.04221940705987869,
"learning_rate": 9.517247262828245e-05,
"loss": 0.7589,
"step": 3470
},
{
"epoch": 0.6581439393939394,
"grad_norm": 0.03796604736644861,
"learning_rate": 9.47112500795808e-05,
"loss": 0.7673,
"step": 3475
},
{
"epoch": 0.6590909090909091,
"grad_norm": 0.03883212329330115,
"learning_rate": 9.425063165095088e-05,
"loss": 0.7899,
"step": 3480
},
{
"epoch": 0.6600378787878788,
"grad_norm": 0.03775017282994837,
"learning_rate": 9.379062237540282e-05,
"loss": 0.7824,
"step": 3485
},
{
"epoch": 0.6609848484848485,
"grad_norm": 0.040969682549424714,
"learning_rate": 9.333122727929086e-05,
"loss": 0.7744,
"step": 3490
},
{
"epoch": 0.6619318181818182,
"grad_norm": 0.043909710244610795,
"learning_rate": 9.287245138225807e-05,
"loss": 0.7844,
"step": 3495
},
{
"epoch": 0.6628787878787878,
"grad_norm": 0.03979263612757763,
"learning_rate": 9.241429969718193e-05,
"loss": 0.7771,
"step": 3500
},
{
"epoch": 0.6638257575757576,
"grad_norm": 0.036720996927973024,
"learning_rate": 9.195677723011943e-05,
"loss": 0.7787,
"step": 3505
},
{
"epoch": 0.6647727272727273,
"grad_norm": 0.03931047020350426,
"learning_rate": 9.149988898025224e-05,
"loss": 0.7924,
"step": 3510
},
{
"epoch": 0.665719696969697,
"grad_norm": 0.037239986158338824,
"learning_rate": 9.10436399398321e-05,
"loss": 0.763,
"step": 3515
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.03903031387121002,
"learning_rate": 9.058803509412646e-05,
"loss": 0.7948,
"step": 3520
},
{
"epoch": 0.6676136363636364,
"grad_norm": 0.037974042366075905,
"learning_rate": 9.013307942136387e-05,
"loss": 0.7958,
"step": 3525
},
{
"epoch": 0.6685606060606061,
"grad_norm": 0.03996518480569968,
"learning_rate": 8.967877789267957e-05,
"loss": 0.7961,
"step": 3530
},
{
"epoch": 0.6695075757575758,
"grad_norm": 0.04156879401863253,
"learning_rate": 8.92251354720612e-05,
"loss": 0.7805,
"step": 3535
},
{
"epoch": 0.6704545454545454,
"grad_norm": 0.0379764100996147,
"learning_rate": 8.877215711629457e-05,
"loss": 0.776,
"step": 3540
},
{
"epoch": 0.6714015151515151,
"grad_norm": 0.03967109810939677,
"learning_rate": 8.831984777490954e-05,
"loss": 0.7884,
"step": 3545
},
{
"epoch": 0.6723484848484849,
"grad_norm": 0.04222761106798767,
"learning_rate": 8.786821239012582e-05,
"loss": 0.7714,
"step": 3550
},
{
"epoch": 0.6732954545454546,
"grad_norm": 0.04051009224725267,
"learning_rate": 8.741725589679912e-05,
"loss": 0.7656,
"step": 3555
},
{
"epoch": 0.6742424242424242,
"grad_norm": 0.03816042826696097,
"learning_rate": 8.696698322236706e-05,
"loss": 0.7609,
"step": 3560
},
{
"epoch": 0.6751893939393939,
"grad_norm": 0.03985241002351074,
"learning_rate": 8.651739928679556e-05,
"loss": 0.7982,
"step": 3565
},
{
"epoch": 0.6761363636363636,
"grad_norm": 0.039265226899704256,
"learning_rate": 8.606850900252478e-05,
"loss": 0.7886,
"step": 3570
},
{
"epoch": 0.6770833333333334,
"grad_norm": 0.03925154055754282,
"learning_rate": 8.562031727441567e-05,
"loss": 0.7963,
"step": 3575
},
{
"epoch": 0.678030303030303,
"grad_norm": 0.039792050085331696,
"learning_rate": 8.517282899969629e-05,
"loss": 0.8051,
"step": 3580
},
{
"epoch": 0.6789772727272727,
"grad_norm": 0.03879414602215694,
"learning_rate": 8.472604906790852e-05,
"loss": 0.8024,
"step": 3585
},
{
"epoch": 0.6799242424242424,
"grad_norm": 0.0418000610573599,
"learning_rate": 8.427998236085404e-05,
"loss": 0.762,
"step": 3590
},
{
"epoch": 0.6808712121212122,
"grad_norm": 0.045760675836821356,
"learning_rate": 8.38346337525417e-05,
"loss": 0.7923,
"step": 3595
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.040543801463440665,
"learning_rate": 8.339000810913386e-05,
"loss": 0.7809,
"step": 3600
},
{
"epoch": 0.6827651515151515,
"grad_norm": 0.03937397555590882,
"learning_rate": 8.294611028889332e-05,
"loss": 0.7985,
"step": 3605
},
{
"epoch": 0.6837121212121212,
"grad_norm": 0.04054190343244338,
"learning_rate": 8.250294514213009e-05,
"loss": 0.8063,
"step": 3610
},
{
"epoch": 0.6846590909090909,
"grad_norm": 0.039503064995133216,
"learning_rate": 8.206051751114875e-05,
"loss": 0.8033,
"step": 3615
},
{
"epoch": 0.6856060606060606,
"grad_norm": 0.03961793432281865,
"learning_rate": 8.161883223019513e-05,
"loss": 0.7841,
"step": 3620
},
{
"epoch": 0.6865530303030303,
"grad_norm": 0.03964303335275124,
"learning_rate": 8.11778941254037e-05,
"loss": 0.793,
"step": 3625
},
{
"epoch": 0.6875,
"grad_norm": 0.03665153936722208,
"learning_rate": 8.073770801474495e-05,
"loss": 0.776,
"step": 3630
},
{
"epoch": 0.6884469696969697,
"grad_norm": 0.04064557439554845,
"learning_rate": 8.029827870797233e-05,
"loss": 0.7622,
"step": 3635
},
{
"epoch": 0.6893939393939394,
"grad_norm": 0.038999462198328914,
"learning_rate": 7.985961100657029e-05,
"loss": 0.7945,
"step": 3640
},
{
"epoch": 0.6903409090909091,
"grad_norm": 0.03814629462651061,
"learning_rate": 7.942170970370128e-05,
"loss": 0.7907,
"step": 3645
},
{
"epoch": 0.6912878787878788,
"grad_norm": 0.03936834359810894,
"learning_rate": 7.898457958415362e-05,
"loss": 0.8105,
"step": 3650
},
{
"epoch": 0.6922348484848485,
"grad_norm": 0.043208861683073814,
"learning_rate": 7.854822542428923e-05,
"loss": 0.7829,
"step": 3655
},
{
"epoch": 0.6931818181818182,
"grad_norm": 0.04118046552241357,
"learning_rate": 7.811265199199152e-05,
"loss": 0.7881,
"step": 3660
},
{
"epoch": 0.6941287878787878,
"grad_norm": 0.04008925612177105,
"learning_rate": 7.76778640466128e-05,
"loss": 0.7898,
"step": 3665
},
{
"epoch": 0.6950757575757576,
"grad_norm": 0.03889735909259863,
"learning_rate": 7.724386633892306e-05,
"loss": 0.7829,
"step": 3670
},
{
"epoch": 0.6960227272727273,
"grad_norm": 0.04151816317577747,
"learning_rate": 7.681066361105756e-05,
"loss": 0.7767,
"step": 3675
},
{
"epoch": 0.696969696969697,
"grad_norm": 0.036944282946496376,
"learning_rate": 7.63782605964648e-05,
"loss": 0.7765,
"step": 3680
},
{
"epoch": 0.6979166666666666,
"grad_norm": 0.03615917598734965,
"learning_rate": 7.594666201985545e-05,
"loss": 0.7861,
"step": 3685
},
{
"epoch": 0.6988636363636364,
"grad_norm": 0.04067200248262229,
"learning_rate": 7.551587259715034e-05,
"loss": 0.8289,
"step": 3690
},
{
"epoch": 0.6998106060606061,
"grad_norm": 0.037365461143322884,
"learning_rate": 7.508589703542878e-05,
"loss": 0.811,
"step": 3695
},
{
"epoch": 0.7007575757575758,
"grad_norm": 0.04185500665647231,
"learning_rate": 7.465674003287745e-05,
"loss": 0.7682,
"step": 3700
},
{
"epoch": 0.7017045454545454,
"grad_norm": 0.040446504718946015,
"learning_rate": 7.422840627873897e-05,
"loss": 0.795,
"step": 3705
},
{
"epoch": 0.7026515151515151,
"grad_norm": 0.03757009814629214,
"learning_rate": 7.380090045326045e-05,
"loss": 0.7504,
"step": 3710
},
{
"epoch": 0.7035984848484849,
"grad_norm": 0.038548413203444785,
"learning_rate": 7.337422722764275e-05,
"loss": 0.8075,
"step": 3715
},
{
"epoch": 0.7045454545454546,
"grad_norm": 0.03999887548841091,
"learning_rate": 7.294839126398908e-05,
"loss": 0.774,
"step": 3720
},
{
"epoch": 0.7054924242424242,
"grad_norm": 0.04141191338877119,
"learning_rate": 7.252339721525412e-05,
"loss": 0.8107,
"step": 3725
},
{
"epoch": 0.7064393939393939,
"grad_norm": 0.0427189690829545,
"learning_rate": 7.209924972519343e-05,
"loss": 0.783,
"step": 3730
},
{
"epoch": 0.7073863636363636,
"grad_norm": 0.041829790074471566,
"learning_rate": 7.167595342831253e-05,
"loss": 0.8037,
"step": 3735
},
{
"epoch": 0.7083333333333334,
"grad_norm": 0.03932038001837439,
"learning_rate": 7.125351294981598e-05,
"loss": 0.7577,
"step": 3740
},
{
"epoch": 0.709280303030303,
"grad_norm": 0.044979053856984176,
"learning_rate": 7.083193290555744e-05,
"loss": 0.7623,
"step": 3745
},
{
"epoch": 0.7102272727272727,
"grad_norm": 0.040516807472682444,
"learning_rate": 7.041121790198881e-05,
"loss": 0.7796,
"step": 3750
},
{
"epoch": 0.7111742424242424,
"grad_norm": 0.04076058081578708,
"learning_rate": 6.999137253611e-05,
"loss": 0.789,
"step": 3755
},
{
"epoch": 0.7121212121212122,
"grad_norm": 0.03822915371523041,
"learning_rate": 6.95724013954186e-05,
"loss": 0.784,
"step": 3760
},
{
"epoch": 0.7130681818181818,
"grad_norm": 0.04000916349581932,
"learning_rate": 6.91543090578601e-05,
"loss": 0.7722,
"step": 3765
},
{
"epoch": 0.7140151515151515,
"grad_norm": 0.044726470002604886,
"learning_rate": 6.87371000917774e-05,
"loss": 0.7575,
"step": 3770
},
{
"epoch": 0.7149621212121212,
"grad_norm": 0.04471754304335626,
"learning_rate": 6.832077905586119e-05,
"loss": 0.7691,
"step": 3775
},
{
"epoch": 0.7159090909090909,
"grad_norm": 0.03949943632638639,
"learning_rate": 6.790535049910017e-05,
"loss": 0.784,
"step": 3780
},
{
"epoch": 0.7168560606060606,
"grad_norm": 0.038373759541872915,
"learning_rate": 6.749081896073106e-05,
"loss": 0.7601,
"step": 3785
},
{
"epoch": 0.7178030303030303,
"grad_norm": 0.03616627695182055,
"learning_rate": 6.707718897018941e-05,
"loss": 0.7591,
"step": 3790
},
{
"epoch": 0.71875,
"grad_norm": 0.04327639838927876,
"learning_rate": 6.66644650470597e-05,
"loss": 0.7846,
"step": 3795
},
{
"epoch": 0.7196969696969697,
"grad_norm": 0.043231180710510686,
"learning_rate": 6.625265170102615e-05,
"loss": 0.752,
"step": 3800
},
{
"epoch": 0.7206439393939394,
"grad_norm": 0.039624521674453655,
"learning_rate": 6.584175343182359e-05,
"loss": 0.7995,
"step": 3805
},
{
"epoch": 0.7215909090909091,
"grad_norm": 0.04268727987190514,
"learning_rate": 6.543177472918794e-05,
"loss": 0.7877,
"step": 3810
},
{
"epoch": 0.7225378787878788,
"grad_norm": 0.0402914394741491,
"learning_rate": 6.502272007280755e-05,
"loss": 0.7539,
"step": 3815
},
{
"epoch": 0.7234848484848485,
"grad_norm": 0.03869463859402182,
"learning_rate": 6.461459393227385e-05,
"loss": 0.7583,
"step": 3820
},
{
"epoch": 0.7244318181818182,
"grad_norm": 0.03726113086293714,
"learning_rate": 6.420740076703291e-05,
"loss": 0.7435,
"step": 3825
},
{
"epoch": 0.7253787878787878,
"grad_norm": 0.04242697385724998,
"learning_rate": 6.38011450263364e-05,
"loss": 0.7909,
"step": 3830
},
{
"epoch": 0.7263257575757576,
"grad_norm": 0.041072190514661835,
"learning_rate": 6.339583114919301e-05,
"loss": 0.7938,
"step": 3835
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.04140101649993429,
"learning_rate": 6.299146356432029e-05,
"loss": 0.7724,
"step": 3840
},
{
"epoch": 0.728219696969697,
"grad_norm": 0.04276312404745283,
"learning_rate": 6.258804669009575e-05,
"loss": 0.8042,
"step": 3845
},
{
"epoch": 0.7291666666666666,
"grad_norm": 0.03951731344689689,
"learning_rate": 6.218558493450893e-05,
"loss": 0.7555,
"step": 3850
},
{
"epoch": 0.7301136363636364,
"grad_norm": 0.03936898937096199,
"learning_rate": 6.178408269511312e-05,
"loss": 0.7863,
"step": 3855
},
{
"epoch": 0.7310606060606061,
"grad_norm": 0.038716693149565,
"learning_rate": 6.138354435897748e-05,
"loss": 0.7745,
"step": 3860
},
{
"epoch": 0.7320075757575758,
"grad_norm": 0.04072421020095559,
"learning_rate": 6.098397430263858e-05,
"loss": 0.7956,
"step": 3865
},
{
"epoch": 0.7329545454545454,
"grad_norm": 0.04100101910580714,
"learning_rate": 6.058537689205328e-05,
"loss": 0.7578,
"step": 3870
},
{
"epoch": 0.7339015151515151,
"grad_norm": 0.039583105949787575,
"learning_rate": 6.0187756482550645e-05,
"loss": 0.796,
"step": 3875
},
{
"epoch": 0.7348484848484849,
"grad_norm": 0.0398070275621675,
"learning_rate": 5.9791117418784274e-05,
"loss": 0.7667,
"step": 3880
},
{
"epoch": 0.7357954545454546,
"grad_norm": 0.03854944447288334,
"learning_rate": 5.939546403468501e-05,
"loss": 0.7499,
"step": 3885
},
{
"epoch": 0.7367424242424242,
"grad_norm": 0.04368614364362331,
"learning_rate": 5.900080065341363e-05,
"loss": 0.78,
"step": 3890
},
{
"epoch": 0.7376893939393939,
"grad_norm": 0.042603936273429066,
"learning_rate": 5.860713158731333e-05,
"loss": 0.7636,
"step": 3895
},
{
"epoch": 0.7386363636363636,
"grad_norm": 0.043008266779781215,
"learning_rate": 5.821446113786302e-05,
"loss": 0.7631,
"step": 3900
},
{
"epoch": 0.7395833333333334,
"grad_norm": 0.03910221102626979,
"learning_rate": 5.782279359562988e-05,
"loss": 0.7691,
"step": 3905
},
{
"epoch": 0.740530303030303,
"grad_norm": 0.042051023569034784,
"learning_rate": 5.743213324022272e-05,
"loss": 0.7905,
"step": 3910
},
{
"epoch": 0.7414772727272727,
"grad_norm": 0.044919920953571925,
"learning_rate": 5.7042484340245265e-05,
"loss": 0.7715,
"step": 3915
},
{
"epoch": 0.7424242424242424,
"grad_norm": 0.041073163710179876,
"learning_rate": 5.665385115324953e-05,
"loss": 0.7468,
"step": 3920
},
{
"epoch": 0.7433712121212122,
"grad_norm": 0.03567748888143746,
"learning_rate": 5.626623792568885e-05,
"loss": 0.7902,
"step": 3925
},
{
"epoch": 0.7443181818181818,
"grad_norm": 0.040223394497797826,
"learning_rate": 5.587964889287218e-05,
"loss": 0.8142,
"step": 3930
},
{
"epoch": 0.7452651515151515,
"grad_norm": 0.03937628241815354,
"learning_rate": 5.5494088278917434e-05,
"loss": 0.7561,
"step": 3935
},
{
"epoch": 0.7462121212121212,
"grad_norm": 0.039022205324506364,
"learning_rate": 5.5109560296705066e-05,
"loss": 0.7761,
"step": 3940
},
{
"epoch": 0.7471590909090909,
"grad_norm": 0.037705226948508606,
"learning_rate": 5.472606914783266e-05,
"loss": 0.7697,
"step": 3945
},
{
"epoch": 0.7481060606060606,
"grad_norm": 0.03949092410910686,
"learning_rate": 5.434361902256868e-05,
"loss": 0.7804,
"step": 3950
},
{
"epoch": 0.7490530303030303,
"grad_norm": 0.040225291682004415,
"learning_rate": 5.396221409980653e-05,
"loss": 0.7895,
"step": 3955
},
{
"epoch": 0.75,
"grad_norm": 0.03696261298169589,
"learning_rate": 5.358185854701909e-05,
"loss": 0.7715,
"step": 3960
},
{
"epoch": 0.7509469696969697,
"grad_norm": 0.03863523973389968,
"learning_rate": 5.320255652021336e-05,
"loss": 0.7748,
"step": 3965
},
{
"epoch": 0.7518939393939394,
"grad_norm": 0.041539206419021424,
"learning_rate": 5.282431216388457e-05,
"loss": 0.7556,
"step": 3970
},
{
"epoch": 0.7528409090909091,
"grad_norm": 0.040538971828186623,
"learning_rate": 5.244712961097142e-05,
"loss": 0.7843,
"step": 3975
},
{
"epoch": 0.7537878787878788,
"grad_norm": 0.042618390094256595,
"learning_rate": 5.207101298281049e-05,
"loss": 0.7666,
"step": 3980
},
{
"epoch": 0.7547348484848485,
"grad_norm": 0.03893190636534372,
"learning_rate": 5.1695966389091396e-05,
"loss": 0.7793,
"step": 3985
},
{
"epoch": 0.7556818181818182,
"grad_norm": 0.03755845511933422,
"learning_rate": 5.132199392781205e-05,
"loss": 0.77,
"step": 3990
},
{
"epoch": 0.7566287878787878,
"grad_norm": 0.04022540043473745,
"learning_rate": 5.094909968523351e-05,
"loss": 0.78,
"step": 3995
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.04253171860319729,
"learning_rate": 5.057728773583559e-05,
"loss": 0.7478,
"step": 4000
},
{
"epoch": 0.7585227272727273,
"grad_norm": 0.042102270133092194,
"learning_rate": 5.0206562142272334e-05,
"loss": 0.7817,
"step": 4005
},
{
"epoch": 0.759469696969697,
"grad_norm": 0.04424433693890534,
"learning_rate": 4.9836926955327656e-05,
"loss": 0.7774,
"step": 4010
},
{
"epoch": 0.7604166666666666,
"grad_norm": 0.03727474161719155,
"learning_rate": 4.946838621387063e-05,
"loss": 0.7548,
"step": 4015
},
{
"epoch": 0.7613636363636364,
"grad_norm": 0.038729203633207226,
"learning_rate": 4.9100943944812114e-05,
"loss": 0.7723,
"step": 4020
},
{
"epoch": 0.7623106060606061,
"grad_norm": 0.04033403314672209,
"learning_rate": 4.873460416306023e-05,
"loss": 0.7815,
"step": 4025
},
{
"epoch": 0.7632575757575758,
"grad_norm": 0.03663235670257645,
"learning_rate": 4.836937087147655e-05,
"loss": 0.7968,
"step": 4030
},
{
"epoch": 0.7642045454545454,
"grad_norm": 0.038393805155331855,
"learning_rate": 4.8005248060832446e-05,
"loss": 0.7572,
"step": 4035
},
{
"epoch": 0.7651515151515151,
"grad_norm": 0.03692754409484954,
"learning_rate": 4.7642239709765596e-05,
"loss": 0.7707,
"step": 4040
},
{
"epoch": 0.7660984848484849,
"grad_norm": 0.03765360322249641,
"learning_rate": 4.728034978473621e-05,
"loss": 0.7886,
"step": 4045
},
{
"epoch": 0.7670454545454546,
"grad_norm": 0.04124091795917703,
"learning_rate": 4.691958223998401e-05,
"loss": 0.7693,
"step": 4050
},
{
"epoch": 0.7679924242424242,
"grad_norm": 0.038689870405394365,
"learning_rate": 4.655994101748477e-05,
"loss": 0.7921,
"step": 4055
},
{
"epoch": 0.7689393939393939,
"grad_norm": 0.03609443794000663,
"learning_rate": 4.620143004690736e-05,
"loss": 0.7289,
"step": 4060
},
{
"epoch": 0.7698863636363636,
"grad_norm": 0.03987735376629914,
"learning_rate": 4.584405324557092e-05,
"loss": 0.7605,
"step": 4065
},
{
"epoch": 0.7708333333333334,
"grad_norm": 0.04097229768216734,
"learning_rate": 4.548781451840179e-05,
"loss": 0.7663,
"step": 4070
},
{
"epoch": 0.771780303030303,
"grad_norm": 0.040742694287622665,
"learning_rate": 4.513271775789099e-05,
"loss": 0.8028,
"step": 4075
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.04002660818121977,
"learning_rate": 4.477876684405179e-05,
"loss": 0.7613,
"step": 4080
},
{
"epoch": 0.7736742424242424,
"grad_norm": 0.03889418322921735,
"learning_rate": 4.4425965644377206e-05,
"loss": 0.7551,
"step": 4085
},
{
"epoch": 0.7746212121212122,
"grad_norm": 0.041611350969633386,
"learning_rate": 4.407431801379765e-05,
"loss": 0.7626,
"step": 4090
},
{
"epoch": 0.7755681818181818,
"grad_norm": 0.038681908003403036,
"learning_rate": 4.37238277946389e-05,
"loss": 0.7903,
"step": 4095
},
{
"epoch": 0.7765151515151515,
"grad_norm": 0.03548309908592482,
"learning_rate": 4.337449881658027e-05,
"loss": 0.7786,
"step": 4100
},
{
"epoch": 0.7774621212121212,
"grad_norm": 0.039380169633909605,
"learning_rate": 4.3026334896612454e-05,
"loss": 0.7403,
"step": 4105
},
{
"epoch": 0.7784090909090909,
"grad_norm": 0.04079678253532297,
"learning_rate": 4.267933983899601e-05,
"loss": 0.7436,
"step": 4110
},
{
"epoch": 0.7793560606060606,
"grad_norm": 0.039301682797346464,
"learning_rate": 4.233351743521987e-05,
"loss": 0.7671,
"step": 4115
},
{
"epoch": 0.7803030303030303,
"grad_norm": 0.03847308151820405,
"learning_rate": 4.19888714639597e-05,
"loss": 0.7448,
"step": 4120
},
{
"epoch": 0.78125,
"grad_norm": 0.041300407433686306,
"learning_rate": 4.164540569103667e-05,
"loss": 0.7589,
"step": 4125
},
{
"epoch": 0.7821969696969697,
"grad_norm": 0.03941592905737965,
"learning_rate": 4.1303123869376535e-05,
"loss": 0.757,
"step": 4130
},
{
"epoch": 0.7831439393939394,
"grad_norm": 0.037406188407398566,
"learning_rate": 4.096202973896825e-05,
"loss": 0.7725,
"step": 4135
},
{
"epoch": 0.7840909090909091,
"grad_norm": 0.03990816917288711,
"learning_rate": 4.0622127026823445e-05,
"loss": 0.7317,
"step": 4140
},
{
"epoch": 0.7850378787878788,
"grad_norm": 0.03511090719697071,
"learning_rate": 4.028341944693543e-05,
"loss": 0.7529,
"step": 4145
},
{
"epoch": 0.7859848484848485,
"grad_norm": 0.0379918188663595,
"learning_rate": 3.9945910700238865e-05,
"loss": 0.7766,
"step": 4150
},
{
"epoch": 0.7869318181818182,
"grad_norm": 0.04057440463664927,
"learning_rate": 3.960960447456907e-05,
"loss": 0.7828,
"step": 4155
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.03820049794907823,
"learning_rate": 3.9274504444622016e-05,
"loss": 0.7687,
"step": 4160
},
{
"epoch": 0.7888257575757576,
"grad_norm": 0.04059223380009775,
"learning_rate": 3.894061427191384e-05,
"loss": 0.7736,
"step": 4165
},
{
"epoch": 0.7897727272727273,
"grad_norm": 0.03586736668288177,
"learning_rate": 3.860793760474105e-05,
"loss": 0.7504,
"step": 4170
},
{
"epoch": 0.790719696969697,
"grad_norm": 0.03808778963155071,
"learning_rate": 3.8276478078140746e-05,
"loss": 0.7827,
"step": 4175
},
{
"epoch": 0.7916666666666666,
"grad_norm": 0.04112934182070688,
"learning_rate": 3.794623931385062e-05,
"loss": 0.7754,
"step": 4180
},
{
"epoch": 0.7926136363636364,
"grad_norm": 0.03890850582072587,
"learning_rate": 3.7617224920269607e-05,
"loss": 0.7529,
"step": 4185
},
{
"epoch": 0.7935606060606061,
"grad_norm": 0.03888836620814126,
"learning_rate": 3.7289438492418375e-05,
"loss": 0.7797,
"step": 4190
},
{
"epoch": 0.7945075757575758,
"grad_norm": 0.04186176891014295,
"learning_rate": 3.696288361190015e-05,
"loss": 0.7735,
"step": 4195
},
{
"epoch": 0.7954545454545454,
"grad_norm": 0.03922361327888292,
"learning_rate": 3.663756384686127e-05,
"loss": 0.7431,
"step": 4200
},
{
"epoch": 0.7964015151515151,
"grad_norm": 0.037477031699552646,
"learning_rate": 3.631348275195259e-05,
"loss": 0.7477,
"step": 4205
},
{
"epoch": 0.7973484848484849,
"grad_norm": 0.03804879504351854,
"learning_rate": 3.599064386829051e-05,
"loss": 0.7873,
"step": 4210
},
{
"epoch": 0.7982954545454546,
"grad_norm": 0.041760751407490776,
"learning_rate": 3.5669050723418074e-05,
"loss": 0.7644,
"step": 4215
},
{
"epoch": 0.7992424242424242,
"grad_norm": 0.035448569098769006,
"learning_rate": 3.534870683126664e-05,
"loss": 0.7786,
"step": 4220
},
{
"epoch": 0.8001893939393939,
"grad_norm": 0.042355358195710444,
"learning_rate": 3.5029615692117555e-05,
"loss": 0.7576,
"step": 4225
},
{
"epoch": 0.8011363636363636,
"grad_norm": 0.039815508314079394,
"learning_rate": 3.47117807925636e-05,
"loss": 0.7678,
"step": 4230
},
{
"epoch": 0.8020833333333334,
"grad_norm": 0.04043653268326337,
"learning_rate": 3.4395205605471286e-05,
"loss": 0.7763,
"step": 4235
},
{
"epoch": 0.803030303030303,
"grad_norm": 0.03871480607482675,
"learning_rate": 3.4079893589942543e-05,
"loss": 0.761,
"step": 4240
},
{
"epoch": 0.8039772727272727,
"grad_norm": 0.041056778138514105,
"learning_rate": 3.376584819127712e-05,
"loss": 0.7686,
"step": 4245
},
{
"epoch": 0.8049242424242424,
"grad_norm": 0.0402161423571865,
"learning_rate": 3.3453072840935e-05,
"loss": 0.7704,
"step": 4250
},
{
"epoch": 0.8058712121212122,
"grad_norm": 0.03864045638204508,
"learning_rate": 3.314157095649868e-05,
"loss": 0.7707,
"step": 4255
},
{
"epoch": 0.8068181818181818,
"grad_norm": 0.042228336396620804,
"learning_rate": 3.283134594163599e-05,
"loss": 0.7482,
"step": 4260
},
{
"epoch": 0.8077651515151515,
"grad_norm": 0.04047310966274174,
"learning_rate": 3.252240118606293e-05,
"loss": 0.7587,
"step": 4265
},
{
"epoch": 0.8087121212121212,
"grad_norm": 0.03977289729530792,
"learning_rate": 3.221474006550662e-05,
"loss": 0.768,
"step": 4270
},
{
"epoch": 0.8096590909090909,
"grad_norm": 0.03903161422504673,
"learning_rate": 3.1908365941668115e-05,
"loss": 0.7433,
"step": 4275
},
{
"epoch": 0.8106060606060606,
"grad_norm": 0.03937683373863951,
"learning_rate": 3.160328216218617e-05,
"loss": 0.7889,
"step": 4280
},
{
"epoch": 0.8115530303030303,
"grad_norm": 0.039794170439367574,
"learning_rate": 3.129949206060039e-05,
"loss": 0.7418,
"step": 4285
},
{
"epoch": 0.8125,
"grad_norm": 0.041917271276222724,
"learning_rate": 3.099699895631474e-05,
"loss": 0.7451,
"step": 4290
},
{
"epoch": 0.8134469696969697,
"grad_norm": 0.037936541801760024,
"learning_rate": 3.069580615456137e-05,
"loss": 0.7627,
"step": 4295
},
{
"epoch": 0.8143939393939394,
"grad_norm": 0.037614637220929004,
"learning_rate": 3.03959169463646e-05,
"loss": 0.7674,
"step": 4300
},
{
"epoch": 0.8153409090909091,
"grad_norm": 0.03881575160344137,
"learning_rate": 3.009733460850473e-05,
"loss": 0.7646,
"step": 4305
},
{
"epoch": 0.8162878787878788,
"grad_norm": 0.041064989397716814,
"learning_rate": 2.9800062403482493e-05,
"loss": 0.7554,
"step": 4310
},
{
"epoch": 0.8172348484848485,
"grad_norm": 0.03751500962917203,
"learning_rate": 2.9504103579483163e-05,
"loss": 0.772,
"step": 4315
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.03762830736526142,
"learning_rate": 2.9209461370341204e-05,
"loss": 0.7419,
"step": 4320
},
{
"epoch": 0.8191287878787878,
"grad_norm": 0.043952738002868426,
"learning_rate": 2.891613899550499e-05,
"loss": 0.7876,
"step": 4325
},
{
"epoch": 0.8200757575757576,
"grad_norm": 0.04012795700450522,
"learning_rate": 2.8624139660001448e-05,
"loss": 0.7589,
"step": 4330
},
{
"epoch": 0.8210227272727273,
"grad_norm": 0.03722728875218868,
"learning_rate": 2.8333466554401125e-05,
"loss": 0.7521,
"step": 4335
},
{
"epoch": 0.821969696969697,
"grad_norm": 0.037476550396028804,
"learning_rate": 2.804412285478343e-05,
"loss": 0.7393,
"step": 4340
},
{
"epoch": 0.8229166666666666,
"grad_norm": 0.037123539268036035,
"learning_rate": 2.775611172270185e-05,
"loss": 0.7654,
"step": 4345
},
{
"epoch": 0.8238636363636364,
"grad_norm": 0.03776861945697112,
"learning_rate": 2.7469436305149172e-05,
"loss": 0.7629,
"step": 4350
},
{
"epoch": 0.8248106060606061,
"grad_norm": 0.0397986113112224,
"learning_rate": 2.7184099734523567e-05,
"loss": 0.776,
"step": 4355
},
{
"epoch": 0.8257575757575758,
"grad_norm": 0.04058484357450401,
"learning_rate": 2.690010512859403e-05,
"loss": 0.7563,
"step": 4360
},
{
"epoch": 0.8267045454545454,
"grad_norm": 0.03896827353754326,
"learning_rate": 2.6617455590466363e-05,
"loss": 0.7457,
"step": 4365
},
{
"epoch": 0.8276515151515151,
"grad_norm": 0.03825814102977763,
"learning_rate": 2.633615420854928e-05,
"loss": 0.75,
"step": 4370
},
{
"epoch": 0.8285984848484849,
"grad_norm": 0.03716265282693635,
"learning_rate": 2.6056204056520795e-05,
"loss": 0.758,
"step": 4375
},
{
"epoch": 0.8295454545454546,
"grad_norm": 0.04444901558357328,
"learning_rate": 2.5777608193294396e-05,
"loss": 0.7576,
"step": 4380
},
{
"epoch": 0.8304924242424242,
"grad_norm": 0.03624822046682716,
"learning_rate": 2.550036966298581e-05,
"loss": 0.7483,
"step": 4385
},
{
"epoch": 0.8314393939393939,
"grad_norm": 0.038466147912297376,
"learning_rate": 2.5224491494879705e-05,
"loss": 0.7735,
"step": 4390
},
{
"epoch": 0.8323863636363636,
"grad_norm": 0.04216736810756713,
"learning_rate": 2.4949976703396486e-05,
"loss": 0.7666,
"step": 4395
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.037577989344391036,
"learning_rate": 2.4676828288059558e-05,
"loss": 0.7504,
"step": 4400
},
{
"epoch": 0.834280303030303,
"grad_norm": 0.04096373784722742,
"learning_rate": 2.4405049233462316e-05,
"loss": 0.7541,
"step": 4405
},
{
"epoch": 0.8352272727272727,
"grad_norm": 0.03828220611720389,
"learning_rate": 2.413464250923566e-05,
"loss": 0.7512,
"step": 4410
},
{
"epoch": 0.8361742424242424,
"grad_norm": 0.03788635961789358,
"learning_rate": 2.3865611070015605e-05,
"loss": 0.7544,
"step": 4415
},
{
"epoch": 0.8371212121212122,
"grad_norm": 0.04139269826653916,
"learning_rate": 2.3597957855410932e-05,
"loss": 0.7847,
"step": 4420
},
{
"epoch": 0.8380681818181818,
"grad_norm": 0.04098240265367287,
"learning_rate": 2.3331685789970978e-05,
"loss": 0.7548,
"step": 4425
},
{
"epoch": 0.8390151515151515,
"grad_norm": 0.0366730676630168,
"learning_rate": 2.3066797783153767e-05,
"loss": 0.7546,
"step": 4430
},
{
"epoch": 0.8399621212121212,
"grad_norm": 0.03855939364168934,
"learning_rate": 2.280329672929434e-05,
"loss": 0.7526,
"step": 4435
},
{
"epoch": 0.8409090909090909,
"grad_norm": 0.03769398425525407,
"learning_rate": 2.2541185507572858e-05,
"loss": 0.7659,
"step": 4440
},
{
"epoch": 0.8418560606060606,
"grad_norm": 0.037763802543836905,
"learning_rate": 2.228046698198336e-05,
"loss": 0.7492,
"step": 4445
},
{
"epoch": 0.8428030303030303,
"grad_norm": 0.038504139823869195,
"learning_rate": 2.202114400130246e-05,
"loss": 0.7532,
"step": 4450
},
{
"epoch": 0.84375,
"grad_norm": 0.03839729099480198,
"learning_rate": 2.1763219399058042e-05,
"loss": 0.7716,
"step": 4455
},
{
"epoch": 0.8446969696969697,
"grad_norm": 0.03634271413981629,
"learning_rate": 2.150669599349845e-05,
"loss": 0.781,
"step": 4460
},
{
"epoch": 0.8456439393939394,
"grad_norm": 0.038799770819478115,
"learning_rate": 2.1251576587561774e-05,
"loss": 0.7471,
"step": 4465
},
{
"epoch": 0.8465909090909091,
"grad_norm": 0.037447866031002947,
"learning_rate": 2.0997863968844914e-05,
"loss": 0.7454,
"step": 4470
},
{
"epoch": 0.8475378787878788,
"grad_norm": 0.03812532250323499,
"learning_rate": 2.0745560909573534e-05,
"loss": 0.7487,
"step": 4475
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.03967052575801908,
"learning_rate": 2.0494670166571353e-05,
"loss": 0.7448,
"step": 4480
},
{
"epoch": 0.8494318181818182,
"grad_norm": 0.038717835231477656,
"learning_rate": 2.0245194481230386e-05,
"loss": 0.746,
"step": 4485
},
{
"epoch": 0.8503787878787878,
"grad_norm": 0.03978509491909852,
"learning_rate": 1.9997136579480698e-05,
"loss": 0.7591,
"step": 4490
},
{
"epoch": 0.8513257575757576,
"grad_norm": 0.040392976405494746,
"learning_rate": 1.9750499171760864e-05,
"loss": 0.7437,
"step": 4495
},
{
"epoch": 0.8522727272727273,
"grad_norm": 0.03839961150423484,
"learning_rate": 1.9505284952988154e-05,
"loss": 0.7191,
"step": 4500
},
{
"epoch": 0.853219696969697,
"grad_norm": 0.03701174541805748,
"learning_rate": 1.9261496602529163e-05,
"loss": 0.7614,
"step": 4505
},
{
"epoch": 0.8541666666666666,
"grad_norm": 0.03820961258437268,
"learning_rate": 1.9019136784170635e-05,
"loss": 0.7914,
"step": 4510
},
{
"epoch": 0.8551136363636364,
"grad_norm": 0.039034746383769636,
"learning_rate": 1.877820814609018e-05,
"loss": 0.7378,
"step": 4515
},
{
"epoch": 0.8560606060606061,
"grad_norm": 0.035548177827413464,
"learning_rate": 1.8538713320827398e-05,
"loss": 0.7587,
"step": 4520
},
{
"epoch": 0.8570075757575758,
"grad_norm": 0.03927586449468295,
"learning_rate": 1.8300654925255227e-05,
"loss": 0.7505,
"step": 4525
},
{
"epoch": 0.8579545454545454,
"grad_norm": 0.03808728080301323,
"learning_rate": 1.8064035560551254e-05,
"loss": 0.7546,
"step": 4530
},
{
"epoch": 0.8589015151515151,
"grad_norm": 0.03971353114564455,
"learning_rate": 1.7828857812169183e-05,
"loss": 0.7481,
"step": 4535
},
{
"epoch": 0.8598484848484849,
"grad_norm": 0.038394015305144635,
"learning_rate": 1.7595124249810798e-05,
"loss": 0.7512,
"step": 4540
},
{
"epoch": 0.8607954545454546,
"grad_norm": 0.04035971231008132,
"learning_rate": 1.736283742739781e-05,
"loss": 0.7514,
"step": 4545
},
{
"epoch": 0.8617424242424242,
"grad_norm": 0.03852526269337616,
"learning_rate": 1.7131999883043864e-05,
"loss": 0.7324,
"step": 4550
},
{
"epoch": 0.8626893939393939,
"grad_norm": 0.0380464207669555,
"learning_rate": 1.690261413902685e-05,
"loss": 0.778,
"step": 4555
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.03809307250418814,
"learning_rate": 1.6674682701761493e-05,
"loss": 0.741,
"step": 4560
},
{
"epoch": 0.8645833333333334,
"grad_norm": 0.03730545835154613,
"learning_rate": 1.644820806177165e-05,
"loss": 0.7494,
"step": 4565
},
{
"epoch": 0.865530303030303,
"grad_norm": 0.0410319046669476,
"learning_rate": 1.622319269366349e-05,
"loss": 0.7774,
"step": 4570
},
{
"epoch": 0.8664772727272727,
"grad_norm": 0.0365044250054721,
"learning_rate": 1.599963905609807e-05,
"loss": 0.7404,
"step": 4575
},
{
"epoch": 0.8674242424242424,
"grad_norm": 0.039763773803898096,
"learning_rate": 1.5777549591764705e-05,
"loss": 0.7789,
"step": 4580
},
{
"epoch": 0.8683712121212122,
"grad_norm": 0.03734007441823457,
"learning_rate": 1.555692672735431e-05,
"loss": 0.7487,
"step": 4585
},
{
"epoch": 0.8693181818181818,
"grad_norm": 0.040126269127401595,
"learning_rate": 1.5337772873532696e-05,
"loss": 0.7653,
"step": 4590
},
{
"epoch": 0.8702651515151515,
"grad_norm": 0.04012812890151172,
"learning_rate": 1.5120090424914305e-05,
"loss": 0.7763,
"step": 4595
},
{
"epoch": 0.8712121212121212,
"grad_norm": 0.03851463594249241,
"learning_rate": 1.4903881760036163e-05,
"loss": 0.7654,
"step": 4600
},
{
"epoch": 0.8721590909090909,
"grad_norm": 0.03700030549429758,
"learning_rate": 1.46891492413318e-05,
"loss": 0.7481,
"step": 4605
},
{
"epoch": 0.8731060606060606,
"grad_norm": 0.037764961164442196,
"learning_rate": 1.4475895215105299e-05,
"loss": 0.751,
"step": 4610
},
{
"epoch": 0.8740530303030303,
"grad_norm": 0.039658099635677214,
"learning_rate": 1.4264122011505919e-05,
"loss": 0.7454,
"step": 4615
},
{
"epoch": 0.875,
"grad_norm": 0.037877514658421034,
"learning_rate": 1.4053831944502508e-05,
"loss": 0.7311,
"step": 4620
},
{
"epoch": 0.8759469696969697,
"grad_norm": 0.03981948814673359,
"learning_rate": 1.3845027311858149e-05,
"loss": 0.7701,
"step": 4625
},
{
"epoch": 0.8768939393939394,
"grad_norm": 0.03725499979086121,
"learning_rate": 1.3637710395105134e-05,
"loss": 0.7496,
"step": 4630
},
{
"epoch": 0.8778409090909091,
"grad_norm": 0.03845165051106696,
"learning_rate": 1.3431883459520115e-05,
"loss": 0.7598,
"step": 4635
},
{
"epoch": 0.8787878787878788,
"grad_norm": 0.03921039831701288,
"learning_rate": 1.3227548754099148e-05,
"loss": 0.7576,
"step": 4640
},
{
"epoch": 0.8797348484848485,
"grad_norm": 0.03677027674299414,
"learning_rate": 1.3024708511533266e-05,
"loss": 0.7536,
"step": 4645
},
{
"epoch": 0.8806818181818182,
"grad_norm": 0.03769732908420657,
"learning_rate": 1.2823364948184095e-05,
"loss": 0.7631,
"step": 4650
},
{
"epoch": 0.8816287878787878,
"grad_norm": 0.038277457641516056,
"learning_rate": 1.2623520264059528e-05,
"loss": 0.758,
"step": 4655
},
{
"epoch": 0.8825757575757576,
"grad_norm": 0.03835316890120053,
"learning_rate": 1.2425176642789841e-05,
"loss": 0.7545,
"step": 4660
},
{
"epoch": 0.8835227272727273,
"grad_norm": 0.040317569065410515,
"learning_rate": 1.2228336251603632e-05,
"loss": 0.7211,
"step": 4665
},
{
"epoch": 0.884469696969697,
"grad_norm": 0.03605361368261573,
"learning_rate": 1.2033001241304285e-05,
"loss": 0.7356,
"step": 4670
},
{
"epoch": 0.8854166666666666,
"grad_norm": 0.042368074274969164,
"learning_rate": 1.1839173746246462e-05,
"loss": 0.7643,
"step": 4675
},
{
"epoch": 0.8863636363636364,
"grad_norm": 0.04206007952837537,
"learning_rate": 1.164685588431281e-05,
"loss": 0.7694,
"step": 4680
},
{
"epoch": 0.8873106060606061,
"grad_norm": 0.03587287003409619,
"learning_rate": 1.14560497568906e-05,
"loss": 0.7336,
"step": 4685
},
{
"epoch": 0.8882575757575758,
"grad_norm": 0.04055727525356863,
"learning_rate": 1.126675744884904e-05,
"loss": 0.7858,
"step": 4690
},
{
"epoch": 0.8892045454545454,
"grad_norm": 0.03731194721410893,
"learning_rate": 1.1078981028516421e-05,
"loss": 0.7546,
"step": 4695
},
{
"epoch": 0.8901515151515151,
"grad_norm": 0.03913350636593797,
"learning_rate": 1.08927225476574e-05,
"loss": 0.7555,
"step": 4700
},
{
"epoch": 0.8910984848484849,
"grad_norm": 0.03620266304429595,
"learning_rate": 1.0707984041450673e-05,
"loss": 0.7393,
"step": 4705
},
{
"epoch": 0.8920454545454546,
"grad_norm": 0.0372176814841684,
"learning_rate": 1.0524767528466766e-05,
"loss": 0.7815,
"step": 4710
},
{
"epoch": 0.8929924242424242,
"grad_norm": 0.04163117308071071,
"learning_rate": 1.034307501064589e-05,
"loss": 0.7744,
"step": 4715
},
{
"epoch": 0.8939393939393939,
"grad_norm": 0.03841314072028053,
"learning_rate": 1.0162908473276133e-05,
"loss": 0.7441,
"step": 4720
},
{
"epoch": 0.8948863636363636,
"grad_norm": 0.03658511014566751,
"learning_rate": 9.984269884971796e-06,
"loss": 0.7534,
"step": 4725
},
{
"epoch": 0.8958333333333334,
"grad_norm": 0.035726934558083914,
"learning_rate": 9.807161197651742e-06,
"loss": 0.7561,
"step": 4730
},
{
"epoch": 0.896780303030303,
"grad_norm": 0.03786917865045401,
"learning_rate": 9.63158434651825e-06,
"loss": 0.753,
"step": 4735
},
{
"epoch": 0.8977272727272727,
"grad_norm": 0.03878081614015611,
"learning_rate": 9.45754125003576e-06,
"loss": 0.7665,
"step": 4740
},
{
"epoch": 0.8986742424242424,
"grad_norm": 0.03776273136819908,
"learning_rate": 9.285033809909863e-06,
"loss": 0.7882,
"step": 4745
},
{
"epoch": 0.8996212121212122,
"grad_norm": 0.04079662714361428,
"learning_rate": 9.114063911066676e-06,
"loss": 0.7775,
"step": 4750
},
{
"epoch": 0.9005681818181818,
"grad_norm": 0.04107251149823735,
"learning_rate": 8.944633421632169e-06,
"loss": 0.7785,
"step": 4755
},
{
"epoch": 0.9015151515151515,
"grad_norm": 0.043937037368177494,
"learning_rate": 8.776744192911666e-06,
"loss": 0.7709,
"step": 4760
},
{
"epoch": 0.9024621212121212,
"grad_norm": 0.03806032575275296,
"learning_rate": 8.610398059369733e-06,
"loss": 0.7398,
"step": 4765
},
{
"epoch": 0.9034090909090909,
"grad_norm": 0.03989849682979902,
"learning_rate": 8.445596838610136e-06,
"loss": 0.7839,
"step": 4770
},
{
"epoch": 0.9043560606060606,
"grad_norm": 0.03804089571024527,
"learning_rate": 8.282342331355896e-06,
"loss": 0.737,
"step": 4775
},
{
"epoch": 0.9053030303030303,
"grad_norm": 0.036823538728651795,
"learning_rate": 8.120636321429618e-06,
"loss": 0.7365,
"step": 4780
},
{
"epoch": 0.90625,
"grad_norm": 0.039238403212191623,
"learning_rate": 7.960480575734162e-06,
"loss": 0.7679,
"step": 4785
},
{
"epoch": 0.9071969696969697,
"grad_norm": 0.03655300704953951,
"learning_rate": 7.801876844233102e-06,
"loss": 0.7276,
"step": 4790
},
{
"epoch": 0.9081439393939394,
"grad_norm": 0.038671267549804565,
"learning_rate": 7.64482685993174e-06,
"loss": 0.754,
"step": 4795
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.04012488210471297,
"learning_rate": 7.489332338858201e-06,
"loss": 0.7706,
"step": 4800
},
{
"epoch": 0.9100378787878788,
"grad_norm": 0.039340313195190324,
"learning_rate": 7.3353949800445625e-06,
"loss": 0.7437,
"step": 4805
},
{
"epoch": 0.9109848484848485,
"grad_norm": 0.03623402554079789,
"learning_rate": 7.1830164655084175e-06,
"loss": 0.747,
"step": 4810
},
{
"epoch": 0.9119318181818182,
"grad_norm": 0.040697271432715135,
"learning_rate": 7.032198460234367e-06,
"loss": 0.7624,
"step": 4815
},
{
"epoch": 0.9128787878787878,
"grad_norm": 0.03876794678188874,
"learning_rate": 6.88294261215595e-06,
"loss": 0.7132,
"step": 4820
},
{
"epoch": 0.9138257575757576,
"grad_norm": 0.039259366616565435,
"learning_rate": 6.7352505521375445e-06,
"loss": 0.768,
"step": 4825
},
{
"epoch": 0.9147727272727273,
"grad_norm": 0.04205245328852447,
"learning_rate": 6.5891238939566275e-06,
"loss": 0.78,
"step": 4830
},
{
"epoch": 0.915719696969697,
"grad_norm": 0.041430998012228624,
"learning_rate": 6.444564234286059e-06,
"loss": 0.7476,
"step": 4835
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.04090068304483327,
"learning_rate": 6.301573152676664e-06,
"loss": 0.7832,
"step": 4840
},
{
"epoch": 0.9176136363636364,
"grad_norm": 0.03893377913410083,
"learning_rate": 6.160152211540059e-06,
"loss": 0.766,
"step": 4845
},
{
"epoch": 0.9185606060606061,
"grad_norm": 0.03562213581544829,
"learning_rate": 6.020302956131434e-06,
"loss": 0.7506,
"step": 4850
},
{
"epoch": 0.9195075757575758,
"grad_norm": 0.03858250177735203,
"learning_rate": 5.8820269145327335e-06,
"loss": 0.7449,
"step": 4855
},
{
"epoch": 0.9204545454545454,
"grad_norm": 0.03698269538357442,
"learning_rate": 5.7453255976360526e-06,
"loss": 0.7419,
"step": 4860
},
{
"epoch": 0.9214015151515151,
"grad_norm": 0.039313884060948906,
"learning_rate": 5.6102004991269655e-06,
"loss": 0.7509,
"step": 4865
},
{
"epoch": 0.9223484848484849,
"grad_norm": 0.038202779909671226,
"learning_rate": 5.476653095468292e-06,
"loss": 0.7404,
"step": 4870
},
{
"epoch": 0.9232954545454546,
"grad_norm": 0.038488820882748215,
"learning_rate": 5.344684845883957e-06,
"loss": 0.7584,
"step": 4875
},
{
"epoch": 0.9242424242424242,
"grad_norm": 0.03673421514324292,
"learning_rate": 5.214297192343104e-06,
"loss": 0.7493,
"step": 4880
},
{
"epoch": 0.9251893939393939,
"grad_norm": 0.0350920577902006,
"learning_rate": 5.085491559544175e-06,
"loss": 0.7834,
"step": 4885
},
{
"epoch": 0.9261363636363636,
"grad_norm": 0.03508459667371372,
"learning_rate": 4.9582693548994914e-06,
"loss": 0.761,
"step": 4890
},
{
"epoch": 0.9270833333333334,
"grad_norm": 0.03866865889378931,
"learning_rate": 4.832631968519862e-06,
"loss": 0.7536,
"step": 4895
},
{
"epoch": 0.928030303030303,
"grad_norm": 0.03900577265735235,
"learning_rate": 4.708580773199333e-06,
"loss": 0.7588,
"step": 4900
},
{
"epoch": 0.9289772727272727,
"grad_norm": 0.03975550249874538,
"learning_rate": 4.586117124400196e-06,
"loss": 0.7301,
"step": 4905
},
{
"epoch": 0.9299242424242424,
"grad_norm": 0.03658042099322432,
"learning_rate": 4.465242360238269e-06,
"loss": 0.7192,
"step": 4910
},
{
"epoch": 0.9308712121212122,
"grad_norm": 0.03538458441608596,
"learning_rate": 4.345957801468092e-06,
"loss": 0.7537,
"step": 4915
},
{
"epoch": 0.9318181818181818,
"grad_norm": 0.039696089459974056,
"learning_rate": 4.228264751468752e-06,
"loss": 0.7578,
"step": 4920
},
{
"epoch": 0.9327651515151515,
"grad_norm": 0.03891493239675872,
"learning_rate": 4.112164496229381e-06,
"loss": 0.7988,
"step": 4925
},
{
"epoch": 0.9337121212121212,
"grad_norm": 0.0391191766137019,
"learning_rate": 3.997658304335249e-06,
"loss": 0.748,
"step": 4930
},
{
"epoch": 0.9346590909090909,
"grad_norm": 0.041273803822080235,
"learning_rate": 3.88474742695391e-06,
"loss": 0.7444,
"step": 4935
},
{
"epoch": 0.9356060606060606,
"grad_norm": 0.03833780199551714,
"learning_rate": 3.77343309782151e-06,
"loss": 0.7535,
"step": 4940
},
{
"epoch": 0.9365530303030303,
"grad_norm": 0.0377506973575768,
"learning_rate": 3.663716533229183e-06,
"loss": 0.7603,
"step": 4945
},
{
"epoch": 0.9375,
"grad_norm": 0.03920529885627104,
"learning_rate": 3.5555989320099952e-06,
"loss": 0.7346,
"step": 4950
},
{
"epoch": 0.9384469696969697,
"grad_norm": 0.03926812513866438,
"learning_rate": 3.4490814755256724e-06,
"loss": 0.7882,
"step": 4955
},
{
"epoch": 0.9393939393939394,
"grad_norm": 0.03915603389468844,
"learning_rate": 3.344165327653725e-06,
"loss": 0.7804,
"step": 4960
},
{
"epoch": 0.9403409090909091,
"grad_norm": 0.03692758597782573,
"learning_rate": 3.2408516347747606e-06,
"loss": 0.7615,
"step": 4965
},
{
"epoch": 0.9412878787878788,
"grad_norm": 0.040111477519722376,
"learning_rate": 3.1391415257599583e-06,
"loss": 0.7624,
"step": 4970
},
{
"epoch": 0.9422348484848485,
"grad_norm": 0.036054816584654786,
"learning_rate": 3.039036111958715e-06,
"loss": 0.7595,
"step": 4975
},
{
"epoch": 0.9431818181818182,
"grad_norm": 0.03402332704452141,
"learning_rate": 2.9405364871864514e-06,
"loss": 0.7569,
"step": 4980
},
{
"epoch": 0.9441287878787878,
"grad_norm": 0.039782631917374064,
"learning_rate": 2.8436437277128075e-06,
"loss": 0.7616,
"step": 4985
},
{
"epoch": 0.9450757575757576,
"grad_norm": 0.03902157823658662,
"learning_rate": 2.7483588922497025e-06,
"loss": 0.7324,
"step": 4990
},
{
"epoch": 0.9460227272727273,
"grad_norm": 0.039325622745832914,
"learning_rate": 2.6546830219399405e-06,
"loss": 0.7597,
"step": 4995
},
{
"epoch": 0.946969696969697,
"grad_norm": 0.03881835950586153,
"learning_rate": 2.562617140345691e-06,
"loss": 0.7473,
"step": 5000
},
{
"epoch": 0.9479166666666666,
"grad_norm": 0.039364545515671236,
"learning_rate": 2.472162253437343e-06,
"loss": 0.7553,
"step": 5005
},
{
"epoch": 0.9488636363636364,
"grad_norm": 0.03556550322704117,
"learning_rate": 2.3833193495825853e-06,
"loss": 0.7329,
"step": 5010
},
{
"epoch": 0.9498106060606061,
"grad_norm": 0.03769878462512779,
"learning_rate": 2.2960893995355443e-06,
"loss": 0.7677,
"step": 5015
},
{
"epoch": 0.9507575757575758,
"grad_norm": 0.04179821670604519,
"learning_rate": 2.210473356426146e-06,
"loss": 0.7329,
"step": 5020
},
{
"epoch": 0.9517045454545454,
"grad_norm": 0.03551927757361974,
"learning_rate": 2.1264721557497866e-06,
"loss": 0.745,
"step": 5025
},
{
"epoch": 0.9526515151515151,
"grad_norm": 0.035762866838308725,
"learning_rate": 2.0440867153570627e-06,
"loss": 0.757,
"step": 5030
},
{
"epoch": 0.9535984848484849,
"grad_norm": 0.03810233771587777,
"learning_rate": 1.9633179354437257e-06,
"loss": 0.737,
"step": 5035
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.03824899164000916,
"learning_rate": 1.8841666985408566e-06,
"loss": 0.7708,
"step": 5040
},
{
"epoch": 0.9554924242424242,
"grad_norm": 0.03784087071597624,
"learning_rate": 1.8066338695052585e-06,
"loss": 0.7791,
"step": 5045
},
{
"epoch": 0.9564393939393939,
"grad_norm": 0.03875228767293192,
"learning_rate": 1.730720295509963e-06,
"loss": 0.757,
"step": 5050
},
{
"epoch": 0.9573863636363636,
"grad_norm": 0.03959979451862991,
"learning_rate": 1.6564268060349884e-06,
"loss": 0.7581,
"step": 5055
},
{
"epoch": 0.9583333333333334,
"grad_norm": 0.03746814864806074,
"learning_rate": 1.583754212858329e-06,
"loss": 0.7492,
"step": 5060
},
{
"epoch": 0.959280303030303,
"grad_norm": 0.03840085310645041,
"learning_rate": 1.5127033100469477e-06,
"loss": 0.7428,
"step": 5065
},
{
"epoch": 0.9602272727272727,
"grad_norm": 0.04015902576846351,
"learning_rate": 1.4432748739482468e-06,
"loss": 0.7601,
"step": 5070
},
{
"epoch": 0.9611742424242424,
"grad_norm": 0.03815333482717027,
"learning_rate": 1.3754696631815276e-06,
"loss": 0.7781,
"step": 5075
},
{
"epoch": 0.9621212121212122,
"grad_norm": 0.039134552740557424,
"learning_rate": 1.3092884186296282e-06,
"loss": 0.7605,
"step": 5080
},
{
"epoch": 0.9630681818181818,
"grad_norm": 0.03873683575508503,
"learning_rate": 1.2447318634309977e-06,
"loss": 0.7465,
"step": 5085
},
{
"epoch": 0.9640151515151515,
"grad_norm": 0.038640801639052244,
"learning_rate": 1.1818007029716525e-06,
"loss": 0.7616,
"step": 5090
},
{
"epoch": 0.9649621212121212,
"grad_norm": 0.04251912678550345,
"learning_rate": 1.1204956248774655e-06,
"loss": 0.747,
"step": 5095
},
{
"epoch": 0.9659090909090909,
"grad_norm": 0.039291943037907916,
"learning_rate": 1.0608172990067553e-06,
"loss": 0.7628,
"step": 5100
},
{
"epoch": 0.9668560606060606,
"grad_norm": 0.04115060966519561,
"learning_rate": 1.0027663774429096e-06,
"loss": 0.7533,
"step": 5105
},
{
"epoch": 0.9678030303030303,
"grad_norm": 0.040550851971633786,
"learning_rate": 9.463434944872395e-07,
"loss": 0.77,
"step": 5110
},
{
"epoch": 0.96875,
"grad_norm": 0.036797005792547945,
"learning_rate": 8.91549266652053e-07,
"loss": 0.7296,
"step": 5115
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.03621498987532269,
"learning_rate": 8.383842926539929e-07,
"loss": 0.7682,
"step": 5120
},
{
"epoch": 0.9706439393939394,
"grad_norm": 0.03987403439986009,
"learning_rate": 7.868491534073928e-07,
"loss": 0.793,
"step": 5125
},
{
"epoch": 0.9715909090909091,
"grad_norm": 0.03862093235236962,
"learning_rate": 7.369444120179647e-07,
"loss": 0.7388,
"step": 5130
},
{
"epoch": 0.9725378787878788,
"grad_norm": 0.03845042714550149,
"learning_rate": 6.88670613776704e-07,
"loss": 0.7571,
"step": 5135
},
{
"epoch": 0.9734848484848485,
"grad_norm": 0.03537218356309702,
"learning_rate": 6.420282861538283e-07,
"loss": 0.7192,
"step": 5140
},
{
"epoch": 0.9744318181818182,
"grad_norm": 0.03716360855745044,
"learning_rate": 5.970179387931151e-07,
"loss": 0.7498,
"step": 5145
},
{
"epoch": 0.9753787878787878,
"grad_norm": 0.03704971797049268,
"learning_rate": 5.536400635062721e-07,
"loss": 0.7639,
"step": 5150
},
{
"epoch": 0.9763257575757576,
"grad_norm": 0.03658375948794085,
"learning_rate": 5.118951342675592e-07,
"loss": 0.7607,
"step": 5155
},
{
"epoch": 0.9772727272727273,
"grad_norm": 0.038160851981614306,
"learning_rate": 4.717836072086589e-07,
"loss": 0.7761,
"step": 5160
},
{
"epoch": 0.978219696969697,
"grad_norm": 0.03554298384663066,
"learning_rate": 4.3330592061361357e-07,
"loss": 0.7515,
"step": 5165
},
{
"epoch": 0.9791666666666666,
"grad_norm": 0.035988940616932245,
"learning_rate": 3.964624949141626e-07,
"loss": 0.7287,
"step": 5170
},
{
"epoch": 0.9801136363636364,
"grad_norm": 0.038573387129357734,
"learning_rate": 3.6125373268499625e-07,
"loss": 0.7584,
"step": 5175
},
{
"epoch": 0.9810606060606061,
"grad_norm": 0.03560535292438474,
"learning_rate": 3.2768001863945905e-07,
"loss": 0.7381,
"step": 5180
},
{
"epoch": 0.9820075757575758,
"grad_norm": 0.03470144345138998,
"learning_rate": 2.9574171962533644e-07,
"loss": 0.7447,
"step": 5185
},
{
"epoch": 0.9829545454545454,
"grad_norm": 0.038083967145801485,
"learning_rate": 2.654391846207915e-07,
"loss": 0.7667,
"step": 5190
},
{
"epoch": 0.9839015151515151,
"grad_norm": 0.03704077024461041,
"learning_rate": 2.3677274473063444e-07,
"loss": 0.7666,
"step": 5195
},
{
"epoch": 0.9848484848484849,
"grad_norm": 0.04114138125476826,
"learning_rate": 2.0974271318260905e-07,
"loss": 0.7681,
"step": 5200
},
{
"epoch": 0.9857954545454546,
"grad_norm": 0.03557878772125844,
"learning_rate": 1.8434938532406186e-07,
"loss": 0.7482,
"step": 5205
},
{
"epoch": 0.9867424242424242,
"grad_norm": 0.03606070157019983,
"learning_rate": 1.6059303861862826e-07,
"loss": 0.7404,
"step": 5210
},
{
"epoch": 0.9876893939393939,
"grad_norm": 0.037415707092603924,
"learning_rate": 1.3847393264330153e-07,
"loss": 0.768,
"step": 5215
},
{
"epoch": 0.9886363636363636,
"grad_norm": 0.03725866754101771,
"learning_rate": 1.1799230908550173e-07,
"loss": 0.7409,
"step": 5220
},
{
"epoch": 0.9895833333333334,
"grad_norm": 0.039547937064916217,
"learning_rate": 9.914839174049449e-08,
"loss": 0.7408,
"step": 5225
},
{
"epoch": 0.990530303030303,
"grad_norm": 0.03604106643192906,
"learning_rate": 8.194238650889307e-08,
"loss": 0.7571,
"step": 5230
},
{
"epoch": 0.9914772727272727,
"grad_norm": 0.03696081603757769,
"learning_rate": 6.637448139447666e-08,
"loss": 0.7416,
"step": 5235
},
{
"epoch": 0.9924242424242424,
"grad_norm": 0.037770041786195266,
"learning_rate": 5.244484650207548e-08,
"loss": 0.756,
"step": 5240
},
{
"epoch": 0.9933712121212122,
"grad_norm": 0.04046458037414051,
"learning_rate": 4.01536340357389e-08,
"loss": 0.759,
"step": 5245
},
{
"epoch": 0.9943181818181818,
"grad_norm": 0.03827692567603896,
"learning_rate": 2.9500978297103407e-08,
"loss": 0.7787,
"step": 5250
},
{
"epoch": 0.9952651515151515,
"grad_norm": 0.03835615698389389,
"learning_rate": 2.0486995683860476e-08,
"loss": 0.7284,
"step": 5255
},
{
"epoch": 0.9962121212121212,
"grad_norm": 0.04008817912232536,
"learning_rate": 1.3111784688507599e-08,
"loss": 0.7493,
"step": 5260
},
{
"epoch": 0.9971590909090909,
"grad_norm": 0.03940503331438013,
"learning_rate": 7.375425897299115e-09,
"loss": 0.7522,
"step": 5265
},
{
"epoch": 0.9981060606060606,
"grad_norm": 0.03788195266551941,
"learning_rate": 3.277981989346923e-09,
"loss": 0.746,
"step": 5270
},
{
"epoch": 0.9990530303030303,
"grad_norm": 0.03792409661384259,
"learning_rate": 8.194977359210486e-10,
"loss": 0.7443,
"step": 5275
},
{
"epoch": 1.0,
"grad_norm": 0.04145596279829835,
"learning_rate": 0.0,
"loss": 0.7751,
"step": 5280
},
{
"epoch": 1.0,
"eval_loss": 1.116625189781189,
"eval_runtime": 1241.8314,
"eval_samples_per_second": 194.261,
"eval_steps_per_second": 6.071,
"step": 5280
},
{
"epoch": 1.0,
"step": 5280,
"total_flos": 771937243234304.0,
"train_loss": 0.8302312182657646,
"train_runtime": 21905.5744,
"train_samples_per_second": 30.851,
"train_steps_per_second": 0.241
}
],
"logging_steps": 5,
"max_steps": 5280,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 771937243234304.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}