Files
SmolLM2-MagpieUltraPlus/trainer_state.json
ModelHub XC c7134457d2 初始化项目,由ModelHub XC社区提供模型
Model: HuggingFaceTB/SmolLM2-MagpieUltraPlus
Source: Original Platform
2026-06-18 21:38:13 +08:00

6400 lines
157 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998897828722584,
"eval_steps": 500,
"global_step": 4536,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011021712774165104,
"grad_norm": 59.11821880494146,
"learning_rate": 3.303964757709251e-06,
"loss": 3.825,
"step": 5
},
{
"epoch": 0.002204342554833021,
"grad_norm": 42.17582597601867,
"learning_rate": 6.607929515418502e-06,
"loss": 3.6223,
"step": 10
},
{
"epoch": 0.0033065138322495315,
"grad_norm": 18.347454692288814,
"learning_rate": 9.911894273127752e-06,
"loss": 2.8332,
"step": 15
},
{
"epoch": 0.004408685109666042,
"grad_norm": 7.929104229119195,
"learning_rate": 1.3215859030837005e-05,
"loss": 2.0196,
"step": 20
},
{
"epoch": 0.005510856387082552,
"grad_norm": 3.3860390518924492,
"learning_rate": 1.6519823788546254e-05,
"loss": 1.6062,
"step": 25
},
{
"epoch": 0.006613027664499063,
"grad_norm": 1.5094502156188854,
"learning_rate": 1.9823788546255504e-05,
"loss": 1.3491,
"step": 30
},
{
"epoch": 0.007715198941915574,
"grad_norm": 0.8243795908744608,
"learning_rate": 2.3127753303964757e-05,
"loss": 1.1719,
"step": 35
},
{
"epoch": 0.008817370219332083,
"grad_norm": 0.47607081050989597,
"learning_rate": 2.643171806167401e-05,
"loss": 1.1158,
"step": 40
},
{
"epoch": 0.009919541496748594,
"grad_norm": 0.34776556828145655,
"learning_rate": 2.9735682819383256e-05,
"loss": 1.0865,
"step": 45
},
{
"epoch": 0.011021712774165105,
"grad_norm": 0.34304699002193145,
"learning_rate": 3.303964757709251e-05,
"loss": 1.0369,
"step": 50
},
{
"epoch": 0.012123884051581615,
"grad_norm": 0.2531395849305058,
"learning_rate": 3.634361233480176e-05,
"loss": 1.0383,
"step": 55
},
{
"epoch": 0.013226055328998126,
"grad_norm": 0.21165915652546113,
"learning_rate": 3.964757709251101e-05,
"loss": 1.021,
"step": 60
},
{
"epoch": 0.014328226606414637,
"grad_norm": 0.1944258505202001,
"learning_rate": 4.295154185022026e-05,
"loss": 0.9806,
"step": 65
},
{
"epoch": 0.015430397883831147,
"grad_norm": 0.2225104161521765,
"learning_rate": 4.625550660792951e-05,
"loss": 0.9731,
"step": 70
},
{
"epoch": 0.016532569161247658,
"grad_norm": 0.18722320005454055,
"learning_rate": 4.9559471365638766e-05,
"loss": 0.9756,
"step": 75
},
{
"epoch": 0.017634740438664167,
"grad_norm": 0.18788262680578474,
"learning_rate": 5.286343612334802e-05,
"loss": 0.9827,
"step": 80
},
{
"epoch": 0.01873691171608068,
"grad_norm": 0.15447512609368866,
"learning_rate": 5.6167400881057265e-05,
"loss": 0.9681,
"step": 85
},
{
"epoch": 0.019839082993497188,
"grad_norm": 0.16941375745095524,
"learning_rate": 5.947136563876651e-05,
"loss": 0.9765,
"step": 90
},
{
"epoch": 0.0209412542709137,
"grad_norm": 0.14612889382972916,
"learning_rate": 6.277533039647576e-05,
"loss": 0.9359,
"step": 95
},
{
"epoch": 0.02204342554833021,
"grad_norm": 0.1405739682413469,
"learning_rate": 6.607929515418502e-05,
"loss": 0.9543,
"step": 100
},
{
"epoch": 0.023145596825746722,
"grad_norm": 0.130605536696256,
"learning_rate": 6.938325991189426e-05,
"loss": 0.9136,
"step": 105
},
{
"epoch": 0.02424776810316323,
"grad_norm": 0.11300041363364274,
"learning_rate": 7.268722466960352e-05,
"loss": 0.9478,
"step": 110
},
{
"epoch": 0.025349939380579743,
"grad_norm": 0.09643276514367859,
"learning_rate": 7.599118942731278e-05,
"loss": 0.9125,
"step": 115
},
{
"epoch": 0.026452110657996252,
"grad_norm": 0.10011972487944946,
"learning_rate": 7.929515418502201e-05,
"loss": 0.9309,
"step": 120
},
{
"epoch": 0.027554281935412765,
"grad_norm": 0.08587719618641608,
"learning_rate": 8.259911894273126e-05,
"loss": 0.9023,
"step": 125
},
{
"epoch": 0.028656453212829273,
"grad_norm": 0.08726681655122204,
"learning_rate": 8.590308370044052e-05,
"loss": 0.9058,
"step": 130
},
{
"epoch": 0.029758624490245786,
"grad_norm": 0.09342883329751345,
"learning_rate": 8.920704845814977e-05,
"loss": 0.9054,
"step": 135
},
{
"epoch": 0.030860795767662295,
"grad_norm": 0.08738078317709104,
"learning_rate": 9.251101321585903e-05,
"loss": 0.8833,
"step": 140
},
{
"epoch": 0.031962967045078804,
"grad_norm": 0.07421598879157193,
"learning_rate": 9.581497797356827e-05,
"loss": 0.9187,
"step": 145
},
{
"epoch": 0.033065138322495316,
"grad_norm": 0.05948715305053877,
"learning_rate": 9.911894273127753e-05,
"loss": 0.8747,
"step": 150
},
{
"epoch": 0.03416730959991183,
"grad_norm": 0.07139242065324014,
"learning_rate": 0.00010242290748898678,
"loss": 0.8821,
"step": 155
},
{
"epoch": 0.035269480877328334,
"grad_norm": 0.0688597307361162,
"learning_rate": 0.00010572687224669604,
"loss": 0.8818,
"step": 160
},
{
"epoch": 0.036371652154744846,
"grad_norm": 0.06917882261732754,
"learning_rate": 0.00010903083700440527,
"loss": 0.9223,
"step": 165
},
{
"epoch": 0.03747382343216136,
"grad_norm": 0.06477308185348316,
"learning_rate": 0.00011233480176211453,
"loss": 0.8979,
"step": 170
},
{
"epoch": 0.03857599470957787,
"grad_norm": 0.07148480842885613,
"learning_rate": 0.00011563876651982378,
"loss": 0.8864,
"step": 175
},
{
"epoch": 0.039678165986994376,
"grad_norm": 0.06963902310697093,
"learning_rate": 0.00011894273127753302,
"loss": 0.8924,
"step": 180
},
{
"epoch": 0.04078033726441089,
"grad_norm": 0.06681188993008794,
"learning_rate": 0.00012224669603524228,
"loss": 0.8853,
"step": 185
},
{
"epoch": 0.0418825085418274,
"grad_norm": 0.07882523551134729,
"learning_rate": 0.00012555066079295151,
"loss": 0.8752,
"step": 190
},
{
"epoch": 0.042984679819243914,
"grad_norm": 0.07046808160734085,
"learning_rate": 0.00012885462555066077,
"loss": 0.9005,
"step": 195
},
{
"epoch": 0.04408685109666042,
"grad_norm": 0.08831018054166795,
"learning_rate": 0.00013215859030837003,
"loss": 0.8779,
"step": 200
},
{
"epoch": 0.04518902237407693,
"grad_norm": 0.06786610627531549,
"learning_rate": 0.0001354625550660793,
"loss": 0.8865,
"step": 205
},
{
"epoch": 0.046291193651493444,
"grad_norm": 0.06898944984160912,
"learning_rate": 0.00013876651982378853,
"loss": 0.8951,
"step": 210
},
{
"epoch": 0.04739336492890995,
"grad_norm": 0.07193213519196924,
"learning_rate": 0.00014207048458149779,
"loss": 0.91,
"step": 215
},
{
"epoch": 0.04849553620632646,
"grad_norm": 0.06470248286974109,
"learning_rate": 0.00014537444933920705,
"loss": 0.8713,
"step": 220
},
{
"epoch": 0.049597707483742974,
"grad_norm": 0.07558920025422085,
"learning_rate": 0.0001486784140969163,
"loss": 0.9003,
"step": 225
},
{
"epoch": 0.050699878761159486,
"grad_norm": 0.08225650056399321,
"learning_rate": 0.00015198237885462556,
"loss": 0.8744,
"step": 230
},
{
"epoch": 0.05180205003857599,
"grad_norm": 0.08830347321776405,
"learning_rate": 0.0001552863436123348,
"loss": 0.8543,
"step": 235
},
{
"epoch": 0.052904221315992504,
"grad_norm": 0.06812983519818898,
"learning_rate": 0.00015859030837004403,
"loss": 0.9006,
"step": 240
},
{
"epoch": 0.05400639259340902,
"grad_norm": 0.08404581821873025,
"learning_rate": 0.0001618942731277533,
"loss": 0.8915,
"step": 245
},
{
"epoch": 0.05510856387082553,
"grad_norm": 0.07050034877556227,
"learning_rate": 0.00016519823788546252,
"loss": 0.8737,
"step": 250
},
{
"epoch": 0.056210735148242034,
"grad_norm": 0.060716071571555404,
"learning_rate": 0.0001685022026431718,
"loss": 0.8551,
"step": 255
},
{
"epoch": 0.05731290642565855,
"grad_norm": 0.06881772285936742,
"learning_rate": 0.00017180616740088104,
"loss": 0.8895,
"step": 260
},
{
"epoch": 0.05841507770307506,
"grad_norm": 0.0616288543822739,
"learning_rate": 0.0001751101321585903,
"loss": 0.8761,
"step": 265
},
{
"epoch": 0.05951724898049157,
"grad_norm": 0.06164986983886088,
"learning_rate": 0.00017841409691629953,
"loss": 0.8948,
"step": 270
},
{
"epoch": 0.06061942025790808,
"grad_norm": 0.08556452067304546,
"learning_rate": 0.00018171806167400882,
"loss": 0.8965,
"step": 275
},
{
"epoch": 0.06172159153532459,
"grad_norm": 0.06708897999556158,
"learning_rate": 0.00018502202643171805,
"loss": 0.8717,
"step": 280
},
{
"epoch": 0.0628237628127411,
"grad_norm": 0.06626854890594584,
"learning_rate": 0.00018832599118942728,
"loss": 0.8967,
"step": 285
},
{
"epoch": 0.06392593409015761,
"grad_norm": 0.06798823718932381,
"learning_rate": 0.00019162995594713654,
"loss": 0.879,
"step": 290
},
{
"epoch": 0.06502810536757413,
"grad_norm": 0.07021508305766244,
"learning_rate": 0.0001949339207048458,
"loss": 0.8761,
"step": 295
},
{
"epoch": 0.06613027664499063,
"grad_norm": 0.06347368995094158,
"learning_rate": 0.00019823788546255506,
"loss": 0.853,
"step": 300
},
{
"epoch": 0.06723244792240714,
"grad_norm": 0.07136142919506049,
"learning_rate": 0.0002015418502202643,
"loss": 0.8771,
"step": 305
},
{
"epoch": 0.06833461919982366,
"grad_norm": 0.06846198864454035,
"learning_rate": 0.00020484581497797356,
"loss": 0.8903,
"step": 310
},
{
"epoch": 0.06943679047724016,
"grad_norm": 0.06137623690084353,
"learning_rate": 0.0002081497797356828,
"loss": 0.853,
"step": 315
},
{
"epoch": 0.07053896175465667,
"grad_norm": 0.07068069693448537,
"learning_rate": 0.00021145374449339208,
"loss": 0.8868,
"step": 320
},
{
"epoch": 0.07164113303207319,
"grad_norm": 0.0633263499263589,
"learning_rate": 0.0002147577092511013,
"loss": 0.8847,
"step": 325
},
{
"epoch": 0.07274330430948969,
"grad_norm": 0.06653681784940939,
"learning_rate": 0.00021806167400881054,
"loss": 0.8725,
"step": 330
},
{
"epoch": 0.07384547558690621,
"grad_norm": 0.06583964059153263,
"learning_rate": 0.0002213656387665198,
"loss": 0.878,
"step": 335
},
{
"epoch": 0.07494764686432272,
"grad_norm": 0.08061748319197447,
"learning_rate": 0.00022466960352422906,
"loss": 0.8984,
"step": 340
},
{
"epoch": 0.07604981814173922,
"grad_norm": 0.0731004519013094,
"learning_rate": 0.00022797356828193832,
"loss": 0.8629,
"step": 345
},
{
"epoch": 0.07715198941915574,
"grad_norm": 0.06044943906403856,
"learning_rate": 0.00023127753303964755,
"loss": 0.8783,
"step": 350
},
{
"epoch": 0.07825416069657225,
"grad_norm": 0.06920900396817772,
"learning_rate": 0.0002345814977973568,
"loss": 0.8882,
"step": 355
},
{
"epoch": 0.07935633197398875,
"grad_norm": 0.06396348587422171,
"learning_rate": 0.00023788546255506604,
"loss": 0.8557,
"step": 360
},
{
"epoch": 0.08045850325140527,
"grad_norm": 0.06844620445410649,
"learning_rate": 0.00024118942731277533,
"loss": 0.8973,
"step": 365
},
{
"epoch": 0.08156067452882178,
"grad_norm": 0.06653654136399571,
"learning_rate": 0.00024449339207048456,
"loss": 0.8916,
"step": 370
},
{
"epoch": 0.08266284580623828,
"grad_norm": 0.06216710353519921,
"learning_rate": 0.0002477973568281938,
"loss": 0.8499,
"step": 375
},
{
"epoch": 0.0837650170836548,
"grad_norm": 0.05880885627082627,
"learning_rate": 0.00025110132158590303,
"loss": 0.9042,
"step": 380
},
{
"epoch": 0.08486718836107131,
"grad_norm": 0.06754842514493627,
"learning_rate": 0.0002544052863436123,
"loss": 0.8641,
"step": 385
},
{
"epoch": 0.08596935963848783,
"grad_norm": 0.06140819068848091,
"learning_rate": 0.00025770925110132155,
"loss": 0.8943,
"step": 390
},
{
"epoch": 0.08707153091590433,
"grad_norm": 0.061754148705009615,
"learning_rate": 0.00026101321585903083,
"loss": 0.8718,
"step": 395
},
{
"epoch": 0.08817370219332084,
"grad_norm": 0.06597271175452663,
"learning_rate": 0.00026431718061674007,
"loss": 0.8791,
"step": 400
},
{
"epoch": 0.08927587347073736,
"grad_norm": 0.058815224093820354,
"learning_rate": 0.00026762114537444935,
"loss": 0.8822,
"step": 405
},
{
"epoch": 0.09037804474815386,
"grad_norm": 0.06335803406426932,
"learning_rate": 0.0002709251101321586,
"loss": 0.8555,
"step": 410
},
{
"epoch": 0.09148021602557037,
"grad_norm": 0.061177086710910725,
"learning_rate": 0.0002742290748898678,
"loss": 0.8999,
"step": 415
},
{
"epoch": 0.09258238730298689,
"grad_norm": 0.0640511083565074,
"learning_rate": 0.00027753303964757705,
"loss": 0.8907,
"step": 420
},
{
"epoch": 0.0936845585804034,
"grad_norm": 0.06233966383032991,
"learning_rate": 0.0002808370044052863,
"loss": 0.8794,
"step": 425
},
{
"epoch": 0.0947867298578199,
"grad_norm": 0.057022002708166875,
"learning_rate": 0.00028414096916299557,
"loss": 0.9005,
"step": 430
},
{
"epoch": 0.09588890113523642,
"grad_norm": 0.0617487231342341,
"learning_rate": 0.0002874449339207048,
"loss": 0.8574,
"step": 435
},
{
"epoch": 0.09699107241265292,
"grad_norm": 0.07666912755096209,
"learning_rate": 0.0002907488986784141,
"loss": 0.89,
"step": 440
},
{
"epoch": 0.09809324369006944,
"grad_norm": 0.06908006068504674,
"learning_rate": 0.0002940528634361233,
"loss": 0.9058,
"step": 445
},
{
"epoch": 0.09919541496748595,
"grad_norm": 0.06466123609412561,
"learning_rate": 0.0002973568281938326,
"loss": 0.8852,
"step": 450
},
{
"epoch": 0.10029758624490245,
"grad_norm": 0.05620325485859758,
"learning_rate": 0.0002999999555762735,
"loss": 0.882,
"step": 455
},
{
"epoch": 0.10139975752231897,
"grad_norm": 0.05608554381774001,
"learning_rate": 0.0002999984007486092,
"loss": 0.8602,
"step": 460
},
{
"epoch": 0.10250192879973548,
"grad_norm": 0.06300983111668536,
"learning_rate": 0.0002999946247609333,
"loss": 0.8939,
"step": 465
},
{
"epoch": 0.10360410007715198,
"grad_norm": 0.05359836181219753,
"learning_rate": 0.00029998862766916014,
"loss": 0.8719,
"step": 470
},
{
"epoch": 0.1047062713545685,
"grad_norm": 0.05972412283076983,
"learning_rate": 0.0002999804095620941,
"loss": 0.8567,
"step": 475
},
{
"epoch": 0.10580844263198501,
"grad_norm": 0.055736449030028,
"learning_rate": 0.00029996997056142786,
"loss": 0.8928,
"step": 480
},
{
"epoch": 0.10691061390940153,
"grad_norm": 0.05535452244152051,
"learning_rate": 0.0002999573108217412,
"loss": 0.8815,
"step": 485
},
{
"epoch": 0.10801278518681803,
"grad_norm": 0.05571951712028587,
"learning_rate": 0.00029994243053049795,
"loss": 0.8273,
"step": 490
},
{
"epoch": 0.10911495646423454,
"grad_norm": 0.0572367057417111,
"learning_rate": 0.000299925329908044,
"loss": 0.8891,
"step": 495
},
{
"epoch": 0.11021712774165106,
"grad_norm": 0.05206161611783487,
"learning_rate": 0.00029990600920760355,
"loss": 0.8467,
"step": 500
},
{
"epoch": 0.11131929901906756,
"grad_norm": 0.057618220952806956,
"learning_rate": 0.0002998844687152753,
"loss": 0.84,
"step": 505
},
{
"epoch": 0.11242147029648407,
"grad_norm": 0.0731502466891883,
"learning_rate": 0.0002998607087500286,
"loss": 0.8899,
"step": 510
},
{
"epoch": 0.11352364157390059,
"grad_norm": 0.06531011144403108,
"learning_rate": 0.00029983472966369835,
"loss": 0.8805,
"step": 515
},
{
"epoch": 0.1146258128513171,
"grad_norm": 0.05719733051934535,
"learning_rate": 0.0002998065318409801,
"loss": 0.8998,
"step": 520
},
{
"epoch": 0.1157279841287336,
"grad_norm": 0.04786336091320019,
"learning_rate": 0.0002997761156994242,
"loss": 0.8454,
"step": 525
},
{
"epoch": 0.11683015540615012,
"grad_norm": 0.061209707796165434,
"learning_rate": 0.00029974348168942944,
"loss": 0.894,
"step": 530
},
{
"epoch": 0.11793232668356662,
"grad_norm": 0.05567071955981122,
"learning_rate": 0.0002997086302942368,
"loss": 0.8791,
"step": 535
},
{
"epoch": 0.11903449796098314,
"grad_norm": 0.051378774159055826,
"learning_rate": 0.00029967156202992184,
"loss": 0.8908,
"step": 540
},
{
"epoch": 0.12013666923839965,
"grad_norm": 0.06416620508611666,
"learning_rate": 0.0002996322774453875,
"loss": 0.912,
"step": 545
},
{
"epoch": 0.12123884051581615,
"grad_norm": 0.049933851892099514,
"learning_rate": 0.0002995907771223556,
"loss": 0.8819,
"step": 550
},
{
"epoch": 0.12234101179323267,
"grad_norm": 0.052571228924698295,
"learning_rate": 0.00029954706167535834,
"loss": 0.8926,
"step": 555
},
{
"epoch": 0.12344318307064918,
"grad_norm": 0.05946512503425042,
"learning_rate": 0.0002995011317517294,
"loss": 0.8767,
"step": 560
},
{
"epoch": 0.12454535434806568,
"grad_norm": 0.05379133265550323,
"learning_rate": 0.0002994529880315941,
"loss": 0.8541,
"step": 565
},
{
"epoch": 0.1256475256254822,
"grad_norm": 0.04996259410414281,
"learning_rate": 0.00029940263122785936,
"loss": 0.8975,
"step": 570
},
{
"epoch": 0.12674969690289872,
"grad_norm": 0.06293201636932731,
"learning_rate": 0.0002993500620862033,
"loss": 0.8538,
"step": 575
},
{
"epoch": 0.12785186818031521,
"grad_norm": 0.0549834121731086,
"learning_rate": 0.000299295281385064,
"loss": 0.8766,
"step": 580
},
{
"epoch": 0.12895403945773173,
"grad_norm": 0.06031562586347478,
"learning_rate": 0.00029923828993562814,
"loss": 0.8519,
"step": 585
},
{
"epoch": 0.13005621073514825,
"grad_norm": 0.05792440007594611,
"learning_rate": 0.00029917908858181897,
"loss": 0.8295,
"step": 590
},
{
"epoch": 0.13115838201256474,
"grad_norm": 0.22881705102396294,
"learning_rate": 0.00029911767820028364,
"loss": 0.8934,
"step": 595
},
{
"epoch": 0.13226055328998126,
"grad_norm": 0.2022359302630765,
"learning_rate": 0.0002990540597003804,
"loss": 0.9332,
"step": 600
},
{
"epoch": 0.13336272456739778,
"grad_norm": 0.07299327465806,
"learning_rate": 0.0002989882340241651,
"loss": 0.8848,
"step": 605
},
{
"epoch": 0.13446489584481428,
"grad_norm": 0.06761831265802426,
"learning_rate": 0.0002989202021463772,
"loss": 0.8613,
"step": 610
},
{
"epoch": 0.1355670671222308,
"grad_norm": 0.08020708226646049,
"learning_rate": 0.0002988499650744254,
"loss": 0.8961,
"step": 615
},
{
"epoch": 0.1366692383996473,
"grad_norm": 0.08560660775941882,
"learning_rate": 0.0002987775238483725,
"loss": 0.9122,
"step": 620
},
{
"epoch": 0.1377714096770638,
"grad_norm": 0.05636244041478988,
"learning_rate": 0.0002987028795409204,
"loss": 0.8427,
"step": 625
},
{
"epoch": 0.13887358095448032,
"grad_norm": 1.0715539508022556,
"learning_rate": 0.0002986260332573939,
"loss": 0.8535,
"step": 630
},
{
"epoch": 0.13997575223189684,
"grad_norm": 0.07028650609498803,
"learning_rate": 0.0002985469861357243,
"loss": 0.8843,
"step": 635
},
{
"epoch": 0.14107792350931334,
"grad_norm": 0.10912168092090672,
"learning_rate": 0.0002984657393464329,
"loss": 0.8802,
"step": 640
},
{
"epoch": 0.14218009478672985,
"grad_norm": 0.07320669082767357,
"learning_rate": 0.0002983822940926133,
"loss": 0.8534,
"step": 645
},
{
"epoch": 0.14328226606414637,
"grad_norm": 0.049879806960918614,
"learning_rate": 0.0002982966516099137,
"loss": 0.8661,
"step": 650
},
{
"epoch": 0.14438443734156287,
"grad_norm": 0.05068348497813321,
"learning_rate": 0.00029820881316651866,
"loss": 0.881,
"step": 655
},
{
"epoch": 0.14548660861897939,
"grad_norm": 0.05280972060027596,
"learning_rate": 0.00029811878006313046,
"loss": 0.8552,
"step": 660
},
{
"epoch": 0.1465887798963959,
"grad_norm": 0.060948440074130784,
"learning_rate": 0.00029802655363294934,
"loss": 0.8694,
"step": 665
},
{
"epoch": 0.14769095117381242,
"grad_norm": 0.051871602193463616,
"learning_rate": 0.0002979321352416543,
"loss": 0.8482,
"step": 670
},
{
"epoch": 0.14879312245122892,
"grad_norm": 0.05140620390257059,
"learning_rate": 0.0002978355262873826,
"loss": 0.871,
"step": 675
},
{
"epoch": 0.14989529372864543,
"grad_norm": 0.053923668745928,
"learning_rate": 0.00029773672820070915,
"loss": 0.8617,
"step": 680
},
{
"epoch": 0.15099746500606195,
"grad_norm": 0.05491254789252112,
"learning_rate": 0.0002976357424446253,
"loss": 0.8688,
"step": 685
},
{
"epoch": 0.15209963628347845,
"grad_norm": 0.057725313291247395,
"learning_rate": 0.00029753257051451707,
"loss": 0.8725,
"step": 690
},
{
"epoch": 0.15320180756089496,
"grad_norm": 0.06175295050381468,
"learning_rate": 0.0002974272139381433,
"loss": 0.8721,
"step": 695
},
{
"epoch": 0.15430397883831148,
"grad_norm": 0.05416095170725182,
"learning_rate": 0.00029731967427561266,
"loss": 0.8477,
"step": 700
},
{
"epoch": 0.15540615011572798,
"grad_norm": 0.05008825843504415,
"learning_rate": 0.00029720995311936077,
"loss": 0.8539,
"step": 705
},
{
"epoch": 0.1565083213931445,
"grad_norm": 0.048098359842914856,
"learning_rate": 0.0002970980520941266,
"loss": 0.8391,
"step": 710
},
{
"epoch": 0.15761049267056101,
"grad_norm": 0.05792884649127905,
"learning_rate": 0.00029698397285692833,
"loss": 0.836,
"step": 715
},
{
"epoch": 0.1587126639479775,
"grad_norm": 0.04553225276243662,
"learning_rate": 0.000296867717097039,
"loss": 0.8407,
"step": 720
},
{
"epoch": 0.15981483522539403,
"grad_norm": 0.04857966778373228,
"learning_rate": 0.0002967492865359611,
"loss": 0.843,
"step": 725
},
{
"epoch": 0.16091700650281054,
"grad_norm": 0.05464944380446163,
"learning_rate": 0.00029662868292740165,
"loss": 0.85,
"step": 730
},
{
"epoch": 0.16201917778022704,
"grad_norm": 0.045834951820991766,
"learning_rate": 0.00029650590805724574,
"loss": 0.8661,
"step": 735
},
{
"epoch": 0.16312134905764356,
"grad_norm": 0.053929904093643635,
"learning_rate": 0.0002963809637435303,
"loss": 0.9115,
"step": 740
},
{
"epoch": 0.16422352033506007,
"grad_norm": 0.04871618742526512,
"learning_rate": 0.00029625385183641706,
"loss": 0.845,
"step": 745
},
{
"epoch": 0.16532569161247657,
"grad_norm": 0.05247129804792461,
"learning_rate": 0.00029612457421816546,
"loss": 0.8772,
"step": 750
},
{
"epoch": 0.16642786288989309,
"grad_norm": 0.04850518585643222,
"learning_rate": 0.0002959931328031043,
"loss": 0.8687,
"step": 755
},
{
"epoch": 0.1675300341673096,
"grad_norm": 0.05475798664220526,
"learning_rate": 0.00029585952953760386,
"loss": 0.8666,
"step": 760
},
{
"epoch": 0.16863220544472612,
"grad_norm": 0.04884060269417904,
"learning_rate": 0.00029572376640004674,
"loss": 0.8681,
"step": 765
},
{
"epoch": 0.16973437672214262,
"grad_norm": 0.04948889650089674,
"learning_rate": 0.00029558584540079864,
"loss": 0.8822,
"step": 770
},
{
"epoch": 0.17083654799955914,
"grad_norm": 0.044346461043723126,
"learning_rate": 0.0002954457685821789,
"loss": 0.8656,
"step": 775
},
{
"epoch": 0.17193871927697565,
"grad_norm": 0.05856011647955299,
"learning_rate": 0.0002953035380184296,
"loss": 0.8487,
"step": 780
},
{
"epoch": 0.17304089055439215,
"grad_norm": 0.05004499788895783,
"learning_rate": 0.0002951591558156856,
"loss": 0.8219,
"step": 785
},
{
"epoch": 0.17414306183180867,
"grad_norm": 0.04373207037056602,
"learning_rate": 0.0002950126241119429,
"loss": 0.8712,
"step": 790
},
{
"epoch": 0.17524523310922518,
"grad_norm": 0.04536155490811825,
"learning_rate": 0.0002948639450770269,
"loss": 0.8616,
"step": 795
},
{
"epoch": 0.17634740438664168,
"grad_norm": 0.04345801787038758,
"learning_rate": 0.0002947131209125607,
"loss": 0.859,
"step": 800
},
{
"epoch": 0.1774495756640582,
"grad_norm": 0.045860871445007584,
"learning_rate": 0.0002945601538519321,
"loss": 0.8497,
"step": 805
},
{
"epoch": 0.17855174694147471,
"grad_norm": 0.04997067347593218,
"learning_rate": 0.0002944050461602607,
"loss": 0.8428,
"step": 810
},
{
"epoch": 0.1796539182188912,
"grad_norm": 0.04961878609503156,
"learning_rate": 0.00029424780013436434,
"loss": 0.8582,
"step": 815
},
{
"epoch": 0.18075608949630773,
"grad_norm": 0.047930042373250895,
"learning_rate": 0.0002940884181027251,
"loss": 0.8523,
"step": 820
},
{
"epoch": 0.18185826077372425,
"grad_norm": 0.05532903520929939,
"learning_rate": 0.0002939269024254547,
"loss": 0.8544,
"step": 825
},
{
"epoch": 0.18296043205114074,
"grad_norm": 0.051023957267831634,
"learning_rate": 0.0002937632554942598,
"loss": 0.8419,
"step": 830
},
{
"epoch": 0.18406260332855726,
"grad_norm": 0.0453229581659907,
"learning_rate": 0.0002935974797324064,
"loss": 0.8335,
"step": 835
},
{
"epoch": 0.18516477460597378,
"grad_norm": 0.04606389790907126,
"learning_rate": 0.0002934295775946839,
"loss": 0.8368,
"step": 840
},
{
"epoch": 0.18626694588339027,
"grad_norm": 0.045473867263415464,
"learning_rate": 0.00029325955156736885,
"loss": 0.8304,
"step": 845
},
{
"epoch": 0.1873691171608068,
"grad_norm": 0.044416090960029395,
"learning_rate": 0.0002930874041681883,
"loss": 0.8526,
"step": 850
},
{
"epoch": 0.1884712884382233,
"grad_norm": 0.05975247770022884,
"learning_rate": 0.0002929131379462821,
"loss": 0.8442,
"step": 855
},
{
"epoch": 0.1895734597156398,
"grad_norm": 0.048173050625599255,
"learning_rate": 0.00029273675548216563,
"loss": 0.8725,
"step": 860
},
{
"epoch": 0.19067563099305632,
"grad_norm": 0.04762107623243284,
"learning_rate": 0.0002925582593876912,
"loss": 0.8666,
"step": 865
},
{
"epoch": 0.19177780227047284,
"grad_norm": 0.05283512920588099,
"learning_rate": 0.0002923776523060095,
"loss": 0.8584,
"step": 870
},
{
"epoch": 0.19287997354788936,
"grad_norm": 0.05051732507674948,
"learning_rate": 0.0002921949369115307,
"loss": 0.8685,
"step": 875
},
{
"epoch": 0.19398214482530585,
"grad_norm": 0.04737129568146193,
"learning_rate": 0.00029201011590988444,
"loss": 0.8108,
"step": 880
},
{
"epoch": 0.19508431610272237,
"grad_norm": 0.052972441854251665,
"learning_rate": 0.00029182319203788,
"loss": 0.8554,
"step": 885
},
{
"epoch": 0.19618648738013889,
"grad_norm": 0.045056064701286494,
"learning_rate": 0.0002916341680634657,
"loss": 0.8271,
"step": 890
},
{
"epoch": 0.19728865865755538,
"grad_norm": 0.04699409375298448,
"learning_rate": 0.00029144304678568807,
"loss": 0.836,
"step": 895
},
{
"epoch": 0.1983908299349719,
"grad_norm": 0.048175359720427025,
"learning_rate": 0.00029124983103465026,
"loss": 0.8541,
"step": 900
},
{
"epoch": 0.19949300121238842,
"grad_norm": 0.04727829823889592,
"learning_rate": 0.00029105452367147,
"loss": 0.8502,
"step": 905
},
{
"epoch": 0.2005951724898049,
"grad_norm": 0.05148206426329491,
"learning_rate": 0.0002908571275882376,
"loss": 0.8453,
"step": 910
},
{
"epoch": 0.20169734376722143,
"grad_norm": 0.0514916225658676,
"learning_rate": 0.00029065764570797276,
"loss": 0.8609,
"step": 915
},
{
"epoch": 0.20279951504463795,
"grad_norm": 0.04883020506266138,
"learning_rate": 0.0002904560809845814,
"loss": 0.8461,
"step": 920
},
{
"epoch": 0.20390168632205444,
"grad_norm": 0.04568888763813157,
"learning_rate": 0.00029025243640281223,
"loss": 0.8827,
"step": 925
},
{
"epoch": 0.20500385759947096,
"grad_norm": 0.05356399559573866,
"learning_rate": 0.0002900467149782118,
"loss": 0.8606,
"step": 930
},
{
"epoch": 0.20610602887688748,
"grad_norm": 0.04851687061223907,
"learning_rate": 0.0002898389197570808,
"loss": 0.8586,
"step": 935
},
{
"epoch": 0.20720820015430397,
"grad_norm": 0.04669683817318346,
"learning_rate": 0.00028962905381642827,
"loss": 0.834,
"step": 940
},
{
"epoch": 0.2083103714317205,
"grad_norm": 0.04842521203522015,
"learning_rate": 0.0002894171202639262,
"loss": 0.8352,
"step": 945
},
{
"epoch": 0.209412542709137,
"grad_norm": 0.0440579152345845,
"learning_rate": 0.0002892031222378635,
"loss": 0.8324,
"step": 950
},
{
"epoch": 0.2105147139865535,
"grad_norm": 0.04627079587508343,
"learning_rate": 0.0002889870629070998,
"loss": 0.8253,
"step": 955
},
{
"epoch": 0.21161688526397002,
"grad_norm": 0.049721876458835836,
"learning_rate": 0.0002887689454710182,
"loss": 0.8322,
"step": 960
},
{
"epoch": 0.21271905654138654,
"grad_norm": 0.04459805370567298,
"learning_rate": 0.0002885487731594779,
"loss": 0.8522,
"step": 965
},
{
"epoch": 0.21382122781880306,
"grad_norm": 0.04335466084634585,
"learning_rate": 0.0002883265492327666,
"loss": 0.8385,
"step": 970
},
{
"epoch": 0.21492339909621955,
"grad_norm": 0.04968671178179274,
"learning_rate": 0.000288102276981552,
"loss": 0.8293,
"step": 975
},
{
"epoch": 0.21602557037363607,
"grad_norm": 0.04449544356929534,
"learning_rate": 0.00028787595972683326,
"loss": 0.8444,
"step": 980
},
{
"epoch": 0.21712774165105259,
"grad_norm": 0.043972214438083426,
"learning_rate": 0.0002876476008198917,
"loss": 0.8337,
"step": 985
},
{
"epoch": 0.21822991292846908,
"grad_norm": 0.04311951621843307,
"learning_rate": 0.00028741720364224113,
"loss": 0.851,
"step": 990
},
{
"epoch": 0.2193320842058856,
"grad_norm": 0.045060530710752784,
"learning_rate": 0.000287184771605578,
"loss": 0.8404,
"step": 995
},
{
"epoch": 0.22043425548330212,
"grad_norm": 0.045664652704444204,
"learning_rate": 0.0002869503081517305,
"loss": 0.8181,
"step": 1000
},
{
"epoch": 0.2215364267607186,
"grad_norm": 0.048272419665718394,
"learning_rate": 0.0002867138167526081,
"loss": 0.851,
"step": 1005
},
{
"epoch": 0.22263859803813513,
"grad_norm": 0.047040645763608745,
"learning_rate": 0.0002864753009101497,
"loss": 0.8187,
"step": 1010
},
{
"epoch": 0.22374076931555165,
"grad_norm": 0.0428558339017769,
"learning_rate": 0.00028623476415627185,
"loss": 0.8425,
"step": 1015
},
{
"epoch": 0.22484294059296814,
"grad_norm": 0.04197047231886558,
"learning_rate": 0.0002859922100528168,
"loss": 0.8565,
"step": 1020
},
{
"epoch": 0.22594511187038466,
"grad_norm": 0.044195983390617165,
"learning_rate": 0.0002857476421914993,
"loss": 0.8265,
"step": 1025
},
{
"epoch": 0.22704728314780118,
"grad_norm": 0.04490807738250268,
"learning_rate": 0.0002855010641938536,
"loss": 0.8273,
"step": 1030
},
{
"epoch": 0.22814945442521767,
"grad_norm": 0.045467911300816795,
"learning_rate": 0.00028525247971118,
"loss": 0.8448,
"step": 1035
},
{
"epoch": 0.2292516257026342,
"grad_norm": 0.06605089099746904,
"learning_rate": 0.0002850018924244903,
"loss": 0.8452,
"step": 1040
},
{
"epoch": 0.2303537969800507,
"grad_norm": 0.04096505035492289,
"learning_rate": 0.00028474930604445404,
"loss": 0.8205,
"step": 1045
},
{
"epoch": 0.2314559682574672,
"grad_norm": 0.04304934507293174,
"learning_rate": 0.0002844947243113427,
"loss": 0.8488,
"step": 1050
},
{
"epoch": 0.23255813953488372,
"grad_norm": 0.04230208217283045,
"learning_rate": 0.000284238150994975,
"loss": 0.8376,
"step": 1055
},
{
"epoch": 0.23366031081230024,
"grad_norm": 0.04708086600574921,
"learning_rate": 0.00028397958989466064,
"loss": 0.8231,
"step": 1060
},
{
"epoch": 0.23476248208971673,
"grad_norm": 0.046838021251746416,
"learning_rate": 0.00028371904483914437,
"loss": 0.8284,
"step": 1065
},
{
"epoch": 0.23586465336713325,
"grad_norm": 0.05160315046793239,
"learning_rate": 0.00028345651968654897,
"loss": 0.8489,
"step": 1070
},
{
"epoch": 0.23696682464454977,
"grad_norm": 0.04831039086387138,
"learning_rate": 0.0002831920183243184,
"loss": 0.8611,
"step": 1075
},
{
"epoch": 0.2380689959219663,
"grad_norm": 0.05287417954296867,
"learning_rate": 0.00028292554466916004,
"loss": 0.8323,
"step": 1080
},
{
"epoch": 0.23917116719938278,
"grad_norm": 0.04693015235603077,
"learning_rate": 0.00028265710266698685,
"loss": 0.8632,
"step": 1085
},
{
"epoch": 0.2402733384767993,
"grad_norm": 0.041988241282154073,
"learning_rate": 0.00028238669629285885,
"loss": 0.8068,
"step": 1090
},
{
"epoch": 0.24137550975421582,
"grad_norm": 0.04810130529903119,
"learning_rate": 0.0002821143295509241,
"loss": 0.8193,
"step": 1095
},
{
"epoch": 0.2424776810316323,
"grad_norm": 0.04870507456288239,
"learning_rate": 0.0002818400064743599,
"loss": 0.8726,
"step": 1100
},
{
"epoch": 0.24357985230904883,
"grad_norm": 0.04918127091170888,
"learning_rate": 0.00028156373112531234,
"loss": 0.8501,
"step": 1105
},
{
"epoch": 0.24468202358646535,
"grad_norm": 0.04501871928173725,
"learning_rate": 0.0002812855075948369,
"loss": 0.8623,
"step": 1110
},
{
"epoch": 0.24578419486388184,
"grad_norm": 0.04700064721205868,
"learning_rate": 0.00028100534000283727,
"loss": 0.8334,
"step": 1115
},
{
"epoch": 0.24688636614129836,
"grad_norm": 0.043956386695142506,
"learning_rate": 0.0002807232324980048,
"loss": 0.8729,
"step": 1120
},
{
"epoch": 0.24798853741871488,
"grad_norm": 0.044369171455137094,
"learning_rate": 0.00028043918925775666,
"loss": 0.8198,
"step": 1125
},
{
"epoch": 0.24909070869613137,
"grad_norm": 0.04924934475039299,
"learning_rate": 0.00028015321448817435,
"loss": 0.8425,
"step": 1130
},
{
"epoch": 0.2501928799735479,
"grad_norm": 0.052400033371239746,
"learning_rate": 0.0002798653124239411,
"loss": 0.8627,
"step": 1135
},
{
"epoch": 0.2512950512509644,
"grad_norm": 0.04354647857372681,
"learning_rate": 0.0002795754873282794,
"loss": 0.8052,
"step": 1140
},
{
"epoch": 0.2523972225283809,
"grad_norm": 0.050485861813274184,
"learning_rate": 0.0002792837434928878,
"loss": 0.8437,
"step": 1145
},
{
"epoch": 0.25349939380579745,
"grad_norm": 0.04827775784152042,
"learning_rate": 0.00027899008523787726,
"loss": 0.8595,
"step": 1150
},
{
"epoch": 0.2546015650832139,
"grad_norm": 0.04298338268947314,
"learning_rate": 0.0002786945169117073,
"loss": 0.8306,
"step": 1155
},
{
"epoch": 0.25570373636063043,
"grad_norm": 0.04907197256608099,
"learning_rate": 0.0002783970428911216,
"loss": 0.8305,
"step": 1160
},
{
"epoch": 0.25680590763804695,
"grad_norm": 0.049467752786233936,
"learning_rate": 0.000278097667581083,
"loss": 0.848,
"step": 1165
},
{
"epoch": 0.25790807891546347,
"grad_norm": 0.04656416214482384,
"learning_rate": 0.0002777963954147087,
"loss": 0.8165,
"step": 1170
},
{
"epoch": 0.25901025019288,
"grad_norm": 0.051521287489935036,
"learning_rate": 0.0002774932308532041,
"loss": 0.8362,
"step": 1175
},
{
"epoch": 0.2601124214702965,
"grad_norm": 0.053549824085333694,
"learning_rate": 0.00027718817838579706,
"loss": 0.8267,
"step": 1180
},
{
"epoch": 0.26121459274771297,
"grad_norm": 0.054270101506079374,
"learning_rate": 0.0002768812425296714,
"loss": 0.8119,
"step": 1185
},
{
"epoch": 0.2623167640251295,
"grad_norm": 0.05333532704116092,
"learning_rate": 0.00027657242782989987,
"loss": 0.8099,
"step": 1190
},
{
"epoch": 0.263418935302546,
"grad_norm": 0.046490067153489543,
"learning_rate": 0.00027626173885937703,
"loss": 0.806,
"step": 1195
},
{
"epoch": 0.26452110657996253,
"grad_norm": 0.04891723101243592,
"learning_rate": 0.0002759491802187513,
"loss": 0.8336,
"step": 1200
},
{
"epoch": 0.26562327785737905,
"grad_norm": 0.052736333806997764,
"learning_rate": 0.00027563475653635713,
"loss": 0.8471,
"step": 1205
},
{
"epoch": 0.26672544913479557,
"grad_norm": 0.04347575493484375,
"learning_rate": 0.00027531847246814613,
"loss": 0.8388,
"step": 1210
},
{
"epoch": 0.26782762041221203,
"grad_norm": 0.044934221766930585,
"learning_rate": 0.00027500033269761855,
"loss": 0.8382,
"step": 1215
},
{
"epoch": 0.26892979168962855,
"grad_norm": 0.04472911222526219,
"learning_rate": 0.0002746803419357534,
"loss": 0.823,
"step": 1220
},
{
"epoch": 0.27003196296704507,
"grad_norm": 0.042907862680138985,
"learning_rate": 0.0002743585049209391,
"loss": 0.8217,
"step": 1225
},
{
"epoch": 0.2711341342444616,
"grad_norm": 0.0430952497583985,
"learning_rate": 0.00027403482641890324,
"loss": 0.8148,
"step": 1230
},
{
"epoch": 0.2722363055218781,
"grad_norm": 0.04625178973487922,
"learning_rate": 0.0002737093112226418,
"loss": 0.8633,
"step": 1235
},
{
"epoch": 0.2733384767992946,
"grad_norm": 0.05941599713124254,
"learning_rate": 0.00027338196415234857,
"loss": 0.8307,
"step": 1240
},
{
"epoch": 0.27444064807671115,
"grad_norm": 0.055472061651380494,
"learning_rate": 0.0002730527900553432,
"loss": 0.8527,
"step": 1245
},
{
"epoch": 0.2755428193541276,
"grad_norm": 0.04678339387739163,
"learning_rate": 0.00027272179380600006,
"loss": 0.849,
"step": 1250
},
{
"epoch": 0.27664499063154413,
"grad_norm": 0.04692059237171032,
"learning_rate": 0.0002723889803056756,
"loss": 0.8706,
"step": 1255
},
{
"epoch": 0.27774716190896065,
"grad_norm": 0.04559570312510178,
"learning_rate": 0.00027205435448263593,
"loss": 0.8418,
"step": 1260
},
{
"epoch": 0.27884933318637717,
"grad_norm": 0.0419292041399672,
"learning_rate": 0.0002717179212919838,
"loss": 0.8583,
"step": 1265
},
{
"epoch": 0.2799515044637937,
"grad_norm": 0.04430167866474891,
"learning_rate": 0.00027137968571558553,
"loss": 0.8333,
"step": 1270
},
{
"epoch": 0.2810536757412102,
"grad_norm": 0.0456691993264945,
"learning_rate": 0.00027103965276199647,
"loss": 0.8447,
"step": 1275
},
{
"epoch": 0.28215584701862667,
"grad_norm": 0.04665005990926171,
"learning_rate": 0.0002706978274663879,
"loss": 0.7695,
"step": 1280
},
{
"epoch": 0.2832580182960432,
"grad_norm": 0.043712884590464574,
"learning_rate": 0.0002703542148904715,
"loss": 0.8267,
"step": 1285
},
{
"epoch": 0.2843601895734597,
"grad_norm": 0.04652550786503771,
"learning_rate": 0.00027000882012242496,
"loss": 0.8437,
"step": 1290
},
{
"epoch": 0.28546236085087623,
"grad_norm": 0.045612398832845166,
"learning_rate": 0.00026966164827681643,
"loss": 0.8138,
"step": 1295
},
{
"epoch": 0.28656453212829275,
"grad_norm": 0.04839127808608333,
"learning_rate": 0.00026931270449452897,
"loss": 0.8372,
"step": 1300
},
{
"epoch": 0.28766670340570927,
"grad_norm": 0.0445074064874431,
"learning_rate": 0.000268961993942684,
"loss": 0.8214,
"step": 1305
},
{
"epoch": 0.28876887468312573,
"grad_norm": 0.0402466463654265,
"learning_rate": 0.0002686095218145654,
"loss": 0.8086,
"step": 1310
},
{
"epoch": 0.28987104596054225,
"grad_norm": 0.05239340243182599,
"learning_rate": 0.000268255293329542,
"loss": 0.8368,
"step": 1315
},
{
"epoch": 0.29097321723795877,
"grad_norm": 0.04600417253087546,
"learning_rate": 0.0002678993137329908,
"loss": 0.8081,
"step": 1320
},
{
"epoch": 0.2920753885153753,
"grad_norm": 0.04275984799543839,
"learning_rate": 0.0002675415882962189,
"loss": 0.8257,
"step": 1325
},
{
"epoch": 0.2931775597927918,
"grad_norm": 0.044570824361962434,
"learning_rate": 0.0002671821223163858,
"loss": 0.8208,
"step": 1330
},
{
"epoch": 0.29427973107020833,
"grad_norm": 0.045119447979777474,
"learning_rate": 0.0002668209211164244,
"loss": 0.8488,
"step": 1335
},
{
"epoch": 0.29538190234762485,
"grad_norm": 0.04036708835023624,
"learning_rate": 0.00026645799004496306,
"loss": 0.8512,
"step": 1340
},
{
"epoch": 0.2964840736250413,
"grad_norm": 0.041845363942691824,
"learning_rate": 0.0002660933344762455,
"loss": 0.8228,
"step": 1345
},
{
"epoch": 0.29758624490245783,
"grad_norm": 0.052565125884796726,
"learning_rate": 0.0002657269598100518,
"loss": 0.833,
"step": 1350
},
{
"epoch": 0.29868841617987435,
"grad_norm": 0.05263005472080342,
"learning_rate": 0.0002653588714716181,
"loss": 0.8482,
"step": 1355
},
{
"epoch": 0.29979058745729087,
"grad_norm": 0.04489153452747211,
"learning_rate": 0.00026498907491155665,
"loss": 0.7975,
"step": 1360
},
{
"epoch": 0.3008927587347074,
"grad_norm": 0.0477109850798695,
"learning_rate": 0.0002646175756057745,
"loss": 0.8168,
"step": 1365
},
{
"epoch": 0.3019949300121239,
"grad_norm": 0.04476257759581266,
"learning_rate": 0.00026424437905539315,
"loss": 0.8062,
"step": 1370
},
{
"epoch": 0.30309710128954037,
"grad_norm": 0.04415673060651443,
"learning_rate": 0.00026386949078666653,
"loss": 0.8352,
"step": 1375
},
{
"epoch": 0.3041992725669569,
"grad_norm": 0.04044906513322972,
"learning_rate": 0.0002634929163508993,
"loss": 0.8299,
"step": 1380
},
{
"epoch": 0.3053014438443734,
"grad_norm": 0.047407594480881006,
"learning_rate": 0.0002631146613243648,
"loss": 0.8509,
"step": 1385
},
{
"epoch": 0.30640361512178993,
"grad_norm": 0.04212908230670688,
"learning_rate": 0.00026273473130822235,
"loss": 0.8348,
"step": 1390
},
{
"epoch": 0.30750578639920645,
"grad_norm": 0.04777049716354304,
"learning_rate": 0.0002623531319284343,
"loss": 0.8477,
"step": 1395
},
{
"epoch": 0.30860795767662297,
"grad_norm": 0.048704830358582564,
"learning_rate": 0.00026196986883568284,
"loss": 0.8514,
"step": 1400
},
{
"epoch": 0.30971012895403943,
"grad_norm": 0.05238527702204111,
"learning_rate": 0.00026158494770528614,
"loss": 0.82,
"step": 1405
},
{
"epoch": 0.31081230023145595,
"grad_norm": 0.05352166328603628,
"learning_rate": 0.0002611983742371144,
"loss": 0.8293,
"step": 1410
},
{
"epoch": 0.31191447150887247,
"grad_norm": 0.04537796100997172,
"learning_rate": 0.0002608101541555056,
"loss": 0.8001,
"step": 1415
},
{
"epoch": 0.313016642786289,
"grad_norm": 0.04950116503834519,
"learning_rate": 0.0002604202932091805,
"loss": 0.8406,
"step": 1420
},
{
"epoch": 0.3141188140637055,
"grad_norm": 0.042514906932797684,
"learning_rate": 0.0002600287971711576,
"loss": 0.8467,
"step": 1425
},
{
"epoch": 0.31522098534112203,
"grad_norm": 0.03957295732321364,
"learning_rate": 0.0002596356718386676,
"loss": 0.8457,
"step": 1430
},
{
"epoch": 0.31632315661853855,
"grad_norm": 0.04155394927306569,
"learning_rate": 0.0002592409230330677,
"loss": 0.8087,
"step": 1435
},
{
"epoch": 0.317425327895955,
"grad_norm": 0.04562885194483694,
"learning_rate": 0.0002588445565997554,
"loss": 0.8394,
"step": 1440
},
{
"epoch": 0.31852749917337153,
"grad_norm": 0.04641875664850794,
"learning_rate": 0.0002584465784080817,
"loss": 0.8407,
"step": 1445
},
{
"epoch": 0.31962967045078805,
"grad_norm": 0.0430053369111926,
"learning_rate": 0.0002580469943512644,
"loss": 0.8494,
"step": 1450
},
{
"epoch": 0.32073184172820457,
"grad_norm": 0.04973211543509925,
"learning_rate": 0.0002576458103463007,
"loss": 0.798,
"step": 1455
},
{
"epoch": 0.3218340130056211,
"grad_norm": 0.04129288667946417,
"learning_rate": 0.00025724303233387987,
"loss": 0.8446,
"step": 1460
},
{
"epoch": 0.3229361842830376,
"grad_norm": 0.04473055100882414,
"learning_rate": 0.00025683866627829486,
"loss": 0.8455,
"step": 1465
},
{
"epoch": 0.32403835556045407,
"grad_norm": 0.046468538153449125,
"learning_rate": 0.00025643271816735416,
"loss": 0.8194,
"step": 1470
},
{
"epoch": 0.3251405268378706,
"grad_norm": 0.039997009480838126,
"learning_rate": 0.0002560251940122935,
"loss": 0.8198,
"step": 1475
},
{
"epoch": 0.3262426981152871,
"grad_norm": 0.045094913214168995,
"learning_rate": 0.000255616099847686,
"loss": 0.8099,
"step": 1480
},
{
"epoch": 0.32734486939270363,
"grad_norm": 0.04423782452900588,
"learning_rate": 0.0002552054417313538,
"loss": 0.8205,
"step": 1485
},
{
"epoch": 0.32844704067012015,
"grad_norm": 0.044248198901321145,
"learning_rate": 0.0002547932257442775,
"loss": 0.8115,
"step": 1490
},
{
"epoch": 0.32954921194753667,
"grad_norm": 0.04981089857434275,
"learning_rate": 0.00025437945799050674,
"loss": 0.8398,
"step": 1495
},
{
"epoch": 0.33065138322495313,
"grad_norm": 0.04545569085042973,
"learning_rate": 0.00025396414459706926,
"loss": 0.8086,
"step": 1500
},
{
"epoch": 0.33175355450236965,
"grad_norm": 0.0428291316212456,
"learning_rate": 0.00025354729171388077,
"loss": 0.813,
"step": 1505
},
{
"epoch": 0.33285572577978617,
"grad_norm": 0.04223393291828146,
"learning_rate": 0.0002531289055136535,
"loss": 0.8322,
"step": 1510
},
{
"epoch": 0.3339578970572027,
"grad_norm": 0.04755472767341248,
"learning_rate": 0.0002527089921918047,
"loss": 0.8496,
"step": 1515
},
{
"epoch": 0.3350600683346192,
"grad_norm": 0.0440763985224576,
"learning_rate": 0.00025228755796636524,
"loss": 0.8317,
"step": 1520
},
{
"epoch": 0.33616223961203573,
"grad_norm": 0.04739432394265274,
"learning_rate": 0.00025186460907788733,
"loss": 0.8291,
"step": 1525
},
{
"epoch": 0.33726441088945225,
"grad_norm": 0.04365113581228294,
"learning_rate": 0.0002514401517893521,
"loss": 0.8314,
"step": 1530
},
{
"epoch": 0.3383665821668687,
"grad_norm": 0.05295299465042367,
"learning_rate": 0.0002510141923860769,
"loss": 0.8386,
"step": 1535
},
{
"epoch": 0.33946875344428523,
"grad_norm": 0.04480906078196552,
"learning_rate": 0.0002505867371756224,
"loss": 0.8087,
"step": 1540
},
{
"epoch": 0.34057092472170175,
"grad_norm": 0.04166659673582314,
"learning_rate": 0.0002501577924876987,
"loss": 0.8336,
"step": 1545
},
{
"epoch": 0.34167309599911827,
"grad_norm": 0.04392236356125392,
"learning_rate": 0.0002497273646740723,
"loss": 0.8221,
"step": 1550
},
{
"epoch": 0.3427752672765348,
"grad_norm": 0.037583836271015914,
"learning_rate": 0.0002492954601084713,
"loss": 0.8347,
"step": 1555
},
{
"epoch": 0.3438774385539513,
"grad_norm": 0.04245959731615741,
"learning_rate": 0.00024886208518649173,
"loss": 0.8341,
"step": 1560
},
{
"epoch": 0.3449796098313678,
"grad_norm": 0.04299442216812149,
"learning_rate": 0.00024842724632550216,
"loss": 0.8143,
"step": 1565
},
{
"epoch": 0.3460817811087843,
"grad_norm": 0.044416364158721855,
"learning_rate": 0.00024799094996454926,
"loss": 0.817,
"step": 1570
},
{
"epoch": 0.3471839523862008,
"grad_norm": 0.04080370665097008,
"learning_rate": 0.0002475532025642621,
"loss": 0.8404,
"step": 1575
},
{
"epoch": 0.34828612366361733,
"grad_norm": 0.043404652747125856,
"learning_rate": 0.0002471140106067565,
"loss": 0.8056,
"step": 1580
},
{
"epoch": 0.34938829494103385,
"grad_norm": 0.04702543231191137,
"learning_rate": 0.0002466733805955394,
"loss": 0.8364,
"step": 1585
},
{
"epoch": 0.35049046621845037,
"grad_norm": 0.04379810092412971,
"learning_rate": 0.000246231319055412,
"loss": 0.7982,
"step": 1590
},
{
"epoch": 0.35159263749586683,
"grad_norm": 0.04861448921339962,
"learning_rate": 0.0002457878325323735,
"loss": 0.8108,
"step": 1595
},
{
"epoch": 0.35269480877328335,
"grad_norm": 0.05366919048317618,
"learning_rate": 0.00024534292759352414,
"loss": 0.8406,
"step": 1600
},
{
"epoch": 0.35379698005069987,
"grad_norm": 0.04637591531118319,
"learning_rate": 0.000244896610826968,
"loss": 0.7878,
"step": 1605
},
{
"epoch": 0.3548991513281164,
"grad_norm": 0.0451952378709979,
"learning_rate": 0.00024444888884171505,
"loss": 0.8073,
"step": 1610
},
{
"epoch": 0.3560013226055329,
"grad_norm": 0.04229445731850634,
"learning_rate": 0.00024399976826758392,
"loss": 0.8133,
"step": 1615
},
{
"epoch": 0.35710349388294943,
"grad_norm": 0.04303409290387014,
"learning_rate": 0.00024354925575510315,
"loss": 0.7969,
"step": 1620
},
{
"epoch": 0.3582056651603659,
"grad_norm": 0.04070575320795186,
"learning_rate": 0.00024309735797541318,
"loss": 0.8192,
"step": 1625
},
{
"epoch": 0.3593078364377824,
"grad_norm": 0.04197565777082017,
"learning_rate": 0.0002426440816201671,
"loss": 0.8239,
"step": 1630
},
{
"epoch": 0.36041000771519893,
"grad_norm": 0.044014973357833005,
"learning_rate": 0.00024218943340143182,
"loss": 0.8334,
"step": 1635
},
{
"epoch": 0.36151217899261545,
"grad_norm": 0.04571887208233434,
"learning_rate": 0.00024173342005158894,
"loss": 0.8432,
"step": 1640
},
{
"epoch": 0.36261435027003197,
"grad_norm": 0.04094752957919555,
"learning_rate": 0.00024127604832323445,
"loss": 0.7932,
"step": 1645
},
{
"epoch": 0.3637165215474485,
"grad_norm": 0.04013508208109058,
"learning_rate": 0.0002408173249890792,
"loss": 0.8034,
"step": 1650
},
{
"epoch": 0.364818692824865,
"grad_norm": 0.040311829670245755,
"learning_rate": 0.00024035725684184845,
"loss": 0.7866,
"step": 1655
},
{
"epoch": 0.3659208641022815,
"grad_norm": 0.041899754058127306,
"learning_rate": 0.00023989585069418134,
"loss": 0.7872,
"step": 1660
},
{
"epoch": 0.367023035379698,
"grad_norm": 0.043805964643653376,
"learning_rate": 0.0002394331133785299,
"loss": 0.8146,
"step": 1665
},
{
"epoch": 0.3681252066571145,
"grad_norm": 0.04065431765935861,
"learning_rate": 0.000238969051747058,
"loss": 0.8394,
"step": 1670
},
{
"epoch": 0.36922737793453103,
"grad_norm": 0.04291000131859918,
"learning_rate": 0.00023850367267153985,
"loss": 0.8414,
"step": 1675
},
{
"epoch": 0.37032954921194755,
"grad_norm": 0.04361025776347538,
"learning_rate": 0.00023803698304325824,
"loss": 0.7826,
"step": 1680
},
{
"epoch": 0.37143172048936407,
"grad_norm": 0.04765203149725648,
"learning_rate": 0.00023756898977290235,
"loss": 0.8113,
"step": 1685
},
{
"epoch": 0.37253389176678053,
"grad_norm": 0.04008616793311488,
"learning_rate": 0.00023709969979046576,
"loss": 0.8291,
"step": 1690
},
{
"epoch": 0.37363606304419705,
"grad_norm": 0.04271593974112689,
"learning_rate": 0.00023662912004514345,
"loss": 0.82,
"step": 1695
},
{
"epoch": 0.3747382343216136,
"grad_norm": 0.04110249253005406,
"learning_rate": 0.00023615725750522913,
"loss": 0.8305,
"step": 1700
},
{
"epoch": 0.3758404055990301,
"grad_norm": 0.0403637537968849,
"learning_rate": 0.00023568411915801205,
"loss": 0.8177,
"step": 1705
},
{
"epoch": 0.3769425768764466,
"grad_norm": 0.044289241987404186,
"learning_rate": 0.00023520971200967334,
"loss": 0.8215,
"step": 1710
},
{
"epoch": 0.37804474815386313,
"grad_norm": 0.04129576160875138,
"learning_rate": 0.00023473404308518256,
"loss": 0.8337,
"step": 1715
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.04173364684891999,
"learning_rate": 0.00023425711942819333,
"loss": 0.8067,
"step": 1720
},
{
"epoch": 0.3802490907086961,
"grad_norm": 0.04440013453397569,
"learning_rate": 0.00023377894810093944,
"loss": 0.8396,
"step": 1725
},
{
"epoch": 0.38135126198611263,
"grad_norm": 0.04364153613366134,
"learning_rate": 0.00023329953618412985,
"loss": 0.8126,
"step": 1730
},
{
"epoch": 0.38245343326352915,
"grad_norm": 0.03880573318839816,
"learning_rate": 0.0002328188907768441,
"loss": 0.7861,
"step": 1735
},
{
"epoch": 0.38355560454094567,
"grad_norm": 0.044630935282676455,
"learning_rate": 0.00023233701899642712,
"loss": 0.8041,
"step": 1740
},
{
"epoch": 0.3846577758183622,
"grad_norm": 0.045036425102858116,
"learning_rate": 0.0002318539279783839,
"loss": 0.8389,
"step": 1745
},
{
"epoch": 0.3857599470957787,
"grad_norm": 0.044227375694242045,
"learning_rate": 0.0002313696248762737,
"loss": 0.8024,
"step": 1750
},
{
"epoch": 0.3868621183731952,
"grad_norm": 0.04607651991018229,
"learning_rate": 0.00023088411686160415,
"loss": 0.8087,
"step": 1755
},
{
"epoch": 0.3879642896506117,
"grad_norm": 0.04430605414125412,
"learning_rate": 0.00023039741112372528,
"loss": 0.8279,
"step": 1760
},
{
"epoch": 0.3890664609280282,
"grad_norm": 0.047410669524145625,
"learning_rate": 0.00022990951486972258,
"loss": 0.8104,
"step": 1765
},
{
"epoch": 0.39016863220544473,
"grad_norm": 0.041525912031087554,
"learning_rate": 0.0002294204353243109,
"loss": 0.7937,
"step": 1770
},
{
"epoch": 0.39127080348286125,
"grad_norm": 0.03803908868674592,
"learning_rate": 0.00022893017972972686,
"loss": 0.8099,
"step": 1775
},
{
"epoch": 0.39237297476027777,
"grad_norm": 0.04663481529200365,
"learning_rate": 0.00022843875534562204,
"loss": 0.7985,
"step": 1780
},
{
"epoch": 0.39347514603769423,
"grad_norm": 0.04209155232991889,
"learning_rate": 0.0002279461694489553,
"loss": 0.7984,
"step": 1785
},
{
"epoch": 0.39457731731511075,
"grad_norm": 0.04273613495319002,
"learning_rate": 0.00022745242933388507,
"loss": 0.7856,
"step": 1790
},
{
"epoch": 0.3956794885925273,
"grad_norm": 0.04007855374098208,
"learning_rate": 0.00022695754231166125,
"loss": 0.798,
"step": 1795
},
{
"epoch": 0.3967816598699438,
"grad_norm": 0.03874216976594234,
"learning_rate": 0.0002264615157105171,
"loss": 0.8303,
"step": 1800
},
{
"epoch": 0.3978838311473603,
"grad_norm": 0.042291279075714026,
"learning_rate": 0.00022596435687556067,
"loss": 0.8284,
"step": 1805
},
{
"epoch": 0.39898600242477683,
"grad_norm": 0.04134434249625502,
"learning_rate": 0.00022546607316866583,
"loss": 0.8143,
"step": 1810
},
{
"epoch": 0.4000881737021933,
"grad_norm": 0.0426391584789927,
"learning_rate": 0.00022496667196836358,
"loss": 0.8291,
"step": 1815
},
{
"epoch": 0.4011903449796098,
"grad_norm": 0.03943254080642002,
"learning_rate": 0.0002244661606697326,
"loss": 0.8093,
"step": 1820
},
{
"epoch": 0.40229251625702633,
"grad_norm": 0.04164893383401447,
"learning_rate": 0.00022396454668428982,
"loss": 0.8135,
"step": 1825
},
{
"epoch": 0.40339468753444285,
"grad_norm": 0.04110460672662909,
"learning_rate": 0.00022346183743988056,
"loss": 0.8083,
"step": 1830
},
{
"epoch": 0.40449685881185937,
"grad_norm": 0.04143399066869764,
"learning_rate": 0.00022295804038056867,
"loss": 0.798,
"step": 1835
},
{
"epoch": 0.4055990300892759,
"grad_norm": 0.04010672194926782,
"learning_rate": 0.0002224531629665263,
"loss": 0.8132,
"step": 1840
},
{
"epoch": 0.4067012013666924,
"grad_norm": 0.040718372885152405,
"learning_rate": 0.00022194721267392324,
"loss": 0.8237,
"step": 1845
},
{
"epoch": 0.4078033726441089,
"grad_norm": 0.044019144470429636,
"learning_rate": 0.0002214401969948164,
"loss": 0.7955,
"step": 1850
},
{
"epoch": 0.4089055439215254,
"grad_norm": 0.0449280577770788,
"learning_rate": 0.00022093212343703893,
"loss": 0.7929,
"step": 1855
},
{
"epoch": 0.4100077151989419,
"grad_norm": 0.050365477985191316,
"learning_rate": 0.00022042299952408872,
"loss": 0.8389,
"step": 1860
},
{
"epoch": 0.41110988647635843,
"grad_norm": 0.04032290135169436,
"learning_rate": 0.00021991283279501744,
"loss": 0.796,
"step": 1865
},
{
"epoch": 0.41221205775377495,
"grad_norm": 0.03859944961488867,
"learning_rate": 0.0002194016308043185,
"loss": 0.7977,
"step": 1870
},
{
"epoch": 0.41331422903119147,
"grad_norm": 0.04159446854663543,
"learning_rate": 0.00021888940112181542,
"loss": 0.826,
"step": 1875
},
{
"epoch": 0.41441640030860794,
"grad_norm": 0.04359969871410925,
"learning_rate": 0.0002183761513325496,
"loss": 0.8251,
"step": 1880
},
{
"epoch": 0.41551857158602445,
"grad_norm": 0.04343781370305786,
"learning_rate": 0.0002178618890366682,
"loss": 0.7984,
"step": 1885
},
{
"epoch": 0.416620742863441,
"grad_norm": 0.04412546972897982,
"learning_rate": 0.00021734662184931137,
"loss": 0.8275,
"step": 1890
},
{
"epoch": 0.4177229141408575,
"grad_norm": 0.046901376458696624,
"learning_rate": 0.00021683035740049952,
"loss": 0.8286,
"step": 1895
},
{
"epoch": 0.418825085418274,
"grad_norm": 0.04296065580749498,
"learning_rate": 0.00021631310333502062,
"loss": 0.8245,
"step": 1900
},
{
"epoch": 0.41992725669569053,
"grad_norm": 0.042120916788582694,
"learning_rate": 0.00021579486731231653,
"loss": 0.7803,
"step": 1905
},
{
"epoch": 0.421029427973107,
"grad_norm": 0.040883347305042415,
"learning_rate": 0.00021527565700637003,
"loss": 0.8347,
"step": 1910
},
{
"epoch": 0.4221315992505235,
"grad_norm": 0.04111643166609028,
"learning_rate": 0.0002147554801055908,
"loss": 0.7808,
"step": 1915
},
{
"epoch": 0.42323377052794003,
"grad_norm": 0.042728251907987964,
"learning_rate": 0.0002142343443127018,
"loss": 0.8306,
"step": 1920
},
{
"epoch": 0.42433594180535655,
"grad_norm": 0.042082803091619436,
"learning_rate": 0.0002137122573446254,
"loss": 0.8057,
"step": 1925
},
{
"epoch": 0.4254381130827731,
"grad_norm": 0.04346598671784763,
"learning_rate": 0.00021318922693236845,
"loss": 0.812,
"step": 1930
},
{
"epoch": 0.4265402843601896,
"grad_norm": 0.04311043189883461,
"learning_rate": 0.00021266526082090858,
"loss": 0.7732,
"step": 1935
},
{
"epoch": 0.4276424556376061,
"grad_norm": 0.03914987981413652,
"learning_rate": 0.00021214036676907888,
"loss": 0.7875,
"step": 1940
},
{
"epoch": 0.4287446269150226,
"grad_norm": 0.03762591331444608,
"learning_rate": 0.00021161455254945354,
"loss": 0.8256,
"step": 1945
},
{
"epoch": 0.4298467981924391,
"grad_norm": 0.04123350564815786,
"learning_rate": 0.00021108782594823227,
"loss": 0.8177,
"step": 1950
},
{
"epoch": 0.4309489694698556,
"grad_norm": 0.0396041073736449,
"learning_rate": 0.00021056019476512532,
"loss": 0.8145,
"step": 1955
},
{
"epoch": 0.43205114074727213,
"grad_norm": 0.0380641273178196,
"learning_rate": 0.00021003166681323794,
"loss": 0.7952,
"step": 1960
},
{
"epoch": 0.43315331202468865,
"grad_norm": 0.038313042170990505,
"learning_rate": 0.00020950224991895456,
"loss": 0.7872,
"step": 1965
},
{
"epoch": 0.43425548330210517,
"grad_norm": 0.04604786797820384,
"learning_rate": 0.00020897195192182299,
"loss": 0.8094,
"step": 1970
},
{
"epoch": 0.43535765457952164,
"grad_norm": 0.04044192743630967,
"learning_rate": 0.00020844078067443835,
"loss": 0.8141,
"step": 1975
},
{
"epoch": 0.43645982585693816,
"grad_norm": 0.04177226121543752,
"learning_rate": 0.00020790874404232667,
"loss": 0.8181,
"step": 1980
},
{
"epoch": 0.4375619971343547,
"grad_norm": 0.04439891759072682,
"learning_rate": 0.00020737584990382862,
"loss": 0.7925,
"step": 1985
},
{
"epoch": 0.4386641684117712,
"grad_norm": 0.04270949977662141,
"learning_rate": 0.0002068421061499826,
"loss": 0.7786,
"step": 1990
},
{
"epoch": 0.4397663396891877,
"grad_norm": 0.040991624121934515,
"learning_rate": 0.0002063075206844082,
"loss": 0.8308,
"step": 1995
},
{
"epoch": 0.44086851096660423,
"grad_norm": 0.041275862470859626,
"learning_rate": 0.00020577210142318876,
"loss": 0.8342,
"step": 2000
},
{
"epoch": 0.4419706822440207,
"grad_norm": 0.040817642657457055,
"learning_rate": 0.00020523585629475457,
"loss": 0.8274,
"step": 2005
},
{
"epoch": 0.4430728535214372,
"grad_norm": 0.04417509610771959,
"learning_rate": 0.00020469879323976517,
"loss": 0.8176,
"step": 2010
},
{
"epoch": 0.44417502479885373,
"grad_norm": 0.04251521072112325,
"learning_rate": 0.00020416092021099193,
"loss": 0.8049,
"step": 2015
},
{
"epoch": 0.44527719607627025,
"grad_norm": 0.042166821439132904,
"learning_rate": 0.00020362224517320014,
"loss": 0.8014,
"step": 2020
},
{
"epoch": 0.4463793673536868,
"grad_norm": 0.041857744422797494,
"learning_rate": 0.0002030827761030312,
"loss": 0.7916,
"step": 2025
},
{
"epoch": 0.4474815386311033,
"grad_norm": 0.0411849481904813,
"learning_rate": 0.00020254252098888447,
"loss": 0.7706,
"step": 2030
},
{
"epoch": 0.44858370990851976,
"grad_norm": 0.04092873755686162,
"learning_rate": 0.00020200148783079892,
"loss": 0.7896,
"step": 2035
},
{
"epoch": 0.4496858811859363,
"grad_norm": 0.04336026242745058,
"learning_rate": 0.0002014596846403348,
"loss": 0.7672,
"step": 2040
},
{
"epoch": 0.4507880524633528,
"grad_norm": 0.03910209178598118,
"learning_rate": 0.0002009171194404548,
"loss": 0.7752,
"step": 2045
},
{
"epoch": 0.4518902237407693,
"grad_norm": 0.04605371401305054,
"learning_rate": 0.00020037380026540543,
"loss": 0.8172,
"step": 2050
},
{
"epoch": 0.45299239501818583,
"grad_norm": 0.04099809926171493,
"learning_rate": 0.000199829735160598,
"loss": 0.7939,
"step": 2055
},
{
"epoch": 0.45409456629560235,
"grad_norm": 0.04031696273456619,
"learning_rate": 0.0001992849321824894,
"loss": 0.7852,
"step": 2060
},
{
"epoch": 0.4551967375730189,
"grad_norm": 0.038355940898486575,
"learning_rate": 0.0001987393993984629,
"loss": 0.7772,
"step": 2065
},
{
"epoch": 0.45629890885043534,
"grad_norm": 0.04164706481380678,
"learning_rate": 0.00019819314488670866,
"loss": 0.8031,
"step": 2070
},
{
"epoch": 0.45740108012785186,
"grad_norm": 0.042502765015544626,
"learning_rate": 0.00019764617673610413,
"loss": 0.8199,
"step": 2075
},
{
"epoch": 0.4585032514052684,
"grad_norm": 0.0394129908093004,
"learning_rate": 0.0001970985030460942,
"loss": 0.7861,
"step": 2080
},
{
"epoch": 0.4596054226826849,
"grad_norm": 0.03849198049749736,
"learning_rate": 0.00019655013192657135,
"loss": 0.79,
"step": 2085
},
{
"epoch": 0.4607075939601014,
"grad_norm": 0.044138018952375395,
"learning_rate": 0.0001960010714977555,
"loss": 0.7813,
"step": 2090
},
{
"epoch": 0.46180976523751793,
"grad_norm": 0.04225359100682865,
"learning_rate": 0.00019545132989007375,
"loss": 0.7865,
"step": 2095
},
{
"epoch": 0.4629119365149344,
"grad_norm": 0.041786418031829003,
"learning_rate": 0.00019490091524404016,
"loss": 0.7911,
"step": 2100
},
{
"epoch": 0.4640141077923509,
"grad_norm": 0.04528452410421116,
"learning_rate": 0.00019434983571013485,
"loss": 0.7932,
"step": 2105
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.03898208334364371,
"learning_rate": 0.00019379809944868376,
"loss": 0.8061,
"step": 2110
},
{
"epoch": 0.46621845034718395,
"grad_norm": 0.039265674569634,
"learning_rate": 0.00019324571462973737,
"loss": 0.7707,
"step": 2115
},
{
"epoch": 0.4673206216246005,
"grad_norm": 0.040002387303416556,
"learning_rate": 0.00019269268943295013,
"loss": 0.7777,
"step": 2120
},
{
"epoch": 0.468422792902017,
"grad_norm": 0.04208726677362012,
"learning_rate": 0.00019213903204745895,
"loss": 0.7979,
"step": 2125
},
{
"epoch": 0.46952496417943346,
"grad_norm": 0.03816406171273399,
"learning_rate": 0.0001915847506717622,
"loss": 0.806,
"step": 2130
},
{
"epoch": 0.47062713545685,
"grad_norm": 0.04211319505603739,
"learning_rate": 0.00019102985351359832,
"loss": 0.7887,
"step": 2135
},
{
"epoch": 0.4717293067342665,
"grad_norm": 0.041883426079474324,
"learning_rate": 0.00019047434878982403,
"loss": 0.7814,
"step": 2140
},
{
"epoch": 0.472831478011683,
"grad_norm": 0.037960810503627374,
"learning_rate": 0.00018991824472629293,
"loss": 0.7698,
"step": 2145
},
{
"epoch": 0.47393364928909953,
"grad_norm": 0.03832451540546507,
"learning_rate": 0.0001893615495577335,
"loss": 0.7953,
"step": 2150
},
{
"epoch": 0.47503582056651605,
"grad_norm": 0.0399683526421958,
"learning_rate": 0.0001888042715276273,
"loss": 0.7875,
"step": 2155
},
{
"epoch": 0.4761379918439326,
"grad_norm": 0.03995007716254075,
"learning_rate": 0.00018824641888808683,
"loss": 0.7958,
"step": 2160
},
{
"epoch": 0.47724016312134904,
"grad_norm": 0.04082501729363916,
"learning_rate": 0.0001876879998997333,
"loss": 0.8004,
"step": 2165
},
{
"epoch": 0.47834233439876556,
"grad_norm": 0.03753976072129883,
"learning_rate": 0.00018712902283157438,
"loss": 0.7862,
"step": 2170
},
{
"epoch": 0.4794445056761821,
"grad_norm": 0.039247708791779135,
"learning_rate": 0.00018656949596088177,
"loss": 0.7846,
"step": 2175
},
{
"epoch": 0.4805466769535986,
"grad_norm": 0.040194874528480064,
"learning_rate": 0.00018600942757306853,
"loss": 0.7948,
"step": 2180
},
{
"epoch": 0.4816488482310151,
"grad_norm": 0.042684703351327506,
"learning_rate": 0.00018544882596156643,
"loss": 0.8328,
"step": 2185
},
{
"epoch": 0.48275101950843163,
"grad_norm": 0.04256364237578435,
"learning_rate": 0.0001848876994277032,
"loss": 0.8036,
"step": 2190
},
{
"epoch": 0.4838531907858481,
"grad_norm": 0.04084201277383364,
"learning_rate": 0.0001843260562805796,
"loss": 0.7838,
"step": 2195
},
{
"epoch": 0.4849553620632646,
"grad_norm": 0.03951302995836591,
"learning_rate": 0.0001837639048369462,
"loss": 0.7729,
"step": 2200
},
{
"epoch": 0.48605753334068114,
"grad_norm": 0.0440116634757643,
"learning_rate": 0.00018320125342108058,
"loss": 0.8097,
"step": 2205
},
{
"epoch": 0.48715970461809766,
"grad_norm": 0.03880553291311738,
"learning_rate": 0.0001826381103646636,
"loss": 0.7858,
"step": 2210
},
{
"epoch": 0.4882618758955142,
"grad_norm": 0.04347096499790543,
"learning_rate": 0.00018207448400665656,
"loss": 0.7931,
"step": 2215
},
{
"epoch": 0.4893640471729307,
"grad_norm": 0.04137171191530775,
"learning_rate": 0.0001815103826931772,
"loss": 0.7904,
"step": 2220
},
{
"epoch": 0.49046621845034716,
"grad_norm": 0.03924451193006248,
"learning_rate": 0.00018094581477737652,
"loss": 0.7892,
"step": 2225
},
{
"epoch": 0.4915683897277637,
"grad_norm": 0.035326553011083374,
"learning_rate": 0.00018038078861931482,
"loss": 0.7699,
"step": 2230
},
{
"epoch": 0.4926705610051802,
"grad_norm": 0.037295368892913544,
"learning_rate": 0.00017981531258583794,
"loss": 0.7688,
"step": 2235
},
{
"epoch": 0.4937727322825967,
"grad_norm": 0.04102492750892726,
"learning_rate": 0.00017924939505045364,
"loss": 0.7959,
"step": 2240
},
{
"epoch": 0.49487490356001324,
"grad_norm": 0.038911715276106644,
"learning_rate": 0.0001786830443932071,
"loss": 0.8129,
"step": 2245
},
{
"epoch": 0.49597707483742975,
"grad_norm": 0.0401123359070641,
"learning_rate": 0.00017811626900055748,
"loss": 0.8031,
"step": 2250
},
{
"epoch": 0.4970792461148463,
"grad_norm": 0.044018814366690515,
"learning_rate": 0.00017754907726525302,
"loss": 0.7963,
"step": 2255
},
{
"epoch": 0.49818141739226274,
"grad_norm": 0.03902942910566316,
"learning_rate": 0.00017698147758620736,
"loss": 0.7607,
"step": 2260
},
{
"epoch": 0.49928358866967926,
"grad_norm": 0.04968656280514321,
"learning_rate": 0.0001764134783683748,
"loss": 0.8039,
"step": 2265
},
{
"epoch": 0.5003857599470958,
"grad_norm": 0.04093320446137599,
"learning_rate": 0.00017584508802262602,
"loss": 0.8126,
"step": 2270
},
{
"epoch": 0.5014879312245123,
"grad_norm": 0.04617686206504717,
"learning_rate": 0.00017527631496562352,
"loss": 0.8063,
"step": 2275
},
{
"epoch": 0.5025901025019288,
"grad_norm": 0.038529497279601546,
"learning_rate": 0.0001747071676196968,
"loss": 0.7816,
"step": 2280
},
{
"epoch": 0.5036922737793453,
"grad_norm": 0.03888023182945524,
"learning_rate": 0.000174137654412718,
"loss": 0.8077,
"step": 2285
},
{
"epoch": 0.5047944450567619,
"grad_norm": 0.04001737897478531,
"learning_rate": 0.00017356778377797664,
"loss": 0.8262,
"step": 2290
},
{
"epoch": 0.5058966163341784,
"grad_norm": 0.039323331641178884,
"learning_rate": 0.00017299756415405524,
"loss": 0.795,
"step": 2295
},
{
"epoch": 0.5069987876115949,
"grad_norm": 0.04467088321827032,
"learning_rate": 0.00017242700398470393,
"loss": 0.7939,
"step": 2300
},
{
"epoch": 0.5081009588890113,
"grad_norm": 0.043810507147911466,
"learning_rate": 0.00017185611171871573,
"loss": 0.7669,
"step": 2305
},
{
"epoch": 0.5092031301664278,
"grad_norm": 0.040939929874561506,
"learning_rate": 0.0001712848958098012,
"loss": 0.8017,
"step": 2310
},
{
"epoch": 0.5103053014438443,
"grad_norm": 0.03898184972185426,
"learning_rate": 0.00017071336471646348,
"loss": 0.8045,
"step": 2315
},
{
"epoch": 0.5114074727212609,
"grad_norm": 0.038962684331486475,
"learning_rate": 0.0001701415269018728,
"loss": 0.8071,
"step": 2320
},
{
"epoch": 0.5125096439986774,
"grad_norm": 0.04491474953288176,
"learning_rate": 0.0001695693908337414,
"loss": 0.7909,
"step": 2325
},
{
"epoch": 0.5136118152760939,
"grad_norm": 0.04437277653745269,
"learning_rate": 0.00016899696498419794,
"loss": 0.7973,
"step": 2330
},
{
"epoch": 0.5147139865535104,
"grad_norm": 0.04158284754618555,
"learning_rate": 0.00016842425782966224,
"loss": 0.7778,
"step": 2335
},
{
"epoch": 0.5158161578309269,
"grad_norm": 0.04116323767907288,
"learning_rate": 0.00016785127785071949,
"loss": 0.8043,
"step": 2340
},
{
"epoch": 0.5169183291083435,
"grad_norm": 0.034684359802969134,
"learning_rate": 0.000167278033531995,
"loss": 0.79,
"step": 2345
},
{
"epoch": 0.51802050038576,
"grad_norm": 0.04229406791148304,
"learning_rate": 0.0001667045333620283,
"loss": 0.7795,
"step": 2350
},
{
"epoch": 0.5191226716631765,
"grad_norm": 0.03764426814798344,
"learning_rate": 0.00016613078583314756,
"loss": 0.7781,
"step": 2355
},
{
"epoch": 0.520224842940593,
"grad_norm": 0.03973158416131846,
"learning_rate": 0.00016555679944134382,
"loss": 0.7873,
"step": 2360
},
{
"epoch": 0.5213270142180095,
"grad_norm": 0.039706342704437673,
"learning_rate": 0.00016498258268614514,
"loss": 0.761,
"step": 2365
},
{
"epoch": 0.5224291854954259,
"grad_norm": 0.04246187330898323,
"learning_rate": 0.00016440814407049092,
"loss": 0.7904,
"step": 2370
},
{
"epoch": 0.5235313567728425,
"grad_norm": 0.037315547695955284,
"learning_rate": 0.00016383349210060555,
"loss": 0.7916,
"step": 2375
},
{
"epoch": 0.524633528050259,
"grad_norm": 0.03680043140841983,
"learning_rate": 0.000163258635285873,
"loss": 0.7839,
"step": 2380
},
{
"epoch": 0.5257356993276755,
"grad_norm": 0.04240483271155321,
"learning_rate": 0.00016268358213871058,
"loss": 0.7717,
"step": 2385
},
{
"epoch": 0.526837870605092,
"grad_norm": 0.038360421064488774,
"learning_rate": 0.0001621083411744427,
"loss": 0.8082,
"step": 2390
},
{
"epoch": 0.5279400418825085,
"grad_norm": 0.03923682854690225,
"learning_rate": 0.00016153292091117505,
"loss": 0.7675,
"step": 2395
},
{
"epoch": 0.5290422131599251,
"grad_norm": 0.04266472133785625,
"learning_rate": 0.00016095732986966824,
"loss": 0.7826,
"step": 2400
},
{
"epoch": 0.5301443844373416,
"grad_norm": 0.04032119205943179,
"learning_rate": 0.00016038157657321202,
"loss": 0.7694,
"step": 2405
},
{
"epoch": 0.5312465557147581,
"grad_norm": 0.038493359106615645,
"learning_rate": 0.0001598056695474984,
"loss": 0.7851,
"step": 2410
},
{
"epoch": 0.5323487269921746,
"grad_norm": 0.04788764713041358,
"learning_rate": 0.00015922961732049617,
"loss": 0.8041,
"step": 2415
},
{
"epoch": 0.5334508982695911,
"grad_norm": 0.03867840390035932,
"learning_rate": 0.000158653428422324,
"loss": 0.763,
"step": 2420
},
{
"epoch": 0.5345530695470077,
"grad_norm": 0.03926183742629788,
"learning_rate": 0.00015807711138512458,
"loss": 0.774,
"step": 2425
},
{
"epoch": 0.5356552408244241,
"grad_norm": 0.04275754859474233,
"learning_rate": 0.00015750067474293774,
"loss": 0.8008,
"step": 2430
},
{
"epoch": 0.5367574121018406,
"grad_norm": 0.041396745192273696,
"learning_rate": 0.00015692412703157478,
"loss": 0.7899,
"step": 2435
},
{
"epoch": 0.5378595833792571,
"grad_norm": 0.03771777455755809,
"learning_rate": 0.00015634747678849146,
"loss": 0.7662,
"step": 2440
},
{
"epoch": 0.5389617546566736,
"grad_norm": 0.037105475754751184,
"learning_rate": 0.00015577073255266185,
"loss": 0.7963,
"step": 2445
},
{
"epoch": 0.5400639259340901,
"grad_norm": 0.042466734201556076,
"learning_rate": 0.00015519390286445201,
"loss": 0.7795,
"step": 2450
},
{
"epoch": 0.5411660972115067,
"grad_norm": 0.03800624488466557,
"learning_rate": 0.00015461699626549314,
"loss": 0.7789,
"step": 2455
},
{
"epoch": 0.5422682684889232,
"grad_norm": 0.03799071186124082,
"learning_rate": 0.00015404002129855557,
"loss": 0.7621,
"step": 2460
},
{
"epoch": 0.5433704397663397,
"grad_norm": 0.04246315994111129,
"learning_rate": 0.00015346298650742177,
"loss": 0.7898,
"step": 2465
},
{
"epoch": 0.5444726110437562,
"grad_norm": 0.0383853594452228,
"learning_rate": 0.00015288590043676027,
"loss": 0.7838,
"step": 2470
},
{
"epoch": 0.5455747823211727,
"grad_norm": 0.037342426062281935,
"learning_rate": 0.00015230877163199878,
"loss": 0.7746,
"step": 2475
},
{
"epoch": 0.5466769535985893,
"grad_norm": 0.03967766879530587,
"learning_rate": 0.000151731608639198,
"loss": 0.7807,
"step": 2480
},
{
"epoch": 0.5477791248760058,
"grad_norm": 0.038046687905520335,
"learning_rate": 0.0001511544200049247,
"loss": 0.7624,
"step": 2485
},
{
"epoch": 0.5488812961534223,
"grad_norm": 0.038282722756821576,
"learning_rate": 0.00015057721427612548,
"loss": 0.7781,
"step": 2490
},
{
"epoch": 0.5499834674308387,
"grad_norm": 0.04204297605214361,
"learning_rate": 0.00015,
"loss": 0.7889,
"step": 2495
},
{
"epoch": 0.5510856387082552,
"grad_norm": 0.04253941444925998,
"learning_rate": 0.00014942278572387452,
"loss": 0.7874,
"step": 2500
},
{
"epoch": 0.5521878099856717,
"grad_norm": 0.04099337109892425,
"learning_rate": 0.00014884557999507528,
"loss": 0.7932,
"step": 2505
},
{
"epoch": 0.5532899812630883,
"grad_norm": 0.043225237652168194,
"learning_rate": 0.00014826839136080204,
"loss": 0.8035,
"step": 2510
},
{
"epoch": 0.5543921525405048,
"grad_norm": 0.04237211794633771,
"learning_rate": 0.00014769122836800122,
"loss": 0.782,
"step": 2515
},
{
"epoch": 0.5554943238179213,
"grad_norm": 0.0390643188084349,
"learning_rate": 0.00014711409956323976,
"loss": 0.8021,
"step": 2520
},
{
"epoch": 0.5565964950953378,
"grad_norm": 0.038912412857210685,
"learning_rate": 0.00014653701349257823,
"loss": 0.7713,
"step": 2525
},
{
"epoch": 0.5576986663727543,
"grad_norm": 0.04021618253944335,
"learning_rate": 0.00014595997870144443,
"loss": 0.7711,
"step": 2530
},
{
"epoch": 0.5588008376501709,
"grad_norm": 0.04054714580080947,
"learning_rate": 0.00014538300373450683,
"loss": 0.7959,
"step": 2535
},
{
"epoch": 0.5599030089275874,
"grad_norm": 0.0378078133538945,
"learning_rate": 0.00014480609713554796,
"loss": 0.7533,
"step": 2540
},
{
"epoch": 0.5610051802050039,
"grad_norm": 0.03566763348348747,
"learning_rate": 0.0001442292674473381,
"loss": 0.7842,
"step": 2545
},
{
"epoch": 0.5621073514824204,
"grad_norm": 0.04162087831710151,
"learning_rate": 0.0001436525232115086,
"loss": 0.7765,
"step": 2550
},
{
"epoch": 0.5632095227598369,
"grad_norm": 0.039192296915950345,
"learning_rate": 0.00014307587296842524,
"loss": 0.7761,
"step": 2555
},
{
"epoch": 0.5643116940372533,
"grad_norm": 0.040771917142651264,
"learning_rate": 0.00014249932525706223,
"loss": 0.7637,
"step": 2560
},
{
"epoch": 0.5654138653146699,
"grad_norm": 0.0404557988061511,
"learning_rate": 0.00014192288861487545,
"loss": 0.7809,
"step": 2565
},
{
"epoch": 0.5665160365920864,
"grad_norm": 0.0380287543914902,
"learning_rate": 0.00014134657157767593,
"loss": 0.7744,
"step": 2570
},
{
"epoch": 0.5676182078695029,
"grad_norm": 0.037545959627603626,
"learning_rate": 0.00014077038267950383,
"loss": 0.7705,
"step": 2575
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.04121660123612755,
"learning_rate": 0.00014019433045250158,
"loss": 0.7969,
"step": 2580
},
{
"epoch": 0.5698225504243359,
"grad_norm": 0.03543171857215221,
"learning_rate": 0.00013961842342678798,
"loss": 0.7706,
"step": 2585
},
{
"epoch": 0.5709247217017525,
"grad_norm": 0.03970445149727504,
"learning_rate": 0.0001390426701303317,
"loss": 0.7813,
"step": 2590
},
{
"epoch": 0.572026892979169,
"grad_norm": 0.04263582518492721,
"learning_rate": 0.00013846707908882498,
"loss": 0.8044,
"step": 2595
},
{
"epoch": 0.5731290642565855,
"grad_norm": 0.03919599782168947,
"learning_rate": 0.0001378916588255573,
"loss": 0.7709,
"step": 2600
},
{
"epoch": 0.574231235534002,
"grad_norm": 0.04085513710063447,
"learning_rate": 0.0001373164178612894,
"loss": 0.7916,
"step": 2605
},
{
"epoch": 0.5753334068114185,
"grad_norm": 0.03947664604661718,
"learning_rate": 0.0001367413647141269,
"loss": 0.7829,
"step": 2610
},
{
"epoch": 0.576435578088835,
"grad_norm": 0.03819848642986916,
"learning_rate": 0.00013616650789939443,
"loss": 0.7736,
"step": 2615
},
{
"epoch": 0.5775377493662515,
"grad_norm": 0.03928261920892333,
"learning_rate": 0.0001355918559295091,
"loss": 0.7934,
"step": 2620
},
{
"epoch": 0.578639920643668,
"grad_norm": 0.04465492341767027,
"learning_rate": 0.00013501741731385483,
"loss": 0.7872,
"step": 2625
},
{
"epoch": 0.5797420919210845,
"grad_norm": 0.04518050772542813,
"learning_rate": 0.00013444320055865618,
"loss": 0.7978,
"step": 2630
},
{
"epoch": 0.580844263198501,
"grad_norm": 0.03823568510951906,
"learning_rate": 0.00013386921416685239,
"loss": 0.8026,
"step": 2635
},
{
"epoch": 0.5819464344759175,
"grad_norm": 0.03860337235855855,
"learning_rate": 0.0001332954666379717,
"loss": 0.7819,
"step": 2640
},
{
"epoch": 0.5830486057533341,
"grad_norm": 0.040848904672585555,
"learning_rate": 0.00013272196646800497,
"loss": 0.7718,
"step": 2645
},
{
"epoch": 0.5841507770307506,
"grad_norm": 0.039991424568808075,
"learning_rate": 0.0001321487221492805,
"loss": 0.7737,
"step": 2650
},
{
"epoch": 0.5852529483081671,
"grad_norm": 0.04406998907502384,
"learning_rate": 0.00013157574217033773,
"loss": 0.7804,
"step": 2655
},
{
"epoch": 0.5863551195855836,
"grad_norm": 0.042736167667461564,
"learning_rate": 0.00013100303501580206,
"loss": 0.7864,
"step": 2660
},
{
"epoch": 0.5874572908630001,
"grad_norm": 0.039658625402537326,
"learning_rate": 0.0001304306091662586,
"loss": 0.7879,
"step": 2665
},
{
"epoch": 0.5885594621404167,
"grad_norm": 0.03732667476656254,
"learning_rate": 0.0001298584730981272,
"loss": 0.7958,
"step": 2670
},
{
"epoch": 0.5896616334178332,
"grad_norm": 0.0385663925190591,
"learning_rate": 0.00012928663528353652,
"loss": 0.7532,
"step": 2675
},
{
"epoch": 0.5907638046952497,
"grad_norm": 0.039100737225537294,
"learning_rate": 0.00012871510419019876,
"loss": 0.8146,
"step": 2680
},
{
"epoch": 0.5918659759726661,
"grad_norm": 0.04322733978868932,
"learning_rate": 0.0001281438882812843,
"loss": 0.7844,
"step": 2685
},
{
"epoch": 0.5929681472500826,
"grad_norm": 0.03855540614705994,
"learning_rate": 0.00012757299601529604,
"loss": 0.7444,
"step": 2690
},
{
"epoch": 0.5940703185274991,
"grad_norm": 0.03976869418403505,
"learning_rate": 0.00012700243584594479,
"loss": 0.7706,
"step": 2695
},
{
"epoch": 0.5951724898049157,
"grad_norm": 0.038039179428976305,
"learning_rate": 0.00012643221622202336,
"loss": 0.7497,
"step": 2700
},
{
"epoch": 0.5962746610823322,
"grad_norm": 0.03951382972148692,
"learning_rate": 0.00012586234558728207,
"loss": 0.7571,
"step": 2705
},
{
"epoch": 0.5973768323597487,
"grad_norm": 0.04180256816699712,
"learning_rate": 0.0001252928323803032,
"loss": 0.7538,
"step": 2710
},
{
"epoch": 0.5984790036371652,
"grad_norm": 0.037309356744730904,
"learning_rate": 0.00012472368503437648,
"loss": 0.7924,
"step": 2715
},
{
"epoch": 0.5995811749145817,
"grad_norm": 0.04216451657557382,
"learning_rate": 0.00012415491197737395,
"loss": 0.7816,
"step": 2720
},
{
"epoch": 0.6006833461919983,
"grad_norm": 0.039059339374739786,
"learning_rate": 0.00012358652163162523,
"loss": 0.7394,
"step": 2725
},
{
"epoch": 0.6017855174694148,
"grad_norm": 0.03934478577799933,
"learning_rate": 0.00012301852241379267,
"loss": 0.7903,
"step": 2730
},
{
"epoch": 0.6028876887468313,
"grad_norm": 0.03677529471356991,
"learning_rate": 0.00012245092273474695,
"loss": 0.7688,
"step": 2735
},
{
"epoch": 0.6039898600242478,
"grad_norm": 0.04379989553642847,
"learning_rate": 0.00012188373099944252,
"loss": 0.7791,
"step": 2740
},
{
"epoch": 0.6050920313016642,
"grad_norm": 0.03853225124689197,
"learning_rate": 0.00012131695560679285,
"loss": 0.7842,
"step": 2745
},
{
"epoch": 0.6061942025790807,
"grad_norm": 0.04243715932465795,
"learning_rate": 0.0001207506049495464,
"loss": 0.7633,
"step": 2750
},
{
"epoch": 0.6072963738564973,
"grad_norm": 0.042350895868304664,
"learning_rate": 0.00012018468741416206,
"loss": 0.7992,
"step": 2755
},
{
"epoch": 0.6083985451339138,
"grad_norm": 0.042511366861454146,
"learning_rate": 0.00011961921138068517,
"loss": 0.7628,
"step": 2760
},
{
"epoch": 0.6095007164113303,
"grad_norm": 0.03945967376445848,
"learning_rate": 0.00011905418522262343,
"loss": 0.7798,
"step": 2765
},
{
"epoch": 0.6106028876887468,
"grad_norm": 0.04134467702346202,
"learning_rate": 0.00011848961730682276,
"loss": 0.7736,
"step": 2770
},
{
"epoch": 0.6117050589661633,
"grad_norm": 0.04321593442131728,
"learning_rate": 0.00011792551599334342,
"loss": 0.7729,
"step": 2775
},
{
"epoch": 0.6128072302435799,
"grad_norm": 0.03753481531786513,
"learning_rate": 0.00011736188963533636,
"loss": 0.7868,
"step": 2780
},
{
"epoch": 0.6139094015209964,
"grad_norm": 0.046677240052044086,
"learning_rate": 0.0001167987465789194,
"loss": 0.788,
"step": 2785
},
{
"epoch": 0.6150115727984129,
"grad_norm": 0.04004956558986007,
"learning_rate": 0.00011623609516305375,
"loss": 0.7669,
"step": 2790
},
{
"epoch": 0.6161137440758294,
"grad_norm": 0.03877661195736084,
"learning_rate": 0.0001156739437194204,
"loss": 0.7403,
"step": 2795
},
{
"epoch": 0.6172159153532459,
"grad_norm": 0.037935876666486346,
"learning_rate": 0.00011511230057229678,
"loss": 0.7373,
"step": 2800
},
{
"epoch": 0.6183180866306625,
"grad_norm": 0.038133591224345446,
"learning_rate": 0.00011455117403843358,
"loss": 0.7626,
"step": 2805
},
{
"epoch": 0.6194202579080789,
"grad_norm": 0.040488281712004456,
"learning_rate": 0.00011399057242693143,
"loss": 0.7748,
"step": 2810
},
{
"epoch": 0.6205224291854954,
"grad_norm": 0.045588727331748555,
"learning_rate": 0.00011343050403911823,
"loss": 0.7566,
"step": 2815
},
{
"epoch": 0.6216246004629119,
"grad_norm": 0.04322927430800435,
"learning_rate": 0.0001128709771684256,
"loss": 0.7405,
"step": 2820
},
{
"epoch": 0.6227267717403284,
"grad_norm": 0.03941622010511477,
"learning_rate": 0.00011231200010026668,
"loss": 0.7699,
"step": 2825
},
{
"epoch": 0.6238289430177449,
"grad_norm": 0.040341244924510265,
"learning_rate": 0.00011175358111191316,
"loss": 0.7546,
"step": 2830
},
{
"epoch": 0.6249311142951615,
"grad_norm": 0.0370716397793875,
"learning_rate": 0.00011119572847237272,
"loss": 0.7524,
"step": 2835
},
{
"epoch": 0.626033285572578,
"grad_norm": 0.04088692753580051,
"learning_rate": 0.00011063845044226649,
"loss": 0.7737,
"step": 2840
},
{
"epoch": 0.6271354568499945,
"grad_norm": 0.04149954061127777,
"learning_rate": 0.00011008175527370708,
"loss": 0.7635,
"step": 2845
},
{
"epoch": 0.628237628127411,
"grad_norm": 0.04067030099074359,
"learning_rate": 0.00010952565121017595,
"loss": 0.7781,
"step": 2850
},
{
"epoch": 0.6293397994048275,
"grad_norm": 0.037995610342879724,
"learning_rate": 0.00010897014648640164,
"loss": 0.7536,
"step": 2855
},
{
"epoch": 0.6304419706822441,
"grad_norm": 0.038638564159584964,
"learning_rate": 0.0001084152493282378,
"loss": 0.7582,
"step": 2860
},
{
"epoch": 0.6315441419596606,
"grad_norm": 0.03865577853003443,
"learning_rate": 0.00010786096795254105,
"loss": 0.743,
"step": 2865
},
{
"epoch": 0.6326463132370771,
"grad_norm": 0.037742667105078676,
"learning_rate": 0.00010730731056704987,
"loss": 0.7738,
"step": 2870
},
{
"epoch": 0.6337484845144935,
"grad_norm": 0.04087500905979875,
"learning_rate": 0.0001067542853702626,
"loss": 0.7454,
"step": 2875
},
{
"epoch": 0.63485065579191,
"grad_norm": 0.03681788140604627,
"learning_rate": 0.00010620190055131628,
"loss": 0.7513,
"step": 2880
},
{
"epoch": 0.6359528270693265,
"grad_norm": 0.039530646965778786,
"learning_rate": 0.00010565016428986515,
"loss": 0.7863,
"step": 2885
},
{
"epoch": 0.6370549983467431,
"grad_norm": 0.03637232048859645,
"learning_rate": 0.00010509908475595984,
"loss": 0.7871,
"step": 2890
},
{
"epoch": 0.6381571696241596,
"grad_norm": 0.037532035474368244,
"learning_rate": 0.0001045486701099262,
"loss": 0.7868,
"step": 2895
},
{
"epoch": 0.6392593409015761,
"grad_norm": 0.04139117865057032,
"learning_rate": 0.0001039989285022445,
"loss": 0.7544,
"step": 2900
},
{
"epoch": 0.6403615121789926,
"grad_norm": 0.04037843881677972,
"learning_rate": 0.00010344986807342866,
"loss": 0.7612,
"step": 2905
},
{
"epoch": 0.6414636834564091,
"grad_norm": 0.039565719386194985,
"learning_rate": 0.00010290149695390581,
"loss": 0.7616,
"step": 2910
},
{
"epoch": 0.6425658547338257,
"grad_norm": 0.038310169105771584,
"learning_rate": 0.00010235382326389586,
"loss": 0.7576,
"step": 2915
},
{
"epoch": 0.6436680260112422,
"grad_norm": 0.03936468865097429,
"learning_rate": 0.00010180685511329131,
"loss": 0.7702,
"step": 2920
},
{
"epoch": 0.6447701972886587,
"grad_norm": 0.0398256871335756,
"learning_rate": 0.00010126060060153713,
"loss": 0.7822,
"step": 2925
},
{
"epoch": 0.6458723685660752,
"grad_norm": 0.0413034275637383,
"learning_rate": 0.00010071506781751063,
"loss": 0.7542,
"step": 2930
},
{
"epoch": 0.6469745398434916,
"grad_norm": 0.03871254029791003,
"learning_rate": 0.000100170264839402,
"loss": 0.7335,
"step": 2935
},
{
"epoch": 0.6480767111209081,
"grad_norm": 0.036091901012243334,
"learning_rate": 9.962619973459453e-05,
"loss": 0.7748,
"step": 2940
},
{
"epoch": 0.6491788823983247,
"grad_norm": 0.04138951063703848,
"learning_rate": 9.90828805595452e-05,
"loss": 0.7524,
"step": 2945
},
{
"epoch": 0.6502810536757412,
"grad_norm": 0.03858848182433528,
"learning_rate": 9.854031535966521e-05,
"loss": 0.7517,
"step": 2950
},
{
"epoch": 0.6513832249531577,
"grad_norm": 0.03602604340055366,
"learning_rate": 9.799851216920107e-05,
"loss": 0.7337,
"step": 2955
},
{
"epoch": 0.6524853962305742,
"grad_norm": 0.03579532104123597,
"learning_rate": 9.745747901111552e-05,
"loss": 0.7623,
"step": 2960
},
{
"epoch": 0.6535875675079907,
"grad_norm": 0.0381744125557122,
"learning_rate": 9.691722389696879e-05,
"loss": 0.7683,
"step": 2965
},
{
"epoch": 0.6546897387854073,
"grad_norm": 0.03608442329560764,
"learning_rate": 9.637775482679988e-05,
"loss": 0.7732,
"step": 2970
},
{
"epoch": 0.6557919100628238,
"grad_norm": 0.03784439256503652,
"learning_rate": 9.583907978900807e-05,
"loss": 0.7739,
"step": 2975
},
{
"epoch": 0.6568940813402403,
"grad_norm": 0.03945771278463694,
"learning_rate": 9.530120676023482e-05,
"loss": 0.7442,
"step": 2980
},
{
"epoch": 0.6579962526176568,
"grad_norm": 0.03667458666034089,
"learning_rate": 9.476414370524538e-05,
"loss": 0.7456,
"step": 2985
},
{
"epoch": 0.6590984238950733,
"grad_norm": 0.03940939789339881,
"learning_rate": 9.422789857681124e-05,
"loss": 0.7438,
"step": 2990
},
{
"epoch": 0.6602005951724899,
"grad_norm": 0.03803960926078932,
"learning_rate": 9.36924793155918e-05,
"loss": 0.778,
"step": 2995
},
{
"epoch": 0.6613027664499063,
"grad_norm": 0.035231525831095054,
"learning_rate": 9.315789385001738e-05,
"loss": 0.7647,
"step": 3000
},
{
"epoch": 0.6624049377273228,
"grad_norm": 0.03745699399316068,
"learning_rate": 9.262415009617139e-05,
"loss": 0.7684,
"step": 3005
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.03586306355233504,
"learning_rate": 9.209125595767336e-05,
"loss": 0.7458,
"step": 3010
},
{
"epoch": 0.6646092802821558,
"grad_norm": 0.036839355886467764,
"learning_rate": 9.15592193255617e-05,
"loss": 0.7706,
"step": 3015
},
{
"epoch": 0.6657114515595723,
"grad_norm": 0.03906285720782755,
"learning_rate": 9.102804807817699e-05,
"loss": 0.781,
"step": 3020
},
{
"epoch": 0.6668136228369889,
"grad_norm": 0.03869457663028768,
"learning_rate": 9.049775008104542e-05,
"loss": 0.7486,
"step": 3025
},
{
"epoch": 0.6679157941144054,
"grad_norm": 0.0393845820755994,
"learning_rate": 8.996833318676204e-05,
"loss": 0.7473,
"step": 3030
},
{
"epoch": 0.6690179653918219,
"grad_norm": 0.03941498428248022,
"learning_rate": 8.943980523487469e-05,
"loss": 0.7846,
"step": 3035
},
{
"epoch": 0.6701201366692384,
"grad_norm": 0.0420251439088419,
"learning_rate": 8.891217405176774e-05,
"loss": 0.7608,
"step": 3040
},
{
"epoch": 0.6712223079466549,
"grad_norm": 0.035649123267955884,
"learning_rate": 8.838544745054645e-05,
"loss": 0.7719,
"step": 3045
},
{
"epoch": 0.6723244792240715,
"grad_norm": 0.03877525844159031,
"learning_rate": 8.785963323092108e-05,
"loss": 0.7582,
"step": 3050
},
{
"epoch": 0.673426650501488,
"grad_norm": 0.037091463531239946,
"learning_rate": 8.733473917909144e-05,
"loss": 0.7411,
"step": 3055
},
{
"epoch": 0.6745288217789045,
"grad_norm": 0.036271594287721816,
"learning_rate": 8.68107730676315e-05,
"loss": 0.7849,
"step": 3060
},
{
"epoch": 0.6756309930563209,
"grad_norm": 0.03541699632121048,
"learning_rate": 8.628774265537462e-05,
"loss": 0.7514,
"step": 3065
},
{
"epoch": 0.6767331643337374,
"grad_norm": 0.0362990364448018,
"learning_rate": 8.576565568729813e-05,
"loss": 0.7474,
"step": 3070
},
{
"epoch": 0.6778353356111539,
"grad_norm": 0.03921802869726997,
"learning_rate": 8.524451989440918e-05,
"loss": 0.754,
"step": 3075
},
{
"epoch": 0.6789375068885705,
"grad_norm": 0.037337016378149755,
"learning_rate": 8.472434299362998e-05,
"loss": 0.75,
"step": 3080
},
{
"epoch": 0.680039678165987,
"grad_norm": 0.03891038958554527,
"learning_rate": 8.420513268768347e-05,
"loss": 0.7859,
"step": 3085
},
{
"epoch": 0.6811418494434035,
"grad_norm": 0.035894066909538044,
"learning_rate": 8.368689666497938e-05,
"loss": 0.7329,
"step": 3090
},
{
"epoch": 0.68224402072082,
"grad_norm": 0.039979769321902066,
"learning_rate": 8.31696425995004e-05,
"loss": 0.7503,
"step": 3095
},
{
"epoch": 0.6833461919982365,
"grad_norm": 0.03936168369937806,
"learning_rate": 8.26533781506887e-05,
"loss": 0.7726,
"step": 3100
},
{
"epoch": 0.6844483632756531,
"grad_norm": 0.04019560215837221,
"learning_rate": 8.21381109633318e-05,
"loss": 0.7432,
"step": 3105
},
{
"epoch": 0.6855505345530696,
"grad_norm": 0.039269909994462844,
"learning_rate": 8.162384866745036e-05,
"loss": 0.7538,
"step": 3110
},
{
"epoch": 0.6866527058304861,
"grad_norm": 0.03904037612780349,
"learning_rate": 8.111059887818459e-05,
"loss": 0.744,
"step": 3115
},
{
"epoch": 0.6877548771079026,
"grad_norm": 0.03885534367589714,
"learning_rate": 8.059836919568152e-05,
"loss": 0.7328,
"step": 3120
},
{
"epoch": 0.688857048385319,
"grad_norm": 0.03858452994442185,
"learning_rate": 8.008716720498253e-05,
"loss": 0.7701,
"step": 3125
},
{
"epoch": 0.6899592196627355,
"grad_norm": 0.03519383464662896,
"learning_rate": 7.957700047591121e-05,
"loss": 0.7451,
"step": 3130
},
{
"epoch": 0.6910613909401521,
"grad_norm": 0.0380259068615794,
"learning_rate": 7.906787656296107e-05,
"loss": 0.7556,
"step": 3135
},
{
"epoch": 0.6921635622175686,
"grad_norm": 0.038944765250869484,
"learning_rate": 7.855980300518354e-05,
"loss": 0.7389,
"step": 3140
},
{
"epoch": 0.6932657334949851,
"grad_norm": 0.04470561682300718,
"learning_rate": 7.805278732607678e-05,
"loss": 0.7568,
"step": 3145
},
{
"epoch": 0.6943679047724016,
"grad_norm": 0.04297680477477516,
"learning_rate": 7.754683703347372e-05,
"loss": 0.7626,
"step": 3150
},
{
"epoch": 0.6954700760498181,
"grad_norm": 0.034697565387832634,
"learning_rate": 7.704195961943129e-05,
"loss": 0.7721,
"step": 3155
},
{
"epoch": 0.6965722473272347,
"grad_norm": 0.04072535239631796,
"learning_rate": 7.653816256011941e-05,
"loss": 0.7757,
"step": 3160
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.035271346752025576,
"learning_rate": 7.603545331571018e-05,
"loss": 0.7629,
"step": 3165
},
{
"epoch": 0.6987765898820677,
"grad_norm": 0.037357663884374157,
"learning_rate": 7.553383933026741e-05,
"loss": 0.7549,
"step": 3170
},
{
"epoch": 0.6998787611594842,
"grad_norm": 0.03637211308219045,
"learning_rate": 7.503332803163641e-05,
"loss": 0.7529,
"step": 3175
},
{
"epoch": 0.7009809324369007,
"grad_norm": 0.039519222558093065,
"learning_rate": 7.453392683133415e-05,
"loss": 0.7879,
"step": 3180
},
{
"epoch": 0.7020831037143173,
"grad_norm": 0.03323807896911395,
"learning_rate": 7.403564312443932e-05,
"loss": 0.7189,
"step": 3185
},
{
"epoch": 0.7031852749917337,
"grad_norm": 0.03896304992954701,
"learning_rate": 7.353848428948288e-05,
"loss": 0.732,
"step": 3190
},
{
"epoch": 0.7042874462691502,
"grad_norm": 0.03533752667617695,
"learning_rate": 7.304245768833872e-05,
"loss": 0.7499,
"step": 3195
},
{
"epoch": 0.7053896175465667,
"grad_norm": 0.04289239640414403,
"learning_rate": 7.25475706661149e-05,
"loss": 0.7518,
"step": 3200
},
{
"epoch": 0.7064917888239832,
"grad_norm": 0.036700606939269846,
"learning_rate": 7.20538305510447e-05,
"loss": 0.7444,
"step": 3205
},
{
"epoch": 0.7075939601013997,
"grad_norm": 0.039301039880959406,
"learning_rate": 7.156124465437799e-05,
"loss": 0.7647,
"step": 3210
},
{
"epoch": 0.7086961313788163,
"grad_norm": 0.039132429979466775,
"learning_rate": 7.106982027027314e-05,
"loss": 0.7464,
"step": 3215
},
{
"epoch": 0.7097983026562328,
"grad_norm": 0.03668166024260441,
"learning_rate": 7.057956467568913e-05,
"loss": 0.768,
"step": 3220
},
{
"epoch": 0.7109004739336493,
"grad_norm": 0.03763349214718496,
"learning_rate": 7.009048513027738e-05,
"loss": 0.7627,
"step": 3225
},
{
"epoch": 0.7120026452110658,
"grad_norm": 0.03910255645252377,
"learning_rate": 6.960258887627474e-05,
"loss": 0.7393,
"step": 3230
},
{
"epoch": 0.7131048164884823,
"grad_norm": 0.03847225495364757,
"learning_rate": 6.911588313839579e-05,
"loss": 0.758,
"step": 3235
},
{
"epoch": 0.7142069877658989,
"grad_norm": 0.038410315616110316,
"learning_rate": 6.86303751237263e-05,
"loss": 0.7385,
"step": 3240
},
{
"epoch": 0.7153091590433154,
"grad_norm": 0.038761774380026405,
"learning_rate": 6.814607202161606e-05,
"loss": 0.7382,
"step": 3245
},
{
"epoch": 0.7164113303207318,
"grad_norm": 0.03723197930881741,
"learning_rate": 6.766298100357281e-05,
"loss": 0.7359,
"step": 3250
},
{
"epoch": 0.7175135015981483,
"grad_norm": 0.040413685922780995,
"learning_rate": 6.718110922315593e-05,
"loss": 0.7342,
"step": 3255
},
{
"epoch": 0.7186156728755648,
"grad_norm": 0.03978308906273803,
"learning_rate": 6.670046381587016e-05,
"loss": 0.7645,
"step": 3260
},
{
"epoch": 0.7197178441529813,
"grad_norm": 0.03897559054869522,
"learning_rate": 6.622105189906052e-05,
"loss": 0.7455,
"step": 3265
},
{
"epoch": 0.7208200154303979,
"grad_norm": 0.03815718519490893,
"learning_rate": 6.574288057180663e-05,
"loss": 0.7615,
"step": 3270
},
{
"epoch": 0.7219221867078144,
"grad_norm": 0.03856332628344952,
"learning_rate": 6.526595691481746e-05,
"loss": 0.7599,
"step": 3275
},
{
"epoch": 0.7230243579852309,
"grad_norm": 0.037326383736852486,
"learning_rate": 6.479028799032664e-05,
"loss": 0.7727,
"step": 3280
},
{
"epoch": 0.7241265292626474,
"grad_norm": 0.03759649917895476,
"learning_rate": 6.431588084198791e-05,
"loss": 0.733,
"step": 3285
},
{
"epoch": 0.7252287005400639,
"grad_norm": 0.0351188278300472,
"learning_rate": 6.384274249477086e-05,
"loss": 0.7603,
"step": 3290
},
{
"epoch": 0.7263308718174805,
"grad_norm": 0.0387440053943191,
"learning_rate": 6.337087995485658e-05,
"loss": 0.7401,
"step": 3295
},
{
"epoch": 0.727433043094897,
"grad_norm": 0.03680120173341686,
"learning_rate": 6.290030020953423e-05,
"loss": 0.7811,
"step": 3300
},
{
"epoch": 0.7285352143723135,
"grad_norm": 0.037694080907078036,
"learning_rate": 6.243101022709761e-05,
"loss": 0.7279,
"step": 3305
},
{
"epoch": 0.72963738564973,
"grad_norm": 0.04067783323042442,
"learning_rate": 6.196301695674176e-05,
"loss": 0.7827,
"step": 3310
},
{
"epoch": 0.7307395569271464,
"grad_norm": 0.038537648221183,
"learning_rate": 6.14963273284601e-05,
"loss": 0.7586,
"step": 3315
},
{
"epoch": 0.731841728204563,
"grad_norm": 0.038447173395684923,
"learning_rate": 6.1030948252941985e-05,
"loss": 0.7599,
"step": 3320
},
{
"epoch": 0.7329438994819795,
"grad_norm": 0.037364125258692316,
"learning_rate": 6.056688662147012e-05,
"loss": 0.7546,
"step": 3325
},
{
"epoch": 0.734046070759396,
"grad_norm": 0.03886268408011641,
"learning_rate": 6.010414930581866e-05,
"loss": 0.7451,
"step": 3330
},
{
"epoch": 0.7351482420368125,
"grad_norm": 0.03769869242431956,
"learning_rate": 5.96427431581515e-05,
"loss": 0.768,
"step": 3335
},
{
"epoch": 0.736250413314229,
"grad_norm": 0.037020275271875513,
"learning_rate": 5.918267501092078e-05,
"loss": 0.7392,
"step": 3340
},
{
"epoch": 0.7373525845916455,
"grad_norm": 0.03729781683672499,
"learning_rate": 5.872395167676555e-05,
"loss": 0.7541,
"step": 3345
},
{
"epoch": 0.7384547558690621,
"grad_norm": 0.03769698709111463,
"learning_rate": 5.826657994841104e-05,
"loss": 0.7464,
"step": 3350
},
{
"epoch": 0.7395569271464786,
"grad_norm": 0.035749490646957455,
"learning_rate": 5.78105665985681e-05,
"loss": 0.783,
"step": 3355
},
{
"epoch": 0.7406590984238951,
"grad_norm": 0.03815712387980432,
"learning_rate": 5.7355918379832925e-05,
"loss": 0.7415,
"step": 3360
},
{
"epoch": 0.7417612697013116,
"grad_norm": 0.03756149653556473,
"learning_rate": 5.690264202458685e-05,
"loss": 0.7754,
"step": 3365
},
{
"epoch": 0.7428634409787281,
"grad_norm": 0.03958578609528177,
"learning_rate": 5.64507442448968e-05,
"loss": 0.7835,
"step": 3370
},
{
"epoch": 0.7439656122561447,
"grad_norm": 0.038064835951232556,
"learning_rate": 5.6000231732416045e-05,
"loss": 0.7938,
"step": 3375
},
{
"epoch": 0.7450677835335611,
"grad_norm": 0.0371943932393074,
"learning_rate": 5.555111115828492e-05,
"loss": 0.7406,
"step": 3380
},
{
"epoch": 0.7461699548109776,
"grad_norm": 0.03776336387841464,
"learning_rate": 5.510338917303204e-05,
"loss": 0.7459,
"step": 3385
},
{
"epoch": 0.7472721260883941,
"grad_norm": 0.04142505105864296,
"learning_rate": 5.4657072406475816e-05,
"loss": 0.7419,
"step": 3390
},
{
"epoch": 0.7483742973658106,
"grad_norm": 0.03664875530168412,
"learning_rate": 5.421216746762651e-05,
"loss": 0.7701,
"step": 3395
},
{
"epoch": 0.7494764686432271,
"grad_norm": 0.03963080933205579,
"learning_rate": 5.3768680944588006e-05,
"loss": 0.7449,
"step": 3400
},
{
"epoch": 0.7505786399206437,
"grad_norm": 0.03875301352382599,
"learning_rate": 5.3326619404460594e-05,
"loss": 0.7512,
"step": 3405
},
{
"epoch": 0.7516808111980602,
"grad_norm": 0.03812435605779621,
"learning_rate": 5.2885989393243446e-05,
"loss": 0.7524,
"step": 3410
},
{
"epoch": 0.7527829824754767,
"grad_norm": 0.036201552008782494,
"learning_rate": 5.244679743573793e-05,
"loss": 0.7313,
"step": 3415
},
{
"epoch": 0.7538851537528932,
"grad_norm": 0.03508489489495532,
"learning_rate": 5.200905003545072e-05,
"loss": 0.7143,
"step": 3420
},
{
"epoch": 0.7549873250303097,
"grad_norm": 0.03873819352052367,
"learning_rate": 5.1572753674497784e-05,
"loss": 0.7262,
"step": 3425
},
{
"epoch": 0.7560894963077263,
"grad_norm": 0.03878661096421005,
"learning_rate": 5.11379148135083e-05,
"loss": 0.7388,
"step": 3430
},
{
"epoch": 0.7571916675851428,
"grad_norm": 0.03624350442263521,
"learning_rate": 5.070453989152865e-05,
"loss": 0.7516,
"step": 3435
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.03462784231516503,
"learning_rate": 5.0272635325927666e-05,
"loss": 0.735,
"step": 3440
},
{
"epoch": 0.7593960101399757,
"grad_norm": 0.03546841031831082,
"learning_rate": 4.9842207512301255e-05,
"loss": 0.7688,
"step": 3445
},
{
"epoch": 0.7604981814173922,
"grad_norm": 0.03765881036086525,
"learning_rate": 4.941326282437765e-05,
"loss": 0.7584,
"step": 3450
},
{
"epoch": 0.7616003526948087,
"grad_norm": 0.04070540653962422,
"learning_rate": 4.8985807613923084e-05,
"loss": 0.7658,
"step": 3455
},
{
"epoch": 0.7627025239722253,
"grad_norm": 0.041025307893189714,
"learning_rate": 4.855984821064789e-05,
"loss": 0.753,
"step": 3460
},
{
"epoch": 0.7638046952496418,
"grad_norm": 0.03747182722869465,
"learning_rate": 4.8135390922112687e-05,
"loss": 0.7481,
"step": 3465
},
{
"epoch": 0.7649068665270583,
"grad_norm": 0.03475376097595749,
"learning_rate": 4.771244203363478e-05,
"loss": 0.7322,
"step": 3470
},
{
"epoch": 0.7660090378044748,
"grad_norm": 0.03620242697594977,
"learning_rate": 4.72910078081953e-05,
"loss": 0.7289,
"step": 3475
},
{
"epoch": 0.7671112090818913,
"grad_norm": 0.039201952070474604,
"learning_rate": 4.687109448634647e-05,
"loss": 0.7663,
"step": 3480
},
{
"epoch": 0.7682133803593079,
"grad_norm": 0.038508731501384584,
"learning_rate": 4.6452708286119176e-05,
"loss": 0.7554,
"step": 3485
},
{
"epoch": 0.7693155516367244,
"grad_norm": 0.03899698694328063,
"learning_rate": 4.603585540293071e-05,
"loss": 0.7736,
"step": 3490
},
{
"epoch": 0.7704177229141409,
"grad_norm": 0.0368565333958254,
"learning_rate": 4.5620542009493304e-05,
"loss": 0.7516,
"step": 3495
},
{
"epoch": 0.7715198941915574,
"grad_norm": 0.035388497953352936,
"learning_rate": 4.5206774255722504e-05,
"loss": 0.7484,
"step": 3500
},
{
"epoch": 0.7726220654689738,
"grad_norm": 0.03538316494242759,
"learning_rate": 4.4794558268646194e-05,
"loss": 0.7581,
"step": 3505
},
{
"epoch": 0.7737242367463903,
"grad_norm": 0.037362884464824934,
"learning_rate": 4.4383900152313926e-05,
"loss": 0.7459,
"step": 3510
},
{
"epoch": 0.7748264080238069,
"grad_norm": 0.036038446641414534,
"learning_rate": 4.397480598770652e-05,
"loss": 0.7606,
"step": 3515
},
{
"epoch": 0.7759285793012234,
"grad_norm": 0.0402761096628342,
"learning_rate": 4.3567281832645815e-05,
"loss": 0.7813,
"step": 3520
},
{
"epoch": 0.7770307505786399,
"grad_norm": 0.03506614642106647,
"learning_rate": 4.3161333721705146e-05,
"loss": 0.7303,
"step": 3525
},
{
"epoch": 0.7781329218560564,
"grad_norm": 0.03710237528152325,
"learning_rate": 4.275696766612007e-05,
"loss": 0.7658,
"step": 3530
},
{
"epoch": 0.779235093133473,
"grad_norm": 0.039207961974188736,
"learning_rate": 4.2354189653699234e-05,
"loss": 0.7686,
"step": 3535
},
{
"epoch": 0.7803372644108895,
"grad_norm": 0.0400226786429818,
"learning_rate": 4.1953005648735606e-05,
"loss": 0.7365,
"step": 3540
},
{
"epoch": 0.781439435688306,
"grad_norm": 0.038069210231566904,
"learning_rate": 4.1553421591918264e-05,
"loss": 0.7612,
"step": 3545
},
{
"epoch": 0.7825416069657225,
"grad_norm": 0.036731650972072025,
"learning_rate": 4.115544340024456e-05,
"loss": 0.7276,
"step": 3550
},
{
"epoch": 0.783643778243139,
"grad_norm": 0.03761683304943094,
"learning_rate": 4.075907696693224e-05,
"loss": 0.7397,
"step": 3555
},
{
"epoch": 0.7847459495205555,
"grad_norm": 0.039130062081128986,
"learning_rate": 4.036432816133241e-05,
"loss": 0.7412,
"step": 3560
},
{
"epoch": 0.785848120797972,
"grad_norm": 0.03725082169003722,
"learning_rate": 3.99712028288424e-05,
"loss": 0.7378,
"step": 3565
},
{
"epoch": 0.7869502920753885,
"grad_norm": 0.03581598167403878,
"learning_rate": 3.957970679081948e-05,
"loss": 0.7377,
"step": 3570
},
{
"epoch": 0.788052463352805,
"grad_norm": 0.036766846530443355,
"learning_rate": 3.918984584449435e-05,
"loss": 0.7606,
"step": 3575
},
{
"epoch": 0.7891546346302215,
"grad_norm": 0.03708226420234272,
"learning_rate": 3.880162576288557e-05,
"loss": 0.763,
"step": 3580
},
{
"epoch": 0.790256805907638,
"grad_norm": 0.035646641087147025,
"learning_rate": 3.841505229471386e-05,
"loss": 0.7472,
"step": 3585
},
{
"epoch": 0.7913589771850545,
"grad_norm": 0.03623704020179618,
"learning_rate": 3.803013116431716e-05,
"loss": 0.7371,
"step": 3590
},
{
"epoch": 0.7924611484624711,
"grad_norm": 0.03685938607213482,
"learning_rate": 3.764686807156565e-05,
"loss": 0.7636,
"step": 3595
},
{
"epoch": 0.7935633197398876,
"grad_norm": 0.03581401378985991,
"learning_rate": 3.72652686917776e-05,
"loss": 0.7436,
"step": 3600
},
{
"epoch": 0.7946654910173041,
"grad_norm": 0.03576091311918202,
"learning_rate": 3.6885338675635215e-05,
"loss": 0.741,
"step": 3605
},
{
"epoch": 0.7957676622947206,
"grad_norm": 0.03938541436587627,
"learning_rate": 3.65070836491007e-05,
"loss": 0.7511,
"step": 3610
},
{
"epoch": 0.7968698335721371,
"grad_norm": 0.03831985268675037,
"learning_rate": 3.613050921333345e-05,
"loss": 0.7581,
"step": 3615
},
{
"epoch": 0.7979720048495537,
"grad_norm": 0.036183263282557804,
"learning_rate": 3.575562094460682e-05,
"loss": 0.7519,
"step": 3620
},
{
"epoch": 0.7990741761269702,
"grad_norm": 0.039441336486759127,
"learning_rate": 3.5382424394225506e-05,
"loss": 0.7566,
"step": 3625
},
{
"epoch": 0.8001763474043866,
"grad_norm": 0.03791319055471918,
"learning_rate": 3.501092508844339e-05,
"loss": 0.7483,
"step": 3630
},
{
"epoch": 0.8012785186818031,
"grad_norm": 0.034917608244421146,
"learning_rate": 3.464112852838184e-05,
"loss": 0.7434,
"step": 3635
},
{
"epoch": 0.8023806899592196,
"grad_norm": 0.03606780306247915,
"learning_rate": 3.427304018994821e-05,
"loss": 0.7478,
"step": 3640
},
{
"epoch": 0.8034828612366361,
"grad_norm": 0.03680798923538717,
"learning_rate": 3.3906665523754504e-05,
"loss": 0.7496,
"step": 3645
},
{
"epoch": 0.8045850325140527,
"grad_norm": 0.03865353271265747,
"learning_rate": 3.354200995503692e-05,
"loss": 0.7397,
"step": 3650
},
{
"epoch": 0.8056872037914692,
"grad_norm": 0.03635669491983459,
"learning_rate": 3.3179078883575536e-05,
"loss": 0.7718,
"step": 3655
},
{
"epoch": 0.8067893750688857,
"grad_norm": 0.0363260735871896,
"learning_rate": 3.2817877683614244e-05,
"loss": 0.7209,
"step": 3660
},
{
"epoch": 0.8078915463463022,
"grad_norm": 0.03739024777521627,
"learning_rate": 3.245841170378106e-05,
"loss": 0.7276,
"step": 3665
},
{
"epoch": 0.8089937176237187,
"grad_norm": 0.04056944004711678,
"learning_rate": 3.21006862670092e-05,
"loss": 0.7427,
"step": 3670
},
{
"epoch": 0.8100958889011353,
"grad_norm": 0.038893468288220996,
"learning_rate": 3.174470667045801e-05,
"loss": 0.7337,
"step": 3675
},
{
"epoch": 0.8111980601785518,
"grad_norm": 0.037502075980572286,
"learning_rate": 3.139047818543462e-05,
"loss": 0.7536,
"step": 3680
},
{
"epoch": 0.8123002314559683,
"grad_norm": 0.03650106907146565,
"learning_rate": 3.103800605731598e-05,
"loss": 0.7533,
"step": 3685
},
{
"epoch": 0.8134024027333848,
"grad_norm": 0.03819525064272321,
"learning_rate": 3.068729550547105e-05,
"loss": 0.7681,
"step": 3690
},
{
"epoch": 0.8145045740108012,
"grad_norm": 0.03711022658424072,
"learning_rate": 3.033835172318355e-05,
"loss": 0.7449,
"step": 3695
},
{
"epoch": 0.8156067452882177,
"grad_norm": 0.03418347940782499,
"learning_rate": 2.9991179877575032e-05,
"loss": 0.7393,
"step": 3700
},
{
"epoch": 0.8167089165656343,
"grad_norm": 0.03626371876045612,
"learning_rate": 2.964578510952847e-05,
"loss": 0.7371,
"step": 3705
},
{
"epoch": 0.8178110878430508,
"grad_norm": 0.03878297701004356,
"learning_rate": 2.9302172533612077e-05,
"loss": 0.747,
"step": 3710
},
{
"epoch": 0.8189132591204673,
"grad_norm": 0.038112632260334955,
"learning_rate": 2.8960347238003488e-05,
"loss": 0.7579,
"step": 3715
},
{
"epoch": 0.8200154303978838,
"grad_norm": 0.03926882455039221,
"learning_rate": 2.8620314284414486e-05,
"loss": 0.7529,
"step": 3720
},
{
"epoch": 0.8211176016753003,
"grad_norm": 0.03648970703950733,
"learning_rate": 2.8282078708016163e-05,
"loss": 0.7473,
"step": 3725
},
{
"epoch": 0.8222197729527169,
"grad_norm": 0.03630414782533231,
"learning_rate": 2.7945645517364064e-05,
"loss": 0.7355,
"step": 3730
},
{
"epoch": 0.8233219442301334,
"grad_norm": 0.036468505132406466,
"learning_rate": 2.7611019694324415e-05,
"loss": 0.7101,
"step": 3735
},
{
"epoch": 0.8244241155075499,
"grad_norm": 0.037520748694235606,
"learning_rate": 2.727820619399992e-05,
"loss": 0.7431,
"step": 3740
},
{
"epoch": 0.8255262867849664,
"grad_norm": 0.03483967252567691,
"learning_rate": 2.6947209944656784e-05,
"loss": 0.7008,
"step": 3745
},
{
"epoch": 0.8266284580623829,
"grad_norm": 0.03683830824123343,
"learning_rate": 2.661803584765143e-05,
"loss": 0.7397,
"step": 3750
},
{
"epoch": 0.8277306293397994,
"grad_norm": 0.037699100377305624,
"learning_rate": 2.6290688777358164e-05,
"loss": 0.7663,
"step": 3755
},
{
"epoch": 0.8288328006172159,
"grad_norm": 0.04181925311652994,
"learning_rate": 2.5965173581096748e-05,
"loss": 0.7553,
"step": 3760
},
{
"epoch": 0.8299349718946324,
"grad_norm": 0.03779442944677197,
"learning_rate": 2.564149507906089e-05,
"loss": 0.7589,
"step": 3765
},
{
"epoch": 0.8310371431720489,
"grad_norm": 0.03577568203273454,
"learning_rate": 2.5319658064246595e-05,
"loss": 0.7446,
"step": 3770
},
{
"epoch": 0.8321393144494654,
"grad_norm": 0.03931081842755508,
"learning_rate": 2.4999667302381404e-05,
"loss": 0.751,
"step": 3775
},
{
"epoch": 0.833241485726882,
"grad_norm": 0.03745824041736122,
"learning_rate": 2.4681527531853835e-05,
"loss": 0.7123,
"step": 3780
},
{
"epoch": 0.8343436570042985,
"grad_norm": 0.03527193664557951,
"learning_rate": 2.436524346364286e-05,
"loss": 0.7025,
"step": 3785
},
{
"epoch": 0.835445828281715,
"grad_norm": 0.03606947650450607,
"learning_rate": 2.4050819781248647e-05,
"loss": 0.7206,
"step": 3790
},
{
"epoch": 0.8365479995591315,
"grad_norm": 0.035888622608410504,
"learning_rate": 2.373826114062296e-05,
"loss": 0.7537,
"step": 3795
},
{
"epoch": 0.837650170836548,
"grad_norm": 0.036850404718823324,
"learning_rate": 2.3427572170100112e-05,
"loss": 0.7638,
"step": 3800
},
{
"epoch": 0.8387523421139645,
"grad_norm": 0.037724339996222885,
"learning_rate": 2.311875747032858e-05,
"loss": 0.7557,
"step": 3805
},
{
"epoch": 0.8398545133913811,
"grad_norm": 0.03518696843854021,
"learning_rate": 2.2811821614202897e-05,
"loss": 0.7602,
"step": 3810
},
{
"epoch": 0.8409566846687976,
"grad_norm": 0.03634301372317722,
"learning_rate": 2.2506769146795893e-05,
"loss": 0.7427,
"step": 3815
},
{
"epoch": 0.842058855946214,
"grad_norm": 0.03565812651945666,
"learning_rate": 2.2203604585291303e-05,
"loss": 0.7336,
"step": 3820
},
{
"epoch": 0.8431610272236305,
"grad_norm": 0.03768119047665601,
"learning_rate": 2.1902332418916956e-05,
"loss": 0.7661,
"step": 3825
},
{
"epoch": 0.844263198501047,
"grad_norm": 0.035791641873146804,
"learning_rate": 2.1602957108878434e-05,
"loss": 0.7589,
"step": 3830
},
{
"epoch": 0.8453653697784635,
"grad_norm": 0.03888367446787956,
"learning_rate": 2.130548308829267e-05,
"loss": 0.7395,
"step": 3835
},
{
"epoch": 0.8464675410558801,
"grad_norm": 0.03780624024053704,
"learning_rate": 2.1009914762122694e-05,
"loss": 0.7324,
"step": 3840
},
{
"epoch": 0.8475697123332966,
"grad_norm": 0.03701658755711745,
"learning_rate": 2.071625650711217e-05,
"loss": 0.7261,
"step": 3845
},
{
"epoch": 0.8486718836107131,
"grad_norm": 0.03349203678484499,
"learning_rate": 2.0424512671720566e-05,
"loss": 0.7285,
"step": 3850
},
{
"epoch": 0.8497740548881296,
"grad_norm": 0.035807885382574296,
"learning_rate": 2.0134687576058878e-05,
"loss": 0.7513,
"step": 3855
},
{
"epoch": 0.8508762261655461,
"grad_norm": 0.03540300293328915,
"learning_rate": 1.9846785511825618e-05,
"loss": 0.7506,
"step": 3860
},
{
"epoch": 0.8519783974429627,
"grad_norm": 0.036548562423081375,
"learning_rate": 1.9560810742243298e-05,
"loss": 0.7486,
"step": 3865
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.03688805027139149,
"learning_rate": 1.9276767501995206e-05,
"loss": 0.756,
"step": 3870
},
{
"epoch": 0.8541827399977957,
"grad_norm": 0.03511921343987327,
"learning_rate": 1.8994659997162687e-05,
"loss": 0.7188,
"step": 3875
},
{
"epoch": 0.8552849112752122,
"grad_norm": 0.03593480104178096,
"learning_rate": 1.8714492405163072e-05,
"loss": 0.7241,
"step": 3880
},
{
"epoch": 0.8563870825526286,
"grad_norm": 0.03912290065785617,
"learning_rate": 1.843626887468764e-05,
"loss": 0.735,
"step": 3885
},
{
"epoch": 0.8574892538300452,
"grad_norm": 0.036540249139284824,
"learning_rate": 1.8159993525640115e-05,
"loss": 0.7629,
"step": 3890
},
{
"epoch": 0.8585914251074617,
"grad_norm": 0.03586116680339337,
"learning_rate": 1.788567044907585e-05,
"loss": 0.728,
"step": 3895
},
{
"epoch": 0.8596935963848782,
"grad_norm": 0.04108515819021104,
"learning_rate": 1.7613303707141164e-05,
"loss": 0.7544,
"step": 3900
},
{
"epoch": 0.8607957676622947,
"grad_norm": 0.04154327787956279,
"learning_rate": 1.7342897333013112e-05,
"loss": 0.715,
"step": 3905
},
{
"epoch": 0.8618979389397112,
"grad_norm": 0.0375147580423421,
"learning_rate": 1.7074455330839943e-05,
"loss": 0.7325,
"step": 3910
},
{
"epoch": 0.8630001102171277,
"grad_norm": 0.03505267883405521,
"learning_rate": 1.6807981675681587e-05,
"loss": 0.7463,
"step": 3915
},
{
"epoch": 0.8641022814945443,
"grad_norm": 0.035319705305492576,
"learning_rate": 1.654348031345104e-05,
"loss": 0.7225,
"step": 3920
},
{
"epoch": 0.8652044527719608,
"grad_norm": 0.038996518100504834,
"learning_rate": 1.6280955160855628e-05,
"loss": 0.7537,
"step": 3925
},
{
"epoch": 0.8663066240493773,
"grad_norm": 0.03826307362758949,
"learning_rate": 1.602041010533934e-05,
"loss": 0.7287,
"step": 3930
},
{
"epoch": 0.8674087953267938,
"grad_norm": 0.03711543681709775,
"learning_rate": 1.5761849005024985e-05,
"loss": 0.7709,
"step": 3935
},
{
"epoch": 0.8685109666042103,
"grad_norm": 0.03886269645051025,
"learning_rate": 1.5505275688657275e-05,
"loss": 0.733,
"step": 3940
},
{
"epoch": 0.8696131378816268,
"grad_norm": 0.039368459771957895,
"learning_rate": 1.5250693955545929e-05,
"loss": 0.7377,
"step": 3945
},
{
"epoch": 0.8707153091590433,
"grad_norm": 0.03542501994719629,
"learning_rate": 1.4998107575509633e-05,
"loss": 0.7509,
"step": 3950
},
{
"epoch": 0.8718174804364598,
"grad_norm": 0.03662399917022349,
"learning_rate": 1.4747520288820014e-05,
"loss": 0.7221,
"step": 3955
},
{
"epoch": 0.8729196517138763,
"grad_norm": 0.03657298741205027,
"learning_rate": 1.449893580614636e-05,
"loss": 0.7497,
"step": 3960
},
{
"epoch": 0.8740218229912928,
"grad_norm": 0.03781150680434851,
"learning_rate": 1.425235780850067e-05,
"loss": 0.7582,
"step": 3965
},
{
"epoch": 0.8751239942687093,
"grad_norm": 0.04032655125637331,
"learning_rate": 1.4007789947183168e-05,
"loss": 0.7447,
"step": 3970
},
{
"epoch": 0.8762261655461259,
"grad_norm": 0.03589883470863497,
"learning_rate": 1.3765235843728129e-05,
"loss": 0.7276,
"step": 3975
},
{
"epoch": 0.8773283368235424,
"grad_norm": 0.03717658884234944,
"learning_rate": 1.3524699089850328e-05,
"loss": 0.7401,
"step": 3980
},
{
"epoch": 0.8784305081009589,
"grad_norm": 0.03497351776047112,
"learning_rate": 1.3286183247391868e-05,
"loss": 0.7392,
"step": 3985
},
{
"epoch": 0.8795326793783754,
"grad_norm": 0.03680345383865433,
"learning_rate": 1.3049691848269461e-05,
"loss": 0.7397,
"step": 3990
},
{
"epoch": 0.880634850655792,
"grad_norm": 0.037067904261301174,
"learning_rate": 1.2815228394421995e-05,
"loss": 0.7543,
"step": 3995
},
{
"epoch": 0.8817370219332085,
"grad_norm": 0.03635951971560963,
"learning_rate": 1.2582796357758829e-05,
"loss": 0.7268,
"step": 4000
},
{
"epoch": 0.882839193210625,
"grad_norm": 0.037149225593830666,
"learning_rate": 1.2352399180108286e-05,
"loss": 0.7447,
"step": 4005
},
{
"epoch": 0.8839413644880414,
"grad_norm": 0.034848783140386315,
"learning_rate": 1.2124040273166691e-05,
"loss": 0.7311,
"step": 4010
},
{
"epoch": 0.8850435357654579,
"grad_norm": 0.03985554439292569,
"learning_rate": 1.1897723018447946e-05,
"loss": 0.7288,
"step": 4015
},
{
"epoch": 0.8861457070428744,
"grad_norm": 0.035706356323576875,
"learning_rate": 1.1673450767233388e-05,
"loss": 0.7326,
"step": 4020
},
{
"epoch": 0.887247878320291,
"grad_norm": 0.036740845098738074,
"learning_rate": 1.1451226840522077e-05,
"loss": 0.7496,
"step": 4025
},
{
"epoch": 0.8883500495977075,
"grad_norm": 0.037031961816322526,
"learning_rate": 1.1231054528981765e-05,
"loss": 0.7524,
"step": 4030
},
{
"epoch": 0.889452220875124,
"grad_norm": 0.03756810225346114,
"learning_rate": 1.1012937092900126e-05,
"loss": 0.7312,
"step": 4035
},
{
"epoch": 0.8905543921525405,
"grad_norm": 0.037472375253416255,
"learning_rate": 1.0796877762136458e-05,
"loss": 0.7544,
"step": 4040
},
{
"epoch": 0.891656563429957,
"grad_norm": 0.0359317417796455,
"learning_rate": 1.0582879736073819e-05,
"loss": 0.7354,
"step": 4045
},
{
"epoch": 0.8927587347073735,
"grad_norm": 0.03749781446172768,
"learning_rate": 1.03709461835717e-05,
"loss": 0.7546,
"step": 4050
},
{
"epoch": 0.8938609059847901,
"grad_norm": 0.03733411968021827,
"learning_rate": 1.0161080242919129e-05,
"loss": 0.7259,
"step": 4055
},
{
"epoch": 0.8949630772622066,
"grad_norm": 0.03761288676599098,
"learning_rate": 9.953285021788143e-06,
"loss": 0.7489,
"step": 4060
},
{
"epoch": 0.8960652485396231,
"grad_norm": 0.03412569507517913,
"learning_rate": 9.747563597187791e-06,
"loss": 0.7286,
"step": 4065
},
{
"epoch": 0.8971674198170395,
"grad_norm": 0.03525939402552114,
"learning_rate": 9.543919015418516e-06,
"loss": 0.7513,
"step": 4070
},
{
"epoch": 0.898269591094456,
"grad_norm": 0.03720593815884494,
"learning_rate": 9.342354292027215e-06,
"loss": 0.7474,
"step": 4075
},
{
"epoch": 0.8993717623718726,
"grad_norm": 0.0375416497695239,
"learning_rate": 9.142872411762354e-06,
"loss": 0.7685,
"step": 4080
},
{
"epoch": 0.9004739336492891,
"grad_norm": 0.03906878754343892,
"learning_rate": 8.945476328529949e-06,
"loss": 0.732,
"step": 4085
},
{
"epoch": 0.9015761049267056,
"grad_norm": 0.035056491589514995,
"learning_rate": 8.750168965349713e-06,
"loss": 0.7436,
"step": 4090
},
{
"epoch": 0.9026782762041221,
"grad_norm": 0.03470624619912502,
"learning_rate": 8.556953214311896e-06,
"loss": 0.6928,
"step": 4095
},
{
"epoch": 0.9037804474815386,
"grad_norm": 0.03557139688126229,
"learning_rate": 8.365831936534289e-06,
"loss": 0.7236,
"step": 4100
},
{
"epoch": 0.9048826187589551,
"grad_norm": 0.03860114842216224,
"learning_rate": 8.17680796212003e-06,
"loss": 0.7367,
"step": 4105
},
{
"epoch": 0.9059847900363717,
"grad_norm": 0.0358454686914662,
"learning_rate": 7.989884090115579e-06,
"loss": 0.7393,
"step": 4110
},
{
"epoch": 0.9070869613137882,
"grad_norm": 0.036131468273917416,
"learning_rate": 7.80506308846927e-06,
"loss": 0.7187,
"step": 4115
},
{
"epoch": 0.9081891325912047,
"grad_norm": 0.03431907826116676,
"learning_rate": 7.622347693990438e-06,
"loss": 0.7368,
"step": 4120
},
{
"epoch": 0.9092913038686212,
"grad_norm": 0.036378482006826245,
"learning_rate": 7.4417406123088e-06,
"loss": 0.7123,
"step": 4125
},
{
"epoch": 0.9103934751460377,
"grad_norm": 0.03610068662140505,
"learning_rate": 7.263244517834365e-06,
"loss": 0.7298,
"step": 4130
},
{
"epoch": 0.9114956464234542,
"grad_norm": 0.035675737631172474,
"learning_rate": 7.086862053717867e-06,
"loss": 0.7329,
"step": 4135
},
{
"epoch": 0.9125978177008707,
"grad_norm": 0.03488018397228746,
"learning_rate": 6.91259583181169e-06,
"loss": 0.7459,
"step": 4140
},
{
"epoch": 0.9136999889782872,
"grad_norm": 0.037711098080699654,
"learning_rate": 6.740448432631118e-06,
"loss": 0.7456,
"step": 4145
},
{
"epoch": 0.9148021602557037,
"grad_norm": 0.03683158716361964,
"learning_rate": 6.570422405316117e-06,
"loss": 0.7477,
"step": 4150
},
{
"epoch": 0.9159043315331202,
"grad_norm": 0.03722178270525642,
"learning_rate": 6.4025202675935635e-06,
"loss": 0.7668,
"step": 4155
},
{
"epoch": 0.9170065028105367,
"grad_norm": 0.03548538866880862,
"learning_rate": 6.236744505740126e-06,
"loss": 0.7612,
"step": 4160
},
{
"epoch": 0.9181086740879533,
"grad_norm": 0.03573889132178891,
"learning_rate": 6.073097574545244e-06,
"loss": 0.7374,
"step": 4165
},
{
"epoch": 0.9192108453653698,
"grad_norm": 0.03490127726620679,
"learning_rate": 5.91158189727487e-06,
"loss": 0.7233,
"step": 4170
},
{
"epoch": 0.9203130166427863,
"grad_norm": 0.04015606965771483,
"learning_rate": 5.752199865635604e-06,
"loss": 0.7356,
"step": 4175
},
{
"epoch": 0.9214151879202028,
"grad_norm": 0.038889913565105266,
"learning_rate": 5.594953839739252e-06,
"loss": 0.7571,
"step": 4180
},
{
"epoch": 0.9225173591976193,
"grad_norm": 0.03632902640398831,
"learning_rate": 5.439846148067856e-06,
"loss": 0.7478,
"step": 4185
},
{
"epoch": 0.9236195304750359,
"grad_norm": 0.036911826600897674,
"learning_rate": 5.2868790874392495e-06,
"loss": 0.7351,
"step": 4190
},
{
"epoch": 0.9247217017524524,
"grad_norm": 0.035394959022726914,
"learning_rate": 5.13605492297306e-06,
"loss": 0.7513,
"step": 4195
},
{
"epoch": 0.9258238730298688,
"grad_norm": 0.038584292984150205,
"learning_rate": 4.98737588805711e-06,
"loss": 0.7503,
"step": 4200
},
{
"epoch": 0.9269260443072853,
"grad_norm": 0.03443755510774623,
"learning_rate": 4.840844184314368e-06,
"loss": 0.7467,
"step": 4205
},
{
"epoch": 0.9280282155847018,
"grad_norm": 0.03507393739539605,
"learning_rate": 4.696461981570371e-06,
"loss": 0.7479,
"step": 4210
},
{
"epoch": 0.9291303868621184,
"grad_norm": 0.03572284290016147,
"learning_rate": 4.554231417821147e-06,
"loss": 0.7438,
"step": 4215
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.03939452564404578,
"learning_rate": 4.414154599201314e-06,
"loss": 0.7528,
"step": 4220
},
{
"epoch": 0.9313347294169514,
"grad_norm": 0.0343058423932778,
"learning_rate": 4.2762335999532494e-06,
"loss": 0.7123,
"step": 4225
},
{
"epoch": 0.9324369006943679,
"grad_norm": 0.03688347815350498,
"learning_rate": 4.140470462396101e-06,
"loss": 0.7363,
"step": 4230
},
{
"epoch": 0.9335390719717844,
"grad_norm": 0.03669486942706104,
"learning_rate": 4.006867196895641e-06,
"loss": 0.7285,
"step": 4235
},
{
"epoch": 0.934641243249201,
"grad_norm": 0.035278173584095886,
"learning_rate": 3.8754257818345125e-06,
"loss": 0.7273,
"step": 4240
},
{
"epoch": 0.9357434145266175,
"grad_norm": 0.03838555636403366,
"learning_rate": 3.7461481635828793e-06,
"loss": 0.7406,
"step": 4245
},
{
"epoch": 0.936845585804034,
"grad_norm": 0.03449587188356417,
"learning_rate": 3.619036256469704e-06,
"loss": 0.719,
"step": 4250
},
{
"epoch": 0.9379477570814505,
"grad_norm": 0.03617056410014076,
"learning_rate": 3.4940919427542345e-06,
"loss": 0.7074,
"step": 4255
},
{
"epoch": 0.9390499283588669,
"grad_norm": 0.03720545719113871,
"learning_rate": 3.371317072598312e-06,
"loss": 0.761,
"step": 4260
},
{
"epoch": 0.9401520996362834,
"grad_norm": 0.03733982842334115,
"learning_rate": 3.2507134640388566e-06,
"loss": 0.7373,
"step": 4265
},
{
"epoch": 0.9412542709137,
"grad_norm": 0.03832577468879599,
"learning_rate": 3.132282902961025e-06,
"loss": 0.7744,
"step": 4270
},
{
"epoch": 0.9423564421911165,
"grad_norm": 0.03561487376514044,
"learning_rate": 3.016027143071631e-06,
"loss": 0.7367,
"step": 4275
},
{
"epoch": 0.943458613468533,
"grad_norm": 0.037260538854841714,
"learning_rate": 2.9019479058733974e-06,
"loss": 0.7412,
"step": 4280
},
{
"epoch": 0.9445607847459495,
"grad_norm": 0.033914247331261665,
"learning_rate": 2.7900468806392128e-06,
"loss": 0.7191,
"step": 4285
},
{
"epoch": 0.945662956023366,
"grad_norm": 0.036359448459938014,
"learning_rate": 2.6803257243873165e-06,
"loss": 0.7236,
"step": 4290
},
{
"epoch": 0.9467651273007825,
"grad_norm": 0.036671268303379516,
"learning_rate": 2.572786061856652e-06,
"loss": 0.706,
"step": 4295
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.03688898129821375,
"learning_rate": 2.467429485482869e-06,
"loss": 0.7719,
"step": 4300
},
{
"epoch": 0.9489694698556156,
"grad_norm": 0.035848288240590206,
"learning_rate": 2.3642575553746933e-06,
"loss": 0.7375,
"step": 4305
},
{
"epoch": 0.9500716411330321,
"grad_norm": 0.03649010626732304,
"learning_rate": 2.2632717992908278e-06,
"loss": 0.7492,
"step": 4310
},
{
"epoch": 0.9511738124104486,
"grad_norm": 0.035622089704062276,
"learning_rate": 2.164473712617387e-06,
"loss": 0.7277,
"step": 4315
},
{
"epoch": 0.9522759836878651,
"grad_norm": 0.034793596935363234,
"learning_rate": 2.0678647583456995e-06,
"loss": 0.7167,
"step": 4320
},
{
"epoch": 0.9533781549652816,
"grad_norm": 0.038314433912779305,
"learning_rate": 1.973446367050674e-06,
"loss": 0.694,
"step": 4325
},
{
"epoch": 0.9544803262426981,
"grad_norm": 0.032175274076676946,
"learning_rate": 1.8812199368695325e-06,
"loss": 0.7399,
"step": 4330
},
{
"epoch": 0.9555824975201146,
"grad_norm": 0.03747625530620859,
"learning_rate": 1.7911868334812618e-06,
"loss": 0.7543,
"step": 4335
},
{
"epoch": 0.9566846687975311,
"grad_norm": 0.03469398206914549,
"learning_rate": 1.7033483900862953e-06,
"loss": 0.719,
"step": 4340
},
{
"epoch": 0.9577868400749476,
"grad_norm": 0.03716371251632767,
"learning_rate": 1.617705907386696e-06,
"loss": 0.7242,
"step": 4345
},
{
"epoch": 0.9588890113523642,
"grad_norm": 0.033096194137156504,
"learning_rate": 1.5342606535670877e-06,
"loss": 0.7395,
"step": 4350
},
{
"epoch": 0.9599911826297807,
"grad_norm": 0.038402200718579416,
"learning_rate": 1.4530138642756872e-06,
"loss": 0.75,
"step": 4355
},
{
"epoch": 0.9610933539071972,
"grad_norm": 0.03629339352134901,
"learning_rate": 1.3739667426061196e-06,
"loss": 0.7121,
"step": 4360
},
{
"epoch": 0.9621955251846137,
"grad_norm": 0.03480704310265325,
"learning_rate": 1.2971204590795813e-06,
"loss": 0.7347,
"step": 4365
},
{
"epoch": 0.9632976964620302,
"grad_norm": 0.03454475792553819,
"learning_rate": 1.2224761516274883e-06,
"loss": 0.7322,
"step": 4370
},
{
"epoch": 0.9643998677394467,
"grad_norm": 0.0348941999387763,
"learning_rate": 1.1500349255746055e-06,
"loss": 0.744,
"step": 4375
},
{
"epoch": 0.9655020390168633,
"grad_norm": 0.03661845220058871,
"learning_rate": 1.0797978536227602e-06,
"loss": 0.7429,
"step": 4380
},
{
"epoch": 0.9666042102942798,
"grad_norm": 0.03670130845399715,
"learning_rate": 1.011765975834855e-06,
"loss": 0.7508,
"step": 4385
},
{
"epoch": 0.9677063815716962,
"grad_norm": 0.037482737750151054,
"learning_rate": 9.459402996195797e-07,
"loss": 0.7199,
"step": 4390
},
{
"epoch": 0.9688085528491127,
"grad_norm": 0.03609308663664764,
"learning_rate": 8.823217997163401e-07,
"loss": 0.7202,
"step": 4395
},
{
"epoch": 0.9699107241265292,
"grad_norm": 0.03562801152195268,
"learning_rate": 8.209114181810029e-07,
"loss": 0.7519,
"step": 4400
},
{
"epoch": 0.9710128954039458,
"grad_norm": 0.037875623718305995,
"learning_rate": 7.617100643718066e-07,
"loss": 0.736,
"step": 4405
},
{
"epoch": 0.9721150666813623,
"grad_norm": 0.0343579172461373,
"learning_rate": 7.04718614935973e-07,
"loss": 0.7319,
"step": 4410
},
{
"epoch": 0.9732172379587788,
"grad_norm": 0.0351522271270778,
"learning_rate": 6.499379137966831e-07,
"loss": 0.7158,
"step": 4415
},
{
"epoch": 0.9743194092361953,
"grad_norm": 0.03685325990487051,
"learning_rate": 5.973687721405884e-07,
"loss": 0.7343,
"step": 4420
},
{
"epoch": 0.9754215805136118,
"grad_norm": 0.03353546299147592,
"learning_rate": 5.470119684058527e-07,
"loss": 0.73,
"step": 4425
},
{
"epoch": 0.9765237517910283,
"grad_norm": 0.03850593917914308,
"learning_rate": 4.988682482705286e-07,
"loss": 0.756,
"step": 4430
},
{
"epoch": 0.9776259230684449,
"grad_norm": 0.040410693859280186,
"learning_rate": 4.5293832464159965e-07,
"loss": 0.7463,
"step": 4435
},
{
"epoch": 0.9787280943458614,
"grad_norm": 0.03585489720397234,
"learning_rate": 4.0922287764438843e-07,
"loss": 0.7227,
"step": 4440
},
{
"epoch": 0.9798302656232779,
"grad_norm": 0.03643983778724927,
"learning_rate": 3.677225546124818e-07,
"loss": 0.7297,
"step": 4445
},
{
"epoch": 0.9809324369006943,
"grad_norm": 0.03527743845105325,
"learning_rate": 3.2843797007812147e-07,
"loss": 0.7343,
"step": 4450
},
{
"epoch": 0.9820346081781108,
"grad_norm": 0.03420044354382039,
"learning_rate": 2.913697057632114e-07,
"loss": 0.7211,
"step": 4455
},
{
"epoch": 0.9831367794555274,
"grad_norm": 0.03541898225117583,
"learning_rate": 2.565183105705415e-07,
"loss": 0.7596,
"step": 4460
},
{
"epoch": 0.9842389507329439,
"grad_norm": 0.03803822863693149,
"learning_rate": 2.23884300575794e-07,
"loss": 0.7484,
"step": 4465
},
{
"epoch": 0.9853411220103604,
"grad_norm": 0.0339643099555736,
"learning_rate": 1.9346815901984947e-07,
"loss": 0.7259,
"step": 4470
},
{
"epoch": 0.9864432932877769,
"grad_norm": 0.035078257970822355,
"learning_rate": 1.6527033630162613e-07,
"loss": 0.7306,
"step": 4475
},
{
"epoch": 0.9875454645651934,
"grad_norm": 0.0345533584530425,
"learning_rate": 1.392912499714016e-07,
"loss": 0.734,
"step": 4480
},
{
"epoch": 0.98864763584261,
"grad_norm": 0.03944350230563913,
"learning_rate": 1.1553128472468476e-07,
"loss": 0.764,
"step": 4485
},
{
"epoch": 0.9897498071200265,
"grad_norm": 0.0357818169230016,
"learning_rate": 9.39907923964367e-08,
"loss": 0.7235,
"step": 4490
},
{
"epoch": 0.990851978397443,
"grad_norm": 0.03566232250402245,
"learning_rate": 7.467009195594176e-08,
"loss": 0.7433,
"step": 4495
},
{
"epoch": 0.9919541496748595,
"grad_norm": 0.03513694284161295,
"learning_rate": 5.7569469502011247e-08,
"loss": 0.7419,
"step": 4500
},
{
"epoch": 0.993056320952276,
"grad_norm": 0.033604401356817075,
"learning_rate": 4.2689178258820125e-08,
"loss": 0.7302,
"step": 4505
},
{
"epoch": 0.9941584922296925,
"grad_norm": 0.038757934920082,
"learning_rate": 3.0029438572110045e-08,
"loss": 0.7509,
"step": 4510
},
{
"epoch": 0.995260663507109,
"grad_norm": 0.03542769692423219,
"learning_rate": 1.959043790590864e-08,
"loss": 0.7425,
"step": 4515
},
{
"epoch": 0.9963628347845255,
"grad_norm": 0.037185347711832795,
"learning_rate": 1.137233083983169e-08,
"loss": 0.7187,
"step": 4520
},
{
"epoch": 0.997465006061942,
"grad_norm": 0.03758234390906879,
"learning_rate": 5.375239066685022e-09,
"loss": 0.7592,
"step": 4525
},
{
"epoch": 0.9985671773393585,
"grad_norm": 0.036473312127062285,
"learning_rate": 1.5992513907658878e-09,
"loss": 0.7272,
"step": 4530
},
{
"epoch": 0.999669348616775,
"grad_norm": 0.03634038177846733,
"learning_rate": 4.442372649737791e-11,
"loss": 0.7268,
"step": 4535
},
{
"epoch": 0.9998897828722584,
"eval_loss": 1.1339110136032104,
"eval_runtime": 1020.4828,
"eval_samples_per_second": 187.325,
"eval_steps_per_second": 5.854,
"step": 4536
},
{
"epoch": 0.9998897828722584,
"step": 4536,
"total_flos": 693442503278592.0,
"train_loss": 0.8111531063651491,
"train_runtime": 19545.8709,
"train_samples_per_second": 29.708,
"train_steps_per_second": 0.232
}
],
"logging_steps": 5,
"max_steps": 4536,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 693442503278592.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}