Files
SmolLM2-MagpieUltraPlus-Met…/trainer_state.json
ModelHub XC 490c02f464 初始化项目,由ModelHub XC社区提供模型
Model: HuggingFaceTB/SmolLM2-MagpieUltraPlus-MetamathQA
Source: Original Platform
2026-06-18 21:38:13 +08:00

7436 lines
182 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998579209093061,
"eval_steps": 500,
"global_step": 5278,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000947193937958797,
"grad_norm": 52.55722061842073,
"learning_rate": 2.8409090909090907e-06,
"loss": 3.7816,
"step": 5
},
{
"epoch": 0.001894387875917594,
"grad_norm": 38.95340149423018,
"learning_rate": 5.6818181818181815e-06,
"loss": 3.5056,
"step": 10
},
{
"epoch": 0.0028415818138763913,
"grad_norm": 21.712424534014453,
"learning_rate": 8.522727272727271e-06,
"loss": 2.882,
"step": 15
},
{
"epoch": 0.003788775751835188,
"grad_norm": 9.040565708034416,
"learning_rate": 1.1363636363636363e-05,
"loss": 2.173,
"step": 20
},
{
"epoch": 0.004735969689793985,
"grad_norm": 4.220912189865154,
"learning_rate": 1.4204545454545453e-05,
"loss": 1.6834,
"step": 25
},
{
"epoch": 0.005683163627752783,
"grad_norm": 1.9714256696877792,
"learning_rate": 1.7045454545454543e-05,
"loss": 1.3769,
"step": 30
},
{
"epoch": 0.006630357565711579,
"grad_norm": 1.0423474521511698,
"learning_rate": 1.9886363636363634e-05,
"loss": 1.2239,
"step": 35
},
{
"epoch": 0.007577551503670376,
"grad_norm": 1.2391396539653674,
"learning_rate": 2.2727272727272726e-05,
"loss": 1.1367,
"step": 40
},
{
"epoch": 0.008524745441629173,
"grad_norm": 0.46298881388281793,
"learning_rate": 2.5568181818181814e-05,
"loss": 1.0701,
"step": 45
},
{
"epoch": 0.00947193937958797,
"grad_norm": 0.3539086469394409,
"learning_rate": 2.8409090909090906e-05,
"loss": 1.0452,
"step": 50
},
{
"epoch": 0.010419133317546767,
"grad_norm": 0.303699683898037,
"learning_rate": 3.125e-05,
"loss": 1.0418,
"step": 55
},
{
"epoch": 0.011366327255505565,
"grad_norm": 0.2133546406490209,
"learning_rate": 3.4090909090909085e-05,
"loss": 0.9768,
"step": 60
},
{
"epoch": 0.012313521193464362,
"grad_norm": 0.2101107113029947,
"learning_rate": 3.693181818181818e-05,
"loss": 0.9808,
"step": 65
},
{
"epoch": 0.013260715131423158,
"grad_norm": 0.2001787181890806,
"learning_rate": 3.977272727272727e-05,
"loss": 0.9558,
"step": 70
},
{
"epoch": 0.014207909069381956,
"grad_norm": 0.2060240507616133,
"learning_rate": 4.261363636363637e-05,
"loss": 0.961,
"step": 75
},
{
"epoch": 0.015155103007340753,
"grad_norm": 0.1998791188434844,
"learning_rate": 4.545454545454545e-05,
"loss": 0.9515,
"step": 80
},
{
"epoch": 0.01610229694529955,
"grad_norm": 0.15833294762038227,
"learning_rate": 4.8295454545454537e-05,
"loss": 0.9414,
"step": 85
},
{
"epoch": 0.017049490883258345,
"grad_norm": 0.1520760900480208,
"learning_rate": 5.113636363636363e-05,
"loss": 0.8963,
"step": 90
},
{
"epoch": 0.017996684821217145,
"grad_norm": 0.1388506087010938,
"learning_rate": 5.3977272727272727e-05,
"loss": 0.9501,
"step": 95
},
{
"epoch": 0.01894387875917594,
"grad_norm": 0.1410344131187388,
"learning_rate": 5.681818181818181e-05,
"loss": 0.909,
"step": 100
},
{
"epoch": 0.019891072697134738,
"grad_norm": 0.12318568339151473,
"learning_rate": 5.96590909090909e-05,
"loss": 0.9107,
"step": 105
},
{
"epoch": 0.020838266635093534,
"grad_norm": 0.1279058146057783,
"learning_rate": 6.25e-05,
"loss": 0.8978,
"step": 110
},
{
"epoch": 0.02178546057305233,
"grad_norm": 0.11032351265233929,
"learning_rate": 6.534090909090909e-05,
"loss": 0.931,
"step": 115
},
{
"epoch": 0.02273265451101113,
"grad_norm": 0.1119942887320865,
"learning_rate": 6.818181818181817e-05,
"loss": 0.8893,
"step": 120
},
{
"epoch": 0.023679848448969927,
"grad_norm": 0.10935745636043816,
"learning_rate": 7.102272727272727e-05,
"loss": 0.9006,
"step": 125
},
{
"epoch": 0.024627042386928723,
"grad_norm": 0.1083392524866063,
"learning_rate": 7.386363636363635e-05,
"loss": 0.8987,
"step": 130
},
{
"epoch": 0.02557423632488752,
"grad_norm": 0.09975528745630019,
"learning_rate": 7.670454545454545e-05,
"loss": 0.8831,
"step": 135
},
{
"epoch": 0.026521430262846316,
"grad_norm": 0.08603398438897839,
"learning_rate": 7.954545454545454e-05,
"loss": 0.8836,
"step": 140
},
{
"epoch": 0.027468624200805116,
"grad_norm": 0.08005996371772706,
"learning_rate": 8.238636363636362e-05,
"loss": 0.8932,
"step": 145
},
{
"epoch": 0.028415818138763912,
"grad_norm": 0.09105132634043626,
"learning_rate": 8.522727272727273e-05,
"loss": 0.8953,
"step": 150
},
{
"epoch": 0.02936301207672271,
"grad_norm": 0.08350347397936869,
"learning_rate": 8.806818181818182e-05,
"loss": 0.8901,
"step": 155
},
{
"epoch": 0.030310206014681505,
"grad_norm": 0.06553921290329484,
"learning_rate": 9.09090909090909e-05,
"loss": 0.84,
"step": 160
},
{
"epoch": 0.031257399952640305,
"grad_norm": 0.07002312647287917,
"learning_rate": 9.374999999999999e-05,
"loss": 0.9037,
"step": 165
},
{
"epoch": 0.0322045938905991,
"grad_norm": 0.07442774560302336,
"learning_rate": 9.659090909090907e-05,
"loss": 0.8961,
"step": 170
},
{
"epoch": 0.0331517878285579,
"grad_norm": 0.07978205419119488,
"learning_rate": 9.943181818181817e-05,
"loss": 0.8472,
"step": 175
},
{
"epoch": 0.03409898176651669,
"grad_norm": 0.08212508538706279,
"learning_rate": 0.00010227272727272726,
"loss": 0.8828,
"step": 180
},
{
"epoch": 0.03504617570447549,
"grad_norm": 0.06703812335472381,
"learning_rate": 0.00010511363636363635,
"loss": 0.8952,
"step": 185
},
{
"epoch": 0.03599336964243429,
"grad_norm": 0.07274197409833637,
"learning_rate": 0.00010795454545454545,
"loss": 0.8884,
"step": 190
},
{
"epoch": 0.03694056358039308,
"grad_norm": 0.06566094578471512,
"learning_rate": 0.00011079545454545454,
"loss": 0.8457,
"step": 195
},
{
"epoch": 0.03788775751835188,
"grad_norm": 0.08307007406251704,
"learning_rate": 0.00011363636363636362,
"loss": 0.8723,
"step": 200
},
{
"epoch": 0.038834951456310676,
"grad_norm": 0.08535904339597694,
"learning_rate": 0.00011647727272727271,
"loss": 0.9134,
"step": 205
},
{
"epoch": 0.039782145394269476,
"grad_norm": 0.06790982585326019,
"learning_rate": 0.0001193181818181818,
"loss": 0.888,
"step": 210
},
{
"epoch": 0.040729339332228276,
"grad_norm": 0.07237225374916947,
"learning_rate": 0.0001221590909090909,
"loss": 0.8629,
"step": 215
},
{
"epoch": 0.04167653327018707,
"grad_norm": 0.07663584947407301,
"learning_rate": 0.000125,
"loss": 0.8979,
"step": 220
},
{
"epoch": 0.04262372720814587,
"grad_norm": 0.06530101929213837,
"learning_rate": 0.00012784090909090907,
"loss": 0.893,
"step": 225
},
{
"epoch": 0.04357092114610466,
"grad_norm": 0.0750675120954495,
"learning_rate": 0.00013068181818181817,
"loss": 0.8826,
"step": 230
},
{
"epoch": 0.04451811508406346,
"grad_norm": 0.06833860955837964,
"learning_rate": 0.00013352272727272727,
"loss": 0.9042,
"step": 235
},
{
"epoch": 0.04546530902202226,
"grad_norm": 0.06776526573633952,
"learning_rate": 0.00013636363636363634,
"loss": 0.8771,
"step": 240
},
{
"epoch": 0.046412502959981054,
"grad_norm": 0.06177830383391277,
"learning_rate": 0.00013920454545454544,
"loss": 0.8735,
"step": 245
},
{
"epoch": 0.047359696897939854,
"grad_norm": 0.06679265204109729,
"learning_rate": 0.00014204545454545454,
"loss": 0.8705,
"step": 250
},
{
"epoch": 0.04830689083589865,
"grad_norm": 0.06611716647725416,
"learning_rate": 0.00014488636363636364,
"loss": 0.8716,
"step": 255
},
{
"epoch": 0.04925408477385745,
"grad_norm": 0.06159220932412366,
"learning_rate": 0.0001477272727272727,
"loss": 0.8713,
"step": 260
},
{
"epoch": 0.05020127871181625,
"grad_norm": 0.06823008150239136,
"learning_rate": 0.00015056818181818183,
"loss": 0.8969,
"step": 265
},
{
"epoch": 0.05114847264977504,
"grad_norm": 0.0703778333367766,
"learning_rate": 0.0001534090909090909,
"loss": 0.8687,
"step": 270
},
{
"epoch": 0.05209566658773384,
"grad_norm": 0.06489353446808296,
"learning_rate": 0.00015625,
"loss": 0.8639,
"step": 275
},
{
"epoch": 0.05304286052569263,
"grad_norm": 0.06986694992701606,
"learning_rate": 0.00015909090909090907,
"loss": 0.8666,
"step": 280
},
{
"epoch": 0.05399005446365143,
"grad_norm": 0.06723519153451205,
"learning_rate": 0.00016193181818181817,
"loss": 0.8764,
"step": 285
},
{
"epoch": 0.05493724840161023,
"grad_norm": 0.07077673680927957,
"learning_rate": 0.00016477272727272724,
"loss": 0.8618,
"step": 290
},
{
"epoch": 0.055884442339569025,
"grad_norm": 0.06891807376211805,
"learning_rate": 0.00016761363636363634,
"loss": 0.8745,
"step": 295
},
{
"epoch": 0.056831636277527825,
"grad_norm": 0.06841751882175513,
"learning_rate": 0.00017045454545454547,
"loss": 0.8461,
"step": 300
},
{
"epoch": 0.05777883021548662,
"grad_norm": 0.07176128138475886,
"learning_rate": 0.00017329545454545454,
"loss": 0.8667,
"step": 305
},
{
"epoch": 0.05872602415344542,
"grad_norm": 0.07290498390552146,
"learning_rate": 0.00017613636363636364,
"loss": 0.8739,
"step": 310
},
{
"epoch": 0.05967321809140422,
"grad_norm": 0.06773178575121884,
"learning_rate": 0.0001789772727272727,
"loss": 0.8871,
"step": 315
},
{
"epoch": 0.06062041202936301,
"grad_norm": 0.0678345156257685,
"learning_rate": 0.0001818181818181818,
"loss": 0.8677,
"step": 320
},
{
"epoch": 0.06156760596732181,
"grad_norm": 0.07520267059018662,
"learning_rate": 0.00018465909090909088,
"loss": 0.8762,
"step": 325
},
{
"epoch": 0.06251479990528061,
"grad_norm": 0.06412335985278948,
"learning_rate": 0.00018749999999999998,
"loss": 0.8935,
"step": 330
},
{
"epoch": 0.0634619938432394,
"grad_norm": 0.09455831472598197,
"learning_rate": 0.00019034090909090908,
"loss": 0.8799,
"step": 335
},
{
"epoch": 0.0644091877811982,
"grad_norm": 0.08784917380610333,
"learning_rate": 0.00019318181818181815,
"loss": 0.8622,
"step": 340
},
{
"epoch": 0.065356381719157,
"grad_norm": 0.0706205634228429,
"learning_rate": 0.00019602272727272727,
"loss": 0.8574,
"step": 345
},
{
"epoch": 0.0663035756571158,
"grad_norm": 0.0647307909886003,
"learning_rate": 0.00019886363636363634,
"loss": 0.8542,
"step": 350
},
{
"epoch": 0.06725076959507459,
"grad_norm": 0.06150113901124715,
"learning_rate": 0.00020170454545454544,
"loss": 0.8407,
"step": 355
},
{
"epoch": 0.06819796353303338,
"grad_norm": 0.06257476247931755,
"learning_rate": 0.0002045454545454545,
"loss": 0.8564,
"step": 360
},
{
"epoch": 0.06914515747099219,
"grad_norm": 0.06213175384030445,
"learning_rate": 0.0002073863636363636,
"loss": 0.8481,
"step": 365
},
{
"epoch": 0.07009235140895098,
"grad_norm": 0.07243770022823882,
"learning_rate": 0.0002102272727272727,
"loss": 0.8742,
"step": 370
},
{
"epoch": 0.07103954534690977,
"grad_norm": 0.06094029025046222,
"learning_rate": 0.00021306818181818178,
"loss": 0.8686,
"step": 375
},
{
"epoch": 0.07198673928486858,
"grad_norm": 0.07211978873607085,
"learning_rate": 0.0002159090909090909,
"loss": 0.8404,
"step": 380
},
{
"epoch": 0.07293393322282737,
"grad_norm": 0.0664949706014046,
"learning_rate": 0.00021874999999999998,
"loss": 0.8767,
"step": 385
},
{
"epoch": 0.07388112716078617,
"grad_norm": 0.07048102073887626,
"learning_rate": 0.00022159090909090908,
"loss": 0.8662,
"step": 390
},
{
"epoch": 0.07482832109874497,
"grad_norm": 0.06550092590742955,
"learning_rate": 0.00022443181818181815,
"loss": 0.8546,
"step": 395
},
{
"epoch": 0.07577551503670377,
"grad_norm": 0.07566899187849191,
"learning_rate": 0.00022727272727272725,
"loss": 0.8551,
"step": 400
},
{
"epoch": 0.07672270897466256,
"grad_norm": 0.06303208036750815,
"learning_rate": 0.00023011363636363634,
"loss": 0.8699,
"step": 405
},
{
"epoch": 0.07766990291262135,
"grad_norm": 0.06875716346372687,
"learning_rate": 0.00023295454545454542,
"loss": 0.8627,
"step": 410
},
{
"epoch": 0.07861709685058016,
"grad_norm": 0.08595111194659674,
"learning_rate": 0.00023579545454545454,
"loss": 0.8834,
"step": 415
},
{
"epoch": 0.07956429078853895,
"grad_norm": 0.061252177097668066,
"learning_rate": 0.0002386363636363636,
"loss": 0.8589,
"step": 420
},
{
"epoch": 0.08051148472649775,
"grad_norm": 0.06674992779765852,
"learning_rate": 0.0002414772727272727,
"loss": 0.8807,
"step": 425
},
{
"epoch": 0.08145867866445655,
"grad_norm": 0.06735600365269058,
"learning_rate": 0.0002443181818181818,
"loss": 0.8632,
"step": 430
},
{
"epoch": 0.08240587260241534,
"grad_norm": 0.06263588681026308,
"learning_rate": 0.0002471590909090909,
"loss": 0.9034,
"step": 435
},
{
"epoch": 0.08335306654037414,
"grad_norm": 0.064198353107829,
"learning_rate": 0.00025,
"loss": 0.9008,
"step": 440
},
{
"epoch": 0.08430026047833294,
"grad_norm": 0.06187143454881962,
"learning_rate": 0.00025284090909090905,
"loss": 0.8421,
"step": 445
},
{
"epoch": 0.08524745441629174,
"grad_norm": 0.05826341458036729,
"learning_rate": 0.00025568181818181815,
"loss": 0.8654,
"step": 450
},
{
"epoch": 0.08619464835425053,
"grad_norm": 0.06153983398074908,
"learning_rate": 0.00025852272727272725,
"loss": 0.8345,
"step": 455
},
{
"epoch": 0.08714184229220932,
"grad_norm": 0.057544439891252096,
"learning_rate": 0.00026136363636363634,
"loss": 0.8474,
"step": 460
},
{
"epoch": 0.08808903623016813,
"grad_norm": 0.05386176310877567,
"learning_rate": 0.00026420454545454544,
"loss": 0.8449,
"step": 465
},
{
"epoch": 0.08903623016812692,
"grad_norm": 0.05733131738608226,
"learning_rate": 0.00026704545454545454,
"loss": 0.8557,
"step": 470
},
{
"epoch": 0.08998342410608572,
"grad_norm": 0.05468199774083347,
"learning_rate": 0.00026988636363636364,
"loss": 0.8738,
"step": 475
},
{
"epoch": 0.09093061804404452,
"grad_norm": 0.0732652071369859,
"learning_rate": 0.0002727272727272727,
"loss": 0.8764,
"step": 480
},
{
"epoch": 0.09187781198200332,
"grad_norm": 0.06609967510300549,
"learning_rate": 0.0002755681818181818,
"loss": 0.8695,
"step": 485
},
{
"epoch": 0.09282500591996211,
"grad_norm": 0.06205800779765995,
"learning_rate": 0.0002784090909090909,
"loss": 0.8616,
"step": 490
},
{
"epoch": 0.09377219985792092,
"grad_norm": 0.05861483996354783,
"learning_rate": 0.00028125,
"loss": 0.8701,
"step": 495
},
{
"epoch": 0.09471939379587971,
"grad_norm": 0.060736665872329516,
"learning_rate": 0.0002840909090909091,
"loss": 0.8947,
"step": 500
},
{
"epoch": 0.0956665877338385,
"grad_norm": 0.06142593731376609,
"learning_rate": 0.0002869318181818182,
"loss": 0.8526,
"step": 505
},
{
"epoch": 0.0966137816717973,
"grad_norm": 0.054392069599614346,
"learning_rate": 0.0002897727272727273,
"loss": 0.8466,
"step": 510
},
{
"epoch": 0.0975609756097561,
"grad_norm": 0.06371556067004604,
"learning_rate": 0.0002926136363636363,
"loss": 0.8121,
"step": 515
},
{
"epoch": 0.0985081695477149,
"grad_norm": 0.06255544482298064,
"learning_rate": 0.0002954545454545454,
"loss": 0.8398,
"step": 520
},
{
"epoch": 0.09945536348567369,
"grad_norm": 0.05906918417826802,
"learning_rate": 0.0002982954545454545,
"loss": 0.8763,
"step": 525
},
{
"epoch": 0.1004025574236325,
"grad_norm": 0.05701807064591476,
"learning_rate": 0.0002999998687698221,
"loss": 0.8712,
"step": 530
},
{
"epoch": 0.10134975136159129,
"grad_norm": 0.061038500218446035,
"learning_rate": 0.00029999839243295787,
"loss": 0.8712,
"step": 535
},
{
"epoch": 0.10229694529955008,
"grad_norm": 0.056276485767051056,
"learning_rate": 0.0002999952757377059,
"loss": 0.8761,
"step": 540
},
{
"epoch": 0.10324413923750889,
"grad_norm": 0.06244892878431511,
"learning_rate": 0.00029999051871814974,
"loss": 0.8711,
"step": 545
},
{
"epoch": 0.10419133317546768,
"grad_norm": 0.05743365231512095,
"learning_rate": 0.0002999841214263116,
"loss": 0.8457,
"step": 550
},
{
"epoch": 0.10513852711342647,
"grad_norm": 0.06440086459939007,
"learning_rate": 0.000299976083932151,
"loss": 0.8994,
"step": 555
},
{
"epoch": 0.10608572105138526,
"grad_norm": 0.056927632875731916,
"learning_rate": 0.0002999664063235649,
"loss": 0.841,
"step": 560
},
{
"epoch": 0.10703291498934407,
"grad_norm": 0.0583247436509342,
"learning_rate": 0.00029995508870638596,
"loss": 0.8765,
"step": 565
},
{
"epoch": 0.10798010892730286,
"grad_norm": 0.04974164281813745,
"learning_rate": 0.00029994213120438187,
"loss": 0.8429,
"step": 570
},
{
"epoch": 0.10892730286526166,
"grad_norm": 0.06125862188074254,
"learning_rate": 0.0002999275339592538,
"loss": 0.8935,
"step": 575
},
{
"epoch": 0.10987449680322046,
"grad_norm": 0.059759664506794964,
"learning_rate": 0.0002999112971306348,
"loss": 0.869,
"step": 580
},
{
"epoch": 0.11082169074117926,
"grad_norm": 0.06328973302665653,
"learning_rate": 0.00029989342089608835,
"loss": 0.852,
"step": 585
},
{
"epoch": 0.11176888467913805,
"grad_norm": 0.060243737818932684,
"learning_rate": 0.00029987390545110605,
"loss": 0.857,
"step": 590
},
{
"epoch": 0.11271607861709686,
"grad_norm": 0.055808252373465206,
"learning_rate": 0.0002998527510091056,
"loss": 0.8774,
"step": 595
},
{
"epoch": 0.11366327255505565,
"grad_norm": 0.06324311731152125,
"learning_rate": 0.0002998299578014287,
"loss": 0.8726,
"step": 600
},
{
"epoch": 0.11461046649301444,
"grad_norm": 0.05610986410453152,
"learning_rate": 0.0002998055260773381,
"loss": 0.8589,
"step": 605
},
{
"epoch": 0.11555766043097324,
"grad_norm": 0.04965479001175545,
"learning_rate": 0.0002997794561040153,
"loss": 0.8383,
"step": 610
},
{
"epoch": 0.11650485436893204,
"grad_norm": 0.05064790996842586,
"learning_rate": 0.00029975174816655736,
"loss": 0.8524,
"step": 615
},
{
"epoch": 0.11745204830689084,
"grad_norm": 0.052969432132795244,
"learning_rate": 0.00029972240256797384,
"loss": 0.8848,
"step": 620
},
{
"epoch": 0.11839924224484963,
"grad_norm": 0.04987789018051327,
"learning_rate": 0.0002996914196291835,
"loss": 0.8579,
"step": 625
},
{
"epoch": 0.11934643618280844,
"grad_norm": 0.05418008688918587,
"learning_rate": 0.0002996587996890107,
"loss": 0.9321,
"step": 630
},
{
"epoch": 0.12029363012076723,
"grad_norm": 0.057606406769674844,
"learning_rate": 0.000299624543104182,
"loss": 0.864,
"step": 635
},
{
"epoch": 0.12124082405872602,
"grad_norm": 0.050896712551105165,
"learning_rate": 0.0002995886502493219,
"loss": 0.8508,
"step": 640
},
{
"epoch": 0.12218801799668483,
"grad_norm": 0.05856562263921288,
"learning_rate": 0.00029955112151694885,
"loss": 0.8557,
"step": 645
},
{
"epoch": 0.12313521193464362,
"grad_norm": 0.056397198481637226,
"learning_rate": 0.00029951195731747114,
"loss": 0.8763,
"step": 650
},
{
"epoch": 0.12408240587260241,
"grad_norm": 0.06021365518577683,
"learning_rate": 0.00029947115807918217,
"loss": 0.8691,
"step": 655
},
{
"epoch": 0.12502959981056122,
"grad_norm": 0.049632655250353507,
"learning_rate": 0.0002994287242482558,
"loss": 0.8593,
"step": 660
},
{
"epoch": 0.12597679374852,
"grad_norm": 0.04958168029549867,
"learning_rate": 0.00029938465628874165,
"loss": 0.8591,
"step": 665
},
{
"epoch": 0.1269239876864788,
"grad_norm": 0.05757210103501373,
"learning_rate": 0.00029933895468255985,
"loss": 0.8402,
"step": 670
},
{
"epoch": 0.1278711816244376,
"grad_norm": 0.06892314154494911,
"learning_rate": 0.0002992916199294959,
"loss": 0.8689,
"step": 675
},
{
"epoch": 0.1288183755623964,
"grad_norm": 0.049748765290627474,
"learning_rate": 0.000299242652547195,
"loss": 0.8486,
"step": 680
},
{
"epoch": 0.1297655695003552,
"grad_norm": 0.054839939836728246,
"learning_rate": 0.0002991920530711566,
"loss": 0.8673,
"step": 685
},
{
"epoch": 0.130712763438314,
"grad_norm": 0.05978086373502768,
"learning_rate": 0.00029913982205472857,
"loss": 0.8608,
"step": 690
},
{
"epoch": 0.13165995737627278,
"grad_norm": 0.052502322799696084,
"learning_rate": 0.0002990859600691008,
"loss": 0.8613,
"step": 695
},
{
"epoch": 0.1326071513142316,
"grad_norm": 0.05054364592204903,
"learning_rate": 0.0002990304677032994,
"loss": 0.8746,
"step": 700
},
{
"epoch": 0.1335543452521904,
"grad_norm": 0.05172804041468556,
"learning_rate": 0.00029897334556418004,
"loss": 0.8256,
"step": 705
},
{
"epoch": 0.13450153919014918,
"grad_norm": 0.05101864907200138,
"learning_rate": 0.0002989145942764212,
"loss": 0.8655,
"step": 710
},
{
"epoch": 0.13544873312810798,
"grad_norm": 0.05509456096234295,
"learning_rate": 0.0002988542144825176,
"loss": 0.8692,
"step": 715
},
{
"epoch": 0.13639592706606676,
"grad_norm": 0.049333296450028125,
"learning_rate": 0.000298792206842773,
"loss": 0.8572,
"step": 720
},
{
"epoch": 0.13734312100402557,
"grad_norm": 0.059122961216738656,
"learning_rate": 0.0002987285720352929,
"loss": 0.8735,
"step": 725
},
{
"epoch": 0.13829031494198438,
"grad_norm": 0.050375990420733686,
"learning_rate": 0.0002986633107559775,
"loss": 0.82,
"step": 730
},
{
"epoch": 0.13923750887994316,
"grad_norm": 0.0496346403585563,
"learning_rate": 0.0002985964237185136,
"loss": 0.8467,
"step": 735
},
{
"epoch": 0.14018470281790196,
"grad_norm": 0.053630745330135815,
"learning_rate": 0.00029852791165436716,
"loss": 0.8858,
"step": 740
},
{
"epoch": 0.14113189675586077,
"grad_norm": 0.055544816267542034,
"learning_rate": 0.0002984577753127752,
"loss": 0.8707,
"step": 745
},
{
"epoch": 0.14207909069381955,
"grad_norm": 0.04983399501220757,
"learning_rate": 0.00029838601546073744,
"loss": 0.846,
"step": 750
},
{
"epoch": 0.14302628463177836,
"grad_norm": 0.05531741725439223,
"learning_rate": 0.00029831263288300817,
"loss": 0.8716,
"step": 755
},
{
"epoch": 0.14397347856973716,
"grad_norm": 0.05305914413910715,
"learning_rate": 0.00029823762838208744,
"loss": 0.8694,
"step": 760
},
{
"epoch": 0.14492067250769594,
"grad_norm": 0.063066365795915,
"learning_rate": 0.00029816100277821247,
"loss": 0.8575,
"step": 765
},
{
"epoch": 0.14586786644565475,
"grad_norm": 0.052014222449902975,
"learning_rate": 0.00029808275690934864,
"loss": 0.8553,
"step": 770
},
{
"epoch": 0.14681506038361355,
"grad_norm": 0.05627981583655042,
"learning_rate": 0.00029800289163118014,
"loss": 0.8491,
"step": 775
},
{
"epoch": 0.14776225432157233,
"grad_norm": 0.048744553655055825,
"learning_rate": 0.00029792140781710103,
"loss": 0.8597,
"step": 780
},
{
"epoch": 0.14870944825953114,
"grad_norm": 0.07103922836346321,
"learning_rate": 0.00029783830635820506,
"loss": 0.8685,
"step": 785
},
{
"epoch": 0.14965664219748995,
"grad_norm": 0.05572062793930259,
"learning_rate": 0.0002977535881632766,
"loss": 0.8144,
"step": 790
},
{
"epoch": 0.15060383613544873,
"grad_norm": 0.047330412499616226,
"learning_rate": 0.00029766725415878017,
"loss": 0.8353,
"step": 795
},
{
"epoch": 0.15155103007340753,
"grad_norm": 0.05324033535511291,
"learning_rate": 0.00029757930528885064,
"loss": 0.8411,
"step": 800
},
{
"epoch": 0.15249822401136634,
"grad_norm": 0.05513304510602078,
"learning_rate": 0.0002974897425152828,
"loss": 0.8809,
"step": 805
},
{
"epoch": 0.15344541794932512,
"grad_norm": 0.052990990094527124,
"learning_rate": 0.0002973985668175207,
"loss": 0.8608,
"step": 810
},
{
"epoch": 0.15439261188728393,
"grad_norm": 0.05324896337561592,
"learning_rate": 0.0002973057791926473,
"loss": 0.8458,
"step": 815
},
{
"epoch": 0.1553398058252427,
"grad_norm": 0.05276687776977392,
"learning_rate": 0.000297211380655373,
"loss": 0.8697,
"step": 820
},
{
"epoch": 0.1562869997632015,
"grad_norm": 0.052354949797073405,
"learning_rate": 0.0002971153722380253,
"loss": 0.8507,
"step": 825
},
{
"epoch": 0.15723419370116032,
"grad_norm": 0.049368244945149506,
"learning_rate": 0.0002970177549905368,
"loss": 0.8403,
"step": 830
},
{
"epoch": 0.1581813876391191,
"grad_norm": 0.046532042464774784,
"learning_rate": 0.00029691852998043396,
"loss": 0.8552,
"step": 835
},
{
"epoch": 0.1591285815770779,
"grad_norm": 0.04876609183561892,
"learning_rate": 0.00029681769829282574,
"loss": 0.8479,
"step": 840
},
{
"epoch": 0.1600757755150367,
"grad_norm": 0.059730813463699885,
"learning_rate": 0.0002967152610303913,
"loss": 0.8545,
"step": 845
},
{
"epoch": 0.1610229694529955,
"grad_norm": 0.055750160324234604,
"learning_rate": 0.00029661121931336804,
"loss": 0.8504,
"step": 850
},
{
"epoch": 0.1619701633909543,
"grad_norm": 0.0528593038524647,
"learning_rate": 0.0002965055742795395,
"loss": 0.8814,
"step": 855
},
{
"epoch": 0.1629173573289131,
"grad_norm": 0.05558247953451502,
"learning_rate": 0.000296398327084223,
"loss": 0.85,
"step": 860
},
{
"epoch": 0.16386455126687188,
"grad_norm": 0.06143605437889347,
"learning_rate": 0.00029628947890025656,
"loss": 0.8561,
"step": 865
},
{
"epoch": 0.1648117452048307,
"grad_norm": 0.05186425147430888,
"learning_rate": 0.0002961790309179866,
"loss": 0.8393,
"step": 870
},
{
"epoch": 0.1657589391427895,
"grad_norm": 0.047149331325217335,
"learning_rate": 0.00029606698434525434,
"loss": 0.8668,
"step": 875
},
{
"epoch": 0.16670613308074828,
"grad_norm": 0.048689384932807,
"learning_rate": 0.00029595334040738333,
"loss": 0.8374,
"step": 880
},
{
"epoch": 0.16765332701870708,
"grad_norm": 0.053510975406836386,
"learning_rate": 0.00029583810034716545,
"loss": 0.8491,
"step": 885
},
{
"epoch": 0.1686005209566659,
"grad_norm": 0.05595964353451741,
"learning_rate": 0.00029572126542484745,
"loss": 0.8727,
"step": 890
},
{
"epoch": 0.16954771489462467,
"grad_norm": 0.055885278431375376,
"learning_rate": 0.0002956028369181174,
"loss": 0.882,
"step": 895
},
{
"epoch": 0.17049490883258347,
"grad_norm": 0.047842403175001005,
"learning_rate": 0.00029548281612209044,
"loss": 0.8682,
"step": 900
},
{
"epoch": 0.17144210277054228,
"grad_norm": 0.058823537208354766,
"learning_rate": 0.00029536120434929476,
"loss": 0.8373,
"step": 905
},
{
"epoch": 0.17238929670850106,
"grad_norm": 0.05444610376517603,
"learning_rate": 0.00029523800292965724,
"loss": 0.8783,
"step": 910
},
{
"epoch": 0.17333649064645987,
"grad_norm": 0.054957105759307595,
"learning_rate": 0.00029511321321048893,
"loss": 0.843,
"step": 915
},
{
"epoch": 0.17428368458441865,
"grad_norm": 0.06583345091917218,
"learning_rate": 0.0002949868365564701,
"loss": 0.8504,
"step": 920
},
{
"epoch": 0.17523087852237745,
"grad_norm": 0.04777073198426105,
"learning_rate": 0.00029485887434963566,
"loss": 0.8298,
"step": 925
},
{
"epoch": 0.17617807246033626,
"grad_norm": 0.05562673540582162,
"learning_rate": 0.00029472932798935977,
"loss": 0.8418,
"step": 930
},
{
"epoch": 0.17712526639829504,
"grad_norm": 0.04785779459273509,
"learning_rate": 0.0002945981988923406,
"loss": 0.8328,
"step": 935
},
{
"epoch": 0.17807246033625385,
"grad_norm": 0.05554074332095169,
"learning_rate": 0.00029446548849258513,
"loss": 0.8279,
"step": 940
},
{
"epoch": 0.17901965427421265,
"grad_norm": 0.046624447216736774,
"learning_rate": 0.00029433119824139286,
"loss": 0.8494,
"step": 945
},
{
"epoch": 0.17996684821217143,
"grad_norm": 0.051194260228541774,
"learning_rate": 0.0002941953296073405,
"loss": 0.8594,
"step": 950
},
{
"epoch": 0.18091404215013024,
"grad_norm": 0.05090727374729561,
"learning_rate": 0.0002940578840762658,
"loss": 0.8422,
"step": 955
},
{
"epoch": 0.18186123608808905,
"grad_norm": 0.04639853886400584,
"learning_rate": 0.00029391886315125083,
"loss": 0.8344,
"step": 960
},
{
"epoch": 0.18280843002604782,
"grad_norm": 0.056061389000908554,
"learning_rate": 0.0002937782683526063,
"loss": 0.8131,
"step": 965
},
{
"epoch": 0.18375562396400663,
"grad_norm": 0.04702100386110801,
"learning_rate": 0.00029363610121785447,
"loss": 0.8141,
"step": 970
},
{
"epoch": 0.18470281790196544,
"grad_norm": 0.04844408584935392,
"learning_rate": 0.00029349236330171224,
"loss": 0.8149,
"step": 975
},
{
"epoch": 0.18565001183992422,
"grad_norm": 0.048586646278994214,
"learning_rate": 0.0002933470561760744,
"loss": 0.8723,
"step": 980
},
{
"epoch": 0.18659720577788302,
"grad_norm": 0.0510188898302652,
"learning_rate": 0.00029320018142999643,
"loss": 0.8319,
"step": 985
},
{
"epoch": 0.18754439971584183,
"grad_norm": 0.04664479134380501,
"learning_rate": 0.0002930517406696771,
"loss": 0.8425,
"step": 990
},
{
"epoch": 0.1884915936538006,
"grad_norm": 0.04274961179387165,
"learning_rate": 0.0002929017355184407,
"loss": 0.8252,
"step": 995
},
{
"epoch": 0.18943878759175942,
"grad_norm": 0.05336071927501094,
"learning_rate": 0.00029275016761671954,
"loss": 0.8343,
"step": 1000
},
{
"epoch": 0.19038598152971822,
"grad_norm": 0.0732142750483461,
"learning_rate": 0.00029259703862203587,
"loss": 0.8305,
"step": 1005
},
{
"epoch": 0.191333175467677,
"grad_norm": 0.059075683989297266,
"learning_rate": 0.00029244235020898395,
"loss": 0.8487,
"step": 1010
},
{
"epoch": 0.1922803694056358,
"grad_norm": 0.04650016716241106,
"learning_rate": 0.0002922861040692115,
"loss": 0.8583,
"step": 1015
},
{
"epoch": 0.1932275633435946,
"grad_norm": 0.04767602957865441,
"learning_rate": 0.0002921283019114011,
"loss": 0.8496,
"step": 1020
},
{
"epoch": 0.1941747572815534,
"grad_norm": 0.05644985578762857,
"learning_rate": 0.00029196894546125197,
"loss": 0.8429,
"step": 1025
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.049699082298498856,
"learning_rate": 0.0002918080364614607,
"loss": 0.8121,
"step": 1030
},
{
"epoch": 0.19606914515747098,
"grad_norm": 0.05678361846269368,
"learning_rate": 0.0002916455766717024,
"loss": 0.831,
"step": 1035
},
{
"epoch": 0.1970163390954298,
"grad_norm": 0.05634582924907757,
"learning_rate": 0.00029148156786861125,
"loss": 0.8411,
"step": 1040
},
{
"epoch": 0.1979635330333886,
"grad_norm": 0.044578510424992744,
"learning_rate": 0.0002913160118457612,
"loss": 0.8163,
"step": 1045
},
{
"epoch": 0.19891072697134737,
"grad_norm": 0.049243336108268164,
"learning_rate": 0.00029114891041364646,
"loss": 0.8651,
"step": 1050
},
{
"epoch": 0.19985792090930618,
"grad_norm": 0.05392438131527857,
"learning_rate": 0.00029098026539966143,
"loss": 0.8304,
"step": 1055
},
{
"epoch": 0.200805114847265,
"grad_norm": 0.05004934480263781,
"learning_rate": 0.0002908100786480811,
"loss": 0.8686,
"step": 1060
},
{
"epoch": 0.20175230878522377,
"grad_norm": 0.04733082301433883,
"learning_rate": 0.00029063835202004036,
"loss": 0.8346,
"step": 1065
},
{
"epoch": 0.20269950272318257,
"grad_norm": 0.05783680843141235,
"learning_rate": 0.0002904650873935143,
"loss": 0.8312,
"step": 1070
},
{
"epoch": 0.20364669666114138,
"grad_norm": 0.04565338494016318,
"learning_rate": 0.0002902902866632969,
"loss": 0.8595,
"step": 1075
},
{
"epoch": 0.20459389059910016,
"grad_norm": 0.050858909948530574,
"learning_rate": 0.0002901139517409811,
"loss": 0.8642,
"step": 1080
},
{
"epoch": 0.20554108453705897,
"grad_norm": 0.05066970549535419,
"learning_rate": 0.0002899360845549373,
"loss": 0.8342,
"step": 1085
},
{
"epoch": 0.20648827847501777,
"grad_norm": 0.04679512262177614,
"learning_rate": 0.0002897566870502925,
"loss": 0.8306,
"step": 1090
},
{
"epoch": 0.20743547241297655,
"grad_norm": 0.05217676490462024,
"learning_rate": 0.00028957576118890914,
"loss": 0.8225,
"step": 1095
},
{
"epoch": 0.20838266635093536,
"grad_norm": 0.05437239079138474,
"learning_rate": 0.0002893933089493635,
"loss": 0.8553,
"step": 1100
},
{
"epoch": 0.20932986028889414,
"grad_norm": 0.0509435046538677,
"learning_rate": 0.00028920933232692386,
"loss": 0.8086,
"step": 1105
},
{
"epoch": 0.21027705422685294,
"grad_norm": 0.044222391512642414,
"learning_rate": 0.00028902383333352926,
"loss": 0.8412,
"step": 1110
},
{
"epoch": 0.21122424816481175,
"grad_norm": 0.04967664422872056,
"learning_rate": 0.0002888368139977669,
"loss": 0.8506,
"step": 1115
},
{
"epoch": 0.21217144210277053,
"grad_norm": 0.0465634897763295,
"learning_rate": 0.0002886482763648503,
"loss": 0.8217,
"step": 1120
},
{
"epoch": 0.21311863604072934,
"grad_norm": 0.05052365351929177,
"learning_rate": 0.0002884582224965968,
"loss": 0.8332,
"step": 1125
},
{
"epoch": 0.21406582997868814,
"grad_norm": 0.05180493973251838,
"learning_rate": 0.000288266654471405,
"loss": 0.8347,
"step": 1130
},
{
"epoch": 0.21501302391664692,
"grad_norm": 0.0471780634040616,
"learning_rate": 0.0002880735743842322,
"loss": 0.8366,
"step": 1135
},
{
"epoch": 0.21596021785460573,
"grad_norm": 0.049108577250984026,
"learning_rate": 0.0002878789843465713,
"loss": 0.8362,
"step": 1140
},
{
"epoch": 0.21690741179256454,
"grad_norm": 0.05564424373715666,
"learning_rate": 0.0002876828864864277,
"loss": 0.8514,
"step": 1145
},
{
"epoch": 0.21785460573052332,
"grad_norm": 0.0564159881319112,
"learning_rate": 0.0002874852829482963,
"loss": 0.8723,
"step": 1150
},
{
"epoch": 0.21880179966848212,
"grad_norm": 0.051069627553698455,
"learning_rate": 0.0002872861758931376,
"loss": 0.851,
"step": 1155
},
{
"epoch": 0.21974899360644093,
"grad_norm": 0.05117455703332741,
"learning_rate": 0.00028708556749835454,
"loss": 0.8434,
"step": 1160
},
{
"epoch": 0.2206961875443997,
"grad_norm": 0.04934525970498773,
"learning_rate": 0.0002868834599577684,
"loss": 0.841,
"step": 1165
},
{
"epoch": 0.22164338148235851,
"grad_norm": 0.05894722590210803,
"learning_rate": 0.0002866798554815948,
"loss": 0.8458,
"step": 1170
},
{
"epoch": 0.22259057542031732,
"grad_norm": 0.05630861952888565,
"learning_rate": 0.0002864747562964197,
"loss": 0.8343,
"step": 1175
},
{
"epoch": 0.2235377693582761,
"grad_norm": 0.045749561983576756,
"learning_rate": 0.000286268164645175,
"loss": 0.8421,
"step": 1180
},
{
"epoch": 0.2244849632962349,
"grad_norm": 0.04499091282837436,
"learning_rate": 0.00028606008278711373,
"loss": 0.8397,
"step": 1185
},
{
"epoch": 0.22543215723419371,
"grad_norm": 0.042402412945632365,
"learning_rate": 0.00028585051299778594,
"loss": 0.8061,
"step": 1190
},
{
"epoch": 0.2263793511721525,
"grad_norm": 0.046372304084499535,
"learning_rate": 0.00028563945756901314,
"loss": 0.8514,
"step": 1195
},
{
"epoch": 0.2273265451101113,
"grad_norm": 0.050066239277336104,
"learning_rate": 0.00028542691880886376,
"loss": 0.8473,
"step": 1200
},
{
"epoch": 0.22827373904807008,
"grad_norm": 0.04750268427944095,
"learning_rate": 0.0002852128990416275,
"loss": 0.8155,
"step": 1205
},
{
"epoch": 0.22922093298602889,
"grad_norm": 0.04448032581142824,
"learning_rate": 0.0002849974006077904,
"loss": 0.8462,
"step": 1210
},
{
"epoch": 0.2301681269239877,
"grad_norm": 0.045988358494773375,
"learning_rate": 0.00028478042586400876,
"loss": 0.8139,
"step": 1215
},
{
"epoch": 0.23111532086194647,
"grad_norm": 0.05150874461710223,
"learning_rate": 0.00028456197718308365,
"loss": 0.8511,
"step": 1220
},
{
"epoch": 0.23206251479990528,
"grad_norm": 0.04407477102954397,
"learning_rate": 0.00028434205695393477,
"loss": 0.8374,
"step": 1225
},
{
"epoch": 0.23300970873786409,
"grad_norm": 0.04739621311473698,
"learning_rate": 0.0002841206675815745,
"loss": 0.8126,
"step": 1230
},
{
"epoch": 0.23395690267582286,
"grad_norm": 0.04187717815582618,
"learning_rate": 0.0002838978114870816,
"loss": 0.8274,
"step": 1235
},
{
"epoch": 0.23490409661378167,
"grad_norm": 0.04177075006556251,
"learning_rate": 0.0002836734911075746,
"loss": 0.8168,
"step": 1240
},
{
"epoch": 0.23585129055174048,
"grad_norm": 0.045942488951301354,
"learning_rate": 0.0002834477088961853,
"loss": 0.8054,
"step": 1245
},
{
"epoch": 0.23679848448969926,
"grad_norm": 0.04359836043504769,
"learning_rate": 0.00028322046732203165,
"loss": 0.8538,
"step": 1250
},
{
"epoch": 0.23774567842765806,
"grad_norm": 0.04439950975585717,
"learning_rate": 0.0002829917688701912,
"loss": 0.8352,
"step": 1255
},
{
"epoch": 0.23869287236561687,
"grad_norm": 0.06555071271570542,
"learning_rate": 0.00028276161604167354,
"loss": 0.8395,
"step": 1260
},
{
"epoch": 0.23964006630357565,
"grad_norm": 0.06429653649306202,
"learning_rate": 0.0002825300113533932,
"loss": 0.8639,
"step": 1265
},
{
"epoch": 0.24058726024153446,
"grad_norm": 0.050999285390159056,
"learning_rate": 0.0002822969573381418,
"loss": 0.8265,
"step": 1270
},
{
"epoch": 0.24153445417949326,
"grad_norm": 0.06883273146747126,
"learning_rate": 0.0002820624565445608,
"loss": 0.8505,
"step": 1275
},
{
"epoch": 0.24248164811745204,
"grad_norm": 0.08792312386360097,
"learning_rate": 0.00028182651153711334,
"loss": 0.8393,
"step": 1280
},
{
"epoch": 0.24342884205541085,
"grad_norm": 0.05834297565610207,
"learning_rate": 0.0002815891248960562,
"loss": 0.8198,
"step": 1285
},
{
"epoch": 0.24437603599336966,
"grad_norm": 0.05226579523827474,
"learning_rate": 0.0002813502992174116,
"loss": 0.8127,
"step": 1290
},
{
"epoch": 0.24532322993132843,
"grad_norm": 0.040640751080577776,
"learning_rate": 0.00028111003711293897,
"loss": 0.8068,
"step": 1295
},
{
"epoch": 0.24627042386928724,
"grad_norm": 0.04679354987985417,
"learning_rate": 0.00028086834121010616,
"loss": 0.8368,
"step": 1300
},
{
"epoch": 0.24721761780724602,
"grad_norm": 0.04499421671023844,
"learning_rate": 0.0002806252141520608,
"loss": 0.8492,
"step": 1305
},
{
"epoch": 0.24816481174520483,
"grad_norm": 0.5315442282070766,
"learning_rate": 0.00028038065859760147,
"loss": 0.8775,
"step": 1310
},
{
"epoch": 0.24911200568316363,
"grad_norm": 49.60479029682665,
"learning_rate": 0.0002801346772211486,
"loss": 14.7667,
"step": 1315
},
{
"epoch": 0.25005919962112244,
"grad_norm": 74.14171657249427,
"learning_rate": 0.000279887272712715,
"loss": 7.0642,
"step": 1320
},
{
"epoch": 0.2510063935590812,
"grad_norm": 0.9679325181066564,
"learning_rate": 0.00027963844777787687,
"loss": 2.8211,
"step": 1325
},
{
"epoch": 0.25195358749704,
"grad_norm": 1.8750749955853339,
"learning_rate": 0.0002793882051377437,
"loss": 2.5509,
"step": 1330
},
{
"epoch": 0.25290078143499883,
"grad_norm": 1.256663906662595,
"learning_rate": 0.00027913654752892897,
"loss": 1.6113,
"step": 1335
},
{
"epoch": 0.2538479753729576,
"grad_norm": 0.4517499508288906,
"learning_rate": 0.00027888347770352,
"loss": 1.3621,
"step": 1340
},
{
"epoch": 0.2547951693109164,
"grad_norm": 0.16403760364117914,
"learning_rate": 0.00027862899842904783,
"loss": 1.1522,
"step": 1345
},
{
"epoch": 0.2557423632488752,
"grad_norm": 0.15782986879922667,
"learning_rate": 0.00027837311248845697,
"loss": 1.0121,
"step": 1350
},
{
"epoch": 0.256689557186834,
"grad_norm": 0.07939626217767812,
"learning_rate": 0.00027811582268007516,
"loss": 0.9976,
"step": 1355
},
{
"epoch": 0.2576367511247928,
"grad_norm": 0.08824920573476226,
"learning_rate": 0.0002778571318175825,
"loss": 0.937,
"step": 1360
},
{
"epoch": 0.2585839450627516,
"grad_norm": 0.05529848573271428,
"learning_rate": 0.0002775970427299808,
"loss": 0.9259,
"step": 1365
},
{
"epoch": 0.2595311390007104,
"grad_norm": 0.05160154895612758,
"learning_rate": 0.00027733555826156266,
"loss": 0.932,
"step": 1370
},
{
"epoch": 0.2604783329386692,
"grad_norm": 0.6967363411222061,
"learning_rate": 0.00027707268127188033,
"loss": 0.941,
"step": 1375
},
{
"epoch": 0.261425526876628,
"grad_norm": 0.05931503868977922,
"learning_rate": 0.00027680841463571446,
"loss": 0.8775,
"step": 1380
},
{
"epoch": 0.2623727208145868,
"grad_norm": 0.07337997434432302,
"learning_rate": 0.0002765427612430426,
"loss": 0.887,
"step": 1385
},
{
"epoch": 0.26331991475254557,
"grad_norm": 0.053104858087786935,
"learning_rate": 0.00027627572399900775,
"loss": 0.8484,
"step": 1390
},
{
"epoch": 0.2642671086905044,
"grad_norm": 0.06163074713437679,
"learning_rate": 0.00027600730582388644,
"loss": 0.8812,
"step": 1395
},
{
"epoch": 0.2652143026284632,
"grad_norm": 0.05016009889342552,
"learning_rate": 0.00027573750965305676,
"loss": 0.8678,
"step": 1400
},
{
"epoch": 0.26616149656642196,
"grad_norm": 0.053793608113706266,
"learning_rate": 0.0002754663384369664,
"loss": 0.8421,
"step": 1405
},
{
"epoch": 0.2671086905043808,
"grad_norm": 0.054043702396785645,
"learning_rate": 0.0002751937951411005,
"loss": 0.8374,
"step": 1410
},
{
"epoch": 0.2680558844423396,
"grad_norm": 0.05993998463791112,
"learning_rate": 0.00027491988274594865,
"loss": 0.8521,
"step": 1415
},
{
"epoch": 0.26900307838029835,
"grad_norm": 0.05214973518061344,
"learning_rate": 0.00027464460424697304,
"loss": 0.8563,
"step": 1420
},
{
"epoch": 0.2699502723182572,
"grad_norm": 0.04937709748344037,
"learning_rate": 0.0002743679626545753,
"loss": 0.8611,
"step": 1425
},
{
"epoch": 0.27089746625621597,
"grad_norm": 0.04690788256444743,
"learning_rate": 0.0002740899609940634,
"loss": 0.8737,
"step": 1430
},
{
"epoch": 0.27184466019417475,
"grad_norm": 0.04458440143618464,
"learning_rate": 0.00027381060230561904,
"loss": 0.8393,
"step": 1435
},
{
"epoch": 0.2727918541321335,
"grad_norm": 0.044640210601826116,
"learning_rate": 0.0002735298896442641,
"loss": 0.8569,
"step": 1440
},
{
"epoch": 0.27373904807009236,
"grad_norm": 0.048448777510886915,
"learning_rate": 0.00027324782607982727,
"loss": 0.8348,
"step": 1445
},
{
"epoch": 0.27468624200805114,
"grad_norm": 0.0500357269022256,
"learning_rate": 0.0002729644146969104,
"loss": 0.8676,
"step": 1450
},
{
"epoch": 0.2756334359460099,
"grad_norm": 0.053604059700079344,
"learning_rate": 0.0002726796585948551,
"loss": 0.8495,
"step": 1455
},
{
"epoch": 0.27658062988396875,
"grad_norm": 0.061024238560798645,
"learning_rate": 0.00027239356088770846,
"loss": 0.84,
"step": 1460
},
{
"epoch": 0.27752782382192753,
"grad_norm": 0.05542695537862334,
"learning_rate": 0.0002721061247041891,
"loss": 0.8445,
"step": 1465
},
{
"epoch": 0.2784750177598863,
"grad_norm": 0.05174226319950172,
"learning_rate": 0.00027181735318765305,
"loss": 0.8239,
"step": 1470
},
{
"epoch": 0.27942221169784515,
"grad_norm": 0.0524720390301642,
"learning_rate": 0.0002715272494960594,
"loss": 0.8717,
"step": 1475
},
{
"epoch": 0.2803694056358039,
"grad_norm": 0.05078251806460792,
"learning_rate": 0.00027123581680193575,
"loss": 0.8776,
"step": 1480
},
{
"epoch": 0.2813165995737627,
"grad_norm": 0.0508208511214285,
"learning_rate": 0.0002709430582923432,
"loss": 0.8337,
"step": 1485
},
{
"epoch": 0.28226379351172154,
"grad_norm": 0.05428792173232585,
"learning_rate": 0.00027064897716884195,
"loss": 0.8331,
"step": 1490
},
{
"epoch": 0.2832109874496803,
"grad_norm": 0.04339480572676772,
"learning_rate": 0.0002703535766474561,
"loss": 0.8474,
"step": 1495
},
{
"epoch": 0.2841581813876391,
"grad_norm": 0.04801286233761752,
"learning_rate": 0.00027005685995863833,
"loss": 0.8538,
"step": 1500
},
{
"epoch": 0.28510537532559793,
"grad_norm": 0.05126840483010234,
"learning_rate": 0.00026975883034723486,
"loss": 0.8508,
"step": 1505
},
{
"epoch": 0.2860525692635567,
"grad_norm": 0.05632538740067954,
"learning_rate": 0.00026945949107244984,
"loss": 0.8239,
"step": 1510
},
{
"epoch": 0.2869997632015155,
"grad_norm": 0.05321785003972056,
"learning_rate": 0.0002691588454078095,
"loss": 0.809,
"step": 1515
},
{
"epoch": 0.2879469571394743,
"grad_norm": 0.05267212261846467,
"learning_rate": 0.00026885689664112673,
"loss": 0.8235,
"step": 1520
},
{
"epoch": 0.2888941510774331,
"grad_norm": 0.06239400176564442,
"learning_rate": 0.0002685536480744648,
"loss": 0.8336,
"step": 1525
},
{
"epoch": 0.2898413450153919,
"grad_norm": 0.05392065914950409,
"learning_rate": 0.0002682491030241016,
"loss": 0.8227,
"step": 1530
},
{
"epoch": 0.2907885389533507,
"grad_norm": 0.042067002075187244,
"learning_rate": 0.0002679432648204928,
"loss": 0.8336,
"step": 1535
},
{
"epoch": 0.2917357328913095,
"grad_norm": 0.04983258400363465,
"learning_rate": 0.0002676361368082362,
"loss": 0.7947,
"step": 1540
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.04869904064999139,
"learning_rate": 0.00026732772234603437,
"loss": 0.8127,
"step": 1545
},
{
"epoch": 0.2936301207672271,
"grad_norm": 0.05851898617300107,
"learning_rate": 0.00026701802480665857,
"loss": 0.8313,
"step": 1550
},
{
"epoch": 0.2945773147051859,
"grad_norm": 0.0552687029482635,
"learning_rate": 0.0002667070475769114,
"loss": 0.8049,
"step": 1555
},
{
"epoch": 0.29552450864314467,
"grad_norm": 0.06477348854060364,
"learning_rate": 0.00026639479405759006,
"loss": 0.83,
"step": 1560
},
{
"epoch": 0.2964717025811035,
"grad_norm": 0.04555333394215088,
"learning_rate": 0.000266081267663449,
"loss": 0.84,
"step": 1565
},
{
"epoch": 0.2974188965190623,
"grad_norm": 0.04197167864122965,
"learning_rate": 0.00026576647182316264,
"loss": 0.8192,
"step": 1570
},
{
"epoch": 0.29836609045702106,
"grad_norm": 0.15360351138708875,
"learning_rate": 0.00026545040997928785,
"loss": 0.8756,
"step": 1575
},
{
"epoch": 0.2993132843949799,
"grad_norm": 0.07002447455001899,
"learning_rate": 0.00026513308558822636,
"loss": 0.8182,
"step": 1580
},
{
"epoch": 0.3002604783329387,
"grad_norm": 0.05720839543286207,
"learning_rate": 0.0002648145021201868,
"loss": 0.8334,
"step": 1585
},
{
"epoch": 0.30120767227089745,
"grad_norm": 0.05201720217949488,
"learning_rate": 0.0002644946630591469,
"loss": 0.8494,
"step": 1590
},
{
"epoch": 0.3021548662088563,
"grad_norm": 0.04850167766978546,
"learning_rate": 0.0002641735719028155,
"loss": 0.8285,
"step": 1595
},
{
"epoch": 0.30310206014681507,
"grad_norm": 0.044133764645286594,
"learning_rate": 0.000263851232162594,
"loss": 0.8225,
"step": 1600
},
{
"epoch": 0.30404925408477385,
"grad_norm": 0.0488939501306738,
"learning_rate": 0.00026352764736353815,
"loss": 0.8395,
"step": 1605
},
{
"epoch": 0.3049964480227327,
"grad_norm": 0.044482887206502425,
"learning_rate": 0.0002632028210443194,
"loss": 0.8199,
"step": 1610
},
{
"epoch": 0.30594364196069146,
"grad_norm": 0.054188307478421044,
"learning_rate": 0.00026287675675718653,
"loss": 0.833,
"step": 1615
},
{
"epoch": 0.30689083589865024,
"grad_norm": 0.048205088442678685,
"learning_rate": 0.00026254945806792614,
"loss": 0.8287,
"step": 1620
},
{
"epoch": 0.3078380298366091,
"grad_norm": 0.0457249850341829,
"learning_rate": 0.0002622209285558244,
"loss": 0.8104,
"step": 1625
},
{
"epoch": 0.30878522377456785,
"grad_norm": 0.04764496488482527,
"learning_rate": 0.00026189117181362733,
"loss": 0.807,
"step": 1630
},
{
"epoch": 0.30973241771252663,
"grad_norm": 0.04577871219106504,
"learning_rate": 0.0002615601914475018,
"loss": 0.8387,
"step": 1635
},
{
"epoch": 0.3106796116504854,
"grad_norm": 0.0560487648361042,
"learning_rate": 0.0002612279910769962,
"loss": 0.8209,
"step": 1640
},
{
"epoch": 0.31162680558844424,
"grad_norm": 0.051106440022587435,
"learning_rate": 0.0002608945743350004,
"loss": 0.8066,
"step": 1645
},
{
"epoch": 0.312573999526403,
"grad_norm": 0.060741538352692886,
"learning_rate": 0.0002605599448677066,
"loss": 0.8258,
"step": 1650
},
{
"epoch": 0.3135211934643618,
"grad_norm": 0.0600167744118878,
"learning_rate": 0.000260224106334569,
"loss": 0.8174,
"step": 1655
},
{
"epoch": 0.31446838740232064,
"grad_norm": 0.05182728019691824,
"learning_rate": 0.000259887062408264,
"loss": 0.8379,
"step": 1660
},
{
"epoch": 0.3154155813402794,
"grad_norm": 0.048406806604266626,
"learning_rate": 0.00025954881677464994,
"loss": 0.8239,
"step": 1665
},
{
"epoch": 0.3163627752782382,
"grad_norm": 0.04614485855762265,
"learning_rate": 0.0002592093731327269,
"loss": 0.8328,
"step": 1670
},
{
"epoch": 0.31730996921619703,
"grad_norm": 0.04097208134051075,
"learning_rate": 0.0002588687351945962,
"loss": 0.8054,
"step": 1675
},
{
"epoch": 0.3182571631541558,
"grad_norm": 0.04859899532989667,
"learning_rate": 0.0002585269066854197,
"loss": 0.828,
"step": 1680
},
{
"epoch": 0.3192043570921146,
"grad_norm": 0.0514636005763012,
"learning_rate": 0.00025818389134337925,
"loss": 0.805,
"step": 1685
},
{
"epoch": 0.3201515510300734,
"grad_norm": 0.04510538821375225,
"learning_rate": 0.0002578396929196356,
"loss": 0.8296,
"step": 1690
},
{
"epoch": 0.3210987449680322,
"grad_norm": 0.04625807399119475,
"learning_rate": 0.00025749431517828775,
"loss": 0.8085,
"step": 1695
},
{
"epoch": 0.322045938905991,
"grad_norm": 0.04353062203420096,
"learning_rate": 0.0002571477618963311,
"loss": 0.8169,
"step": 1700
},
{
"epoch": 0.3229931328439498,
"grad_norm": 0.045940703703086845,
"learning_rate": 0.00025680003686361704,
"loss": 0.8337,
"step": 1705
},
{
"epoch": 0.3239403267819086,
"grad_norm": 0.047707507442658635,
"learning_rate": 0.00025645114388281066,
"loss": 0.8097,
"step": 1710
},
{
"epoch": 0.3248875207198674,
"grad_norm": 0.04488951827418563,
"learning_rate": 0.00025610108676934974,
"loss": 0.8296,
"step": 1715
},
{
"epoch": 0.3258347146578262,
"grad_norm": 0.05157676249590623,
"learning_rate": 0.00025574986935140287,
"loss": 0.832,
"step": 1720
},
{
"epoch": 0.326781908595785,
"grad_norm": 0.045481015370253376,
"learning_rate": 0.00025539749546982736,
"loss": 0.812,
"step": 1725
},
{
"epoch": 0.32772910253374377,
"grad_norm": 0.04671492664808977,
"learning_rate": 0.0002550439689781276,
"loss": 0.783,
"step": 1730
},
{
"epoch": 0.3286762964717026,
"grad_norm": 0.056706366227313135,
"learning_rate": 0.00025468929374241256,
"loss": 0.829,
"step": 1735
},
{
"epoch": 0.3296234904096614,
"grad_norm": 0.04717465337329956,
"learning_rate": 0.0002543334736413539,
"loss": 0.8482,
"step": 1740
},
{
"epoch": 0.33057068434762016,
"grad_norm": 0.048757000553929425,
"learning_rate": 0.0002539765125661432,
"loss": 0.807,
"step": 1745
},
{
"epoch": 0.331517878285579,
"grad_norm": 0.04444091877456076,
"learning_rate": 0.00025361841442044956,
"loss": 0.8321,
"step": 1750
},
{
"epoch": 0.33246507222353777,
"grad_norm": 0.04389186380472923,
"learning_rate": 0.00025325918312037697,
"loss": 0.806,
"step": 1755
},
{
"epoch": 0.33341226616149655,
"grad_norm": 0.0451220202476477,
"learning_rate": 0.0002528988225944214,
"loss": 0.8239,
"step": 1760
},
{
"epoch": 0.3343594600994554,
"grad_norm": 0.04940473146055937,
"learning_rate": 0.00025253733678342775,
"loss": 0.7978,
"step": 1765
},
{
"epoch": 0.33530665403741416,
"grad_norm": 0.04360662632042812,
"learning_rate": 0.000252174729640547,
"loss": 0.7936,
"step": 1770
},
{
"epoch": 0.33625384797537294,
"grad_norm": 0.04508266007873255,
"learning_rate": 0.0002518110051311927,
"loss": 0.8354,
"step": 1775
},
{
"epoch": 0.3372010419133318,
"grad_norm": 0.0447597919709376,
"learning_rate": 0.00025144616723299785,
"loss": 0.8128,
"step": 1780
},
{
"epoch": 0.33814823585129056,
"grad_norm": 0.04068239709765713,
"learning_rate": 0.0002510802199357713,
"loss": 0.8173,
"step": 1785
},
{
"epoch": 0.33909542978924934,
"grad_norm": 0.04478708527154376,
"learning_rate": 0.000250713167241454,
"loss": 0.8192,
"step": 1790
},
{
"epoch": 0.34004262372720817,
"grad_norm": 0.04547217242470498,
"learning_rate": 0.00025034501316407537,
"loss": 0.8418,
"step": 1795
},
{
"epoch": 0.34098981766516695,
"grad_norm": 0.04674703701128052,
"learning_rate": 0.0002499757617297095,
"loss": 0.7595,
"step": 1800
},
{
"epoch": 0.34193701160312573,
"grad_norm": 0.0456363229285353,
"learning_rate": 0.00024960541697643094,
"loss": 0.8125,
"step": 1805
},
{
"epoch": 0.34288420554108456,
"grad_norm": 0.04312589291081109,
"learning_rate": 0.00024923398295427046,
"loss": 0.7931,
"step": 1810
},
{
"epoch": 0.34383139947904334,
"grad_norm": 0.04696710814032231,
"learning_rate": 0.00024886146372517107,
"loss": 0.8062,
"step": 1815
},
{
"epoch": 0.3447785934170021,
"grad_norm": 0.043291328088353766,
"learning_rate": 0.00024848786336294346,
"loss": 0.7962,
"step": 1820
},
{
"epoch": 0.3457257873549609,
"grad_norm": 0.04284288738527321,
"learning_rate": 0.0002481131859532212,
"loss": 0.8031,
"step": 1825
},
{
"epoch": 0.34667298129291974,
"grad_norm": 0.04649906122780062,
"learning_rate": 0.0002477374355934165,
"loss": 0.7931,
"step": 1830
},
{
"epoch": 0.3476201752308785,
"grad_norm": 0.05184640676841213,
"learning_rate": 0.0002473606163926751,
"loss": 0.833,
"step": 1835
},
{
"epoch": 0.3485673691688373,
"grad_norm": 0.04860441277746714,
"learning_rate": 0.00024698273247183137,
"loss": 0.8212,
"step": 1840
},
{
"epoch": 0.34951456310679613,
"grad_norm": 0.05114423505303399,
"learning_rate": 0.0002466037879633633,
"loss": 0.7971,
"step": 1845
},
{
"epoch": 0.3504617570447549,
"grad_norm": 0.047498936911412486,
"learning_rate": 0.00024622378701134737,
"loss": 0.8274,
"step": 1850
},
{
"epoch": 0.3514089509827137,
"grad_norm": 0.04549364277100318,
"learning_rate": 0.00024584273377141306,
"loss": 0.7948,
"step": 1855
},
{
"epoch": 0.3523561449206725,
"grad_norm": 0.05177910151205359,
"learning_rate": 0.0002454606324106977,
"loss": 0.8036,
"step": 1860
},
{
"epoch": 0.3533033388586313,
"grad_norm": 0.04603732482580829,
"learning_rate": 0.00024507748710780034,
"loss": 0.8062,
"step": 1865
},
{
"epoch": 0.3542505327965901,
"grad_norm": 0.05348425306250115,
"learning_rate": 0.00024469330205273676,
"loss": 0.7993,
"step": 1870
},
{
"epoch": 0.3551977267345489,
"grad_norm": 0.04956827787982081,
"learning_rate": 0.0002443080814468931,
"loss": 0.8036,
"step": 1875
},
{
"epoch": 0.3561449206725077,
"grad_norm": 0.05048490898899849,
"learning_rate": 0.00024392182950298033,
"loss": 0.8339,
"step": 1880
},
{
"epoch": 0.35709211461046647,
"grad_norm": 0.04546839597515788,
"learning_rate": 0.0002435345504449877,
"loss": 0.8127,
"step": 1885
},
{
"epoch": 0.3580393085484253,
"grad_norm": 0.04742371580337266,
"learning_rate": 0.00024314624850813689,
"loss": 0.8226,
"step": 1890
},
{
"epoch": 0.3589865024863841,
"grad_norm": 0.04631643296750854,
"learning_rate": 0.00024275692793883577,
"loss": 0.8133,
"step": 1895
},
{
"epoch": 0.35993369642434286,
"grad_norm": 0.04058836177118087,
"learning_rate": 0.00024236659299463171,
"loss": 0.7976,
"step": 1900
},
{
"epoch": 0.3608808903623017,
"grad_norm": 0.049761880526735185,
"learning_rate": 0.00024197524794416508,
"loss": 0.8144,
"step": 1905
},
{
"epoch": 0.3618280843002605,
"grad_norm": 0.0434144853404768,
"learning_rate": 0.00024158289706712266,
"loss": 0.7961,
"step": 1910
},
{
"epoch": 0.36277527823821926,
"grad_norm": 0.04729858009338802,
"learning_rate": 0.0002411895446541908,
"loss": 0.8092,
"step": 1915
},
{
"epoch": 0.3637224721761781,
"grad_norm": 0.04391468404782121,
"learning_rate": 0.00024079519500700848,
"loss": 0.7873,
"step": 1920
},
{
"epoch": 0.36466966611413687,
"grad_norm": 0.046136629260664565,
"learning_rate": 0.00024039985243812017,
"loss": 0.8358,
"step": 1925
},
{
"epoch": 0.36561686005209565,
"grad_norm": 0.04922463227417696,
"learning_rate": 0.000240003521270929,
"loss": 0.7982,
"step": 1930
},
{
"epoch": 0.3665640539900545,
"grad_norm": 0.050099522270096786,
"learning_rate": 0.00023960620583964905,
"loss": 0.8119,
"step": 1935
},
{
"epoch": 0.36751124792801326,
"grad_norm": 0.04673234998587366,
"learning_rate": 0.00023920791048925817,
"loss": 0.7916,
"step": 1940
},
{
"epoch": 0.36845844186597204,
"grad_norm": 0.044736938963155615,
"learning_rate": 0.00023880863957545065,
"loss": 0.8092,
"step": 1945
},
{
"epoch": 0.3694056358039309,
"grad_norm": 0.04403494677156711,
"learning_rate": 0.00023840839746458906,
"loss": 0.8007,
"step": 1950
},
{
"epoch": 0.37035282974188966,
"grad_norm": 0.04207278974539967,
"learning_rate": 0.00023800718853365707,
"loss": 0.8079,
"step": 1955
},
{
"epoch": 0.37130002367984843,
"grad_norm": 0.04441047413325019,
"learning_rate": 0.00023760501717021127,
"loss": 0.7981,
"step": 1960
},
{
"epoch": 0.37224721761780727,
"grad_norm": 0.04392750904458252,
"learning_rate": 0.00023720188777233328,
"loss": 0.8189,
"step": 1965
},
{
"epoch": 0.37319441155576605,
"grad_norm": 0.0409355126519413,
"learning_rate": 0.0002367978047485816,
"loss": 0.8065,
"step": 1970
},
{
"epoch": 0.3741416054937248,
"grad_norm": 0.044747845633113445,
"learning_rate": 0.00023639277251794342,
"loss": 0.8152,
"step": 1975
},
{
"epoch": 0.37508879943168366,
"grad_norm": 0.04360255348673726,
"learning_rate": 0.0002359867955097863,
"loss": 0.797,
"step": 1980
},
{
"epoch": 0.37603599336964244,
"grad_norm": 0.04405719390782307,
"learning_rate": 0.00023557987816380985,
"loss": 0.8058,
"step": 1985
},
{
"epoch": 0.3769831873076012,
"grad_norm": 0.04839154568146625,
"learning_rate": 0.00023517202492999686,
"loss": 0.8114,
"step": 1990
},
{
"epoch": 0.37793038124556005,
"grad_norm": 0.04779278427510571,
"learning_rate": 0.00023476324026856503,
"loss": 0.7969,
"step": 1995
},
{
"epoch": 0.37887757518351883,
"grad_norm": 0.05404705371255034,
"learning_rate": 0.00023435352864991787,
"loss": 0.8054,
"step": 2000
},
{
"epoch": 0.3798247691214776,
"grad_norm": 0.04669907966605441,
"learning_rate": 0.000233942894554596,
"loss": 0.8018,
"step": 2005
},
{
"epoch": 0.38077196305943645,
"grad_norm": 0.043137725132093525,
"learning_rate": 0.0002335313424732282,
"loss": 0.7924,
"step": 2010
},
{
"epoch": 0.3817191569973952,
"grad_norm": 0.04354182985885191,
"learning_rate": 0.00023311887690648196,
"loss": 0.7958,
"step": 2015
},
{
"epoch": 0.382666350935354,
"grad_norm": 0.042644363380025696,
"learning_rate": 0.00023270550236501467,
"loss": 0.8399,
"step": 2020
},
{
"epoch": 0.3836135448733128,
"grad_norm": 0.04539230083821745,
"learning_rate": 0.00023229122336942417,
"loss": 0.8038,
"step": 2025
},
{
"epoch": 0.3845607388112716,
"grad_norm": 0.04633684365426799,
"learning_rate": 0.0002318760444501991,
"loss": 0.7918,
"step": 2030
},
{
"epoch": 0.3855079327492304,
"grad_norm": 0.04726122562269177,
"learning_rate": 0.0002314599701476696,
"loss": 0.8095,
"step": 2035
},
{
"epoch": 0.3864551266871892,
"grad_norm": 0.04700120263284989,
"learning_rate": 0.00023104300501195765,
"loss": 0.7986,
"step": 2040
},
{
"epoch": 0.387402320625148,
"grad_norm": 0.04621030089117987,
"learning_rate": 0.0002306251536029271,
"loss": 0.803,
"step": 2045
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.04183970144784169,
"learning_rate": 0.00023020642049013403,
"loss": 0.785,
"step": 2050
},
{
"epoch": 0.38929670850106557,
"grad_norm": 0.0469484416435775,
"learning_rate": 0.0002297868102527767,
"loss": 0.7991,
"step": 2055
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.04737826498167925,
"learning_rate": 0.0002293663274796454,
"loss": 0.8004,
"step": 2060
},
{
"epoch": 0.3911910963769832,
"grad_norm": 0.04713629528918563,
"learning_rate": 0.00022894497676907244,
"loss": 0.7856,
"step": 2065
},
{
"epoch": 0.39213829031494196,
"grad_norm": 0.0539311516444963,
"learning_rate": 0.0002285227627288816,
"loss": 0.8007,
"step": 2070
},
{
"epoch": 0.3930854842529008,
"grad_norm": 0.04352919743448704,
"learning_rate": 0.00022809968997633803,
"loss": 0.7976,
"step": 2075
},
{
"epoch": 0.3940326781908596,
"grad_norm": 0.0438878761761322,
"learning_rate": 0.00022767576313809757,
"loss": 0.8084,
"step": 2080
},
{
"epoch": 0.39497987212881835,
"grad_norm": 0.04356708774126194,
"learning_rate": 0.0002272509868501561,
"loss": 0.8018,
"step": 2085
},
{
"epoch": 0.3959270660667772,
"grad_norm": 0.044799245818807745,
"learning_rate": 0.00022682536575779926,
"loss": 0.8185,
"step": 2090
},
{
"epoch": 0.39687426000473597,
"grad_norm": 0.04366762752086422,
"learning_rate": 0.00022639890451555094,
"loss": 0.8082,
"step": 2095
},
{
"epoch": 0.39782145394269475,
"grad_norm": 0.04970721314212078,
"learning_rate": 0.00022597160778712303,
"loss": 0.8163,
"step": 2100
},
{
"epoch": 0.3987686478806536,
"grad_norm": 0.04264183467808777,
"learning_rate": 0.00022554348024536413,
"loss": 0.7765,
"step": 2105
},
{
"epoch": 0.39971584181861236,
"grad_norm": 0.046000942745726546,
"learning_rate": 0.00022511452657220836,
"loss": 0.7767,
"step": 2110
},
{
"epoch": 0.40066303575657114,
"grad_norm": 0.04416028306809829,
"learning_rate": 0.0002246847514586244,
"loss": 0.7756,
"step": 2115
},
{
"epoch": 0.40161022969453,
"grad_norm": 0.04652032350471012,
"learning_rate": 0.00022425415960456406,
"loss": 0.785,
"step": 2120
},
{
"epoch": 0.40255742363248875,
"grad_norm": 0.039319004016494394,
"learning_rate": 0.00022382275571891088,
"loss": 0.8171,
"step": 2125
},
{
"epoch": 0.40350461757044753,
"grad_norm": 0.043191035340779275,
"learning_rate": 0.00022339054451942853,
"loss": 0.7888,
"step": 2130
},
{
"epoch": 0.40445181150840637,
"grad_norm": 0.04850980353591229,
"learning_rate": 0.00022295753073270957,
"loss": 0.8024,
"step": 2135
},
{
"epoch": 0.40539900544636515,
"grad_norm": 0.05346932816843321,
"learning_rate": 0.00022252371909412338,
"loss": 0.7943,
"step": 2140
},
{
"epoch": 0.4063461993843239,
"grad_norm": 0.0504819764321655,
"learning_rate": 0.00022208911434776446,
"loss": 0.8113,
"step": 2145
},
{
"epoch": 0.40729339332228276,
"grad_norm": 0.04358873484038266,
"learning_rate": 0.00022165372124640075,
"loss": 0.7792,
"step": 2150
},
{
"epoch": 0.40824058726024154,
"grad_norm": 0.048136044036710554,
"learning_rate": 0.0002212175445514214,
"loss": 0.8271,
"step": 2155
},
{
"epoch": 0.4091877811982003,
"grad_norm": 0.0482886839866423,
"learning_rate": 0.00022078058903278493,
"loss": 0.8082,
"step": 2160
},
{
"epoch": 0.41013497513615915,
"grad_norm": 0.04932102621776615,
"learning_rate": 0.00022034285946896683,
"loss": 0.8157,
"step": 2165
},
{
"epoch": 0.41108216907411793,
"grad_norm": 0.04558271327582903,
"learning_rate": 0.0002199043606469075,
"loss": 0.8205,
"step": 2170
},
{
"epoch": 0.4120293630120767,
"grad_norm": 0.04637987890505172,
"learning_rate": 0.00021946509736195982,
"loss": 0.8104,
"step": 2175
},
{
"epoch": 0.41297655695003554,
"grad_norm": 0.04402627600967093,
"learning_rate": 0.00021902507441783666,
"loss": 0.7735,
"step": 2180
},
{
"epoch": 0.4139237508879943,
"grad_norm": 0.04343306123899328,
"learning_rate": 0.0002185842966265585,
"loss": 0.8137,
"step": 2185
},
{
"epoch": 0.4148709448259531,
"grad_norm": 0.04049175499820178,
"learning_rate": 0.00021814276880840057,
"loss": 0.7666,
"step": 2190
},
{
"epoch": 0.41581813876391194,
"grad_norm": 0.04190114466718279,
"learning_rate": 0.0002177004957918404,
"loss": 0.7941,
"step": 2195
},
{
"epoch": 0.4167653327018707,
"grad_norm": 0.04167140065120627,
"learning_rate": 0.00021725748241350486,
"loss": 0.8049,
"step": 2200
},
{
"epoch": 0.4177125266398295,
"grad_norm": 0.04030091050865229,
"learning_rate": 0.00021681373351811715,
"loss": 0.7765,
"step": 2205
},
{
"epoch": 0.4186597205777883,
"grad_norm": 0.043928276470467834,
"learning_rate": 0.00021636925395844425,
"loss": 0.8004,
"step": 2210
},
{
"epoch": 0.4196069145157471,
"grad_norm": 0.041711739434037935,
"learning_rate": 0.00021592404859524338,
"loss": 0.8014,
"step": 2215
},
{
"epoch": 0.4205541084537059,
"grad_norm": 0.08515140585912362,
"learning_rate": 0.00021547812229720905,
"loss": 0.7925,
"step": 2220
},
{
"epoch": 0.42150130239166467,
"grad_norm": 0.046571948096373585,
"learning_rate": 0.0002150314799409198,
"loss": 0.7995,
"step": 2225
},
{
"epoch": 0.4224484963296235,
"grad_norm": 0.04291996713823751,
"learning_rate": 0.00021458412641078484,
"loss": 0.7833,
"step": 2230
},
{
"epoch": 0.4233956902675823,
"grad_norm": 0.04800959860061716,
"learning_rate": 0.00021413606659899075,
"loss": 0.8056,
"step": 2235
},
{
"epoch": 0.42434288420554106,
"grad_norm": 0.04165302702567769,
"learning_rate": 0.00021368730540544784,
"loss": 0.8031,
"step": 2240
},
{
"epoch": 0.4252900781434999,
"grad_norm": 0.042443453575190054,
"learning_rate": 0.0002132378477377366,
"loss": 0.8342,
"step": 2245
},
{
"epoch": 0.4262372720814587,
"grad_norm": 0.04364197763796348,
"learning_rate": 0.00021278769851105413,
"loss": 0.8069,
"step": 2250
},
{
"epoch": 0.42718446601941745,
"grad_norm": 0.04424427184165216,
"learning_rate": 0.00021233686264816024,
"loss": 0.8093,
"step": 2255
},
{
"epoch": 0.4281316599573763,
"grad_norm": 0.04438652665500913,
"learning_rate": 0.00021188534507932369,
"loss": 0.812,
"step": 2260
},
{
"epoch": 0.42907885389533507,
"grad_norm": 0.04194291606682371,
"learning_rate": 0.0002114331507422682,
"loss": 0.7999,
"step": 2265
},
{
"epoch": 0.43002604783329385,
"grad_norm": 0.040114143951823826,
"learning_rate": 0.0002109802845821187,
"loss": 0.776,
"step": 2270
},
{
"epoch": 0.4309732417712527,
"grad_norm": 0.049855477781271945,
"learning_rate": 0.0002105267515513469,
"loss": 0.7898,
"step": 2275
},
{
"epoch": 0.43192043570921146,
"grad_norm": 0.04832409909314252,
"learning_rate": 0.00021007255660971736,
"loss": 0.7705,
"step": 2280
},
{
"epoch": 0.43286762964717024,
"grad_norm": 0.04448804574954983,
"learning_rate": 0.00020961770472423323,
"loss": 0.7856,
"step": 2285
},
{
"epoch": 0.4338148235851291,
"grad_norm": 0.04056107663630711,
"learning_rate": 0.00020916220086908185,
"loss": 0.8386,
"step": 2290
},
{
"epoch": 0.43476201752308785,
"grad_norm": 0.04562375800507603,
"learning_rate": 0.00020870605002558038,
"loss": 0.7919,
"step": 2295
},
{
"epoch": 0.43570921146104663,
"grad_norm": 0.04990170126587913,
"learning_rate": 0.00020824925718212133,
"loss": 0.7812,
"step": 2300
},
{
"epoch": 0.43665640539900547,
"grad_norm": 0.04782898096811606,
"learning_rate": 0.00020779182733411813,
"loss": 0.8204,
"step": 2305
},
{
"epoch": 0.43760359933696424,
"grad_norm": 0.05977747562829158,
"learning_rate": 0.00020733376548395026,
"loss": 0.7674,
"step": 2310
},
{
"epoch": 0.438550793274923,
"grad_norm": 0.050186919112014665,
"learning_rate": 0.00020687507664090873,
"loss": 0.7842,
"step": 2315
},
{
"epoch": 0.43949798721288186,
"grad_norm": 0.06774152080561698,
"learning_rate": 0.0002064157658211413,
"loss": 0.7863,
"step": 2320
},
{
"epoch": 0.44044518115084064,
"grad_norm": 0.05251562387148029,
"learning_rate": 0.0002059558380475974,
"loss": 0.7803,
"step": 2325
},
{
"epoch": 0.4413923750887994,
"grad_norm": 0.11110903402386875,
"learning_rate": 0.00020549529834997356,
"loss": 0.8211,
"step": 2330
},
{
"epoch": 0.44233956902675825,
"grad_norm": 0.05088713402476131,
"learning_rate": 0.0002050341517646581,
"loss": 0.8229,
"step": 2335
},
{
"epoch": 0.44328676296471703,
"grad_norm": 0.048748937702802024,
"learning_rate": 0.00020457240333467618,
"loss": 0.8308,
"step": 2340
},
{
"epoch": 0.4442339569026758,
"grad_norm": 0.04414994149656158,
"learning_rate": 0.00020411005810963467,
"loss": 0.7783,
"step": 2345
},
{
"epoch": 0.44518115084063464,
"grad_norm": 0.04297554142966596,
"learning_rate": 0.00020364712114566682,
"loss": 0.7994,
"step": 2350
},
{
"epoch": 0.4461283447785934,
"grad_norm": 0.0466918891507033,
"learning_rate": 0.00020318359750537722,
"loss": 0.7766,
"step": 2355
},
{
"epoch": 0.4470755387165522,
"grad_norm": 0.0426774503592908,
"learning_rate": 0.00020271949225778604,
"loss": 0.7689,
"step": 2360
},
{
"epoch": 0.44802273265451104,
"grad_norm": 0.04339210841225298,
"learning_rate": 0.00020225481047827395,
"loss": 0.7629,
"step": 2365
},
{
"epoch": 0.4489699265924698,
"grad_norm": 0.04157539901744288,
"learning_rate": 0.0002017895572485264,
"loss": 0.7993,
"step": 2370
},
{
"epoch": 0.4499171205304286,
"grad_norm": 0.04747869551715979,
"learning_rate": 0.00020132373765647824,
"loss": 0.7831,
"step": 2375
},
{
"epoch": 0.45086431446838743,
"grad_norm": 0.041036503272024266,
"learning_rate": 0.00020085735679625785,
"loss": 0.7938,
"step": 2380
},
{
"epoch": 0.4518115084063462,
"grad_norm": 0.046349793056969124,
"learning_rate": 0.00020039041976813155,
"loss": 0.8213,
"step": 2385
},
{
"epoch": 0.452758702344305,
"grad_norm": 0.04266113778547003,
"learning_rate": 0.000199922931678448,
"loss": 0.7797,
"step": 2390
},
{
"epoch": 0.4537058962822638,
"grad_norm": 0.04138658311957195,
"learning_rate": 0.00019945489763958192,
"loss": 0.7855,
"step": 2395
},
{
"epoch": 0.4546530902202226,
"grad_norm": 0.03957822721175349,
"learning_rate": 0.00019898632276987865,
"loss": 0.7802,
"step": 2400
},
{
"epoch": 0.4556002841581814,
"grad_norm": 0.043723247514117734,
"learning_rate": 0.00019851721219359787,
"loss": 0.7914,
"step": 2405
},
{
"epoch": 0.45654747809614016,
"grad_norm": 0.040483397534663346,
"learning_rate": 0.0001980475710408577,
"loss": 0.7784,
"step": 2410
},
{
"epoch": 0.457494672034099,
"grad_norm": 0.047435378340581154,
"learning_rate": 0.00019757740444757856,
"loss": 0.8099,
"step": 2415
},
{
"epoch": 0.45844186597205777,
"grad_norm": 0.04167097525052772,
"learning_rate": 0.00019710671755542684,
"loss": 0.8004,
"step": 2420
},
{
"epoch": 0.45938905991001655,
"grad_norm": 0.045347168717410416,
"learning_rate": 0.0001966355155117592,
"loss": 0.7503,
"step": 2425
},
{
"epoch": 0.4603362538479754,
"grad_norm": 0.04793512210403035,
"learning_rate": 0.00019616380346956555,
"loss": 0.8034,
"step": 2430
},
{
"epoch": 0.46128344778593416,
"grad_norm": 0.044750662926773134,
"learning_rate": 0.00019569158658741325,
"loss": 0.8036,
"step": 2435
},
{
"epoch": 0.46223064172389294,
"grad_norm": 0.0408279434029639,
"learning_rate": 0.0001952188700293905,
"loss": 0.7744,
"step": 2440
},
{
"epoch": 0.4631778356618518,
"grad_norm": 0.04475763705667161,
"learning_rate": 0.0001947456589650498,
"loss": 0.7831,
"step": 2445
},
{
"epoch": 0.46412502959981056,
"grad_norm": 0.048943569028928394,
"learning_rate": 0.00019427195856935156,
"loss": 0.7584,
"step": 2450
},
{
"epoch": 0.46507222353776934,
"grad_norm": 0.0470737958639738,
"learning_rate": 0.00019379777402260735,
"loss": 0.8045,
"step": 2455
},
{
"epoch": 0.46601941747572817,
"grad_norm": 0.04333259402866556,
"learning_rate": 0.0001933231105104235,
"loss": 0.8252,
"step": 2460
},
{
"epoch": 0.46696661141368695,
"grad_norm": 0.04500393126058614,
"learning_rate": 0.00019284797322364412,
"loss": 0.7963,
"step": 2465
},
{
"epoch": 0.46791380535164573,
"grad_norm": 0.04617455518775394,
"learning_rate": 0.00019237236735829434,
"loss": 0.7905,
"step": 2470
},
{
"epoch": 0.46886099928960456,
"grad_norm": 0.04568226599386409,
"learning_rate": 0.0001918962981155238,
"loss": 0.7878,
"step": 2475
},
{
"epoch": 0.46980819322756334,
"grad_norm": 0.04384781746645944,
"learning_rate": 0.00019141977070154945,
"loss": 0.8155,
"step": 2480
},
{
"epoch": 0.4707553871655221,
"grad_norm": 0.04128165433631373,
"learning_rate": 0.0001909427903275988,
"loss": 0.8024,
"step": 2485
},
{
"epoch": 0.47170258110348096,
"grad_norm": 0.04842154647206823,
"learning_rate": 0.00019046536220985267,
"loss": 0.7762,
"step": 2490
},
{
"epoch": 0.47264977504143973,
"grad_norm": 0.042673491173335736,
"learning_rate": 0.00018998749156938854,
"loss": 0.7709,
"step": 2495
},
{
"epoch": 0.4735969689793985,
"grad_norm": 0.04785439305575584,
"learning_rate": 0.00018950918363212324,
"loss": 0.7804,
"step": 2500
},
{
"epoch": 0.47454416291735735,
"grad_norm": 0.0461863217228199,
"learning_rate": 0.00018903044362875558,
"loss": 0.7925,
"step": 2505
},
{
"epoch": 0.4754913568553161,
"grad_norm": 0.044243673212673404,
"learning_rate": 0.0001885512767947097,
"loss": 0.7941,
"step": 2510
},
{
"epoch": 0.4764385507932749,
"grad_norm": 0.040611884775210036,
"learning_rate": 0.0001880716883700772,
"loss": 0.7562,
"step": 2515
},
{
"epoch": 0.47738574473123374,
"grad_norm": 0.041073708433040416,
"learning_rate": 0.00018759168359956034,
"loss": 0.7856,
"step": 2520
},
{
"epoch": 0.4783329386691925,
"grad_norm": 0.040555781455264096,
"learning_rate": 0.00018711126773241434,
"loss": 0.7808,
"step": 2525
},
{
"epoch": 0.4792801326071513,
"grad_norm": 0.04412592641007611,
"learning_rate": 0.00018663044602239016,
"loss": 0.7527,
"step": 2530
},
{
"epoch": 0.48022732654511013,
"grad_norm": 0.04484068242737891,
"learning_rate": 0.00018614922372767705,
"loss": 0.8026,
"step": 2535
},
{
"epoch": 0.4811745204830689,
"grad_norm": 0.04494181539453932,
"learning_rate": 0.00018566760611084482,
"loss": 0.7884,
"step": 2540
},
{
"epoch": 0.4821217144210277,
"grad_norm": 0.04177209726984591,
"learning_rate": 0.00018518559843878663,
"loss": 0.7944,
"step": 2545
},
{
"epoch": 0.4830689083589865,
"grad_norm": 0.0418891994027945,
"learning_rate": 0.00018470320598266114,
"loss": 0.7876,
"step": 2550
},
{
"epoch": 0.4840161022969453,
"grad_norm": 0.04509400985562226,
"learning_rate": 0.00018422043401783499,
"loss": 0.7906,
"step": 2555
},
{
"epoch": 0.4849632962349041,
"grad_norm": 0.03886301281064671,
"learning_rate": 0.00018373728782382497,
"loss": 0.7658,
"step": 2560
},
{
"epoch": 0.4859104901728629,
"grad_norm": 0.04186589657661449,
"learning_rate": 0.00018325377268424054,
"loss": 0.7921,
"step": 2565
},
{
"epoch": 0.4868576841108217,
"grad_norm": 0.04025799467986521,
"learning_rate": 0.00018276989388672573,
"loss": 0.8143,
"step": 2570
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.04058499655814065,
"learning_rate": 0.0001822856567229016,
"loss": 0.7819,
"step": 2575
},
{
"epoch": 0.4887520719867393,
"grad_norm": 0.04628173800240841,
"learning_rate": 0.0001818010664883082,
"loss": 0.7944,
"step": 2580
},
{
"epoch": 0.4896992659246981,
"grad_norm": 0.042223328062815924,
"learning_rate": 0.0001813161284823466,
"loss": 0.7975,
"step": 2585
},
{
"epoch": 0.49064645986265687,
"grad_norm": 0.041313006294521955,
"learning_rate": 0.00018083084800822128,
"loss": 0.7954,
"step": 2590
},
{
"epoch": 0.49159365380061565,
"grad_norm": 0.042569155544217974,
"learning_rate": 0.0001803452303728816,
"loss": 0.7628,
"step": 2595
},
{
"epoch": 0.4925408477385745,
"grad_norm": 0.043867104326580246,
"learning_rate": 0.00017985928088696434,
"loss": 0.7558,
"step": 2600
},
{
"epoch": 0.49348804167653326,
"grad_norm": 0.04371521207560914,
"learning_rate": 0.0001793730048647352,
"loss": 0.7686,
"step": 2605
},
{
"epoch": 0.49443523561449204,
"grad_norm": 0.04369821893875824,
"learning_rate": 0.00017888640762403078,
"loss": 0.7961,
"step": 2610
},
{
"epoch": 0.4953824295524509,
"grad_norm": 0.043782837772372955,
"learning_rate": 0.00017839949448620064,
"loss": 0.8211,
"step": 2615
},
{
"epoch": 0.49632962349040965,
"grad_norm": 0.04427063556293102,
"learning_rate": 0.00017791227077604876,
"loss": 0.7961,
"step": 2620
},
{
"epoch": 0.49727681742836843,
"grad_norm": 0.04245267391622513,
"learning_rate": 0.00017742474182177567,
"loss": 0.7556,
"step": 2625
},
{
"epoch": 0.49822401136632727,
"grad_norm": 0.04105426628219163,
"learning_rate": 0.00017693691295491982,
"loss": 0.7994,
"step": 2630
},
{
"epoch": 0.49917120530428605,
"grad_norm": 0.03984608708960842,
"learning_rate": 0.0001764487895102995,
"loss": 0.7818,
"step": 2635
},
{
"epoch": 0.5001183992422449,
"grad_norm": 0.04428069948873579,
"learning_rate": 0.00017596037682595465,
"loss": 0.7862,
"step": 2640
},
{
"epoch": 0.5010655931802036,
"grad_norm": 0.04490329464859081,
"learning_rate": 0.00017547168024308806,
"loss": 0.7975,
"step": 2645
},
{
"epoch": 0.5020127871181624,
"grad_norm": 0.0413111891075842,
"learning_rate": 0.0001749827051060072,
"loss": 0.7678,
"step": 2650
},
{
"epoch": 0.5029599810561213,
"grad_norm": 0.04240726873268668,
"learning_rate": 0.00017449345676206595,
"loss": 0.796,
"step": 2655
},
{
"epoch": 0.50390717499408,
"grad_norm": 0.041665058788828147,
"learning_rate": 0.0001740039405616057,
"loss": 0.7769,
"step": 2660
},
{
"epoch": 0.5048543689320388,
"grad_norm": 0.044320910745836974,
"learning_rate": 0.00017351416185789725,
"loss": 0.7805,
"step": 2665
},
{
"epoch": 0.5058015628699977,
"grad_norm": 0.04059793612300337,
"learning_rate": 0.00017302412600708202,
"loss": 0.7585,
"step": 2670
},
{
"epoch": 0.5067487568079564,
"grad_norm": 0.04002124902466378,
"learning_rate": 0.00017253383836811356,
"loss": 0.7902,
"step": 2675
},
{
"epoch": 0.5076959507459152,
"grad_norm": 0.03946188221690789,
"learning_rate": 0.00017204330430269896,
"loss": 0.7883,
"step": 2680
},
{
"epoch": 0.5086431446838741,
"grad_norm": 0.04168808355706374,
"learning_rate": 0.00017155252917524014,
"loss": 0.7623,
"step": 2685
},
{
"epoch": 0.5095903386218328,
"grad_norm": 0.0403800938955808,
"learning_rate": 0.0001710615183527753,
"loss": 0.7837,
"step": 2690
},
{
"epoch": 0.5105375325597916,
"grad_norm": 0.04265285946105665,
"learning_rate": 0.0001705702772049201,
"loss": 0.782,
"step": 2695
},
{
"epoch": 0.5114847264977505,
"grad_norm": 0.03909716815369796,
"learning_rate": 0.00017007881110380903,
"loss": 0.7992,
"step": 2700
},
{
"epoch": 0.5124319204357092,
"grad_norm": 0.04390278736912629,
"learning_rate": 0.00016958712542403665,
"loss": 0.7925,
"step": 2705
},
{
"epoch": 0.513379114373668,
"grad_norm": 0.040328289968045196,
"learning_rate": 0.00016909522554259875,
"loss": 0.7888,
"step": 2710
},
{
"epoch": 0.5143263083116268,
"grad_norm": 0.042972772541450946,
"learning_rate": 0.00016860311683883366,
"loss": 0.7522,
"step": 2715
},
{
"epoch": 0.5152735022495856,
"grad_norm": 0.040623359273199364,
"learning_rate": 0.0001681108046943633,
"loss": 0.7673,
"step": 2720
},
{
"epoch": 0.5162206961875444,
"grad_norm": 0.0451021298457456,
"learning_rate": 0.00016761829449303442,
"loss": 0.7803,
"step": 2725
},
{
"epoch": 0.5171678901255032,
"grad_norm": 0.041924502748264085,
"learning_rate": 0.00016712559162085963,
"loss": 0.7691,
"step": 2730
},
{
"epoch": 0.518115084063462,
"grad_norm": 0.04149735002775417,
"learning_rate": 0.0001666327014659587,
"loss": 0.7889,
"step": 2735
},
{
"epoch": 0.5190622780014208,
"grad_norm": 0.04415559586614686,
"learning_rate": 0.00016613962941849924,
"loss": 0.7808,
"step": 2740
},
{
"epoch": 0.5200094719393796,
"grad_norm": 0.03925128460208889,
"learning_rate": 0.00016564638087063834,
"loss": 0.7773,
"step": 2745
},
{
"epoch": 0.5209566658773384,
"grad_norm": 0.04368831843596003,
"learning_rate": 0.00016515296121646299,
"loss": 0.7882,
"step": 2750
},
{
"epoch": 0.5219038598152972,
"grad_norm": 0.0422781127201221,
"learning_rate": 0.00016465937585193144,
"loss": 0.764,
"step": 2755
},
{
"epoch": 0.522851053753256,
"grad_norm": 0.040472037162999186,
"learning_rate": 0.0001641656301748143,
"loss": 0.7667,
"step": 2760
},
{
"epoch": 0.5237982476912147,
"grad_norm": 0.04379401207313861,
"learning_rate": 0.00016367172958463503,
"loss": 0.7792,
"step": 2765
},
{
"epoch": 0.5247454416291736,
"grad_norm": 0.04114160448623085,
"learning_rate": 0.00016317767948261148,
"loss": 0.7812,
"step": 2770
},
{
"epoch": 0.5256926355671324,
"grad_norm": 0.03953583092530725,
"learning_rate": 0.00016268348527159632,
"loss": 0.751,
"step": 2775
},
{
"epoch": 0.5266398295050911,
"grad_norm": 0.046057484794039295,
"learning_rate": 0.0001621891523560183,
"loss": 0.8031,
"step": 2780
},
{
"epoch": 0.52758702344305,
"grad_norm": 0.044096492253313226,
"learning_rate": 0.00016169468614182306,
"loss": 0.768,
"step": 2785
},
{
"epoch": 0.5285342173810088,
"grad_norm": 0.04028812660467143,
"learning_rate": 0.00016120009203641374,
"loss": 0.7417,
"step": 2790
},
{
"epoch": 0.5294814113189675,
"grad_norm": 0.0421254976442694,
"learning_rate": 0.00016070537544859238,
"loss": 0.7525,
"step": 2795
},
{
"epoch": 0.5304286052569264,
"grad_norm": 0.041297980776658014,
"learning_rate": 0.00016021054178850025,
"loss": 0.7555,
"step": 2800
},
{
"epoch": 0.5313757991948852,
"grad_norm": 0.04218044701907684,
"learning_rate": 0.000159715596467559,
"loss": 0.7636,
"step": 2805
},
{
"epoch": 0.5323229931328439,
"grad_norm": 0.04257545791321493,
"learning_rate": 0.00015922054489841134,
"loss": 0.7877,
"step": 2810
},
{
"epoch": 0.5332701870708028,
"grad_norm": 0.04037611852463485,
"learning_rate": 0.0001587253924948619,
"loss": 0.7967,
"step": 2815
},
{
"epoch": 0.5342173810087616,
"grad_norm": 0.04724884702232222,
"learning_rate": 0.00015823014467181813,
"loss": 0.778,
"step": 2820
},
{
"epoch": 0.5351645749467203,
"grad_norm": 0.041552815375501054,
"learning_rate": 0.00015773480684523082,
"loss": 0.7644,
"step": 2825
},
{
"epoch": 0.5361117688846792,
"grad_norm": 0.04060976824870222,
"learning_rate": 0.00015723938443203505,
"loss": 0.7568,
"step": 2830
},
{
"epoch": 0.537058962822638,
"grad_norm": 0.039968469240745465,
"learning_rate": 0.000156743882850091,
"loss": 0.7641,
"step": 2835
},
{
"epoch": 0.5380061567605967,
"grad_norm": 0.04550185985191349,
"learning_rate": 0.00015624830751812452,
"loss": 0.7631,
"step": 2840
},
{
"epoch": 0.5389533506985555,
"grad_norm": 0.04397422552677812,
"learning_rate": 0.0001557526638556681,
"loss": 0.7898,
"step": 2845
},
{
"epoch": 0.5399005446365144,
"grad_norm": 0.04128096672528751,
"learning_rate": 0.00015525695728300142,
"loss": 0.8049,
"step": 2850
},
{
"epoch": 0.5408477385744731,
"grad_norm": 0.043926555415344445,
"learning_rate": 0.00015476119322109215,
"loss": 0.7856,
"step": 2855
},
{
"epoch": 0.5417949325124319,
"grad_norm": 0.04191285035326946,
"learning_rate": 0.00015426537709153665,
"loss": 0.7811,
"step": 2860
},
{
"epoch": 0.5427421264503908,
"grad_norm": 0.04398915485090648,
"learning_rate": 0.00015376951431650063,
"loss": 0.7642,
"step": 2865
},
{
"epoch": 0.5436893203883495,
"grad_norm": 0.043931690655324165,
"learning_rate": 0.00015327361031865994,
"loss": 0.7453,
"step": 2870
},
{
"epoch": 0.5446365143263083,
"grad_norm": 0.04098372350907064,
"learning_rate": 0.00015277767052114134,
"loss": 0.791,
"step": 2875
},
{
"epoch": 0.545583708264267,
"grad_norm": 0.044827794864255185,
"learning_rate": 0.00015228170034746287,
"loss": 0.7742,
"step": 2880
},
{
"epoch": 0.5465309022022259,
"grad_norm": 0.043361929504781155,
"learning_rate": 0.00015178570522147503,
"loss": 0.7721,
"step": 2885
},
{
"epoch": 0.5474780961401847,
"grad_norm": 0.04140360142754399,
"learning_rate": 0.00015128969056730094,
"loss": 0.7638,
"step": 2890
},
{
"epoch": 0.5484252900781434,
"grad_norm": 0.044541139967598814,
"learning_rate": 0.00015079366180927747,
"loss": 0.7648,
"step": 2895
},
{
"epoch": 0.5493724840161023,
"grad_norm": 0.04373950378809721,
"learning_rate": 0.00015029762437189555,
"loss": 0.764,
"step": 2900
},
{
"epoch": 0.5503196779540611,
"grad_norm": 0.044464277569613665,
"learning_rate": 0.00014980158367974123,
"loss": 0.7584,
"step": 2905
},
{
"epoch": 0.5512668718920198,
"grad_norm": 0.041529612627100455,
"learning_rate": 0.000149305545157436,
"loss": 0.784,
"step": 2910
},
{
"epoch": 0.5522140658299787,
"grad_norm": 0.04387046595628468,
"learning_rate": 0.00014880951422957764,
"loss": 0.7829,
"step": 2915
},
{
"epoch": 0.5531612597679375,
"grad_norm": 0.03928205510955695,
"learning_rate": 0.00014831349632068097,
"loss": 0.7838,
"step": 2920
},
{
"epoch": 0.5541084537058962,
"grad_norm": 0.03792796655680272,
"learning_rate": 0.0001478174968551183,
"loss": 0.7585,
"step": 2925
},
{
"epoch": 0.5550556476438551,
"grad_norm": 0.0418350076429959,
"learning_rate": 0.00014732152125706042,
"loss": 0.7892,
"step": 2930
},
{
"epoch": 0.5560028415818139,
"grad_norm": 0.0459671460831212,
"learning_rate": 0.00014682557495041684,
"loss": 0.733,
"step": 2935
},
{
"epoch": 0.5569500355197726,
"grad_norm": 0.04058456377794918,
"learning_rate": 0.00014632966335877706,
"loss": 0.7686,
"step": 2940
},
{
"epoch": 0.5578972294577315,
"grad_norm": 0.040486030304535854,
"learning_rate": 0.00014583379190535075,
"loss": 0.7396,
"step": 2945
},
{
"epoch": 0.5588444233956903,
"grad_norm": 0.04178283807566538,
"learning_rate": 0.00014533796601290868,
"loss": 0.7982,
"step": 2950
},
{
"epoch": 0.559791617333649,
"grad_norm": 0.040277272924603875,
"learning_rate": 0.0001448421911037234,
"loss": 0.7607,
"step": 2955
},
{
"epoch": 0.5607388112716079,
"grad_norm": 0.041258343955637974,
"learning_rate": 0.0001443464725995098,
"loss": 0.7443,
"step": 2960
},
{
"epoch": 0.5616860052095667,
"grad_norm": 0.04032233073174216,
"learning_rate": 0.00014385081592136614,
"loss": 0.7993,
"step": 2965
},
{
"epoch": 0.5626331991475254,
"grad_norm": 0.03958174726228383,
"learning_rate": 0.0001433552264897143,
"loss": 0.7897,
"step": 2970
},
{
"epoch": 0.5635803930854842,
"grad_norm": 0.03858023826870275,
"learning_rate": 0.000142859709724241,
"loss": 0.7531,
"step": 2975
},
{
"epoch": 0.5645275870234431,
"grad_norm": 0.03863381061427778,
"learning_rate": 0.00014236427104383827,
"loss": 0.7683,
"step": 2980
},
{
"epoch": 0.5654747809614018,
"grad_norm": 0.04304392912853754,
"learning_rate": 0.00014186891586654395,
"loss": 0.7611,
"step": 2985
},
{
"epoch": 0.5664219748993606,
"grad_norm": 0.040517793123203284,
"learning_rate": 0.00014137364960948307,
"loss": 0.7597,
"step": 2990
},
{
"epoch": 0.5673691688373195,
"grad_norm": 0.04010964103438508,
"learning_rate": 0.0001408784776888079,
"loss": 0.7886,
"step": 2995
},
{
"epoch": 0.5683163627752782,
"grad_norm": 0.16933225878617628,
"learning_rate": 0.00014038340551963946,
"loss": 0.7754,
"step": 3000
},
{
"epoch": 0.569263556713237,
"grad_norm": 0.049628098964115824,
"learning_rate": 0.0001398884385160074,
"loss": 0.7557,
"step": 3005
},
{
"epoch": 0.5702107506511959,
"grad_norm": 0.04117704234522045,
"learning_rate": 0.00013939358209079177,
"loss": 0.7662,
"step": 3010
},
{
"epoch": 0.5711579445891546,
"grad_norm": 0.04404870833994479,
"learning_rate": 0.00013889884165566317,
"loss": 0.7802,
"step": 3015
},
{
"epoch": 0.5721051385271134,
"grad_norm": 0.0400783837624163,
"learning_rate": 0.00013840422262102357,
"loss": 0.7772,
"step": 3020
},
{
"epoch": 0.5730523324650723,
"grad_norm": 0.04445272029297132,
"learning_rate": 0.00013790973039594766,
"loss": 0.7403,
"step": 3025
},
{
"epoch": 0.573999526403031,
"grad_norm": 0.04189631829289255,
"learning_rate": 0.000137415370388123,
"loss": 0.7811,
"step": 3030
},
{
"epoch": 0.5749467203409898,
"grad_norm": 0.04116724359754346,
"learning_rate": 0.00013692114800379165,
"loss": 0.7696,
"step": 3035
},
{
"epoch": 0.5758939142789486,
"grad_norm": 0.03811498143887281,
"learning_rate": 0.00013642706864769023,
"loss": 0.7974,
"step": 3040
},
{
"epoch": 0.5768411082169074,
"grad_norm": 0.03813260068428526,
"learning_rate": 0.00013593313772299151,
"loss": 0.7491,
"step": 3045
},
{
"epoch": 0.5777883021548662,
"grad_norm": 0.04258997197360236,
"learning_rate": 0.00013543936063124503,
"loss": 0.7611,
"step": 3050
},
{
"epoch": 0.578735496092825,
"grad_norm": 0.044652230022576414,
"learning_rate": 0.00013494574277231772,
"loss": 0.7639,
"step": 3055
},
{
"epoch": 0.5796826900307838,
"grad_norm": 0.040644166117172274,
"learning_rate": 0.00013445228954433568,
"loss": 0.7871,
"step": 3060
},
{
"epoch": 0.5806298839687426,
"grad_norm": 0.03834957657226225,
"learning_rate": 0.00013395900634362418,
"loss": 0.7516,
"step": 3065
},
{
"epoch": 0.5815770779067014,
"grad_norm": 0.0425924434925859,
"learning_rate": 0.0001334658985646493,
"loss": 0.767,
"step": 3070
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.04603157938094156,
"learning_rate": 0.00013297297159995872,
"loss": 0.7642,
"step": 3075
},
{
"epoch": 0.583471465782619,
"grad_norm": 0.04251195805977364,
"learning_rate": 0.00013248023084012268,
"loss": 0.7695,
"step": 3080
},
{
"epoch": 0.5844186597205778,
"grad_norm": 0.04025338997238214,
"learning_rate": 0.0001319876816736754,
"loss": 0.7428,
"step": 3085
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.04211750103789674,
"learning_rate": 0.00013149532948705542,
"loss": 0.7621,
"step": 3090
},
{
"epoch": 0.5863130475964954,
"grad_norm": 0.04455741942115236,
"learning_rate": 0.0001310031796645475,
"loss": 0.8198,
"step": 3095
},
{
"epoch": 0.5872602415344542,
"grad_norm": 0.042122208500608355,
"learning_rate": 0.00013051123758822317,
"loss": 0.7902,
"step": 3100
},
{
"epoch": 0.5882074354724129,
"grad_norm": 0.04026734370639814,
"learning_rate": 0.0001300195086378822,
"loss": 0.743,
"step": 3105
},
{
"epoch": 0.5891546294103718,
"grad_norm": 0.04263858640190378,
"learning_rate": 0.00012952799819099362,
"loss": 0.7611,
"step": 3110
},
{
"epoch": 0.5901018233483306,
"grad_norm": 0.042020491856201506,
"learning_rate": 0.00012903671162263692,
"loss": 0.7638,
"step": 3115
},
{
"epoch": 0.5910490172862893,
"grad_norm": 0.0411120885986108,
"learning_rate": 0.0001285456543054433,
"loss": 0.7779,
"step": 3120
},
{
"epoch": 0.5919962112242482,
"grad_norm": 0.0406466480831228,
"learning_rate": 0.0001280548316095369,
"loss": 0.7648,
"step": 3125
},
{
"epoch": 0.592943405162207,
"grad_norm": 0.039876223182510155,
"learning_rate": 0.00012756424890247612,
"loss": 0.7465,
"step": 3130
},
{
"epoch": 0.5938905991001657,
"grad_norm": 0.03968060112874898,
"learning_rate": 0.00012707391154919478,
"loss": 0.7788,
"step": 3135
},
{
"epoch": 0.5948377930381246,
"grad_norm": 0.04109563591743239,
"learning_rate": 0.00012658382491194368,
"loss": 0.7629,
"step": 3140
},
{
"epoch": 0.5957849869760834,
"grad_norm": 0.04068329826647931,
"learning_rate": 0.0001260939943502317,
"loss": 0.7652,
"step": 3145
},
{
"epoch": 0.5967321809140421,
"grad_norm": 0.039457301232553726,
"learning_rate": 0.00012560442522076745,
"loss": 0.771,
"step": 3150
},
{
"epoch": 0.597679374852001,
"grad_norm": 0.039313871399632806,
"learning_rate": 0.0001251151228774005,
"loss": 0.7665,
"step": 3155
},
{
"epoch": 0.5986265687899598,
"grad_norm": 0.04012444827177292,
"learning_rate": 0.0001246260926710628,
"loss": 0.7672,
"step": 3160
},
{
"epoch": 0.5995737627279185,
"grad_norm": 0.04199451923312326,
"learning_rate": 0.00012413733994971044,
"loss": 0.7767,
"step": 3165
},
{
"epoch": 0.6005209566658773,
"grad_norm": 0.043536826355634925,
"learning_rate": 0.0001236488700582648,
"loss": 0.7447,
"step": 3170
},
{
"epoch": 0.6014681506038362,
"grad_norm": 0.04225224126474484,
"learning_rate": 0.00012316068833855438,
"loss": 0.7705,
"step": 3175
},
{
"epoch": 0.6024153445417949,
"grad_norm": 0.04122031488132078,
"learning_rate": 0.00012267280012925622,
"loss": 0.7553,
"step": 3180
},
{
"epoch": 0.6033625384797537,
"grad_norm": 0.04052877747206926,
"learning_rate": 0.00012218521076583767,
"loss": 0.7395,
"step": 3185
},
{
"epoch": 0.6043097324177126,
"grad_norm": 0.03976724566633056,
"learning_rate": 0.00012169792558049789,
"loss": 0.7902,
"step": 3190
},
{
"epoch": 0.6052569263556713,
"grad_norm": 0.03994928147888954,
"learning_rate": 0.00012121094990210951,
"loss": 0.7492,
"step": 3195
},
{
"epoch": 0.6062041202936301,
"grad_norm": 0.0410556694914752,
"learning_rate": 0.00012072428905616064,
"loss": 0.7513,
"step": 3200
},
{
"epoch": 0.607151314231589,
"grad_norm": 0.04147107567296144,
"learning_rate": 0.00012023794836469624,
"loss": 0.7321,
"step": 3205
},
{
"epoch": 0.6080985081695477,
"grad_norm": 0.042191697935109296,
"learning_rate": 0.00011975193314626025,
"loss": 0.7553,
"step": 3210
},
{
"epoch": 0.6090457021075065,
"grad_norm": 0.04045134486807547,
"learning_rate": 0.00011926624871583717,
"loss": 0.7352,
"step": 3215
},
{
"epoch": 0.6099928960454654,
"grad_norm": 0.04292542392593404,
"learning_rate": 0.00011878090038479416,
"loss": 0.771,
"step": 3220
},
{
"epoch": 0.6109400899834241,
"grad_norm": 0.03947257757285626,
"learning_rate": 0.00011829589346082281,
"loss": 0.7555,
"step": 3225
},
{
"epoch": 0.6118872839213829,
"grad_norm": 0.03798498100177421,
"learning_rate": 0.00011781123324788111,
"loss": 0.7717,
"step": 3230
},
{
"epoch": 0.6128344778593418,
"grad_norm": 0.040633643895124465,
"learning_rate": 0.00011732692504613554,
"loss": 0.7412,
"step": 3235
},
{
"epoch": 0.6137816717973005,
"grad_norm": 0.038994248859382026,
"learning_rate": 0.00011684297415190295,
"loss": 0.7626,
"step": 3240
},
{
"epoch": 0.6147288657352593,
"grad_norm": 0.037087473994001245,
"learning_rate": 0.00011635938585759284,
"loss": 0.7485,
"step": 3245
},
{
"epoch": 0.6156760596732181,
"grad_norm": 0.04283707658057122,
"learning_rate": 0.00011587616545164923,
"loss": 0.76,
"step": 3250
},
{
"epoch": 0.6166232536111769,
"grad_norm": 0.03994982225913288,
"learning_rate": 0.00011539331821849317,
"loss": 0.7867,
"step": 3255
},
{
"epoch": 0.6175704475491357,
"grad_norm": 0.045291416938963186,
"learning_rate": 0.00011491084943846459,
"loss": 0.7909,
"step": 3260
},
{
"epoch": 0.6185176414870944,
"grad_norm": 0.04461380336072227,
"learning_rate": 0.00011442876438776475,
"loss": 0.7501,
"step": 3265
},
{
"epoch": 0.6194648354250533,
"grad_norm": 0.04724285794795771,
"learning_rate": 0.00011394706833839858,
"loss": 0.7663,
"step": 3270
},
{
"epoch": 0.6204120293630121,
"grad_norm": 0.04043233809259392,
"learning_rate": 0.00011346576655811683,
"loss": 0.7573,
"step": 3275
},
{
"epoch": 0.6213592233009708,
"grad_norm": 0.04083620126750758,
"learning_rate": 0.00011298486431035874,
"loss": 0.796,
"step": 3280
},
{
"epoch": 0.6223064172389297,
"grad_norm": 0.04080275832439418,
"learning_rate": 0.00011250436685419418,
"loss": 0.7631,
"step": 3285
},
{
"epoch": 0.6232536111768885,
"grad_norm": 0.03886958426209183,
"learning_rate": 0.00011202427944426636,
"loss": 0.75,
"step": 3290
},
{
"epoch": 0.6242008051148472,
"grad_norm": 0.03941484340988021,
"learning_rate": 0.00011154460733073433,
"loss": 0.7562,
"step": 3295
},
{
"epoch": 0.625147999052806,
"grad_norm": 0.04351471227388938,
"learning_rate": 0.00011106535575921536,
"loss": 0.7714,
"step": 3300
},
{
"epoch": 0.6260951929907649,
"grad_norm": 0.04704801990878809,
"learning_rate": 0.00011058652997072802,
"loss": 0.7793,
"step": 3305
},
{
"epoch": 0.6270423869287236,
"grad_norm": 0.04584785902650524,
"learning_rate": 0.00011010813520163427,
"loss": 0.7626,
"step": 3310
},
{
"epoch": 0.6279895808666824,
"grad_norm": 0.04629280784526772,
"learning_rate": 0.00010963017668358273,
"loss": 0.7418,
"step": 3315
},
{
"epoch": 0.6289367748046413,
"grad_norm": 0.04407747166586352,
"learning_rate": 0.00010915265964345114,
"loss": 0.7459,
"step": 3320
},
{
"epoch": 0.6298839687426,
"grad_norm": 0.039913077554486434,
"learning_rate": 0.00010867558930328934,
"loss": 0.7504,
"step": 3325
},
{
"epoch": 0.6308311626805588,
"grad_norm": 0.04379451323559438,
"learning_rate": 0.00010819897088026224,
"loss": 0.7633,
"step": 3330
},
{
"epoch": 0.6317783566185177,
"grad_norm": 0.04536301609111961,
"learning_rate": 0.00010772280958659241,
"loss": 0.7657,
"step": 3335
},
{
"epoch": 0.6327255505564764,
"grad_norm": 0.041972499038774445,
"learning_rate": 0.00010724711062950358,
"loss": 0.774,
"step": 3340
},
{
"epoch": 0.6336727444944352,
"grad_norm": 0.04243182090390366,
"learning_rate": 0.00010677187921116325,
"loss": 0.7593,
"step": 3345
},
{
"epoch": 0.6346199384323941,
"grad_norm": 0.041997095162117505,
"learning_rate": 0.00010629712052862619,
"loss": 0.7525,
"step": 3350
},
{
"epoch": 0.6355671323703528,
"grad_norm": 0.043340122902892075,
"learning_rate": 0.00010582283977377709,
"loss": 0.7554,
"step": 3355
},
{
"epoch": 0.6365143263083116,
"grad_norm": 0.041977926659558386,
"learning_rate": 0.00010534904213327447,
"loss": 0.7503,
"step": 3360
},
{
"epoch": 0.6374615202462705,
"grad_norm": 0.04155046135436731,
"learning_rate": 0.00010487573278849338,
"loss": 0.7555,
"step": 3365
},
{
"epoch": 0.6384087141842292,
"grad_norm": 0.04049957089110068,
"learning_rate": 0.00010440291691546895,
"loss": 0.7701,
"step": 3370
},
{
"epoch": 0.639355908122188,
"grad_norm": 0.042538488276278326,
"learning_rate": 0.00010393059968483989,
"loss": 0.765,
"step": 3375
},
{
"epoch": 0.6403031020601468,
"grad_norm": 0.03712808828646828,
"learning_rate": 0.00010345878626179162,
"loss": 0.7492,
"step": 3380
},
{
"epoch": 0.6412502959981056,
"grad_norm": 0.04192587249684641,
"learning_rate": 0.00010298748180600031,
"loss": 0.7644,
"step": 3385
},
{
"epoch": 0.6421974899360644,
"grad_norm": 0.037429843394214256,
"learning_rate": 0.00010251669147157582,
"loss": 0.7484,
"step": 3390
},
{
"epoch": 0.6431446838740232,
"grad_norm": 0.0454827756794171,
"learning_rate": 0.00010204642040700593,
"loss": 0.7432,
"step": 3395
},
{
"epoch": 0.644091877811982,
"grad_norm": 0.03998857520778792,
"learning_rate": 0.00010157667375509966,
"loss": 0.7767,
"step": 3400
},
{
"epoch": 0.6450390717499408,
"grad_norm": 0.03874189126294134,
"learning_rate": 0.00010110745665293102,
"loss": 0.7613,
"step": 3405
},
{
"epoch": 0.6459862656878996,
"grad_norm": 0.04315072772814885,
"learning_rate": 0.00010063877423178327,
"loss": 0.7615,
"step": 3410
},
{
"epoch": 0.6469334596258584,
"grad_norm": 0.039915598481917704,
"learning_rate": 0.00010017063161709203,
"loss": 0.7368,
"step": 3415
},
{
"epoch": 0.6478806535638172,
"grad_norm": 0.043320616700844417,
"learning_rate": 9.970303392839016e-05,
"loss": 0.7643,
"step": 3420
},
{
"epoch": 0.648827847501776,
"grad_norm": 0.038608629273838305,
"learning_rate": 9.923598627925085e-05,
"loss": 0.7647,
"step": 3425
},
{
"epoch": 0.6497750414397347,
"grad_norm": 0.0403187718238657,
"learning_rate": 9.876949377723254e-05,
"loss": 0.7583,
"step": 3430
},
{
"epoch": 0.6507222353776936,
"grad_norm": 0.04071452984696662,
"learning_rate": 9.830356152382245e-05,
"loss": 0.7543,
"step": 3435
},
{
"epoch": 0.6516694293156524,
"grad_norm": 0.04123560216855826,
"learning_rate": 9.783819461438097e-05,
"loss": 0.7503,
"step": 3440
},
{
"epoch": 0.6526166232536111,
"grad_norm": 0.04169247579017777,
"learning_rate": 9.737339813808621e-05,
"loss": 0.7633,
"step": 3445
},
{
"epoch": 0.65356381719157,
"grad_norm": 0.03950148386345426,
"learning_rate": 9.69091771778778e-05,
"loss": 0.7797,
"step": 3450
},
{
"epoch": 0.6545110111295288,
"grad_norm": 0.041505476095881656,
"learning_rate": 9.644553681040196e-05,
"loss": 0.7464,
"step": 3455
},
{
"epoch": 0.6554582050674875,
"grad_norm": 0.04278317549795013,
"learning_rate": 9.598248210595531e-05,
"loss": 0.7758,
"step": 3460
},
{
"epoch": 0.6564053990054464,
"grad_norm": 0.04182779902664369,
"learning_rate": 9.552001812842996e-05,
"loss": 0.7786,
"step": 3465
},
{
"epoch": 0.6573525929434052,
"grad_norm": 0.03765966562832586,
"learning_rate": 9.505814993525797e-05,
"loss": 0.748,
"step": 3470
},
{
"epoch": 0.6582997868813639,
"grad_norm": 0.03856379305262854,
"learning_rate": 9.459688257735575e-05,
"loss": 0.7265,
"step": 3475
},
{
"epoch": 0.6592469808193228,
"grad_norm": 0.042262933505673055,
"learning_rate": 9.413622109906937e-05,
"loss": 0.7608,
"step": 3480
},
{
"epoch": 0.6601941747572816,
"grad_norm": 0.03934049459172944,
"learning_rate": 9.367617053811885e-05,
"loss": 0.7355,
"step": 3485
},
{
"epoch": 0.6611413686952403,
"grad_norm": 0.040505520508700994,
"learning_rate": 9.321673592554346e-05,
"loss": 0.7285,
"step": 3490
},
{
"epoch": 0.6620885626331992,
"grad_norm": 0.0443739357214151,
"learning_rate": 9.275792228564647e-05,
"loss": 0.7465,
"step": 3495
},
{
"epoch": 0.663035756571158,
"grad_norm": 0.042948832615535365,
"learning_rate": 9.229973463594036e-05,
"loss": 0.7415,
"step": 3500
},
{
"epoch": 0.6639829505091167,
"grad_norm": 0.037284235954941944,
"learning_rate": 9.184217798709195e-05,
"loss": 0.7624,
"step": 3505
},
{
"epoch": 0.6649301444470755,
"grad_norm": 0.041373070240537976,
"learning_rate": 9.13852573428673e-05,
"loss": 0.76,
"step": 3510
},
{
"epoch": 0.6658773383850344,
"grad_norm": 0.04078003790880156,
"learning_rate": 9.092897770007748e-05,
"loss": 0.7696,
"step": 3515
},
{
"epoch": 0.6668245323229931,
"grad_norm": 0.04227188813941138,
"learning_rate": 9.047334404852349e-05,
"loss": 0.7385,
"step": 3520
},
{
"epoch": 0.6677717262609519,
"grad_norm": 0.04055813753473042,
"learning_rate": 9.001836137094199e-05,
"loss": 0.7411,
"step": 3525
},
{
"epoch": 0.6687189201989108,
"grad_norm": 0.03979213857043945,
"learning_rate": 8.95640346429506e-05,
"loss": 0.7419,
"step": 3530
},
{
"epoch": 0.6696661141368695,
"grad_norm": 0.04277127849333733,
"learning_rate": 8.911036883299367e-05,
"loss": 0.7459,
"step": 3535
},
{
"epoch": 0.6706133080748283,
"grad_norm": 0.0414740211347812,
"learning_rate": 8.865736890228782e-05,
"loss": 0.7663,
"step": 3540
},
{
"epoch": 0.6715605020127872,
"grad_norm": 0.03968770903759535,
"learning_rate": 8.820503980476766e-05,
"loss": 0.7397,
"step": 3545
},
{
"epoch": 0.6725076959507459,
"grad_norm": 0.04028225688842527,
"learning_rate": 8.775338648703182e-05,
"loss": 0.7359,
"step": 3550
},
{
"epoch": 0.6734548898887047,
"grad_norm": 0.03957291045780008,
"learning_rate": 8.730241388828852e-05,
"loss": 0.7458,
"step": 3555
},
{
"epoch": 0.6744020838266636,
"grad_norm": 0.04075648253621573,
"learning_rate": 8.685212694030197e-05,
"loss": 0.7334,
"step": 3560
},
{
"epoch": 0.6753492777646223,
"grad_norm": 0.03417551964299854,
"learning_rate": 8.640253056733788e-05,
"loss": 0.7105,
"step": 3565
},
{
"epoch": 0.6762964717025811,
"grad_norm": 0.041401776497170924,
"learning_rate": 8.595362968611036e-05,
"loss": 0.714,
"step": 3570
},
{
"epoch": 0.67724366564054,
"grad_norm": 0.04079313998460472,
"learning_rate": 8.550542920572751e-05,
"loss": 0.7426,
"step": 3575
},
{
"epoch": 0.6781908595784987,
"grad_norm": 0.03987821946868629,
"learning_rate": 8.505793402763786e-05,
"loss": 0.763,
"step": 3580
},
{
"epoch": 0.6791380535164575,
"grad_norm": 0.04091134166237045,
"learning_rate": 8.461114904557712e-05,
"loss": 0.751,
"step": 3585
},
{
"epoch": 0.6800852474544163,
"grad_norm": 0.04177068343214537,
"learning_rate": 8.416507914551405e-05,
"loss": 0.78,
"step": 3590
},
{
"epoch": 0.6810324413923751,
"grad_norm": 0.03961181610952186,
"learning_rate": 8.371972920559791e-05,
"loss": 0.7335,
"step": 3595
},
{
"epoch": 0.6819796353303339,
"grad_norm": 0.039417654158232895,
"learning_rate": 8.327510409610408e-05,
"loss": 0.7642,
"step": 3600
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.04073914119422375,
"learning_rate": 8.283120867938156e-05,
"loss": 0.7468,
"step": 3605
},
{
"epoch": 0.6838740232062515,
"grad_norm": 0.03905373593771108,
"learning_rate": 8.23880478097996e-05,
"loss": 0.7377,
"step": 3610
},
{
"epoch": 0.6848212171442103,
"grad_norm": 0.04307956957119377,
"learning_rate": 8.194562633369428e-05,
"loss": 0.7536,
"step": 3615
},
{
"epoch": 0.6857684110821691,
"grad_norm": 0.04210126693770651,
"learning_rate": 8.150394908931622e-05,
"loss": 0.7554,
"step": 3620
},
{
"epoch": 0.6867156050201279,
"grad_norm": 0.0421490482386118,
"learning_rate": 8.106302090677682e-05,
"loss": 0.7936,
"step": 3625
},
{
"epoch": 0.6876627989580867,
"grad_norm": 0.03588928471279015,
"learning_rate": 8.062284660799617e-05,
"loss": 0.7287,
"step": 3630
},
{
"epoch": 0.6886099928960455,
"grad_norm": 0.03952924524135366,
"learning_rate": 8.018343100664975e-05,
"loss": 0.7527,
"step": 3635
},
{
"epoch": 0.6895571868340042,
"grad_norm": 0.03984238587807663,
"learning_rate": 7.974477890811622e-05,
"loss": 0.7528,
"step": 3640
},
{
"epoch": 0.6905043807719631,
"grad_norm": 0.04306536866980951,
"learning_rate": 7.930689510942467e-05,
"loss": 0.7263,
"step": 3645
},
{
"epoch": 0.6914515747099218,
"grad_norm": 0.03823091396622685,
"learning_rate": 7.886978439920219e-05,
"loss": 0.7262,
"step": 3650
},
{
"epoch": 0.6923987686478806,
"grad_norm": 0.040037669190792775,
"learning_rate": 7.84334515576215e-05,
"loss": 0.761,
"step": 3655
},
{
"epoch": 0.6933459625858395,
"grad_norm": 0.04316597192554324,
"learning_rate": 7.799790135634848e-05,
"loss": 0.7654,
"step": 3660
},
{
"epoch": 0.6942931565237982,
"grad_norm": 0.040090557871707184,
"learning_rate": 7.756313855849061e-05,
"loss": 0.7576,
"step": 3665
},
{
"epoch": 0.695240350461757,
"grad_norm": 0.03885025639567031,
"learning_rate": 7.712916791854398e-05,
"loss": 0.7337,
"step": 3670
},
{
"epoch": 0.6961875443997159,
"grad_norm": 0.039817406008376306,
"learning_rate": 7.669599418234209e-05,
"loss": 0.7827,
"step": 3675
},
{
"epoch": 0.6971347383376746,
"grad_norm": 0.03796969889976453,
"learning_rate": 7.626362208700345e-05,
"loss": 0.7401,
"step": 3680
},
{
"epoch": 0.6980819322756334,
"grad_norm": 0.04250157953569218,
"learning_rate": 7.583205636087998e-05,
"loss": 0.7849,
"step": 3685
},
{
"epoch": 0.6990291262135923,
"grad_norm": 0.039617912653361516,
"learning_rate": 7.540130172350553e-05,
"loss": 0.7299,
"step": 3690
},
{
"epoch": 0.699976320151551,
"grad_norm": 0.03816192045516508,
"learning_rate": 7.497136288554358e-05,
"loss": 0.7514,
"step": 3695
},
{
"epoch": 0.7009235140895098,
"grad_norm": 0.03983670858460556,
"learning_rate": 7.454224454873653e-05,
"loss": 0.726,
"step": 3700
},
{
"epoch": 0.7018707080274686,
"grad_norm": 0.03934247221089433,
"learning_rate": 7.411395140585366e-05,
"loss": 0.755,
"step": 3705
},
{
"epoch": 0.7028179019654274,
"grad_norm": 0.041762302180065505,
"learning_rate": 7.368648814064017e-05,
"loss": 0.7731,
"step": 3710
},
{
"epoch": 0.7037650959033862,
"grad_norm": 0.03718549677733008,
"learning_rate": 7.325985942776586e-05,
"loss": 0.7245,
"step": 3715
},
{
"epoch": 0.704712289841345,
"grad_norm": 0.03979809659047382,
"learning_rate": 7.283406993277401e-05,
"loss": 0.7493,
"step": 3720
},
{
"epoch": 0.7056594837793038,
"grad_norm": 0.03891391378511992,
"learning_rate": 7.240912431203036e-05,
"loss": 0.7372,
"step": 3725
},
{
"epoch": 0.7066066777172626,
"grad_norm": 0.03760484285201233,
"learning_rate": 7.198502721267201e-05,
"loss": 0.7319,
"step": 3730
},
{
"epoch": 0.7075538716552214,
"grad_norm": 0.039788628547401186,
"learning_rate": 7.156178327255696e-05,
"loss": 0.7107,
"step": 3735
},
{
"epoch": 0.7085010655931802,
"grad_norm": 0.03854056853156896,
"learning_rate": 7.113939712021312e-05,
"loss": 0.7195,
"step": 3740
},
{
"epoch": 0.709448259531139,
"grad_norm": 0.04238606817116954,
"learning_rate": 7.071787337478785e-05,
"loss": 0.7448,
"step": 3745
},
{
"epoch": 0.7103954534690978,
"grad_norm": 0.03913911448999711,
"learning_rate": 7.029721664599718e-05,
"loss": 0.7553,
"step": 3750
},
{
"epoch": 0.7113426474070565,
"grad_norm": 0.04214126618606677,
"learning_rate": 6.987743153407576e-05,
"loss": 0.7263,
"step": 3755
},
{
"epoch": 0.7122898413450154,
"grad_norm": 0.044083161857802124,
"learning_rate": 6.94585226297263e-05,
"loss": 0.7366,
"step": 3760
},
{
"epoch": 0.7132370352829742,
"grad_norm": 0.04073891396330348,
"learning_rate": 6.90404945140695e-05,
"loss": 0.7389,
"step": 3765
},
{
"epoch": 0.7141842292209329,
"grad_norm": 0.03825315556026045,
"learning_rate": 6.862335175859387e-05,
"loss": 0.7347,
"step": 3770
},
{
"epoch": 0.7151314231588918,
"grad_norm": 0.03504164436950897,
"learning_rate": 6.820709892510566e-05,
"loss": 0.7563,
"step": 3775
},
{
"epoch": 0.7160786170968506,
"grad_norm": 0.040944422736284514,
"learning_rate": 6.779174056567923e-05,
"loss": 0.7324,
"step": 3780
},
{
"epoch": 0.7170258110348093,
"grad_norm": 0.042252601374869914,
"learning_rate": 6.737728122260705e-05,
"loss": 0.7428,
"step": 3785
},
{
"epoch": 0.7179730049727682,
"grad_norm": 0.04198448843744255,
"learning_rate": 6.696372542835007e-05,
"loss": 0.7563,
"step": 3790
},
{
"epoch": 0.718920198910727,
"grad_norm": 0.03891146859575185,
"learning_rate": 6.655107770548829e-05,
"loss": 0.7653,
"step": 3795
},
{
"epoch": 0.7198673928486857,
"grad_norm": 0.047863846470899335,
"learning_rate": 6.613934256667098e-05,
"loss": 0.7443,
"step": 3800
},
{
"epoch": 0.7208145867866446,
"grad_norm": 0.04392069667011626,
"learning_rate": 6.572852451456766e-05,
"loss": 0.7506,
"step": 3805
},
{
"epoch": 0.7217617807246034,
"grad_norm": 0.04054436230218552,
"learning_rate": 6.53186280418188e-05,
"loss": 0.7472,
"step": 3810
},
{
"epoch": 0.7227089746625621,
"grad_norm": 0.03885365408522593,
"learning_rate": 6.490965763098654e-05,
"loss": 0.719,
"step": 3815
},
{
"epoch": 0.723656168600521,
"grad_norm": 0.04194579275932475,
"learning_rate": 6.450161775450572e-05,
"loss": 0.7125,
"step": 3820
},
{
"epoch": 0.7246033625384798,
"grad_norm": 0.04052378750052248,
"learning_rate": 6.409451287463508e-05,
"loss": 0.7766,
"step": 3825
},
{
"epoch": 0.7255505564764385,
"grad_norm": 0.039568496333215346,
"learning_rate": 6.368834744340837e-05,
"loss": 0.7278,
"step": 3830
},
{
"epoch": 0.7264977504143973,
"grad_norm": 0.036717341021427846,
"learning_rate": 6.328312590258568e-05,
"loss": 0.7389,
"step": 3835
},
{
"epoch": 0.7274449443523562,
"grad_norm": 0.04020458964083549,
"learning_rate": 6.28788526836049e-05,
"loss": 0.7484,
"step": 3840
},
{
"epoch": 0.7283921382903149,
"grad_norm": 0.04041885111843109,
"learning_rate": 6.247553220753305e-05,
"loss": 0.7286,
"step": 3845
},
{
"epoch": 0.7293393322282737,
"grad_norm": 0.03932262216800061,
"learning_rate": 6.207316888501833e-05,
"loss": 0.7211,
"step": 3850
},
{
"epoch": 0.7302865261662326,
"grad_norm": 0.03945586913466572,
"learning_rate": 6.167176711624157e-05,
"loss": 0.7343,
"step": 3855
},
{
"epoch": 0.7312337201041913,
"grad_norm": 0.03793066654633331,
"learning_rate": 6.127133129086818e-05,
"loss": 0.7283,
"step": 3860
},
{
"epoch": 0.7321809140421501,
"grad_norm": 0.03754478471345365,
"learning_rate": 6.087186578800027e-05,
"loss": 0.7537,
"step": 3865
},
{
"epoch": 0.733128107980109,
"grad_norm": 0.04134693277562893,
"learning_rate": 6.0473374976128444e-05,
"loss": 0.7279,
"step": 3870
},
{
"epoch": 0.7340753019180677,
"grad_norm": 0.03826648717909671,
"learning_rate": 6.007586321308445e-05,
"loss": 0.722,
"step": 3875
},
{
"epoch": 0.7350224958560265,
"grad_norm": 0.041876625013751154,
"learning_rate": 5.967933484599324e-05,
"loss": 0.7488,
"step": 3880
},
{
"epoch": 0.7359696897939854,
"grad_norm": 0.04361394297294752,
"learning_rate": 5.928379421122557e-05,
"loss": 0.751,
"step": 3885
},
{
"epoch": 0.7369168837319441,
"grad_norm": 0.040558338035631775,
"learning_rate": 5.888924563435032e-05,
"loss": 0.7359,
"step": 3890
},
{
"epoch": 0.7378640776699029,
"grad_norm": 0.041973741302183905,
"learning_rate": 5.849569343008758e-05,
"loss": 0.746,
"step": 3895
},
{
"epoch": 0.7388112716078618,
"grad_norm": 0.043105183047686346,
"learning_rate": 5.8103141902261205e-05,
"loss": 0.7403,
"step": 3900
},
{
"epoch": 0.7397584655458205,
"grad_norm": 0.038539690413186195,
"learning_rate": 5.7711595343751806e-05,
"loss": 0.7467,
"step": 3905
},
{
"epoch": 0.7407056594837793,
"grad_norm": 0.040297657563285356,
"learning_rate": 5.732105803644986e-05,
"loss": 0.7256,
"step": 3910
},
{
"epoch": 0.7416528534217381,
"grad_norm": 0.040590581894694416,
"learning_rate": 5.693153425120872e-05,
"loss": 0.7301,
"step": 3915
},
{
"epoch": 0.7426000473596969,
"grad_norm": 0.039579719033313615,
"learning_rate": 5.654302824779815e-05,
"loss": 0.7343,
"step": 3920
},
{
"epoch": 0.7435472412976557,
"grad_norm": 0.03807616540727068,
"learning_rate": 5.6155544274857436e-05,
"loss": 0.7219,
"step": 3925
},
{
"epoch": 0.7444944352356145,
"grad_norm": 0.03907724233269258,
"learning_rate": 5.576908656984938e-05,
"loss": 0.7359,
"step": 3930
},
{
"epoch": 0.7454416291735733,
"grad_norm": 0.0411669903197421,
"learning_rate": 5.5383659359013516e-05,
"loss": 0.7606,
"step": 3935
},
{
"epoch": 0.7463888231115321,
"grad_norm": 0.03660616286712515,
"learning_rate": 5.499926685731999e-05,
"loss": 0.7144,
"step": 3940
},
{
"epoch": 0.7473360170494909,
"grad_norm": 0.04196508435156298,
"learning_rate": 5.461591326842368e-05,
"loss": 0.7268,
"step": 3945
},
{
"epoch": 0.7482832109874497,
"grad_norm": 0.03782538627658778,
"learning_rate": 5.4233602784617875e-05,
"loss": 0.7538,
"step": 3950
},
{
"epoch": 0.7492304049254085,
"grad_norm": 0.039917199754721966,
"learning_rate": 5.385233958678899e-05,
"loss": 0.7471,
"step": 3955
},
{
"epoch": 0.7501775988633673,
"grad_norm": 0.041462493245785374,
"learning_rate": 5.347212784437014e-05,
"loss": 0.7335,
"step": 3960
},
{
"epoch": 0.751124792801326,
"grad_norm": 0.04262717035805544,
"learning_rate": 5.3092971715296036e-05,
"loss": 0.7517,
"step": 3965
},
{
"epoch": 0.7520719867392849,
"grad_norm": 0.03912615098583992,
"learning_rate": 5.2714875345957364e-05,
"loss": 0.7505,
"step": 3970
},
{
"epoch": 0.7530191806772437,
"grad_norm": 0.03843483966970995,
"learning_rate": 5.2337842871155464e-05,
"loss": 0.729,
"step": 3975
},
{
"epoch": 0.7539663746152024,
"grad_norm": 0.04001769351762854,
"learning_rate": 5.1961878414057116e-05,
"loss": 0.743,
"step": 3980
},
{
"epoch": 0.7549135685531613,
"grad_norm": 0.03903544150596959,
"learning_rate": 5.158698608614928e-05,
"loss": 0.7231,
"step": 3985
},
{
"epoch": 0.7558607624911201,
"grad_norm": 0.04246943536811293,
"learning_rate": 5.1213169987194506e-05,
"loss": 0.7376,
"step": 3990
},
{
"epoch": 0.7568079564290788,
"grad_norm": 0.04255248262700945,
"learning_rate": 5.08404342051856e-05,
"loss": 0.769,
"step": 3995
},
{
"epoch": 0.7577551503670377,
"grad_norm": 0.038324457218321194,
"learning_rate": 5.04687828163015e-05,
"loss": 0.7171,
"step": 4000
},
{
"epoch": 0.7587023443049965,
"grad_norm": 0.04478588785492951,
"learning_rate": 5.0098219884862265e-05,
"loss": 0.764,
"step": 4005
},
{
"epoch": 0.7596495382429552,
"grad_norm": 0.040448163186758916,
"learning_rate": 4.9728749463284634e-05,
"loss": 0.7416,
"step": 4010
},
{
"epoch": 0.7605967321809141,
"grad_norm": 0.04321236917872768,
"learning_rate": 4.936037559203806e-05,
"loss": 0.754,
"step": 4015
},
{
"epoch": 0.7615439261188729,
"grad_norm": 0.041244116901498824,
"learning_rate": 4.899310229960002e-05,
"loss": 0.745,
"step": 4020
},
{
"epoch": 0.7624911200568316,
"grad_norm": 0.04064461271404752,
"learning_rate": 4.862693360241259e-05,
"loss": 0.7351,
"step": 4025
},
{
"epoch": 0.7634383139947905,
"grad_norm": 0.03781319026471734,
"learning_rate": 4.826187350483783e-05,
"loss": 0.7307,
"step": 4030
},
{
"epoch": 0.7643855079327492,
"grad_norm": 0.03974939076217195,
"learning_rate": 4.789792599911453e-05,
"loss": 0.7438,
"step": 4035
},
{
"epoch": 0.765332701870708,
"grad_norm": 0.04065565436383743,
"learning_rate": 4.753509506531436e-05,
"loss": 0.7636,
"step": 4040
},
{
"epoch": 0.7662798958086668,
"grad_norm": 0.04062956705746941,
"learning_rate": 4.717338467129813e-05,
"loss": 0.7569,
"step": 4045
},
{
"epoch": 0.7672270897466256,
"grad_norm": 0.041349177857734835,
"learning_rate": 4.6812798772672936e-05,
"loss": 0.7026,
"step": 4050
},
{
"epoch": 0.7681742836845844,
"grad_norm": 0.04354632478814716,
"learning_rate": 4.645334131274828e-05,
"loss": 0.7145,
"step": 4055
},
{
"epoch": 0.7691214776225432,
"grad_norm": 0.03936898140216215,
"learning_rate": 4.609501622249343e-05,
"loss": 0.7286,
"step": 4060
},
{
"epoch": 0.770068671560502,
"grad_norm": 0.03929846725341178,
"learning_rate": 4.573782742049407e-05,
"loss": 0.7304,
"step": 4065
},
{
"epoch": 0.7710158654984608,
"grad_norm": 0.04124206397422393,
"learning_rate": 4.538177881290973e-05,
"loss": 0.7306,
"step": 4070
},
{
"epoch": 0.7719630594364196,
"grad_norm": 0.042221939835918945,
"learning_rate": 4.502687429343106e-05,
"loss": 0.7519,
"step": 4075
},
{
"epoch": 0.7729102533743784,
"grad_norm": 0.04130656943974972,
"learning_rate": 4.4673117743236884e-05,
"loss": 0.7245,
"step": 4080
},
{
"epoch": 0.7738574473123372,
"grad_norm": 0.03913493498829275,
"learning_rate": 4.432051303095225e-05,
"loss": 0.7487,
"step": 4085
},
{
"epoch": 0.774804641250296,
"grad_norm": 0.037086715638808006,
"learning_rate": 4.396906401260573e-05,
"loss": 0.7308,
"step": 4090
},
{
"epoch": 0.7757518351882547,
"grad_norm": 0.041005709394066454,
"learning_rate": 4.361877453158749e-05,
"loss": 0.7222,
"step": 4095
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.0390323446969787,
"learning_rate": 4.3269648418607194e-05,
"loss": 0.7187,
"step": 4100
},
{
"epoch": 0.7776462230641724,
"grad_norm": 0.04019349411682957,
"learning_rate": 4.29216894916521e-05,
"loss": 0.7089,
"step": 4105
},
{
"epoch": 0.7785934170021311,
"grad_norm": 0.04095873534287484,
"learning_rate": 4.257490155594528e-05,
"loss": 0.7546,
"step": 4110
},
{
"epoch": 0.77954061094009,
"grad_norm": 0.040023174171211935,
"learning_rate": 4.2229288403903994e-05,
"loss": 0.7151,
"step": 4115
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.04008455709185164,
"learning_rate": 4.188485381509833e-05,
"loss": 0.7317,
"step": 4120
},
{
"epoch": 0.7814349988160075,
"grad_norm": 0.041977277262445226,
"learning_rate": 4.154160155620977e-05,
"loss": 0.73,
"step": 4125
},
{
"epoch": 0.7823821927539664,
"grad_norm": 0.04088401185241922,
"learning_rate": 4.119953538099006e-05,
"loss": 0.7639,
"step": 4130
},
{
"epoch": 0.7833293866919252,
"grad_norm": 0.037283132614974145,
"learning_rate": 4.085865903021999e-05,
"loss": 0.7456,
"step": 4135
},
{
"epoch": 0.7842765806298839,
"grad_norm": 0.046399869448183154,
"learning_rate": 4.051897623166879e-05,
"loss": 0.748,
"step": 4140
},
{
"epoch": 0.7852237745678428,
"grad_norm": 0.03912459186813119,
"learning_rate": 4.0180490700053105e-05,
"loss": 0.7518,
"step": 4145
},
{
"epoch": 0.7861709685058016,
"grad_norm": 0.04059706834387849,
"learning_rate": 3.984320613699648e-05,
"loss": 0.7174,
"step": 4150
},
{
"epoch": 0.7871181624437603,
"grad_norm": 0.04279168262844896,
"learning_rate": 3.950712623098892e-05,
"loss": 0.717,
"step": 4155
},
{
"epoch": 0.7880653563817192,
"grad_norm": 0.04328417157580087,
"learning_rate": 3.917225465734632e-05,
"loss": 0.7402,
"step": 4160
},
{
"epoch": 0.789012550319678,
"grad_norm": 0.03907178607474956,
"learning_rate": 3.883859507817061e-05,
"loss": 0.7109,
"step": 4165
},
{
"epoch": 0.7899597442576367,
"grad_norm": 0.03842633012724885,
"learning_rate": 3.850615114230949e-05,
"loss": 0.7565,
"step": 4170
},
{
"epoch": 0.7909069381955955,
"grad_norm": 0.04237253188344206,
"learning_rate": 3.81749264853166e-05,
"loss": 0.7489,
"step": 4175
},
{
"epoch": 0.7918541321335544,
"grad_norm": 0.04212676511762711,
"learning_rate": 3.784492472941173e-05,
"loss": 0.7506,
"step": 4180
},
{
"epoch": 0.7928013260715131,
"grad_norm": 0.040564694587832524,
"learning_rate": 3.751614948344116e-05,
"loss": 0.7594,
"step": 4185
},
{
"epoch": 0.7937485200094719,
"grad_norm": 0.03846320035915876,
"learning_rate": 3.718860434283832e-05,
"loss": 0.7416,
"step": 4190
},
{
"epoch": 0.7946957139474308,
"grad_norm": 0.04058426964727867,
"learning_rate": 3.686229288958442e-05,
"loss": 0.7703,
"step": 4195
},
{
"epoch": 0.7956429078853895,
"grad_norm": 0.03767236001284344,
"learning_rate": 3.653721869216926e-05,
"loss": 0.7344,
"step": 4200
},
{
"epoch": 0.7965901018233483,
"grad_norm": 0.03990592712721354,
"learning_rate": 3.621338530555207e-05,
"loss": 0.7329,
"step": 4205
},
{
"epoch": 0.7975372957613072,
"grad_norm": 0.037505327559176606,
"learning_rate": 3.589079627112298e-05,
"loss": 0.7033,
"step": 4210
},
{
"epoch": 0.7984844896992659,
"grad_norm": 0.03592112011887372,
"learning_rate": 3.5569455116663944e-05,
"loss": 0.75,
"step": 4215
},
{
"epoch": 0.7994316836372247,
"grad_norm": 0.037737720756345204,
"learning_rate": 3.524936535631036e-05,
"loss": 0.7178,
"step": 4220
},
{
"epoch": 0.8003788775751836,
"grad_norm": 0.041074083535645345,
"learning_rate": 3.49305304905126e-05,
"loss": 0.7296,
"step": 4225
},
{
"epoch": 0.8013260715131423,
"grad_norm": 0.037206587197529284,
"learning_rate": 3.461295400599759e-05,
"loss": 0.7318,
"step": 4230
},
{
"epoch": 0.8022732654511011,
"grad_norm": 0.04155270440503186,
"learning_rate": 3.429663937573095e-05,
"loss": 0.7643,
"step": 4235
},
{
"epoch": 0.80322045938906,
"grad_norm": 0.04200718034951944,
"learning_rate": 3.3981590058878764e-05,
"loss": 0.7303,
"step": 4240
},
{
"epoch": 0.8041676533270187,
"grad_norm": 0.0391722359803136,
"learning_rate": 3.36678095007699e-05,
"loss": 0.7551,
"step": 4245
},
{
"epoch": 0.8051148472649775,
"grad_norm": 0.04162052982671362,
"learning_rate": 3.335530113285832e-05,
"loss": 0.7429,
"step": 4250
},
{
"epoch": 0.8060620412029363,
"grad_norm": 0.038860916165607134,
"learning_rate": 3.304406837268538e-05,
"loss": 0.7304,
"step": 4255
},
{
"epoch": 0.8070092351408951,
"grad_norm": 0.03765860345115411,
"learning_rate": 3.2734114623842714e-05,
"loss": 0.7541,
"step": 4260
},
{
"epoch": 0.8079564290788539,
"grad_norm": 0.03985364166742812,
"learning_rate": 3.242544327593487e-05,
"loss": 0.7159,
"step": 4265
},
{
"epoch": 0.8089036230168127,
"grad_norm": 0.04096122915397548,
"learning_rate": 3.211805770454229e-05,
"loss": 0.7494,
"step": 4270
},
{
"epoch": 0.8098508169547715,
"grad_norm": 0.03718014396684494,
"learning_rate": 3.181196127118425e-05,
"loss": 0.7228,
"step": 4275
},
{
"epoch": 0.8107980108927303,
"grad_norm": 0.037725347863418725,
"learning_rate": 3.150715732328235e-05,
"loss": 0.7507,
"step": 4280
},
{
"epoch": 0.8117452048306891,
"grad_norm": 0.04354928220465528,
"learning_rate": 3.120364919412374e-05,
"loss": 0.744,
"step": 4285
},
{
"epoch": 0.8126923987686479,
"grad_norm": 0.03829035376988075,
"learning_rate": 3.090144020282469e-05,
"loss": 0.7497,
"step": 4290
},
{
"epoch": 0.8136395927066067,
"grad_norm": 0.04107370767206898,
"learning_rate": 3.060053365429433e-05,
"loss": 0.7087,
"step": 4295
},
{
"epoch": 0.8145867866445655,
"grad_norm": 0.04270910227284078,
"learning_rate": 3.030093283919841e-05,
"loss": 0.7301,
"step": 4300
},
{
"epoch": 0.8155339805825242,
"grad_norm": 0.040080109853962084,
"learning_rate": 3.000264103392348e-05,
"loss": 0.7113,
"step": 4305
},
{
"epoch": 0.8164811745204831,
"grad_norm": 0.03923392398314568,
"learning_rate": 2.9705661500540916e-05,
"loss": 0.7235,
"step": 4310
},
{
"epoch": 0.8174283684584419,
"grad_norm": 0.04092925596400112,
"learning_rate": 2.9409997486771332e-05,
"loss": 0.7086,
"step": 4315
},
{
"epoch": 0.8183755623964006,
"grad_norm": 0.03723866823377466,
"learning_rate": 2.911565222594904e-05,
"loss": 0.7154,
"step": 4320
},
{
"epoch": 0.8193227563343595,
"grad_norm": 0.04026831218954151,
"learning_rate": 2.8822628936986576e-05,
"loss": 0.7166,
"step": 4325
},
{
"epoch": 0.8202699502723183,
"grad_norm": 0.03924390038270272,
"learning_rate": 2.8530930824339725e-05,
"loss": 0.7114,
"step": 4330
},
{
"epoch": 0.821217144210277,
"grad_norm": 0.03857672055842463,
"learning_rate": 2.8240561077972336e-05,
"loss": 0.7275,
"step": 4335
},
{
"epoch": 0.8221643381482359,
"grad_norm": 0.03813437644170459,
"learning_rate": 2.795152287332143e-05,
"loss": 0.7407,
"step": 4340
},
{
"epoch": 0.8231115320861947,
"grad_norm": 0.039511538349970426,
"learning_rate": 2.766381937126246e-05,
"loss": 0.7224,
"step": 4345
},
{
"epoch": 0.8240587260241534,
"grad_norm": 0.04035022629840493,
"learning_rate": 2.737745371807484e-05,
"loss": 0.7226,
"step": 4350
},
{
"epoch": 0.8250059199621123,
"grad_norm": 0.041748400769491566,
"learning_rate": 2.7092429045407493e-05,
"loss": 0.7076,
"step": 4355
},
{
"epoch": 0.8259531139000711,
"grad_norm": 0.04074616252719918,
"learning_rate": 2.6808748470244596e-05,
"loss": 0.733,
"step": 4360
},
{
"epoch": 0.8269003078380298,
"grad_norm": 0.040304189290435735,
"learning_rate": 2.6526415094871456e-05,
"loss": 0.7275,
"step": 4365
},
{
"epoch": 0.8278475017759886,
"grad_norm": 0.04056642286405685,
"learning_rate": 2.624543200684059e-05,
"loss": 0.7419,
"step": 4370
},
{
"epoch": 0.8287946957139475,
"grad_norm": 0.044474637104153496,
"learning_rate": 2.5965802278938104e-05,
"loss": 0.7029,
"step": 4375
},
{
"epoch": 0.8297418896519062,
"grad_norm": 0.040065168017582885,
"learning_rate": 2.5687528969149797e-05,
"loss": 0.7375,
"step": 4380
},
{
"epoch": 0.830689083589865,
"grad_norm": 0.039500986446779594,
"learning_rate": 2.541061512062808e-05,
"loss": 0.7475,
"step": 4385
},
{
"epoch": 0.8316362775278239,
"grad_norm": 0.04336888291320547,
"learning_rate": 2.5135063761658465e-05,
"loss": 0.7506,
"step": 4390
},
{
"epoch": 0.8325834714657826,
"grad_norm": 0.04238948284006484,
"learning_rate": 2.4860877905626385e-05,
"loss": 0.7072,
"step": 4395
},
{
"epoch": 0.8335306654037414,
"grad_norm": 0.03937330992282642,
"learning_rate": 2.4588060550984517e-05,
"loss": 0.7271,
"step": 4400
},
{
"epoch": 0.8344778593417003,
"grad_norm": 0.04329748465343663,
"learning_rate": 2.4316614681219616e-05,
"loss": 0.7726,
"step": 4405
},
{
"epoch": 0.835425053279659,
"grad_norm": 0.04153391947066848,
"learning_rate": 2.4046543264820367e-05,
"loss": 0.7623,
"step": 4410
},
{
"epoch": 0.8363722472176178,
"grad_norm": 0.03864210650963097,
"learning_rate": 2.3777849255244402e-05,
"loss": 0.7335,
"step": 4415
},
{
"epoch": 0.8373194411555765,
"grad_norm": 0.03605009140484424,
"learning_rate": 2.3510535590886464e-05,
"loss": 0.7185,
"step": 4420
},
{
"epoch": 0.8382666350935354,
"grad_norm": 0.04343264310904382,
"learning_rate": 2.324460519504584e-05,
"loss": 0.7278,
"step": 4425
},
{
"epoch": 0.8392138290314942,
"grad_norm": 0.04222856281924883,
"learning_rate": 2.298006097589478e-05,
"loss": 0.7272,
"step": 4430
},
{
"epoch": 0.8401610229694529,
"grad_norm": 0.03693060964307286,
"learning_rate": 2.2716905826446553e-05,
"loss": 0.728,
"step": 4435
},
{
"epoch": 0.8411082169074118,
"grad_norm": 0.03882562220045244,
"learning_rate": 2.2455142624523632e-05,
"loss": 0.7228,
"step": 4440
},
{
"epoch": 0.8420554108453706,
"grad_norm": 0.04087387623558309,
"learning_rate": 2.2194774232726492e-05,
"loss": 0.7155,
"step": 4445
},
{
"epoch": 0.8430026047833293,
"grad_norm": 0.04058338081432673,
"learning_rate": 2.193580349840211e-05,
"loss": 0.7023,
"step": 4450
},
{
"epoch": 0.8439497987212882,
"grad_norm": 0.04404654162614751,
"learning_rate": 2.167823325361297e-05,
"loss": 0.6959,
"step": 4455
},
{
"epoch": 0.844896992659247,
"grad_norm": 0.03855188004859097,
"learning_rate": 2.1422066315106007e-05,
"loss": 0.7258,
"step": 4460
},
{
"epoch": 0.8458441865972057,
"grad_norm": 0.03928313344333361,
"learning_rate": 2.1167305484281814e-05,
"loss": 0.7372,
"step": 4465
},
{
"epoch": 0.8467913805351646,
"grad_norm": 0.03940286973106492,
"learning_rate": 2.0913953547164058e-05,
"loss": 0.7163,
"step": 4470
},
{
"epoch": 0.8477385744731234,
"grad_norm": 0.04323326221582775,
"learning_rate": 2.0662013274368854e-05,
"loss": 0.7378,
"step": 4475
},
{
"epoch": 0.8486857684110821,
"grad_norm": 0.03858572389634398,
"learning_rate": 2.041148742107471e-05,
"loss": 0.7397,
"step": 4480
},
{
"epoch": 0.849632962349041,
"grad_norm": 0.040751992473504785,
"learning_rate": 2.0162378726992222e-05,
"loss": 0.7581,
"step": 4485
},
{
"epoch": 0.8505801562869998,
"grad_norm": 0.03829683763062958,
"learning_rate": 1.9914689916334175e-05,
"loss": 0.6946,
"step": 4490
},
{
"epoch": 0.8515273502249585,
"grad_norm": 0.03695727697699672,
"learning_rate": 1.9668423697785656e-05,
"loss": 0.7331,
"step": 4495
},
{
"epoch": 0.8524745441629173,
"grad_norm": 0.03752543633546998,
"learning_rate": 1.942358276447462e-05,
"loss": 0.7281,
"step": 4500
},
{
"epoch": 0.8534217381008762,
"grad_norm": 0.04068189729909485,
"learning_rate": 1.9180169793942272e-05,
"loss": 0.7639,
"step": 4505
},
{
"epoch": 0.8543689320388349,
"grad_norm": 0.04048730316852887,
"learning_rate": 1.893818744811388e-05,
"loss": 0.748,
"step": 4510
},
{
"epoch": 0.8553161259767937,
"grad_norm": 0.04595934306302146,
"learning_rate": 1.869763837326963e-05,
"loss": 0.7799,
"step": 4515
},
{
"epoch": 0.8562633199147526,
"grad_norm": 0.04070824155336302,
"learning_rate": 1.8458525200015593e-05,
"loss": 0.7525,
"step": 4520
},
{
"epoch": 0.8572105138527113,
"grad_norm": 0.03956623366079343,
"learning_rate": 1.822085054325515e-05,
"loss": 0.7159,
"step": 4525
},
{
"epoch": 0.8581577077906701,
"grad_norm": 0.03807785792934746,
"learning_rate": 1.798461700216029e-05,
"loss": 0.7562,
"step": 4530
},
{
"epoch": 0.859104901728629,
"grad_norm": 0.03756680254096828,
"learning_rate": 1.7749827160143164e-05,
"loss": 0.7292,
"step": 4535
},
{
"epoch": 0.8600520956665877,
"grad_norm": 0.03976826946310067,
"learning_rate": 1.751648358482789e-05,
"loss": 0.7282,
"step": 4540
},
{
"epoch": 0.8609992896045465,
"grad_norm": 0.043044198125544504,
"learning_rate": 1.7284588828022378e-05,
"loss": 0.7152,
"step": 4545
},
{
"epoch": 0.8619464835425054,
"grad_norm": 0.04230346678410999,
"learning_rate": 1.7054145425690536e-05,
"loss": 0.7297,
"step": 4550
},
{
"epoch": 0.8628936774804641,
"grad_norm": 0.03893557149385341,
"learning_rate": 1.6825155897924513e-05,
"loss": 0.7239,
"step": 4555
},
{
"epoch": 0.8638408714184229,
"grad_norm": 0.037068694217922235,
"learning_rate": 1.6597622748917132e-05,
"loss": 0.7142,
"step": 4560
},
{
"epoch": 0.8647880653563818,
"grad_norm": 0.03991740786553955,
"learning_rate": 1.6371548466934385e-05,
"loss": 0.7308,
"step": 4565
},
{
"epoch": 0.8657352592943405,
"grad_norm": 0.04471672391377206,
"learning_rate": 1.6146935524288446e-05,
"loss": 0.7301,
"step": 4570
},
{
"epoch": 0.8666824532322993,
"grad_norm": 0.037180999778756545,
"learning_rate": 1.5923786377310433e-05,
"loss": 0.7203,
"step": 4575
},
{
"epoch": 0.8676296471702581,
"grad_norm": 0.04146480156497774,
"learning_rate": 1.5702103466323708e-05,
"loss": 0.7119,
"step": 4580
},
{
"epoch": 0.8685768411082169,
"grad_norm": 0.036947710794122034,
"learning_rate": 1.5481889215617073e-05,
"loss": 0.7196,
"step": 4585
},
{
"epoch": 0.8695240350461757,
"grad_norm": 0.03548597961029977,
"learning_rate": 1.5263146033418227e-05,
"loss": 0.7051,
"step": 4590
},
{
"epoch": 0.8704712289841345,
"grad_norm": 0.03955821537818952,
"learning_rate": 1.5045876311867628e-05,
"loss": 0.7206,
"step": 4595
},
{
"epoch": 0.8714184229220933,
"grad_norm": 0.038962735470389774,
"learning_rate": 1.4830082426992112e-05,
"loss": 0.7266,
"step": 4600
},
{
"epoch": 0.8723656168600521,
"grad_norm": 0.03843209348318749,
"learning_rate": 1.4615766738679036e-05,
"loss": 0.7236,
"step": 4605
},
{
"epoch": 0.8733128107980109,
"grad_norm": 0.03583871656951685,
"learning_rate": 1.4402931590650462e-05,
"loss": 0.7037,
"step": 4610
},
{
"epoch": 0.8742600047359697,
"grad_norm": 0.03814447238860827,
"learning_rate": 1.4191579310437412e-05,
"loss": 0.7142,
"step": 4615
},
{
"epoch": 0.8752071986739285,
"grad_norm": 0.04131298607004148,
"learning_rate": 1.398171220935459e-05,
"loss": 0.7472,
"step": 4620
},
{
"epoch": 0.8761543926118873,
"grad_norm": 0.04027601558619238,
"learning_rate": 1.3773332582474995e-05,
"loss": 0.7222,
"step": 4625
},
{
"epoch": 0.877101586549846,
"grad_norm": 0.03807892715571328,
"learning_rate": 1.356644270860487e-05,
"loss": 0.724,
"step": 4630
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.037140756994390095,
"learning_rate": 1.3361044850258657e-05,
"loss": 0.7313,
"step": 4635
},
{
"epoch": 0.8789959744257637,
"grad_norm": 0.040598401756478456,
"learning_rate": 1.3157141253634469e-05,
"loss": 0.7418,
"step": 4640
},
{
"epoch": 0.8799431683637224,
"grad_norm": 0.03636159365334358,
"learning_rate": 1.2954734148589369e-05,
"loss": 0.733,
"step": 4645
},
{
"epoch": 0.8808903623016813,
"grad_norm": 0.04100817868151093,
"learning_rate": 1.2753825748615032e-05,
"loss": 0.715,
"step": 4650
},
{
"epoch": 0.8818375562396401,
"grad_norm": 0.040249623789949056,
"learning_rate": 1.255441825081354e-05,
"loss": 0.7177,
"step": 4655
},
{
"epoch": 0.8827847501775988,
"grad_norm": 0.038473649906406296,
"learning_rate": 1.235651383587331e-05,
"loss": 0.735,
"step": 4660
},
{
"epoch": 0.8837319441155577,
"grad_norm": 0.03716433351702097,
"learning_rate": 1.2160114668045335e-05,
"loss": 0.7109,
"step": 4665
},
{
"epoch": 0.8846791380535165,
"grad_norm": 0.0366065266385142,
"learning_rate": 1.1965222895119442e-05,
"loss": 0.7098,
"step": 4670
},
{
"epoch": 0.8856263319914752,
"grad_norm": 0.041156901959537195,
"learning_rate": 1.1771840648400849e-05,
"loss": 0.7422,
"step": 4675
},
{
"epoch": 0.8865735259294341,
"grad_norm": 0.03951261248655999,
"learning_rate": 1.1579970042686843e-05,
"loss": 0.7434,
"step": 4680
},
{
"epoch": 0.8875207198673929,
"grad_norm": 0.042505748782389406,
"learning_rate": 1.1389613176243567e-05,
"loss": 0.7422,
"step": 4685
},
{
"epoch": 0.8884679138053516,
"grad_norm": 0.036559177807256635,
"learning_rate": 1.1200772130783259e-05,
"loss": 0.6995,
"step": 4690
},
{
"epoch": 0.8894151077433105,
"grad_norm": 0.038540183149753306,
"learning_rate": 1.1013448971441313e-05,
"loss": 0.7386,
"step": 4695
},
{
"epoch": 0.8903623016812693,
"grad_norm": 0.03778323009855525,
"learning_rate": 1.0827645746753837e-05,
"loss": 0.7293,
"step": 4700
},
{
"epoch": 0.891309495619228,
"grad_norm": 0.037960112685593676,
"learning_rate": 1.064336448863507e-05,
"loss": 0.7132,
"step": 4705
},
{
"epoch": 0.8922566895571868,
"grad_norm": 0.038143343588638426,
"learning_rate": 1.0460607212355343e-05,
"loss": 0.7157,
"step": 4710
},
{
"epoch": 0.8932038834951457,
"grad_norm": 0.03885540215502875,
"learning_rate": 1.0279375916518956e-05,
"loss": 0.7329,
"step": 4715
},
{
"epoch": 0.8941510774331044,
"grad_norm": 0.036527462926812734,
"learning_rate": 1.0099672583042306e-05,
"loss": 0.706,
"step": 4720
},
{
"epoch": 0.8950982713710632,
"grad_norm": 0.04170086636332326,
"learning_rate": 9.921499177132325e-06,
"loss": 0.7159,
"step": 4725
},
{
"epoch": 0.8960454653090221,
"grad_norm": 0.03917784068162278,
"learning_rate": 9.744857647264743e-06,
"loss": 0.7151,
"step": 4730
},
{
"epoch": 0.8969926592469808,
"grad_norm": 0.0368623059179063,
"learning_rate": 9.56974992516309e-06,
"loss": 0.7175,
"step": 4735
},
{
"epoch": 0.8979398531849396,
"grad_norm": 0.03761895854461036,
"learning_rate": 9.396177925777315e-06,
"loss": 0.7376,
"step": 4740
},
{
"epoch": 0.8988870471228985,
"grad_norm": 0.03943918919221873,
"learning_rate": 9.224143547263018e-06,
"loss": 0.727,
"step": 4745
},
{
"epoch": 0.8998342410608572,
"grad_norm": 0.041135228489504315,
"learning_rate": 9.053648670960634e-06,
"loss": 0.7079,
"step": 4750
},
{
"epoch": 0.900781434998816,
"grad_norm": 0.037659487176961826,
"learning_rate": 8.88469516137476e-06,
"loss": 0.719,
"step": 4755
},
{
"epoch": 0.9017286289367749,
"grad_norm": 0.03937100685930776,
"learning_rate": 8.717284866153967e-06,
"loss": 0.704,
"step": 4760
},
{
"epoch": 0.9026758228747336,
"grad_norm": 0.038814959136852366,
"learning_rate": 8.551419616070322e-06,
"loss": 0.7329,
"step": 4765
},
{
"epoch": 0.9036230168126924,
"grad_norm": 0.03791616596156232,
"learning_rate": 8.387101224999738e-06,
"loss": 0.7544,
"step": 4770
},
{
"epoch": 0.9045702107506512,
"grad_norm": 0.036920295113180866,
"learning_rate": 8.224331489901747e-06,
"loss": 0.7353,
"step": 4775
},
{
"epoch": 0.90551740468861,
"grad_norm": 0.04102354744430632,
"learning_rate": 8.063112190800114e-06,
"loss": 0.743,
"step": 4780
},
{
"epoch": 0.9064645986265688,
"grad_norm": 0.039647668406205074,
"learning_rate": 7.903445090763278e-06,
"loss": 0.7288,
"step": 4785
},
{
"epoch": 0.9074117925645276,
"grad_norm": 0.037692364175638995,
"learning_rate": 7.745331935885008e-06,
"loss": 0.7185,
"step": 4790
},
{
"epoch": 0.9083589865024864,
"grad_norm": 0.036226337301176685,
"learning_rate": 7.588774455265517e-06,
"loss": 0.7396,
"step": 4795
},
{
"epoch": 0.9093061804404452,
"grad_norm": 0.03741364399731542,
"learning_rate": 7.433774360992279e-06,
"loss": 0.7226,
"step": 4800
},
{
"epoch": 0.9102533743784039,
"grad_norm": 0.03775216663355994,
"learning_rate": 7.280333348121503e-06,
"loss": 0.716,
"step": 4805
},
{
"epoch": 0.9112005683163628,
"grad_norm": 0.03958684677935284,
"learning_rate": 7.128453094659508e-06,
"loss": 0.7364,
"step": 4810
},
{
"epoch": 0.9121477622543216,
"grad_norm": 0.03816974922462119,
"learning_rate": 6.978135261544398e-06,
"loss": 0.726,
"step": 4815
},
{
"epoch": 0.9130949561922803,
"grad_norm": 0.03916663794581213,
"learning_rate": 6.829381492627978e-06,
"loss": 0.7091,
"step": 4820
},
{
"epoch": 0.9140421501302392,
"grad_norm": 0.03819017711079349,
"learning_rate": 6.682193414657583e-06,
"loss": 0.7225,
"step": 4825
},
{
"epoch": 0.914989344068198,
"grad_norm": 0.03825337879044159,
"learning_rate": 6.5365726372584805e-06,
"loss": 0.7167,
"step": 4830
},
{
"epoch": 0.9159365380061567,
"grad_norm": 0.03783436042390378,
"learning_rate": 6.392520752916097e-06,
"loss": 0.7425,
"step": 4835
},
{
"epoch": 0.9168837319441155,
"grad_norm": 0.03893124825143902,
"learning_rate": 6.2500393369588505e-06,
"loss": 0.7272,
"step": 4840
},
{
"epoch": 0.9178309258820744,
"grad_norm": 0.03729369559567071,
"learning_rate": 6.109129947540631e-06,
"loss": 0.741,
"step": 4845
},
{
"epoch": 0.9187781198200331,
"grad_norm": 0.03912193568036812,
"learning_rate": 5.969794125623928e-06,
"loss": 0.7276,
"step": 4850
},
{
"epoch": 0.9197253137579919,
"grad_norm": 0.03810873022721955,
"learning_rate": 5.832033394963015e-06,
"loss": 0.7231,
"step": 4855
},
{
"epoch": 0.9206725076959508,
"grad_norm": 0.03730402627420899,
"learning_rate": 5.69584926208711e-06,
"loss": 0.7047,
"step": 4860
},
{
"epoch": 0.9216197016339095,
"grad_norm": 0.03667813336307107,
"learning_rate": 5.561243216284139e-06,
"loss": 0.7152,
"step": 4865
},
{
"epoch": 0.9225668955718683,
"grad_norm": 0.03964877444312505,
"learning_rate": 5.4282167295842e-06,
"loss": 0.7151,
"step": 4870
},
{
"epoch": 0.9235140895098272,
"grad_norm": 0.04352873821823642,
"learning_rate": 5.296771256743676e-06,
"loss": 0.7148,
"step": 4875
},
{
"epoch": 0.9244612834477859,
"grad_norm": 0.04158839264995075,
"learning_rate": 5.166908235229178e-06,
"loss": 0.699,
"step": 4880
},
{
"epoch": 0.9254084773857447,
"grad_norm": 0.03955083160304519,
"learning_rate": 5.038629085201878e-06,
"loss": 0.727,
"step": 4885
},
{
"epoch": 0.9263556713237036,
"grad_norm": 0.044593469584970485,
"learning_rate": 4.911935209502072e-06,
"loss": 0.7399,
"step": 4890
},
{
"epoch": 0.9273028652616623,
"grad_norm": 0.03845070002008913,
"learning_rate": 4.786827993633635e-06,
"loss": 0.7197,
"step": 4895
},
{
"epoch": 0.9282500591996211,
"grad_norm": 0.037419281999128515,
"learning_rate": 4.663308805749061e-06,
"loss": 0.7318,
"step": 4900
},
{
"epoch": 0.92919725313758,
"grad_norm": 0.03903737099095032,
"learning_rate": 4.541378996634382e-06,
"loss": 0.7339,
"step": 4905
},
{
"epoch": 0.9301444470755387,
"grad_norm": 0.03913683799560393,
"learning_rate": 4.421039899694468e-06,
"loss": 0.7229,
"step": 4910
},
{
"epoch": 0.9310916410134975,
"grad_norm": 0.03740496457952301,
"learning_rate": 4.302292830938403e-06,
"loss": 0.7138,
"step": 4915
},
{
"epoch": 0.9320388349514563,
"grad_norm": 0.03949065539672211,
"learning_rate": 4.185139088965083e-06,
"loss": 0.7036,
"step": 4920
},
{
"epoch": 0.9329860288894151,
"grad_norm": 0.03718724530040875,
"learning_rate": 4.06957995494911e-06,
"loss": 0.7241,
"step": 4925
},
{
"epoch": 0.9339332228273739,
"grad_norm": 0.038984046127228444,
"learning_rate": 3.955616692626612e-06,
"loss": 0.7132,
"step": 4930
},
{
"epoch": 0.9348804167653327,
"grad_norm": 0.03705946169516573,
"learning_rate": 3.843250548281584e-06,
"loss": 0.7205,
"step": 4935
},
{
"epoch": 0.9358276107032915,
"grad_norm": 0.04056507468322176,
"learning_rate": 3.7324827507321907e-06,
"loss": 0.7095,
"step": 4940
},
{
"epoch": 0.9367748046412503,
"grad_norm": 0.03838686948370597,
"learning_rate": 3.62331451131731e-06,
"loss": 0.7402,
"step": 4945
},
{
"epoch": 0.9377219985792091,
"grad_norm": 0.03735322865661384,
"learning_rate": 3.5157470238832975e-06,
"loss": 0.7113,
"step": 4950
},
{
"epoch": 0.9386691925171678,
"grad_norm": 0.03865946802077697,
"learning_rate": 3.4097814647709775e-06,
"loss": 0.7327,
"step": 4955
},
{
"epoch": 0.9396163864551267,
"grad_norm": 0.03750616620805965,
"learning_rate": 3.3054189928027386e-06,
"loss": 0.7078,
"step": 4960
},
{
"epoch": 0.9405635803930855,
"grad_norm": 0.040179517311287265,
"learning_rate": 3.202660749269842e-06,
"loss": 0.7168,
"step": 4965
},
{
"epoch": 0.9415107743310442,
"grad_norm": 0.038488343198023446,
"learning_rate": 3.1015078579199992e-06,
"loss": 0.7263,
"step": 4970
},
{
"epoch": 0.9424579682690031,
"grad_norm": 0.03846115414340801,
"learning_rate": 3.0019614249449818e-06,
"loss": 0.7396,
"step": 4975
},
{
"epoch": 0.9434051622069619,
"grad_norm": 0.03891780338800125,
"learning_rate": 2.9040225389686477e-06,
"loss": 0.7197,
"step": 4980
},
{
"epoch": 0.9443523561449206,
"grad_norm": 0.038745010189377524,
"learning_rate": 2.8076922710349836e-06,
"loss": 0.6982,
"step": 4985
},
{
"epoch": 0.9452995500828795,
"grad_norm": 0.03993490506698604,
"learning_rate": 2.7129716745963316e-06,
"loss": 0.6958,
"step": 4990
},
{
"epoch": 0.9462467440208383,
"grad_norm": 0.03974439268947578,
"learning_rate": 2.6198617855020143e-06,
"loss": 0.7312,
"step": 4995
},
{
"epoch": 0.947193937958797,
"grad_norm": 0.04146912204153186,
"learning_rate": 2.5283636219867954e-06,
"loss": 0.7385,
"step": 5000
},
{
"epoch": 0.9481411318967559,
"grad_norm": 0.03707229834469002,
"learning_rate": 2.43847818465997e-06,
"loss": 0.7345,
"step": 5005
},
{
"epoch": 0.9490883258347147,
"grad_norm": 0.039133364688565486,
"learning_rate": 2.3502064564942578e-06,
"loss": 0.7075,
"step": 5010
},
{
"epoch": 0.9500355197726734,
"grad_norm": 0.03622200599308819,
"learning_rate": 2.263549402815179e-06,
"loss": 0.6983,
"step": 5015
},
{
"epoch": 0.9509827137106323,
"grad_norm": 0.038295665871847934,
"learning_rate": 2.1785079712903275e-06,
"loss": 0.7334,
"step": 5020
},
{
"epoch": 0.9519299076485911,
"grad_norm": 0.03915418927573118,
"learning_rate": 2.095083091919214e-06,
"loss": 0.7372,
"step": 5025
},
{
"epoch": 0.9528771015865498,
"grad_norm": 0.03728899024964666,
"learning_rate": 2.0132756770229576e-06,
"loss": 0.7046,
"step": 5030
},
{
"epoch": 0.9538242955245086,
"grad_norm": 0.03775405052284275,
"learning_rate": 1.9330866212343086e-06,
"loss": 0.7143,
"step": 5035
},
{
"epoch": 0.9547714894624675,
"grad_norm": 0.038310827788718325,
"learning_rate": 1.8545168014879764e-06,
"loss": 0.7111,
"step": 5040
},
{
"epoch": 0.9557186834004262,
"grad_norm": 0.03835494831636439,
"learning_rate": 1.777567077010883e-06,
"loss": 0.7398,
"step": 5045
},
{
"epoch": 0.956665877338385,
"grad_norm": 0.038374737236019835,
"learning_rate": 1.7022382893129072e-06,
"loss": 0.7149,
"step": 5050
},
{
"epoch": 0.9576130712763439,
"grad_norm": 0.039365530128439526,
"learning_rate": 1.6285312621775903e-06,
"loss": 0.7074,
"step": 5055
},
{
"epoch": 0.9585602652143026,
"grad_norm": 0.038447131703807806,
"learning_rate": 1.5564468016531773e-06,
"loss": 0.7531,
"step": 5060
},
{
"epoch": 0.9595074591522614,
"grad_norm": 0.03768227162346853,
"learning_rate": 1.48598569604379e-06,
"loss": 0.7215,
"step": 5065
},
{
"epoch": 0.9604546530902203,
"grad_norm": 0.03691120803905496,
"learning_rate": 1.4171487159007843e-06,
"loss": 0.7037,
"step": 5070
},
{
"epoch": 0.961401847028179,
"grad_norm": 0.03847682888310551,
"learning_rate": 1.349936614014341e-06,
"loss": 0.7386,
"step": 5075
},
{
"epoch": 0.9623490409661378,
"grad_norm": 0.038562458996163625,
"learning_rate": 1.2843501254052368e-06,
"loss": 0.7145,
"step": 5080
},
{
"epoch": 0.9632962349040967,
"grad_norm": 0.039915946654852284,
"learning_rate": 1.2203899673168205e-06,
"loss": 0.723,
"step": 5085
},
{
"epoch": 0.9642434288420554,
"grad_norm": 0.03842715857163489,
"learning_rate": 1.1580568392071e-06,
"loss": 0.7092,
"step": 5090
},
{
"epoch": 0.9651906227800142,
"grad_norm": 0.040087737408694465,
"learning_rate": 1.0973514227412161e-06,
"loss": 0.7467,
"step": 5095
},
{
"epoch": 0.966137816717973,
"grad_norm": 0.03868018943060032,
"learning_rate": 1.038274381783849e-06,
"loss": 0.7034,
"step": 5100
},
{
"epoch": 0.9670850106559318,
"grad_norm": 0.03888690917055077,
"learning_rate": 9.80826362392073e-07,
"loss": 0.724,
"step": 5105
},
{
"epoch": 0.9680322045938906,
"grad_norm": 0.0361796924871166,
"learning_rate": 9.250079928082132e-07,
"loss": 0.736,
"step": 5110
},
{
"epoch": 0.9689793985318494,
"grad_norm": 0.03965669973295369,
"learning_rate": 8.708198834530166e-07,
"loss": 0.7261,
"step": 5115
},
{
"epoch": 0.9699265924698082,
"grad_norm": 0.039430492673631544,
"learning_rate": 8.182626269189752e-07,
"loss": 0.7242,
"step": 5120
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.0348287855119034,
"learning_rate": 7.673367979637968e-07,
"loss": 0.7082,
"step": 5125
},
{
"epoch": 0.9718209803457258,
"grad_norm": 0.03706016619862877,
"learning_rate": 7.180429535042276e-07,
"loss": 0.7518,
"step": 5130
},
{
"epoch": 0.9727681742836846,
"grad_norm": 0.04056580951745248,
"learning_rate": 6.703816326098399e-07,
"loss": 0.7401,
"step": 5135
},
{
"epoch": 0.9737153682216434,
"grad_norm": 0.04032169255601853,
"learning_rate": 6.24353356497187e-07,
"loss": 0.7491,
"step": 5140
},
{
"epoch": 0.9746625621596022,
"grad_norm": 0.04118720081247295,
"learning_rate": 5.799586285241242e-07,
"loss": 0.7569,
"step": 5145
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.039992603944723155,
"learning_rate": 5.371979341843136e-07,
"loss": 0.719,
"step": 5150
},
{
"epoch": 0.9765569500355198,
"grad_norm": 0.038975715445144526,
"learning_rate": 4.960717411018277e-07,
"loss": 0.7183,
"step": 5155
},
{
"epoch": 0.9775041439734786,
"grad_norm": 0.03558255003932097,
"learning_rate": 4.565804990261379e-07,
"loss": 0.72,
"step": 5160
},
{
"epoch": 0.9784513379114373,
"grad_norm": 0.041836063593222644,
"learning_rate": 4.187246398271171e-07,
"loss": 0.7227,
"step": 5165
},
{
"epoch": 0.9793985318493962,
"grad_norm": 0.0401155869593642,
"learning_rate": 3.825045774904112e-07,
"loss": 0.723,
"step": 5170
},
{
"epoch": 0.980345725787355,
"grad_norm": 0.04241569076307143,
"learning_rate": 3.4792070811280884e-07,
"loss": 0.7329,
"step": 5175
},
{
"epoch": 0.9812929197253137,
"grad_norm": 0.04084926695436618,
"learning_rate": 3.149734098979617e-07,
"loss": 0.7126,
"step": 5180
},
{
"epoch": 0.9822401136632726,
"grad_norm": 0.03939583534788028,
"learning_rate": 2.83663043152238e-07,
"loss": 0.7426,
"step": 5185
},
{
"epoch": 0.9831873076012313,
"grad_norm": 0.039426635966572685,
"learning_rate": 2.5398995028079184e-07,
"loss": 0.7086,
"step": 5190
},
{
"epoch": 0.9841345015391901,
"grad_norm": 0.03761530845106783,
"learning_rate": 2.2595445578381665e-07,
"loss": 0.7132,
"step": 5195
},
{
"epoch": 0.985081695477149,
"grad_norm": 0.03983372031881901,
"learning_rate": 1.9955686625299782e-07,
"loss": 0.7041,
"step": 5200
},
{
"epoch": 0.9860288894151077,
"grad_norm": 0.038745755637894036,
"learning_rate": 1.7479747036813207e-07,
"loss": 0.7565,
"step": 5205
},
{
"epoch": 0.9869760833530665,
"grad_norm": 0.03451565749276735,
"learning_rate": 1.5167653889401332e-07,
"loss": 0.7053,
"step": 5210
},
{
"epoch": 0.9879232772910254,
"grad_norm": 0.0377767563169325,
"learning_rate": 1.3019432467743508e-07,
"loss": 0.7165,
"step": 5215
},
{
"epoch": 0.9888704712289841,
"grad_norm": 0.03919747343677102,
"learning_rate": 1.1035106264445925e-07,
"loss": 0.7135,
"step": 5220
},
{
"epoch": 0.9898176651669429,
"grad_norm": 0.035742270061366835,
"learning_rate": 9.214696979781833e-08,
"loss": 0.7422,
"step": 5225
},
{
"epoch": 0.9907648591049018,
"grad_norm": 0.03988070344903895,
"learning_rate": 7.558224521455048e-08,
"loss": 0.7176,
"step": 5230
},
{
"epoch": 0.9917120530428605,
"grad_norm": 0.03844068057098456,
"learning_rate": 6.065707004383468e-08,
"loss": 0.7275,
"step": 5235
},
{
"epoch": 0.9926592469808193,
"grad_norm": 0.037145775071304184,
"learning_rate": 4.737160750500901e-08,
"loss": 0.7158,
"step": 5240
},
{
"epoch": 0.9936064409187781,
"grad_norm": 0.03801708244408533,
"learning_rate": 3.572600288572203e-08,
"loss": 0.7286,
"step": 5245
},
{
"epoch": 0.9945536348567369,
"grad_norm": 0.03898719180494596,
"learning_rate": 2.5720383540484002e-08,
"loss": 0.7208,
"step": 5250
},
{
"epoch": 0.9955008287946957,
"grad_norm": 0.03854350834493005,
"learning_rate": 1.7354858889134793e-08,
"loss": 0.7347,
"step": 5255
},
{
"epoch": 0.9964480227326545,
"grad_norm": 0.039328700531041635,
"learning_rate": 1.0629520415694759e-08,
"loss": 0.7603,
"step": 5260
},
{
"epoch": 0.9973952166706133,
"grad_norm": 0.03842843734298471,
"learning_rate": 5.544441667398869e-09,
"loss": 0.7204,
"step": 5265
},
{
"epoch": 0.9983424106085721,
"grad_norm": 0.03655798359456603,
"learning_rate": 2.099678253847381e-09,
"loss": 0.7179,
"step": 5270
},
{
"epoch": 0.9992896045465309,
"grad_norm": 0.03754524737942178,
"learning_rate": 2.952678464229752e-10,
"loss": 0.6995,
"step": 5275
},
{
"epoch": 0.9998579209093061,
"eval_loss": 1.1249996423721313,
"eval_runtime": 1040.1086,
"eval_samples_per_second": 188.598,
"eval_steps_per_second": 5.895,
"step": 5278
},
{
"epoch": 0.9998579209093061,
"step": 5278,
"total_flos": 768453779423232.0,
"train_loss": 0.8248233945137535,
"train_runtime": 21690.489,
"train_samples_per_second": 31.151,
"train_steps_per_second": 0.243
}
],
"logging_steps": 5,
"max_steps": 5278,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 768453779423232.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}