Files
SmolLM2-MagpieUltraPlus-Mat…/trainer_state.json
ModelHub XC 509b2716e1 初始化项目,由ModelHub XC社区提供模型
Model: HuggingFaceTB/SmolLM2-MagpieUltraPlus-MathInstruct
Source: Original Platform
2026-06-18 21:32:12 +08:00

7485 lines
183 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5314,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000940910801656003,
"grad_norm": 60.01997358091019,
"learning_rate": 2.8195488721804507e-06,
"loss": 3.7648,
"step": 5
},
{
"epoch": 0.001881821603312006,
"grad_norm": 43.13189358709948,
"learning_rate": 5.6390977443609015e-06,
"loss": 3.7027,
"step": 10
},
{
"epoch": 0.002822732404968009,
"grad_norm": 21.06725791656804,
"learning_rate": 8.458646616541352e-06,
"loss": 2.9202,
"step": 15
},
{
"epoch": 0.003763643206624012,
"grad_norm": 10.467981092560883,
"learning_rate": 1.1278195488721803e-05,
"loss": 2.2189,
"step": 20
},
{
"epoch": 0.004704554008280015,
"grad_norm": 4.304956998410812,
"learning_rate": 1.4097744360902254e-05,
"loss": 1.7602,
"step": 25
},
{
"epoch": 0.005645464809936018,
"grad_norm": 2.039731988186442,
"learning_rate": 1.6917293233082704e-05,
"loss": 1.421,
"step": 30
},
{
"epoch": 0.006586375611592021,
"grad_norm": 1.142378863342944,
"learning_rate": 1.9736842105263155e-05,
"loss": 1.2312,
"step": 35
},
{
"epoch": 0.007527286413248024,
"grad_norm": 0.560089452128016,
"learning_rate": 2.2556390977443606e-05,
"loss": 1.1372,
"step": 40
},
{
"epoch": 0.008468197214904027,
"grad_norm": 0.5228484190309799,
"learning_rate": 2.5375939849624057e-05,
"loss": 1.0764,
"step": 45
},
{
"epoch": 0.00940910801656003,
"grad_norm": 0.34335801536002225,
"learning_rate": 2.8195488721804508e-05,
"loss": 1.0306,
"step": 50
},
{
"epoch": 0.010350018818216034,
"grad_norm": 0.3215605332639539,
"learning_rate": 3.101503759398496e-05,
"loss": 0.9845,
"step": 55
},
{
"epoch": 0.011290929619872036,
"grad_norm": 0.22796155046045471,
"learning_rate": 3.383458646616541e-05,
"loss": 1.0076,
"step": 60
},
{
"epoch": 0.012231840421528039,
"grad_norm": 0.19722227209725182,
"learning_rate": 3.665413533834586e-05,
"loss": 1.0149,
"step": 65
},
{
"epoch": 0.013172751223184042,
"grad_norm": 0.21801488393560356,
"learning_rate": 3.947368421052631e-05,
"loss": 0.9928,
"step": 70
},
{
"epoch": 0.014113662024840046,
"grad_norm": 0.2160945691538292,
"learning_rate": 4.2293233082706764e-05,
"loss": 0.9793,
"step": 75
},
{
"epoch": 0.015054572826496047,
"grad_norm": 0.1709423275538542,
"learning_rate": 4.511278195488721e-05,
"loss": 0.9415,
"step": 80
},
{
"epoch": 0.015995483628152053,
"grad_norm": 0.1984231147448857,
"learning_rate": 4.7932330827067666e-05,
"loss": 0.9456,
"step": 85
},
{
"epoch": 0.016936394429808054,
"grad_norm": 0.16064378379367358,
"learning_rate": 5.0751879699248114e-05,
"loss": 0.937,
"step": 90
},
{
"epoch": 0.017877305231464056,
"grad_norm": 0.17183543893075945,
"learning_rate": 5.357142857142857e-05,
"loss": 0.939,
"step": 95
},
{
"epoch": 0.01881821603312006,
"grad_norm": 0.13904432551213514,
"learning_rate": 5.6390977443609016e-05,
"loss": 0.9358,
"step": 100
},
{
"epoch": 0.019759126834776063,
"grad_norm": 0.14843412643116974,
"learning_rate": 5.921052631578947e-05,
"loss": 0.9221,
"step": 105
},
{
"epoch": 0.020700037636432068,
"grad_norm": 0.14289835041826568,
"learning_rate": 6.203007518796992e-05,
"loss": 0.9335,
"step": 110
},
{
"epoch": 0.02164094843808807,
"grad_norm": 0.12123282065253636,
"learning_rate": 6.484962406015037e-05,
"loss": 0.9428,
"step": 115
},
{
"epoch": 0.02258185923974407,
"grad_norm": 0.12416935651368655,
"learning_rate": 6.766917293233081e-05,
"loss": 0.9183,
"step": 120
},
{
"epoch": 0.023522770041400076,
"grad_norm": 0.09188631695179955,
"learning_rate": 7.048872180451127e-05,
"loss": 0.9055,
"step": 125
},
{
"epoch": 0.024463680843056078,
"grad_norm": 0.0876665353586765,
"learning_rate": 7.330827067669172e-05,
"loss": 0.9103,
"step": 130
},
{
"epoch": 0.025404591644712083,
"grad_norm": 0.08243508531454458,
"learning_rate": 7.612781954887218e-05,
"loss": 0.8813,
"step": 135
},
{
"epoch": 0.026345502446368085,
"grad_norm": 0.09264651144613481,
"learning_rate": 7.894736842105262e-05,
"loss": 0.9165,
"step": 140
},
{
"epoch": 0.027286413248024086,
"grad_norm": 0.10132147434306207,
"learning_rate": 8.176691729323307e-05,
"loss": 0.9094,
"step": 145
},
{
"epoch": 0.02822732404968009,
"grad_norm": 0.07777446691897923,
"learning_rate": 8.458646616541353e-05,
"loss": 0.919,
"step": 150
},
{
"epoch": 0.029168234851336093,
"grad_norm": 0.0877254827678949,
"learning_rate": 8.740601503759398e-05,
"loss": 0.9092,
"step": 155
},
{
"epoch": 0.030109145652992095,
"grad_norm": 0.0698681608512868,
"learning_rate": 9.022556390977442e-05,
"loss": 0.8901,
"step": 160
},
{
"epoch": 0.0310500564546481,
"grad_norm": 0.07865064250292489,
"learning_rate": 9.304511278195488e-05,
"loss": 0.9177,
"step": 165
},
{
"epoch": 0.031990967256304105,
"grad_norm": 0.07217084698029017,
"learning_rate": 9.586466165413533e-05,
"loss": 0.8612,
"step": 170
},
{
"epoch": 0.03293187805796011,
"grad_norm": 0.07821397459573423,
"learning_rate": 9.868421052631579e-05,
"loss": 0.9028,
"step": 175
},
{
"epoch": 0.03387278885961611,
"grad_norm": 0.07354195280117189,
"learning_rate": 0.00010150375939849623,
"loss": 0.8853,
"step": 180
},
{
"epoch": 0.03481369966127211,
"grad_norm": 0.07326370701811867,
"learning_rate": 0.00010432330827067668,
"loss": 0.8976,
"step": 185
},
{
"epoch": 0.03575461046292811,
"grad_norm": 0.0771216114229083,
"learning_rate": 0.00010714285714285714,
"loss": 0.8974,
"step": 190
},
{
"epoch": 0.03669552126458412,
"grad_norm": 0.07423933567725373,
"learning_rate": 0.00010996240601503759,
"loss": 0.8983,
"step": 195
},
{
"epoch": 0.03763643206624012,
"grad_norm": 0.07359448396484719,
"learning_rate": 0.00011278195488721803,
"loss": 0.8855,
"step": 200
},
{
"epoch": 0.038577342867896124,
"grad_norm": 0.0705322988260988,
"learning_rate": 0.00011560150375939849,
"loss": 0.8902,
"step": 205
},
{
"epoch": 0.039518253669552125,
"grad_norm": 0.07553017144819835,
"learning_rate": 0.00011842105263157894,
"loss": 0.8979,
"step": 210
},
{
"epoch": 0.04045916447120813,
"grad_norm": 0.06639469040581934,
"learning_rate": 0.0001212406015037594,
"loss": 0.8726,
"step": 215
},
{
"epoch": 0.041400075272864136,
"grad_norm": 0.07164418665916035,
"learning_rate": 0.00012406015037593984,
"loss": 0.8805,
"step": 220
},
{
"epoch": 0.04234098607452014,
"grad_norm": 0.08024783310853623,
"learning_rate": 0.00012687969924812028,
"loss": 0.8846,
"step": 225
},
{
"epoch": 0.04328189687617614,
"grad_norm": 0.06967695982721508,
"learning_rate": 0.00012969924812030075,
"loss": 0.8562,
"step": 230
},
{
"epoch": 0.04422280767783214,
"grad_norm": 0.06791598862412057,
"learning_rate": 0.0001325187969924812,
"loss": 0.8981,
"step": 235
},
{
"epoch": 0.04516371847948814,
"grad_norm": 0.06926389506888998,
"learning_rate": 0.00013533834586466163,
"loss": 0.8889,
"step": 240
},
{
"epoch": 0.04610462928114415,
"grad_norm": 0.07683826514393188,
"learning_rate": 0.0001381578947368421,
"loss": 0.8801,
"step": 245
},
{
"epoch": 0.04704554008280015,
"grad_norm": 0.08813591407993811,
"learning_rate": 0.00014097744360902254,
"loss": 0.8885,
"step": 250
},
{
"epoch": 0.047986450884456154,
"grad_norm": 0.07234142660651707,
"learning_rate": 0.000143796992481203,
"loss": 0.8517,
"step": 255
},
{
"epoch": 0.048927361686112156,
"grad_norm": 0.0648457622804495,
"learning_rate": 0.00014661654135338345,
"loss": 0.8912,
"step": 260
},
{
"epoch": 0.04986827248776816,
"grad_norm": 0.07356170701724246,
"learning_rate": 0.0001494360902255639,
"loss": 0.9047,
"step": 265
},
{
"epoch": 0.050809183289424166,
"grad_norm": 0.06615172685965298,
"learning_rate": 0.00015225563909774436,
"loss": 0.8807,
"step": 270
},
{
"epoch": 0.05175009409108017,
"grad_norm": 0.06082657075881043,
"learning_rate": 0.0001550751879699248,
"loss": 0.8834,
"step": 275
},
{
"epoch": 0.05269100489273617,
"grad_norm": 0.06342890619901809,
"learning_rate": 0.00015789473684210524,
"loss": 0.864,
"step": 280
},
{
"epoch": 0.05363191569439217,
"grad_norm": 0.06319688554101384,
"learning_rate": 0.0001607142857142857,
"loss": 0.8684,
"step": 285
},
{
"epoch": 0.05457282649604817,
"grad_norm": 0.06334243363623335,
"learning_rate": 0.00016353383458646615,
"loss": 0.8909,
"step": 290
},
{
"epoch": 0.055513737297704174,
"grad_norm": 0.06333186578028213,
"learning_rate": 0.00016635338345864662,
"loss": 0.9081,
"step": 295
},
{
"epoch": 0.05645464809936018,
"grad_norm": 0.07330638979979831,
"learning_rate": 0.00016917293233082706,
"loss": 0.8736,
"step": 300
},
{
"epoch": 0.057395558901016185,
"grad_norm": 0.061476218135212986,
"learning_rate": 0.00017199248120300752,
"loss": 0.8572,
"step": 305
},
{
"epoch": 0.058336469702672186,
"grad_norm": 0.06885650230162942,
"learning_rate": 0.00017481203007518797,
"loss": 0.8825,
"step": 310
},
{
"epoch": 0.05927738050432819,
"grad_norm": 0.06201718392161657,
"learning_rate": 0.00017763157894736838,
"loss": 0.8791,
"step": 315
},
{
"epoch": 0.06021829130598419,
"grad_norm": 0.05952975722354683,
"learning_rate": 0.00018045112781954885,
"loss": 0.8722,
"step": 320
},
{
"epoch": 0.0611592021076402,
"grad_norm": 0.0845033558899056,
"learning_rate": 0.0001832706766917293,
"loss": 0.8779,
"step": 325
},
{
"epoch": 0.0621001129092962,
"grad_norm": 0.0696024287202025,
"learning_rate": 0.00018609022556390976,
"loss": 0.8589,
"step": 330
},
{
"epoch": 0.0630410237109522,
"grad_norm": 0.06743400038358317,
"learning_rate": 0.00018890977443609022,
"loss": 0.8943,
"step": 335
},
{
"epoch": 0.06398193451260821,
"grad_norm": 0.06448702771901722,
"learning_rate": 0.00019172932330827067,
"loss": 0.8706,
"step": 340
},
{
"epoch": 0.0649228453142642,
"grad_norm": 0.06648697936479611,
"learning_rate": 0.00019454887218045113,
"loss": 0.8715,
"step": 345
},
{
"epoch": 0.06586375611592021,
"grad_norm": 0.07133760531354241,
"learning_rate": 0.00019736842105263157,
"loss": 0.8909,
"step": 350
},
{
"epoch": 0.06680466691757621,
"grad_norm": 0.07400726358762413,
"learning_rate": 0.000200187969924812,
"loss": 0.8829,
"step": 355
},
{
"epoch": 0.06774557771923222,
"grad_norm": 0.066751649475411,
"learning_rate": 0.00020300751879699246,
"loss": 0.8602,
"step": 360
},
{
"epoch": 0.06868648852088823,
"grad_norm": 0.07170376738210353,
"learning_rate": 0.0002058270676691729,
"loss": 0.8519,
"step": 365
},
{
"epoch": 0.06962739932254422,
"grad_norm": 0.06344374065815822,
"learning_rate": 0.00020864661654135337,
"loss": 0.8373,
"step": 370
},
{
"epoch": 0.07056831012420023,
"grad_norm": 0.06254663540702332,
"learning_rate": 0.0002114661654135338,
"loss": 0.9122,
"step": 375
},
{
"epoch": 0.07150922092585622,
"grad_norm": 0.06630769532444106,
"learning_rate": 0.00021428571428571427,
"loss": 0.8654,
"step": 380
},
{
"epoch": 0.07245013172751223,
"grad_norm": 0.07154196055342273,
"learning_rate": 0.00021710526315789472,
"loss": 0.8698,
"step": 385
},
{
"epoch": 0.07339104252916824,
"grad_norm": 0.06473991172822582,
"learning_rate": 0.00021992481203007518,
"loss": 0.8558,
"step": 390
},
{
"epoch": 0.07433195333082424,
"grad_norm": 0.0641271642923791,
"learning_rate": 0.0002227443609022556,
"loss": 0.8825,
"step": 395
},
{
"epoch": 0.07527286413248024,
"grad_norm": 0.07171043288638893,
"learning_rate": 0.00022556390977443607,
"loss": 0.8715,
"step": 400
},
{
"epoch": 0.07621377493413624,
"grad_norm": 0.07945386130978546,
"learning_rate": 0.0002283834586466165,
"loss": 0.8718,
"step": 405
},
{
"epoch": 0.07715468573579225,
"grad_norm": 0.06502361339008453,
"learning_rate": 0.00023120300751879697,
"loss": 0.8945,
"step": 410
},
{
"epoch": 0.07809559653744826,
"grad_norm": 0.07560856132420715,
"learning_rate": 0.00023402255639097742,
"loss": 0.8527,
"step": 415
},
{
"epoch": 0.07903650733910425,
"grad_norm": 0.06652060140595817,
"learning_rate": 0.00023684210526315788,
"loss": 0.8757,
"step": 420
},
{
"epoch": 0.07997741814076026,
"grad_norm": 0.0634870168645207,
"learning_rate": 0.00023966165413533832,
"loss": 0.8795,
"step": 425
},
{
"epoch": 0.08091832894241625,
"grad_norm": 0.06045441546519768,
"learning_rate": 0.0002424812030075188,
"loss": 0.8696,
"step": 430
},
{
"epoch": 0.08185923974407226,
"grad_norm": 0.061655818665132714,
"learning_rate": 0.00024530075187969923,
"loss": 0.8558,
"step": 435
},
{
"epoch": 0.08280015054572827,
"grad_norm": 0.06182009827579582,
"learning_rate": 0.0002481203007518797,
"loss": 0.8547,
"step": 440
},
{
"epoch": 0.08374106134738427,
"grad_norm": 0.0639219310698945,
"learning_rate": 0.0002509398496240601,
"loss": 0.8752,
"step": 445
},
{
"epoch": 0.08468197214904027,
"grad_norm": 0.05045686701916079,
"learning_rate": 0.00025375939849624056,
"loss": 0.8377,
"step": 450
},
{
"epoch": 0.08562288295069627,
"grad_norm": 0.06119130976338784,
"learning_rate": 0.00025657894736842105,
"loss": 0.8795,
"step": 455
},
{
"epoch": 0.08656379375235228,
"grad_norm": 0.05851515205626573,
"learning_rate": 0.0002593984962406015,
"loss": 0.8466,
"step": 460
},
{
"epoch": 0.08750470455400829,
"grad_norm": 0.05891589185564019,
"learning_rate": 0.00026221804511278193,
"loss": 0.8508,
"step": 465
},
{
"epoch": 0.08844561535566428,
"grad_norm": 0.06568583634178668,
"learning_rate": 0.0002650375939849624,
"loss": 0.8887,
"step": 470
},
{
"epoch": 0.08938652615732029,
"grad_norm": 0.06559964245515083,
"learning_rate": 0.00026785714285714287,
"loss": 0.8517,
"step": 475
},
{
"epoch": 0.09032743695897628,
"grad_norm": 0.06489995786075839,
"learning_rate": 0.00027067669172932326,
"loss": 0.876,
"step": 480
},
{
"epoch": 0.09126834776063229,
"grad_norm": 0.059994926099869166,
"learning_rate": 0.0002734962406015037,
"loss": 0.8905,
"step": 485
},
{
"epoch": 0.0922092585622883,
"grad_norm": 0.05813814908142138,
"learning_rate": 0.0002763157894736842,
"loss": 0.8935,
"step": 490
},
{
"epoch": 0.0931501693639443,
"grad_norm": 0.06773262538564226,
"learning_rate": 0.00027913533834586463,
"loss": 0.8724,
"step": 495
},
{
"epoch": 0.0940910801656003,
"grad_norm": 0.06172481466819506,
"learning_rate": 0.0002819548872180451,
"loss": 0.8488,
"step": 500
},
{
"epoch": 0.0950319909672563,
"grad_norm": 0.05892455966797863,
"learning_rate": 0.00028477443609022557,
"loss": 0.8969,
"step": 505
},
{
"epoch": 0.09597290176891231,
"grad_norm": 0.07375087726120111,
"learning_rate": 0.000287593984962406,
"loss": 0.8699,
"step": 510
},
{
"epoch": 0.09691381257056832,
"grad_norm": 0.06156991814488305,
"learning_rate": 0.00029041353383458645,
"loss": 0.871,
"step": 515
},
{
"epoch": 0.09785472337222431,
"grad_norm": 0.062492373867649643,
"learning_rate": 0.0002932330827067669,
"loss": 0.8804,
"step": 520
},
{
"epoch": 0.09879563417388032,
"grad_norm": 0.05942165284065285,
"learning_rate": 0.00029605263157894733,
"loss": 0.8812,
"step": 525
},
{
"epoch": 0.09973654497553631,
"grad_norm": 0.059197976767297204,
"learning_rate": 0.0002988721804511278,
"loss": 0.888,
"step": 530
},
{
"epoch": 0.10067745577719232,
"grad_norm": 0.060788308115763814,
"learning_rate": 0.00029999970867065386,
"loss": 0.8777,
"step": 535
},
{
"epoch": 0.10161836657884833,
"grad_norm": 0.06549563064661894,
"learning_rate": 0.0002999979283287479,
"loss": 0.8973,
"step": 540
},
{
"epoch": 0.10255927738050433,
"grad_norm": 0.05637984655649405,
"learning_rate": 0.0002999945295137593,
"loss": 0.864,
"step": 545
},
{
"epoch": 0.10350018818216034,
"grad_norm": 0.05419448738098367,
"learning_rate": 0.00029998951226236113,
"loss": 0.8528,
"step": 550
},
{
"epoch": 0.10444109898381633,
"grad_norm": 0.055413842575868566,
"learning_rate": 0.0002999828766286894,
"loss": 0.8721,
"step": 555
},
{
"epoch": 0.10538200978547234,
"grad_norm": 0.05124313574780934,
"learning_rate": 0.0002999746226843424,
"loss": 0.8867,
"step": 560
},
{
"epoch": 0.10632292058712833,
"grad_norm": 0.054834397457305846,
"learning_rate": 0.0002999647505183799,
"loss": 0.861,
"step": 565
},
{
"epoch": 0.10726383138878434,
"grad_norm": 0.05532784610616103,
"learning_rate": 0.00029995326023732235,
"loss": 0.896,
"step": 570
},
{
"epoch": 0.10820474219044035,
"grad_norm": 0.056119336353656256,
"learning_rate": 0.00029994015196514945,
"loss": 0.8922,
"step": 575
},
{
"epoch": 0.10914565299209635,
"grad_norm": 0.054803911827884036,
"learning_rate": 0.00029992542584329914,
"loss": 0.8748,
"step": 580
},
{
"epoch": 0.11008656379375235,
"grad_norm": 0.0575432175173428,
"learning_rate": 0.00029990908203066574,
"loss": 0.8804,
"step": 585
},
{
"epoch": 0.11102747459540835,
"grad_norm": 0.0522611031206057,
"learning_rate": 0.00029989112070359853,
"loss": 0.8619,
"step": 590
},
{
"epoch": 0.11196838539706436,
"grad_norm": 0.053009150267980906,
"learning_rate": 0.0002998715420558997,
"loss": 0.8849,
"step": 595
},
{
"epoch": 0.11290929619872037,
"grad_norm": 0.05254302852123155,
"learning_rate": 0.0002998503462988222,
"loss": 0.8939,
"step": 600
},
{
"epoch": 0.11385020700037636,
"grad_norm": 0.05929685311259276,
"learning_rate": 0.0002998275336610677,
"loss": 0.8651,
"step": 605
},
{
"epoch": 0.11479111780203237,
"grad_norm": 0.055640971519342844,
"learning_rate": 0.0002998031043887838,
"loss": 0.9132,
"step": 610
},
{
"epoch": 0.11573202860368836,
"grad_norm": 0.06515281749731798,
"learning_rate": 0.0002997770587455616,
"loss": 0.8854,
"step": 615
},
{
"epoch": 0.11667293940534437,
"grad_norm": 0.05226198007608227,
"learning_rate": 0.00029974939701243284,
"loss": 0.8936,
"step": 620
},
{
"epoch": 0.11761385020700038,
"grad_norm": 0.05246222413311401,
"learning_rate": 0.00029972011948786677,
"loss": 0.8709,
"step": 625
},
{
"epoch": 0.11855476100865638,
"grad_norm": 0.05531148888103947,
"learning_rate": 0.0002996892264877669,
"loss": 0.8652,
"step": 630
},
{
"epoch": 0.11949567181031238,
"grad_norm": 0.05499488496867662,
"learning_rate": 0.00029965671834546794,
"loss": 0.8678,
"step": 635
},
{
"epoch": 0.12043658261196838,
"grad_norm": 0.05755758230328177,
"learning_rate": 0.0002996225954117316,
"loss": 0.8574,
"step": 640
},
{
"epoch": 0.12137749341362439,
"grad_norm": 0.05120044581677358,
"learning_rate": 0.0002995868580547434,
"loss": 0.8647,
"step": 645
},
{
"epoch": 0.1223184042152804,
"grad_norm": 0.05183325596536476,
"learning_rate": 0.00029954950666010827,
"loss": 0.8572,
"step": 650
},
{
"epoch": 0.12325931501693639,
"grad_norm": 0.05295080137171756,
"learning_rate": 0.0002995105416308466,
"loss": 0.8766,
"step": 655
},
{
"epoch": 0.1242002258185924,
"grad_norm": 0.05489170605693284,
"learning_rate": 0.0002994699633873899,
"loss": 0.8791,
"step": 660
},
{
"epoch": 0.1251411366202484,
"grad_norm": 0.05376647275139867,
"learning_rate": 0.00029942777236757626,
"loss": 0.8876,
"step": 665
},
{
"epoch": 0.1260820474219044,
"grad_norm": 0.05253119730747103,
"learning_rate": 0.0002993839690266454,
"loss": 0.8817,
"step": 670
},
{
"epoch": 0.1270229582235604,
"grad_norm": 0.0517706614738255,
"learning_rate": 0.00029933855383723406,
"loss": 0.8762,
"step": 675
},
{
"epoch": 0.12796386902521642,
"grad_norm": 0.05981220672493737,
"learning_rate": 0.00029929152728937067,
"loss": 0.8721,
"step": 680
},
{
"epoch": 0.1289047798268724,
"grad_norm": 0.052639270170391564,
"learning_rate": 0.00029924288989047047,
"loss": 0.8645,
"step": 685
},
{
"epoch": 0.1298456906285284,
"grad_norm": 0.04870706752581317,
"learning_rate": 0.0002991926421653293,
"loss": 0.8406,
"step": 690
},
{
"epoch": 0.13078660143018442,
"grad_norm": 0.051293819330064536,
"learning_rate": 0.00029914078465611866,
"loss": 0.8571,
"step": 695
},
{
"epoch": 0.13172751223184043,
"grad_norm": 0.05202337778091494,
"learning_rate": 0.0002990873179223796,
"loss": 0.8337,
"step": 700
},
{
"epoch": 0.13266842303349644,
"grad_norm": 0.05886832645070351,
"learning_rate": 0.00029903224254101637,
"loss": 0.8948,
"step": 705
},
{
"epoch": 0.13360933383515242,
"grad_norm": 0.05784259384372831,
"learning_rate": 0.00029897555910629077,
"loss": 0.8893,
"step": 710
},
{
"epoch": 0.13455024463680842,
"grad_norm": 0.049720758505550806,
"learning_rate": 0.0002989172682298153,
"loss": 0.8561,
"step": 715
},
{
"epoch": 0.13549115543846443,
"grad_norm": 0.057567182371576854,
"learning_rate": 0.00029885737054054673,
"loss": 0.853,
"step": 720
},
{
"epoch": 0.13643206624012044,
"grad_norm": 0.05157693817638803,
"learning_rate": 0.00029879586668477936,
"loss": 0.8359,
"step": 725
},
{
"epoch": 0.13737297704177645,
"grad_norm": 0.0533272964269685,
"learning_rate": 0.0002987327573261379,
"loss": 0.908,
"step": 730
},
{
"epoch": 0.13831388784343243,
"grad_norm": 0.05233796692330438,
"learning_rate": 0.00029866804314557043,
"loss": 0.8819,
"step": 735
},
{
"epoch": 0.13925479864508844,
"grad_norm": 0.0573014910728429,
"learning_rate": 0.0002986017248413409,
"loss": 0.8973,
"step": 740
},
{
"epoch": 0.14019570944674445,
"grad_norm": 0.052849475942016416,
"learning_rate": 0.00029853380312902186,
"loss": 0.8419,
"step": 745
},
{
"epoch": 0.14113662024840046,
"grad_norm": 0.06011994209298975,
"learning_rate": 0.0002984642787414865,
"loss": 0.8804,
"step": 750
},
{
"epoch": 0.14207753105005647,
"grad_norm": 0.05499188180020471,
"learning_rate": 0.00029839315242890087,
"loss": 0.8634,
"step": 755
},
{
"epoch": 0.14301844185171245,
"grad_norm": 0.05499133991676239,
"learning_rate": 0.00029832042495871576,
"loss": 0.8583,
"step": 760
},
{
"epoch": 0.14395935265336846,
"grad_norm": 0.05426291762052608,
"learning_rate": 0.00029824609711565824,
"loss": 0.8725,
"step": 765
},
{
"epoch": 0.14490026345502446,
"grad_norm": 0.047322871594277255,
"learning_rate": 0.0002981701697017236,
"loss": 0.8561,
"step": 770
},
{
"epoch": 0.14584117425668047,
"grad_norm": 0.05466867627510976,
"learning_rate": 0.0002980926435361662,
"loss": 0.8737,
"step": 775
},
{
"epoch": 0.14678208505833648,
"grad_norm": 0.051544041730181685,
"learning_rate": 0.0002980135194554911,
"loss": 0.8562,
"step": 780
},
{
"epoch": 0.14772299585999246,
"grad_norm": 0.0512691349349204,
"learning_rate": 0.00029793279831344475,
"loss": 0.8512,
"step": 785
},
{
"epoch": 0.14866390666164847,
"grad_norm": 0.04756716736330136,
"learning_rate": 0.0002978504809810057,
"loss": 0.8612,
"step": 790
},
{
"epoch": 0.14960481746330448,
"grad_norm": 0.04882280569803268,
"learning_rate": 0.00029776656834637553,
"loss": 0.8514,
"step": 795
},
{
"epoch": 0.1505457282649605,
"grad_norm": 0.054783783095015,
"learning_rate": 0.00029768106131496905,
"loss": 0.8796,
"step": 800
},
{
"epoch": 0.1514866390666165,
"grad_norm": 0.0531473951393396,
"learning_rate": 0.0002975939608094045,
"loss": 0.8565,
"step": 805
},
{
"epoch": 0.15242754986827248,
"grad_norm": 0.05185385237557039,
"learning_rate": 0.00029750526776949364,
"loss": 0.8371,
"step": 810
},
{
"epoch": 0.15336846066992849,
"grad_norm": 0.0505734130462192,
"learning_rate": 0.00029741498315223174,
"loss": 0.8626,
"step": 815
},
{
"epoch": 0.1543093714715845,
"grad_norm": 0.055795428961054876,
"learning_rate": 0.000297323107931787,
"loss": 0.8642,
"step": 820
},
{
"epoch": 0.1552502822732405,
"grad_norm": 0.048819133510848926,
"learning_rate": 0.0002972296430994903,
"loss": 0.8873,
"step": 825
},
{
"epoch": 0.1561911930748965,
"grad_norm": 0.05322784832445891,
"learning_rate": 0.00029713458966382434,
"loss": 0.8578,
"step": 830
},
{
"epoch": 0.1571321038765525,
"grad_norm": 0.05681519640146074,
"learning_rate": 0.00029703794865041283,
"loss": 0.8791,
"step": 835
},
{
"epoch": 0.1580730146782085,
"grad_norm": 0.05999945592046972,
"learning_rate": 0.0002969397211020093,
"loss": 0.849,
"step": 840
},
{
"epoch": 0.1590139254798645,
"grad_norm": 0.04571088658610958,
"learning_rate": 0.00029683990807848596,
"loss": 0.8483,
"step": 845
},
{
"epoch": 0.15995483628152052,
"grad_norm": 0.06089448054645934,
"learning_rate": 0.00029673851065682244,
"loss": 0.863,
"step": 850
},
{
"epoch": 0.16089574708317653,
"grad_norm": 0.06061988663918843,
"learning_rate": 0.00029663552993109375,
"loss": 0.8794,
"step": 855
},
{
"epoch": 0.1618366578848325,
"grad_norm": 0.05182284732026814,
"learning_rate": 0.0002965309670124588,
"loss": 0.8607,
"step": 860
},
{
"epoch": 0.16277756868648852,
"grad_norm": 0.04524287407933527,
"learning_rate": 0.0002964248230291483,
"loss": 0.844,
"step": 865
},
{
"epoch": 0.16371847948814452,
"grad_norm": 0.051529772912372,
"learning_rate": 0.0002963170991264526,
"loss": 0.851,
"step": 870
},
{
"epoch": 0.16465939028980053,
"grad_norm": 0.04587024085305175,
"learning_rate": 0.0002962077964667093,
"loss": 0.8595,
"step": 875
},
{
"epoch": 0.16560030109145654,
"grad_norm": 0.05060611998562655,
"learning_rate": 0.0002960969162292908,
"loss": 0.8677,
"step": 880
},
{
"epoch": 0.16654121189311252,
"grad_norm": 0.04499302822555919,
"learning_rate": 0.00029598445961059156,
"loss": 0.8271,
"step": 885
},
{
"epoch": 0.16748212269476853,
"grad_norm": 0.05129556847823224,
"learning_rate": 0.000295870427824015,
"loss": 0.8881,
"step": 890
},
{
"epoch": 0.16842303349642454,
"grad_norm": 0.04871115226254147,
"learning_rate": 0.00029575482209996055,
"loss": 0.8336,
"step": 895
},
{
"epoch": 0.16936394429808055,
"grad_norm": 0.0566108024774645,
"learning_rate": 0.0002956376436858106,
"loss": 0.8494,
"step": 900
},
{
"epoch": 0.17030485509973656,
"grad_norm": 0.048845770084260345,
"learning_rate": 0.00029551889384591665,
"loss": 0.8986,
"step": 905
},
{
"epoch": 0.17124576590139254,
"grad_norm": 0.048327818597270665,
"learning_rate": 0.0002953985738615858,
"loss": 0.8693,
"step": 910
},
{
"epoch": 0.17218667670304855,
"grad_norm": 0.05247614713322588,
"learning_rate": 0.000295276685031067,
"loss": 0.8485,
"step": 915
},
{
"epoch": 0.17312758750470456,
"grad_norm": 0.047710655547393406,
"learning_rate": 0.0002951532286695371,
"loss": 0.8581,
"step": 920
},
{
"epoch": 0.17406849830636056,
"grad_norm": 0.048026071717255266,
"learning_rate": 0.0002950282061090864,
"loss": 0.8417,
"step": 925
},
{
"epoch": 0.17500940910801657,
"grad_norm": 0.047035746855751244,
"learning_rate": 0.0002949016186987046,
"loss": 0.877,
"step": 930
},
{
"epoch": 0.17595031990967255,
"grad_norm": 0.046963321258380666,
"learning_rate": 0.00029477346780426605,
"loss": 0.8792,
"step": 935
},
{
"epoch": 0.17689123071132856,
"grad_norm": 0.04969923101676293,
"learning_rate": 0.0002946437548085148,
"loss": 0.8598,
"step": 940
},
{
"epoch": 0.17783214151298457,
"grad_norm": 0.04621552171294473,
"learning_rate": 0.0002945124811110504,
"loss": 0.8335,
"step": 945
},
{
"epoch": 0.17877305231464058,
"grad_norm": 0.051617110249976034,
"learning_rate": 0.0002943796481283118,
"loss": 0.8719,
"step": 950
},
{
"epoch": 0.1797139631162966,
"grad_norm": 0.043058880781635334,
"learning_rate": 0.000294245257293563,
"loss": 0.8558,
"step": 955
},
{
"epoch": 0.18065487391795257,
"grad_norm": 0.05388724186288067,
"learning_rate": 0.00029410931005687696,
"loss": 0.8519,
"step": 960
},
{
"epoch": 0.18159578471960858,
"grad_norm": 0.04857900819806878,
"learning_rate": 0.00029397180788512026,
"loss": 0.8527,
"step": 965
},
{
"epoch": 0.18253669552126459,
"grad_norm": 0.05424964878908578,
"learning_rate": 0.0002938327522619371,
"loss": 0.8568,
"step": 970
},
{
"epoch": 0.1834776063229206,
"grad_norm": 0.05227365290335698,
"learning_rate": 0.0002936921446877334,
"loss": 0.8633,
"step": 975
},
{
"epoch": 0.1844185171245766,
"grad_norm": 0.051325801180247675,
"learning_rate": 0.0002935499866796607,
"loss": 0.8848,
"step": 980
},
{
"epoch": 0.18535942792623258,
"grad_norm": 0.05574668624448867,
"learning_rate": 0.00029340627977159957,
"loss": 0.8387,
"step": 985
},
{
"epoch": 0.1863003387278886,
"grad_norm": 0.05634617811405129,
"learning_rate": 0.0002932610255141431,
"loss": 0.8611,
"step": 990
},
{
"epoch": 0.1872412495295446,
"grad_norm": 0.04490025753044342,
"learning_rate": 0.0002931142254745804,
"loss": 0.8696,
"step": 995
},
{
"epoch": 0.1881821603312006,
"grad_norm": 0.04554083227376874,
"learning_rate": 0.0002929658812368794,
"loss": 0.8518,
"step": 1000
},
{
"epoch": 0.18912307113285662,
"grad_norm": 0.04781802172474072,
"learning_rate": 0.0002928159944016698,
"loss": 0.8371,
"step": 1005
},
{
"epoch": 0.1900639819345126,
"grad_norm": 0.0489899852900461,
"learning_rate": 0.00029266456658622617,
"loss": 0.8826,
"step": 1010
},
{
"epoch": 0.1910048927361686,
"grad_norm": 0.047291616554045394,
"learning_rate": 0.0002925115994244499,
"loss": 0.8193,
"step": 1015
},
{
"epoch": 0.19194580353782462,
"grad_norm": 0.04800411582285608,
"learning_rate": 0.000292357094566852,
"loss": 0.8566,
"step": 1020
},
{
"epoch": 0.19288671433948062,
"grad_norm": 0.05017688668028036,
"learning_rate": 0.00029220105368053535,
"loss": 0.8278,
"step": 1025
},
{
"epoch": 0.19382762514113663,
"grad_norm": 0.05255812021463712,
"learning_rate": 0.0002920434784491762,
"loss": 0.8518,
"step": 1030
},
{
"epoch": 0.19476853594279261,
"grad_norm": 0.04644990001608543,
"learning_rate": 0.00029188437057300654,
"loss": 0.8295,
"step": 1035
},
{
"epoch": 0.19570944674444862,
"grad_norm": 0.051889984533039224,
"learning_rate": 0.00029172373176879554,
"loss": 0.8511,
"step": 1040
},
{
"epoch": 0.19665035754610463,
"grad_norm": 0.05006046094748636,
"learning_rate": 0.000291561563769831,
"loss": 0.8699,
"step": 1045
},
{
"epoch": 0.19759126834776064,
"grad_norm": 0.04471198696865864,
"learning_rate": 0.00029139786832590075,
"loss": 0.8204,
"step": 1050
},
{
"epoch": 0.19853217914941665,
"grad_norm": 0.046864689834249275,
"learning_rate": 0.00029123264720327355,
"loss": 0.8482,
"step": 1055
},
{
"epoch": 0.19947308995107263,
"grad_norm": 0.05135087246231078,
"learning_rate": 0.0002910659021846803,
"loss": 0.8483,
"step": 1060
},
{
"epoch": 0.20041400075272864,
"grad_norm": 0.04785073870168607,
"learning_rate": 0.00029089763506929476,
"loss": 0.8503,
"step": 1065
},
{
"epoch": 0.20135491155438465,
"grad_norm": 0.05489000507522068,
"learning_rate": 0.0002907278476727139,
"loss": 0.8273,
"step": 1070
},
{
"epoch": 0.20229582235604066,
"grad_norm": 0.04772968129164812,
"learning_rate": 0.0002905565418269386,
"loss": 0.869,
"step": 1075
},
{
"epoch": 0.20323673315769666,
"grad_norm": 0.0468360392868491,
"learning_rate": 0.0002903837193803537,
"loss": 0.8593,
"step": 1080
},
{
"epoch": 0.20417764395935264,
"grad_norm": 0.05048292893225177,
"learning_rate": 0.00029020938219770815,
"loss": 0.8509,
"step": 1085
},
{
"epoch": 0.20511855476100865,
"grad_norm": 0.050674095814671125,
"learning_rate": 0.0002900335321600949,
"loss": 0.8687,
"step": 1090
},
{
"epoch": 0.20605946556266466,
"grad_norm": 0.04788733400998382,
"learning_rate": 0.00028985617116493044,
"loss": 0.8472,
"step": 1095
},
{
"epoch": 0.20700037636432067,
"grad_norm": 0.04582231610895905,
"learning_rate": 0.0002896773011259345,
"loss": 0.8471,
"step": 1100
},
{
"epoch": 0.20794128716597668,
"grad_norm": 0.0443686344112356,
"learning_rate": 0.0002894969239731094,
"loss": 0.8372,
"step": 1105
},
{
"epoch": 0.20888219796763266,
"grad_norm": 0.06909605924612522,
"learning_rate": 0.00028931504165271915,
"loss": 0.8449,
"step": 1110
},
{
"epoch": 0.20982310876928867,
"grad_norm": 0.053698108677639904,
"learning_rate": 0.0002891316561272684,
"loss": 0.8488,
"step": 1115
},
{
"epoch": 0.21076401957094468,
"grad_norm": 0.04652295229508992,
"learning_rate": 0.0002889467693754814,
"loss": 0.8185,
"step": 1120
},
{
"epoch": 0.21170493037260069,
"grad_norm": 0.048539262759809874,
"learning_rate": 0.0002887603833922806,
"loss": 0.8384,
"step": 1125
},
{
"epoch": 0.21264584117425667,
"grad_norm": 0.04776277603446453,
"learning_rate": 0.00028857250018876504,
"loss": 0.8314,
"step": 1130
},
{
"epoch": 0.21358675197591268,
"grad_norm": 0.04923483629023402,
"learning_rate": 0.0002883831217921889,
"loss": 0.8522,
"step": 1135
},
{
"epoch": 0.21452766277756868,
"grad_norm": 0.04258834829093326,
"learning_rate": 0.00028819225024593915,
"loss": 0.8478,
"step": 1140
},
{
"epoch": 0.2154685735792247,
"grad_norm": 0.04700949749104464,
"learning_rate": 0.00028799988760951404,
"loss": 0.8493,
"step": 1145
},
{
"epoch": 0.2164094843808807,
"grad_norm": 0.05210570592265317,
"learning_rate": 0.00028780603595850054,
"loss": 0.8432,
"step": 1150
},
{
"epoch": 0.21735039518253668,
"grad_norm": 0.04967685491585681,
"learning_rate": 0.0002876106973845521,
"loss": 0.8752,
"step": 1155
},
{
"epoch": 0.2182913059841927,
"grad_norm": 0.04829576891783422,
"learning_rate": 0.00028741387399536597,
"loss": 0.8335,
"step": 1160
},
{
"epoch": 0.2192322167858487,
"grad_norm": 0.04740474919455731,
"learning_rate": 0.00028721556791466056,
"loss": 0.8192,
"step": 1165
},
{
"epoch": 0.2201731275875047,
"grad_norm": 0.048080208780232926,
"learning_rate": 0.0002870157812821525,
"loss": 0.8368,
"step": 1170
},
{
"epoch": 0.22111403838916072,
"grad_norm": 0.044197737136251934,
"learning_rate": 0.0002868145162535333,
"loss": 0.8462,
"step": 1175
},
{
"epoch": 0.2220549491908167,
"grad_norm": 0.0469168238676528,
"learning_rate": 0.0002866117750004466,
"loss": 0.8455,
"step": 1180
},
{
"epoch": 0.2229958599924727,
"grad_norm": 0.058277013619873835,
"learning_rate": 0.00028640755971046436,
"loss": 0.8337,
"step": 1185
},
{
"epoch": 0.22393677079412871,
"grad_norm": 0.04233910712545471,
"learning_rate": 0.00028620187258706335,
"loss": 0.8564,
"step": 1190
},
{
"epoch": 0.22487768159578472,
"grad_norm": 0.04697864084139272,
"learning_rate": 0.00028599471584960136,
"loss": 0.824,
"step": 1195
},
{
"epoch": 0.22581859239744073,
"grad_norm": 0.04788326739922656,
"learning_rate": 0.0002857860917332933,
"loss": 0.8129,
"step": 1200
},
{
"epoch": 0.2267595031990967,
"grad_norm": 0.04394998686275356,
"learning_rate": 0.0002855760024891869,
"loss": 0.8271,
"step": 1205
},
{
"epoch": 0.22770041400075272,
"grad_norm": 0.04704392654736644,
"learning_rate": 0.0002853644503841389,
"loss": 0.844,
"step": 1210
},
{
"epoch": 0.22864132480240873,
"grad_norm": 0.05122535529682313,
"learning_rate": 0.0002851514377007901,
"loss": 0.8494,
"step": 1215
},
{
"epoch": 0.22958223560406474,
"grad_norm": 0.0502260796215441,
"learning_rate": 0.00028493696673754067,
"loss": 0.8433,
"step": 1220
},
{
"epoch": 0.23052314640572075,
"grad_norm": 0.043494324671621835,
"learning_rate": 0.0002847210398085259,
"loss": 0.8443,
"step": 1225
},
{
"epoch": 0.23146405720737673,
"grad_norm": 0.05049651139793639,
"learning_rate": 0.00028450365924359073,
"loss": 0.8042,
"step": 1230
},
{
"epoch": 0.23240496800903274,
"grad_norm": 0.05610756684714545,
"learning_rate": 0.000284284827388265,
"loss": 0.8256,
"step": 1235
},
{
"epoch": 0.23334587881068874,
"grad_norm": 0.047968557913656215,
"learning_rate": 0.00028406454660373753,
"loss": 0.8365,
"step": 1240
},
{
"epoch": 0.23428678961234475,
"grad_norm": 0.042261765294726174,
"learning_rate": 0.0002838428192668315,
"loss": 0.855,
"step": 1245
},
{
"epoch": 0.23522770041400076,
"grad_norm": 0.05021581504261803,
"learning_rate": 0.00028361964776997794,
"loss": 0.8191,
"step": 1250
},
{
"epoch": 0.23616861121565674,
"grad_norm": 0.05030219142064272,
"learning_rate": 0.00028339503452119063,
"loss": 0.8325,
"step": 1255
},
{
"epoch": 0.23710952201731275,
"grad_norm": 0.04874399457655963,
"learning_rate": 0.0002831689819440397,
"loss": 0.8423,
"step": 1260
},
{
"epoch": 0.23805043281896876,
"grad_norm": 0.04744286047223305,
"learning_rate": 0.00028294149247762545,
"loss": 0.859,
"step": 1265
},
{
"epoch": 0.23899134362062477,
"grad_norm": 0.04338685105449176,
"learning_rate": 0.00028271256857655244,
"loss": 0.8065,
"step": 1270
},
{
"epoch": 0.23993225442228078,
"grad_norm": 0.047153540286283435,
"learning_rate": 0.0002824822127109026,
"loss": 0.85,
"step": 1275
},
{
"epoch": 0.24087316522393676,
"grad_norm": 0.050020086051956694,
"learning_rate": 0.0002822504273662086,
"loss": 0.8611,
"step": 1280
},
{
"epoch": 0.24181407602559277,
"grad_norm": 0.04356866414442012,
"learning_rate": 0.0002820172150434274,
"loss": 0.8375,
"step": 1285
},
{
"epoch": 0.24275498682724878,
"grad_norm": 0.04859564580518782,
"learning_rate": 0.0002817825782589127,
"loss": 0.851,
"step": 1290
},
{
"epoch": 0.24369589762890478,
"grad_norm": 0.04736842332804661,
"learning_rate": 0.0002815465195443884,
"loss": 0.8232,
"step": 1295
},
{
"epoch": 0.2446368084305608,
"grad_norm": 0.04546754699344671,
"learning_rate": 0.0002813090414469208,
"loss": 0.802,
"step": 1300
},
{
"epoch": 0.24557771923221677,
"grad_norm": 0.04668790834139243,
"learning_rate": 0.0002810701465288913,
"loss": 0.8164,
"step": 1305
},
{
"epoch": 0.24651863003387278,
"grad_norm": 0.04806767256497892,
"learning_rate": 0.0002808298373679688,
"loss": 0.844,
"step": 1310
},
{
"epoch": 0.2474595408355288,
"grad_norm": 0.04430706443035298,
"learning_rate": 0.00028058811655708193,
"loss": 0.8402,
"step": 1315
},
{
"epoch": 0.2484004516371848,
"grad_norm": 0.046991071291030626,
"learning_rate": 0.00028034498670439085,
"loss": 0.8165,
"step": 1320
},
{
"epoch": 0.2493413624388408,
"grad_norm": 0.043876851948307184,
"learning_rate": 0.00028010045043325925,
"loss": 0.8373,
"step": 1325
},
{
"epoch": 0.2502822732404968,
"grad_norm": 0.04464800453485912,
"learning_rate": 0.000279854510382226,
"loss": 0.8491,
"step": 1330
},
{
"epoch": 0.2512231840421528,
"grad_norm": 0.05208965977018001,
"learning_rate": 0.0002796071692049769,
"loss": 0.8519,
"step": 1335
},
{
"epoch": 0.2521640948438088,
"grad_norm": 0.0464137513178153,
"learning_rate": 0.00027935842957031563,
"loss": 0.8362,
"step": 1340
},
{
"epoch": 0.2531050056454648,
"grad_norm": 0.048796355664881524,
"learning_rate": 0.00027910829416213527,
"loss": 0.8505,
"step": 1345
},
{
"epoch": 0.2540459164471208,
"grad_norm": 0.048709351068813245,
"learning_rate": 0.0002788567656793893,
"loss": 0.8544,
"step": 1350
},
{
"epoch": 0.2549868272487768,
"grad_norm": 0.045892021177835134,
"learning_rate": 0.00027860384683606236,
"loss": 0.8271,
"step": 1355
},
{
"epoch": 0.25592773805043284,
"grad_norm": 0.05265800000922237,
"learning_rate": 0.00027834954036114114,
"loss": 0.885,
"step": 1360
},
{
"epoch": 0.2568686488520888,
"grad_norm": 0.04551430213945619,
"learning_rate": 0.00027809384899858474,
"loss": 0.8225,
"step": 1365
},
{
"epoch": 0.2578095596537448,
"grad_norm": 0.04793102535540072,
"learning_rate": 0.00027783677550729515,
"loss": 0.8339,
"step": 1370
},
{
"epoch": 0.25875047045540084,
"grad_norm": 0.04205095375417017,
"learning_rate": 0.0002775783226610875,
"loss": 0.808,
"step": 1375
},
{
"epoch": 0.2596913812570568,
"grad_norm": 0.047025422686677955,
"learning_rate": 0.00027731849324866026,
"loss": 0.8525,
"step": 1380
},
{
"epoch": 0.26063229205871286,
"grad_norm": 0.043984335241227406,
"learning_rate": 0.00027705729007356476,
"loss": 0.8241,
"step": 1385
},
{
"epoch": 0.26157320286036884,
"grad_norm": 0.04490799345162232,
"learning_rate": 0.00027679471595417536,
"loss": 0.843,
"step": 1390
},
{
"epoch": 0.2625141136620248,
"grad_norm": 0.050208847690443474,
"learning_rate": 0.00027653077372365886,
"loss": 0.8233,
"step": 1395
},
{
"epoch": 0.26345502446368085,
"grad_norm": 0.04338736887570603,
"learning_rate": 0.00027626546622994374,
"loss": 0.8177,
"step": 1400
},
{
"epoch": 0.26439593526533683,
"grad_norm": 0.043029065881549575,
"learning_rate": 0.00027599879633568994,
"loss": 0.8201,
"step": 1405
},
{
"epoch": 0.26533684606699287,
"grad_norm": 0.04528220902743319,
"learning_rate": 0.0002757307669182575,
"loss": 0.8182,
"step": 1410
},
{
"epoch": 0.26627775686864885,
"grad_norm": 0.04189572516578376,
"learning_rate": 0.0002754613808696756,
"loss": 0.8093,
"step": 1415
},
{
"epoch": 0.26721866767030483,
"grad_norm": 0.050018443121759355,
"learning_rate": 0.00027519064109661153,
"loss": 0.8408,
"step": 1420
},
{
"epoch": 0.26815957847196087,
"grad_norm": 0.04477497802076473,
"learning_rate": 0.00027491855052033925,
"loss": 0.8037,
"step": 1425
},
{
"epoch": 0.26910048927361685,
"grad_norm": 0.04392548330312801,
"learning_rate": 0.00027464511207670773,
"loss": 0.8507,
"step": 1430
},
{
"epoch": 0.2700414000752729,
"grad_norm": 0.049777060488396405,
"learning_rate": 0.0002743703287161095,
"loss": 0.7992,
"step": 1435
},
{
"epoch": 0.27098231087692887,
"grad_norm": 0.042140230036610965,
"learning_rate": 0.00027409420340344866,
"loss": 0.8167,
"step": 1440
},
{
"epoch": 0.27192322167858485,
"grad_norm": 0.05002427399165945,
"learning_rate": 0.00027381673911810897,
"loss": 0.8121,
"step": 1445
},
{
"epoch": 0.2728641324802409,
"grad_norm": 0.04163684703416787,
"learning_rate": 0.00027353793885392155,
"loss": 0.8244,
"step": 1450
},
{
"epoch": 0.27380504328189686,
"grad_norm": 0.044059819695211666,
"learning_rate": 0.00027325780561913277,
"loss": 0.8164,
"step": 1455
},
{
"epoch": 0.2747459540835529,
"grad_norm": 0.04563812665390404,
"learning_rate": 0.00027297634243637176,
"loss": 0.8366,
"step": 1460
},
{
"epoch": 0.2756868648852089,
"grad_norm": 0.04827848887354726,
"learning_rate": 0.00027269355234261773,
"loss": 0.8281,
"step": 1465
},
{
"epoch": 0.27662777568686486,
"grad_norm": 0.04999760434208785,
"learning_rate": 0.000272409438389167,
"loss": 0.8467,
"step": 1470
},
{
"epoch": 0.2775686864885209,
"grad_norm": 0.05162297961569586,
"learning_rate": 0.00027212400364160075,
"loss": 0.8487,
"step": 1475
},
{
"epoch": 0.2785095972901769,
"grad_norm": 0.04646244942560704,
"learning_rate": 0.000271837251179751,
"loss": 0.8084,
"step": 1480
},
{
"epoch": 0.2794505080918329,
"grad_norm": 0.05039420456957765,
"learning_rate": 0.0002715491840976682,
"loss": 0.8471,
"step": 1485
},
{
"epoch": 0.2803914188934889,
"grad_norm": 0.042544806912826326,
"learning_rate": 0.00027125980550358743,
"loss": 0.7844,
"step": 1490
},
{
"epoch": 0.2813323296951449,
"grad_norm": 0.045410393598617226,
"learning_rate": 0.0002709691185198948,
"loss": 0.8618,
"step": 1495
},
{
"epoch": 0.2822732404968009,
"grad_norm": 0.04318140661358377,
"learning_rate": 0.0002706771262830941,
"loss": 0.8097,
"step": 1500
},
{
"epoch": 0.2832141512984569,
"grad_norm": 0.03985460964577001,
"learning_rate": 0.0002703838319437727,
"loss": 0.8328,
"step": 1505
},
{
"epoch": 0.28415506210011293,
"grad_norm": 0.045705481658739236,
"learning_rate": 0.0002700892386665675,
"loss": 0.8194,
"step": 1510
},
{
"epoch": 0.2850959729017689,
"grad_norm": 0.048028831085993116,
"learning_rate": 0.0002697933496301311,
"loss": 0.8338,
"step": 1515
},
{
"epoch": 0.2860368837034249,
"grad_norm": 0.04693051612093597,
"learning_rate": 0.00026949616802709716,
"loss": 0.8425,
"step": 1520
},
{
"epoch": 0.28697779450508093,
"grad_norm": 0.04357129292417143,
"learning_rate": 0.0002691976970640461,
"loss": 0.8246,
"step": 1525
},
{
"epoch": 0.2879187053067369,
"grad_norm": 0.04698363226395185,
"learning_rate": 0.00026889793996147057,
"loss": 0.8242,
"step": 1530
},
{
"epoch": 0.28885961610839295,
"grad_norm": 0.046155464725970596,
"learning_rate": 0.00026859689995374056,
"loss": 0.8137,
"step": 1535
},
{
"epoch": 0.28980052691004893,
"grad_norm": 0.05204306070924651,
"learning_rate": 0.0002682945802890686,
"loss": 0.8264,
"step": 1540
},
{
"epoch": 0.2907414377117049,
"grad_norm": 0.043611637100207754,
"learning_rate": 0.00026799098422947474,
"loss": 0.821,
"step": 1545
},
{
"epoch": 0.29168234851336095,
"grad_norm": 0.05503848520813138,
"learning_rate": 0.00026768611505075115,
"loss": 0.8291,
"step": 1550
},
{
"epoch": 0.2926232593150169,
"grad_norm": 0.05240199900772415,
"learning_rate": 0.000267379976042427,
"loss": 0.8345,
"step": 1555
},
{
"epoch": 0.29356417011667296,
"grad_norm": 0.04555420445769968,
"learning_rate": 0.0002670725705077329,
"loss": 0.8361,
"step": 1560
},
{
"epoch": 0.29450508091832894,
"grad_norm": 0.04207672123987573,
"learning_rate": 0.0002667639017635651,
"loss": 0.8083,
"step": 1565
},
{
"epoch": 0.2954459917199849,
"grad_norm": 0.043154758358048634,
"learning_rate": 0.0002664539731404502,
"loss": 0.8507,
"step": 1570
},
{
"epoch": 0.29638690252164096,
"grad_norm": 0.048023978799794254,
"learning_rate": 0.0002661427879825084,
"loss": 0.8311,
"step": 1575
},
{
"epoch": 0.29732781332329694,
"grad_norm": 0.04479903851737126,
"learning_rate": 0.0002658303496474182,
"loss": 0.8177,
"step": 1580
},
{
"epoch": 0.298268724124953,
"grad_norm": 0.04075712688100381,
"learning_rate": 0.0002655166615063797,
"loss": 0.8231,
"step": 1585
},
{
"epoch": 0.29920963492660896,
"grad_norm": 0.04516519211583324,
"learning_rate": 0.00026520172694407835,
"loss": 0.8342,
"step": 1590
},
{
"epoch": 0.30015054572826494,
"grad_norm": 0.04163573857623555,
"learning_rate": 0.0002648855493586485,
"loss": 0.8217,
"step": 1595
},
{
"epoch": 0.301091456529921,
"grad_norm": 0.04703202421649344,
"learning_rate": 0.00026456813216163674,
"loss": 0.8375,
"step": 1600
},
{
"epoch": 0.30203236733157696,
"grad_norm": 0.0435527583985287,
"learning_rate": 0.0002642494787779649,
"loss": 0.7929,
"step": 1605
},
{
"epoch": 0.302973278133233,
"grad_norm": 0.04112659440879095,
"learning_rate": 0.0002639295926458934,
"loss": 0.8409,
"step": 1610
},
{
"epoch": 0.303914188934889,
"grad_norm": 0.0455623778767916,
"learning_rate": 0.0002636084772169838,
"loss": 0.8398,
"step": 1615
},
{
"epoch": 0.30485509973654495,
"grad_norm": 0.04205050261795213,
"learning_rate": 0.00026328613595606173,
"loss": 0.8041,
"step": 1620
},
{
"epoch": 0.305796010538201,
"grad_norm": 0.04850951164010859,
"learning_rate": 0.0002629625723411797,
"loss": 0.7985,
"step": 1625
},
{
"epoch": 0.30673692133985697,
"grad_norm": 0.046499953964171024,
"learning_rate": 0.0002626377898635792,
"loss": 0.8404,
"step": 1630
},
{
"epoch": 0.307677832141513,
"grad_norm": 0.05087753755535387,
"learning_rate": 0.00026231179202765336,
"loss": 0.8247,
"step": 1635
},
{
"epoch": 0.308618742943169,
"grad_norm": 0.05107289807504844,
"learning_rate": 0.00026198458235090886,
"loss": 0.8373,
"step": 1640
},
{
"epoch": 0.30955965374482497,
"grad_norm": 0.047849970649591435,
"learning_rate": 0.00026165616436392815,
"loss": 0.8526,
"step": 1645
},
{
"epoch": 0.310500564546481,
"grad_norm": 0.0452846113299356,
"learning_rate": 0.00026132654161033133,
"loss": 0.8257,
"step": 1650
},
{
"epoch": 0.311441475348137,
"grad_norm": 0.046584198634579535,
"learning_rate": 0.00026099571764673786,
"loss": 0.8272,
"step": 1655
},
{
"epoch": 0.312382386149793,
"grad_norm": 0.04195956869333814,
"learning_rate": 0.00026066369604272835,
"loss": 0.8295,
"step": 1660
},
{
"epoch": 0.313323296951449,
"grad_norm": 0.047276897040517386,
"learning_rate": 0.00026033048038080563,
"loss": 0.851,
"step": 1665
},
{
"epoch": 0.314264207753105,
"grad_norm": 0.04611674028446184,
"learning_rate": 0.0002599960742563566,
"loss": 0.8269,
"step": 1670
},
{
"epoch": 0.315205118554761,
"grad_norm": 0.04599603285571535,
"learning_rate": 0.0002596604812776133,
"loss": 0.8253,
"step": 1675
},
{
"epoch": 0.316146029356417,
"grad_norm": 0.04173568003131483,
"learning_rate": 0.00025932370506561364,
"loss": 0.8123,
"step": 1680
},
{
"epoch": 0.31708694015807304,
"grad_norm": 0.0425349576791169,
"learning_rate": 0.0002589857492541627,
"loss": 0.8343,
"step": 1685
},
{
"epoch": 0.318027850959729,
"grad_norm": 0.04610564920795079,
"learning_rate": 0.0002586466174897934,
"loss": 0.8215,
"step": 1690
},
{
"epoch": 0.318968761761385,
"grad_norm": 0.04293233769403105,
"learning_rate": 0.00025830631343172727,
"loss": 0.7962,
"step": 1695
},
{
"epoch": 0.31990967256304104,
"grad_norm": 0.040726277131415696,
"learning_rate": 0.00025796484075183465,
"loss": 0.8119,
"step": 1700
},
{
"epoch": 0.320850583364697,
"grad_norm": 0.0444419737096656,
"learning_rate": 0.00025762220313459535,
"loss": 0.7843,
"step": 1705
},
{
"epoch": 0.32179149416635305,
"grad_norm": 0.048133240264347225,
"learning_rate": 0.0002572784042770588,
"loss": 0.8307,
"step": 1710
},
{
"epoch": 0.32273240496800903,
"grad_norm": 0.045535597036081774,
"learning_rate": 0.0002569334478888044,
"loss": 0.8228,
"step": 1715
},
{
"epoch": 0.323673315769665,
"grad_norm": 0.04973249883926558,
"learning_rate": 0.0002565873376919008,
"loss": 0.8245,
"step": 1720
},
{
"epoch": 0.32461422657132105,
"grad_norm": 0.044317466816763997,
"learning_rate": 0.0002562400774208668,
"loss": 0.7937,
"step": 1725
},
{
"epoch": 0.32555513737297703,
"grad_norm": 0.04422454108768711,
"learning_rate": 0.00025589167082263,
"loss": 0.8492,
"step": 1730
},
{
"epoch": 0.32649604817463307,
"grad_norm": 0.057793214248518535,
"learning_rate": 0.0002555421216564869,
"loss": 0.838,
"step": 1735
},
{
"epoch": 0.32743695897628905,
"grad_norm": 0.050491165924182006,
"learning_rate": 0.00025519143369406253,
"loss": 0.8136,
"step": 1740
},
{
"epoch": 0.32837786977794503,
"grad_norm": 0.039453461402937505,
"learning_rate": 0.00025483961071926924,
"loss": 0.833,
"step": 1745
},
{
"epoch": 0.32931878057960107,
"grad_norm": 0.0427804162848213,
"learning_rate": 0.00025448665652826627,
"loss": 0.8182,
"step": 1750
},
{
"epoch": 0.33025969138125705,
"grad_norm": 0.04278324709181072,
"learning_rate": 0.0002541325749294186,
"loss": 0.8501,
"step": 1755
},
{
"epoch": 0.3312006021829131,
"grad_norm": 0.04007795533618747,
"learning_rate": 0.0002537773697432559,
"loss": 0.8122,
"step": 1760
},
{
"epoch": 0.33214151298456907,
"grad_norm": 0.043309227799558356,
"learning_rate": 0.0002534210448024313,
"loss": 0.8096,
"step": 1765
},
{
"epoch": 0.33308242378622505,
"grad_norm": 0.03941333694707204,
"learning_rate": 0.0002530636039516801,
"loss": 0.7938,
"step": 1770
},
{
"epoch": 0.3340233345878811,
"grad_norm": 0.10747125716613212,
"learning_rate": 0.0002527050510477782,
"loss": 0.8307,
"step": 1775
},
{
"epoch": 0.33496424538953706,
"grad_norm": 0.04590988600337092,
"learning_rate": 0.00025234538995950047,
"loss": 0.8122,
"step": 1780
},
{
"epoch": 0.3359051561911931,
"grad_norm": 0.04172494378663382,
"learning_rate": 0.00025198462456757915,
"loss": 0.8345,
"step": 1785
},
{
"epoch": 0.3368460669928491,
"grad_norm": 0.045093200155212766,
"learning_rate": 0.0002516227587646619,
"loss": 0.8347,
"step": 1790
},
{
"epoch": 0.33778697779450506,
"grad_norm": 0.04401063743206211,
"learning_rate": 0.0002512597964552696,
"loss": 0.8285,
"step": 1795
},
{
"epoch": 0.3387278885961611,
"grad_norm": 0.04213320681777121,
"learning_rate": 0.00025089574155575463,
"loss": 0.8113,
"step": 1800
},
{
"epoch": 0.3396687993978171,
"grad_norm": 0.041757171559943365,
"learning_rate": 0.0002505305979942582,
"loss": 0.8252,
"step": 1805
},
{
"epoch": 0.3406097101994731,
"grad_norm": 0.04786009269846987,
"learning_rate": 0.00025016436971066837,
"loss": 0.8228,
"step": 1810
},
{
"epoch": 0.3415506210011291,
"grad_norm": 0.041726632525803914,
"learning_rate": 0.000249797060656577,
"loss": 0.824,
"step": 1815
},
{
"epoch": 0.3424915318027851,
"grad_norm": 0.04595397683798158,
"learning_rate": 0.00024942867479523764,
"loss": 0.8391,
"step": 1820
},
{
"epoch": 0.3434324426044411,
"grad_norm": 0.046502559025392236,
"learning_rate": 0.0002490592161015227,
"loss": 0.7999,
"step": 1825
},
{
"epoch": 0.3443733534060971,
"grad_norm": 0.042052936652114195,
"learning_rate": 0.00024868868856188,
"loss": 0.8248,
"step": 1830
},
{
"epoch": 0.34531426420775313,
"grad_norm": 0.05203965598763733,
"learning_rate": 0.0002483170961742905,
"loss": 0.812,
"step": 1835
},
{
"epoch": 0.3462551750094091,
"grad_norm": 0.044682898482156765,
"learning_rate": 0.00024794444294822486,
"loss": 0.8477,
"step": 1840
},
{
"epoch": 0.3471960858110651,
"grad_norm": 0.04216691856239276,
"learning_rate": 0.0002475707329046,
"loss": 0.7979,
"step": 1845
},
{
"epoch": 0.34813699661272113,
"grad_norm": 0.04345355127718824,
"learning_rate": 0.0002471959700757358,
"loss": 0.8135,
"step": 1850
},
{
"epoch": 0.3490779074143771,
"grad_norm": 0.042957408510867594,
"learning_rate": 0.00024682015850531193,
"loss": 0.8064,
"step": 1855
},
{
"epoch": 0.35001881821603315,
"grad_norm": 0.04072622346447232,
"learning_rate": 0.00024644330224832375,
"loss": 0.8111,
"step": 1860
},
{
"epoch": 0.3509597290176891,
"grad_norm": 0.045585863196081676,
"learning_rate": 0.0002460654053710388,
"loss": 0.8015,
"step": 1865
},
{
"epoch": 0.3519006398193451,
"grad_norm": 0.04805889924747525,
"learning_rate": 0.0002456864719509529,
"loss": 0.8289,
"step": 1870
},
{
"epoch": 0.35284155062100114,
"grad_norm": 0.040060429859775856,
"learning_rate": 0.0002453065060767461,
"loss": 0.8073,
"step": 1875
},
{
"epoch": 0.3537824614226571,
"grad_norm": 0.04018127215217534,
"learning_rate": 0.0002449255118482386,
"loss": 0.798,
"step": 1880
},
{
"epoch": 0.35472337222431316,
"grad_norm": 0.04853936791390769,
"learning_rate": 0.0002445434933763466,
"loss": 0.8419,
"step": 1885
},
{
"epoch": 0.35566428302596914,
"grad_norm": 0.04367744556341427,
"learning_rate": 0.0002441604547830378,
"loss": 0.838,
"step": 1890
},
{
"epoch": 0.3566051938276251,
"grad_norm": 0.04205522207764387,
"learning_rate": 0.0002437764002012868,
"loss": 0.8158,
"step": 1895
},
{
"epoch": 0.35754610462928116,
"grad_norm": 0.04343667094277572,
"learning_rate": 0.00024339133377503103,
"loss": 0.8208,
"step": 1900
},
{
"epoch": 0.35848701543093714,
"grad_norm": 0.03959475872321939,
"learning_rate": 0.0002430052596591255,
"loss": 0.814,
"step": 1905
},
{
"epoch": 0.3594279262325932,
"grad_norm": 0.04256159840011562,
"learning_rate": 0.00024261818201929813,
"loss": 0.8369,
"step": 1910
},
{
"epoch": 0.36036883703424916,
"grad_norm": 0.04672817842045811,
"learning_rate": 0.00024223010503210483,
"loss": 0.8023,
"step": 1915
},
{
"epoch": 0.36130974783590514,
"grad_norm": 0.04766444087870317,
"learning_rate": 0.00024184103288488456,
"loss": 0.7827,
"step": 1920
},
{
"epoch": 0.3622506586375612,
"grad_norm": 0.04922825160196635,
"learning_rate": 0.0002414509697757139,
"loss": 0.8146,
"step": 1925
},
{
"epoch": 0.36319156943921715,
"grad_norm": 0.04305093896719169,
"learning_rate": 0.00024105991991336197,
"loss": 0.8519,
"step": 1930
},
{
"epoch": 0.3641324802408732,
"grad_norm": 0.0509218525892877,
"learning_rate": 0.00024066788751724483,
"loss": 0.8323,
"step": 1935
},
{
"epoch": 0.36507339104252917,
"grad_norm": 0.041068187949031935,
"learning_rate": 0.00024027487681738016,
"loss": 0.8015,
"step": 1940
},
{
"epoch": 0.36601430184418515,
"grad_norm": 0.04156745894982358,
"learning_rate": 0.0002398808920543414,
"loss": 0.804,
"step": 1945
},
{
"epoch": 0.3669552126458412,
"grad_norm": 0.03811372093805523,
"learning_rate": 0.00023948593747921226,
"loss": 0.7874,
"step": 1950
},
{
"epoch": 0.36789612344749717,
"grad_norm": 0.04142805534519082,
"learning_rate": 0.0002390900173535405,
"loss": 0.8635,
"step": 1955
},
{
"epoch": 0.3688370342491532,
"grad_norm": 0.041550575720770686,
"learning_rate": 0.00023869313594929222,
"loss": 0.813,
"step": 1960
},
{
"epoch": 0.3697779450508092,
"grad_norm": 0.04523950919170376,
"learning_rate": 0.00023829529754880574,
"loss": 0.8025,
"step": 1965
},
{
"epoch": 0.37071885585246517,
"grad_norm": 0.04169165852779439,
"learning_rate": 0.00023789650644474527,
"loss": 0.8074,
"step": 1970
},
{
"epoch": 0.3716597666541212,
"grad_norm": 0.04010793628530859,
"learning_rate": 0.0002374967669400547,
"loss": 0.8093,
"step": 1975
},
{
"epoch": 0.3726006774557772,
"grad_norm": 0.04324010723990065,
"learning_rate": 0.00023709608334791113,
"loss": 0.8072,
"step": 1980
},
{
"epoch": 0.3735415882574332,
"grad_norm": 0.042840112648990035,
"learning_rate": 0.00023669445999167834,
"loss": 0.8213,
"step": 1985
},
{
"epoch": 0.3744824990590892,
"grad_norm": 0.04684866437996544,
"learning_rate": 0.00023629190120486002,
"loss": 0.8238,
"step": 1990
},
{
"epoch": 0.3754234098607452,
"grad_norm": 0.0411724284111032,
"learning_rate": 0.00023588841133105338,
"loss": 0.8117,
"step": 1995
},
{
"epoch": 0.3763643206624012,
"grad_norm": 0.04450924105296537,
"learning_rate": 0.00023548399472390178,
"loss": 0.8218,
"step": 2000
},
{
"epoch": 0.3773052314640572,
"grad_norm": 0.04011920691264243,
"learning_rate": 0.0002350786557470482,
"loss": 0.8244,
"step": 2005
},
{
"epoch": 0.37824614226571324,
"grad_norm": 0.04712123262049659,
"learning_rate": 0.00023467239877408773,
"loss": 0.8227,
"step": 2010
},
{
"epoch": 0.3791870530673692,
"grad_norm": 0.044712637958609615,
"learning_rate": 0.00023426522818852086,
"loss": 0.8216,
"step": 2015
},
{
"epoch": 0.3801279638690252,
"grad_norm": 0.04150493938713221,
"learning_rate": 0.00023385714838370582,
"loss": 0.8148,
"step": 2020
},
{
"epoch": 0.38106887467068123,
"grad_norm": 0.048742142112492115,
"learning_rate": 0.0002334481637628112,
"loss": 0.8348,
"step": 2025
},
{
"epoch": 0.3820097854723372,
"grad_norm": 0.0415305433820995,
"learning_rate": 0.0002330382787387687,
"loss": 0.8151,
"step": 2030
},
{
"epoch": 0.38295069627399325,
"grad_norm": 0.0453536583022637,
"learning_rate": 0.00023262749773422518,
"loss": 0.8157,
"step": 2035
},
{
"epoch": 0.38389160707564923,
"grad_norm": 0.044005851429759116,
"learning_rate": 0.00023221582518149526,
"loss": 0.7728,
"step": 2040
},
{
"epoch": 0.3848325178773052,
"grad_norm": 0.04867846228714434,
"learning_rate": 0.00023180326552251323,
"loss": 0.7984,
"step": 2045
},
{
"epoch": 0.38577342867896125,
"grad_norm": 0.04136667249456415,
"learning_rate": 0.0002313898232087852,
"loss": 0.799,
"step": 2050
},
{
"epoch": 0.38671433948061723,
"grad_norm": 0.0405221793452781,
"learning_rate": 0.00023097550270134124,
"loss": 0.8146,
"step": 2055
},
{
"epoch": 0.38765525028227327,
"grad_norm": 0.04778677014197557,
"learning_rate": 0.000230560308470687,
"loss": 0.8383,
"step": 2060
},
{
"epoch": 0.38859616108392925,
"grad_norm": 0.04391158932493037,
"learning_rate": 0.00023014424499675555,
"loss": 0.8648,
"step": 2065
},
{
"epoch": 0.38953707188558523,
"grad_norm": 0.046018778807677026,
"learning_rate": 0.00022972731676885913,
"loss": 0.8043,
"step": 2070
},
{
"epoch": 0.39047798268724127,
"grad_norm": 0.04560887374781656,
"learning_rate": 0.00022930952828564073,
"loss": 0.7997,
"step": 2075
},
{
"epoch": 0.39141889348889725,
"grad_norm": 0.040583749331980695,
"learning_rate": 0.00022889088405502522,
"loss": 0.8008,
"step": 2080
},
{
"epoch": 0.3923598042905533,
"grad_norm": 0.040731714295921484,
"learning_rate": 0.00022847138859417114,
"loss": 0.7964,
"step": 2085
},
{
"epoch": 0.39330071509220926,
"grad_norm": 0.0428768624710387,
"learning_rate": 0.00022805104642942186,
"loss": 0.8211,
"step": 2090
},
{
"epoch": 0.39424162589386524,
"grad_norm": 0.03600894815100002,
"learning_rate": 0.00022762986209625644,
"loss": 0.8043,
"step": 2095
},
{
"epoch": 0.3951825366955213,
"grad_norm": 0.03886248110556022,
"learning_rate": 0.00022720784013924102,
"loss": 0.7894,
"step": 2100
},
{
"epoch": 0.39612344749717726,
"grad_norm": 0.04550262987220453,
"learning_rate": 0.00022678498511197976,
"loss": 0.8123,
"step": 2105
},
{
"epoch": 0.3970643582988333,
"grad_norm": 0.04253985842426856,
"learning_rate": 0.0002263613015770655,
"loss": 0.8094,
"step": 2110
},
{
"epoch": 0.3980052691004893,
"grad_norm": 0.04139895688562898,
"learning_rate": 0.00022593679410603062,
"loss": 0.8101,
"step": 2115
},
{
"epoch": 0.39894617990214526,
"grad_norm": 0.04611695349566767,
"learning_rate": 0.00022551146727929793,
"loss": 0.7969,
"step": 2120
},
{
"epoch": 0.3998870907038013,
"grad_norm": 0.03767361228914821,
"learning_rate": 0.00022508532568613087,
"loss": 0.777,
"step": 2125
},
{
"epoch": 0.4008280015054573,
"grad_norm": 0.03917014861794376,
"learning_rate": 0.0002246583739245843,
"loss": 0.8062,
"step": 2130
},
{
"epoch": 0.4017689123071133,
"grad_norm": 0.047039791387090205,
"learning_rate": 0.00022423061660145467,
"loss": 0.8055,
"step": 2135
},
{
"epoch": 0.4027098231087693,
"grad_norm": 0.03980335720604302,
"learning_rate": 0.00022380205833223062,
"loss": 0.8114,
"step": 2140
},
{
"epoch": 0.4036507339104253,
"grad_norm": 0.040218404097981254,
"learning_rate": 0.00022337270374104268,
"loss": 0.7878,
"step": 2145
},
{
"epoch": 0.4045916447120813,
"grad_norm": 0.03770658683678954,
"learning_rate": 0.0002229425574606139,
"loss": 0.7626,
"step": 2150
},
{
"epoch": 0.4055325555137373,
"grad_norm": 0.044949683935009095,
"learning_rate": 0.00022251162413220956,
"loss": 0.8187,
"step": 2155
},
{
"epoch": 0.40647346631539333,
"grad_norm": 0.03983952278617522,
"learning_rate": 0.0002220799084055872,
"loss": 0.7852,
"step": 2160
},
{
"epoch": 0.4074143771170493,
"grad_norm": 0.05174431759947435,
"learning_rate": 0.00022164741493894624,
"loss": 0.8323,
"step": 2165
},
{
"epoch": 0.4083552879187053,
"grad_norm": 0.04243001886382076,
"learning_rate": 0.00022121414839887813,
"loss": 0.7876,
"step": 2170
},
{
"epoch": 0.4092961987203613,
"grad_norm": 0.037037966238162086,
"learning_rate": 0.00022078011346031572,
"loss": 0.7927,
"step": 2175
},
{
"epoch": 0.4102371095220173,
"grad_norm": 0.04184803898066816,
"learning_rate": 0.0002203453148064826,
"loss": 0.7871,
"step": 2180
},
{
"epoch": 0.41117802032367334,
"grad_norm": 0.0428590620625003,
"learning_rate": 0.00021990975712884322,
"loss": 0.8127,
"step": 2185
},
{
"epoch": 0.4121189311253293,
"grad_norm": 0.03582098140612815,
"learning_rate": 0.0002194734451270515,
"loss": 0.7809,
"step": 2190
},
{
"epoch": 0.4130598419269853,
"grad_norm": 0.039671091493742434,
"learning_rate": 0.00021903638350890078,
"loss": 0.8145,
"step": 2195
},
{
"epoch": 0.41400075272864134,
"grad_norm": 0.043079186328936876,
"learning_rate": 0.00021859857699027256,
"loss": 0.8163,
"step": 2200
},
{
"epoch": 0.4149416635302973,
"grad_norm": 0.04821222253606962,
"learning_rate": 0.00021816003029508587,
"loss": 0.8108,
"step": 2205
},
{
"epoch": 0.41588257433195336,
"grad_norm": 0.04643606468121251,
"learning_rate": 0.0002177207481552462,
"loss": 0.8143,
"step": 2210
},
{
"epoch": 0.41682348513360934,
"grad_norm": 0.05088614638476515,
"learning_rate": 0.0002172807353105945,
"loss": 0.8093,
"step": 2215
},
{
"epoch": 0.4177643959352653,
"grad_norm": 0.04275749284477858,
"learning_rate": 0.00021683999650885598,
"loss": 0.8043,
"step": 2220
},
{
"epoch": 0.41870530673692136,
"grad_norm": 0.04036391126072423,
"learning_rate": 0.00021639853650558884,
"loss": 0.7994,
"step": 2225
},
{
"epoch": 0.41964621753857734,
"grad_norm": 0.0431659963056544,
"learning_rate": 0.00021595636006413308,
"loss": 0.8197,
"step": 2230
},
{
"epoch": 0.4205871283402333,
"grad_norm": 0.04225940473147677,
"learning_rate": 0.00021551347195555916,
"loss": 0.7891,
"step": 2235
},
{
"epoch": 0.42152803914188935,
"grad_norm": 0.04116212403602597,
"learning_rate": 0.00021506987695861618,
"loss": 0.8418,
"step": 2240
},
{
"epoch": 0.42246894994354534,
"grad_norm": 0.04039520834463586,
"learning_rate": 0.00021462557985968075,
"loss": 0.7952,
"step": 2245
},
{
"epoch": 0.42340986074520137,
"grad_norm": 0.048662820756663734,
"learning_rate": 0.000214180585452705,
"loss": 0.8019,
"step": 2250
},
{
"epoch": 0.42435077154685735,
"grad_norm": 0.045423020413219564,
"learning_rate": 0.00021373489853916499,
"loss": 0.7975,
"step": 2255
},
{
"epoch": 0.42529168234851333,
"grad_norm": 0.04640593668762107,
"learning_rate": 0.00021328852392800906,
"loss": 0.8312,
"step": 2260
},
{
"epoch": 0.42623259315016937,
"grad_norm": 0.04277782133320467,
"learning_rate": 0.00021284146643560562,
"loss": 0.8104,
"step": 2265
},
{
"epoch": 0.42717350395182535,
"grad_norm": 0.04040549088992285,
"learning_rate": 0.00021239373088569142,
"loss": 0.7887,
"step": 2270
},
{
"epoch": 0.4281144147534814,
"grad_norm": 0.03889124568442978,
"learning_rate": 0.00021194532210931945,
"loss": 0.8061,
"step": 2275
},
{
"epoch": 0.42905532555513737,
"grad_norm": 0.043868472214118666,
"learning_rate": 0.00021149624494480674,
"loss": 0.7947,
"step": 2280
},
{
"epoch": 0.42999623635679335,
"grad_norm": 0.03981679327310523,
"learning_rate": 0.00021104650423768218,
"loss": 0.7891,
"step": 2285
},
{
"epoch": 0.4309371471584494,
"grad_norm": 0.04037233565136616,
"learning_rate": 0.00021059610484063437,
"loss": 0.8016,
"step": 2290
},
{
"epoch": 0.43187805796010537,
"grad_norm": 0.041805266956483865,
"learning_rate": 0.00021014505161345915,
"loss": 0.8037,
"step": 2295
},
{
"epoch": 0.4328189687617614,
"grad_norm": 0.03748382253401212,
"learning_rate": 0.00020969334942300702,
"loss": 0.8073,
"step": 2300
},
{
"epoch": 0.4337598795634174,
"grad_norm": 0.06578354317476753,
"learning_rate": 0.00020924100314313092,
"loss": 0.8075,
"step": 2305
},
{
"epoch": 0.43470079036507336,
"grad_norm": 0.04239179218818634,
"learning_rate": 0.00020878801765463343,
"loss": 0.8027,
"step": 2310
},
{
"epoch": 0.4356417011667294,
"grad_norm": 0.042779310945398136,
"learning_rate": 0.00020833439784521423,
"loss": 0.8102,
"step": 2315
},
{
"epoch": 0.4365826119683854,
"grad_norm": 0.04206757037096745,
"learning_rate": 0.00020788014860941717,
"loss": 0.8008,
"step": 2320
},
{
"epoch": 0.4375235227700414,
"grad_norm": 0.04286947710250131,
"learning_rate": 0.00020742527484857778,
"loss": 0.8038,
"step": 2325
},
{
"epoch": 0.4384644335716974,
"grad_norm": 0.04156846594623706,
"learning_rate": 0.00020696978147077006,
"loss": 0.7946,
"step": 2330
},
{
"epoch": 0.4394053443733534,
"grad_norm": 0.041413097020774656,
"learning_rate": 0.00020651367339075366,
"loss": 0.7986,
"step": 2335
},
{
"epoch": 0.4403462551750094,
"grad_norm": 0.04220729277505702,
"learning_rate": 0.00020605695552992093,
"loss": 0.8294,
"step": 2340
},
{
"epoch": 0.4412871659766654,
"grad_norm": 0.03874549638262173,
"learning_rate": 0.00020559963281624376,
"loss": 0.7638,
"step": 2345
},
{
"epoch": 0.44222807677832143,
"grad_norm": 0.04109206825097649,
"learning_rate": 0.00020514171018422015,
"loss": 0.8068,
"step": 2350
},
{
"epoch": 0.4431689875799774,
"grad_norm": 0.043772113393048005,
"learning_rate": 0.0002046831925748215,
"loss": 0.8087,
"step": 2355
},
{
"epoch": 0.4441098983816334,
"grad_norm": 0.04314130063844934,
"learning_rate": 0.00020422408493543878,
"loss": 0.8176,
"step": 2360
},
{
"epoch": 0.44505080918328943,
"grad_norm": 0.04227709213575997,
"learning_rate": 0.00020376439221982953,
"loss": 0.7848,
"step": 2365
},
{
"epoch": 0.4459917199849454,
"grad_norm": 0.047483555049165516,
"learning_rate": 0.0002033041193880641,
"loss": 0.7906,
"step": 2370
},
{
"epoch": 0.44693263078660145,
"grad_norm": 0.04328503293360676,
"learning_rate": 0.00020284327140647238,
"loss": 0.8087,
"step": 2375
},
{
"epoch": 0.44787354158825743,
"grad_norm": 0.043229994930452834,
"learning_rate": 0.00020238185324759005,
"loss": 0.7977,
"step": 2380
},
{
"epoch": 0.4488144523899134,
"grad_norm": 0.04138845464473692,
"learning_rate": 0.00020191986989010497,
"loss": 0.805,
"step": 2385
},
{
"epoch": 0.44975536319156945,
"grad_norm": 0.042452324658930944,
"learning_rate": 0.0002014573263188036,
"loss": 0.7953,
"step": 2390
},
{
"epoch": 0.4506962739932254,
"grad_norm": 0.044241588342132626,
"learning_rate": 0.0002009942275245169,
"loss": 0.8349,
"step": 2395
},
{
"epoch": 0.45163718479488146,
"grad_norm": 0.04179408805570128,
"learning_rate": 0.00020053057850406687,
"loss": 0.7995,
"step": 2400
},
{
"epoch": 0.45257809559653744,
"grad_norm": 0.048094796140699454,
"learning_rate": 0.00020006638426021226,
"loss": 0.811,
"step": 2405
},
{
"epoch": 0.4535190063981934,
"grad_norm": 0.041380019778814475,
"learning_rate": 0.00019960164980159484,
"loss": 0.8142,
"step": 2410
},
{
"epoch": 0.45445991719984946,
"grad_norm": 0.04287459814218423,
"learning_rate": 0.0001991363801426853,
"loss": 0.7988,
"step": 2415
},
{
"epoch": 0.45540082800150544,
"grad_norm": 0.050121488389194564,
"learning_rate": 0.00019867058030372916,
"loss": 0.8017,
"step": 2420
},
{
"epoch": 0.4563417388031615,
"grad_norm": 0.04424030813075273,
"learning_rate": 0.00019820425531069235,
"loss": 0.7994,
"step": 2425
},
{
"epoch": 0.45728264960481746,
"grad_norm": 0.03912669330614894,
"learning_rate": 0.0001977374101952075,
"loss": 0.8172,
"step": 2430
},
{
"epoch": 0.45822356040647344,
"grad_norm": 0.04228385679812487,
"learning_rate": 0.00019727004999451917,
"loss": 0.7907,
"step": 2435
},
{
"epoch": 0.4591644712081295,
"grad_norm": 0.04975994860517981,
"learning_rate": 0.00019680217975142963,
"loss": 0.816,
"step": 2440
},
{
"epoch": 0.46010538200978546,
"grad_norm": 0.044510558468411905,
"learning_rate": 0.00019633380451424473,
"loss": 0.7835,
"step": 2445
},
{
"epoch": 0.4610462928114415,
"grad_norm": 0.04486213312302885,
"learning_rate": 0.00019586492933671885,
"loss": 0.8223,
"step": 2450
},
{
"epoch": 0.4619872036130975,
"grad_norm": 0.04563165216828993,
"learning_rate": 0.00019539555927800098,
"loss": 0.795,
"step": 2455
},
{
"epoch": 0.46292811441475346,
"grad_norm": 0.040840435500794685,
"learning_rate": 0.00019492569940257972,
"loss": 0.7782,
"step": 2460
},
{
"epoch": 0.4638690252164095,
"grad_norm": 0.04957991037956064,
"learning_rate": 0.0001944553547802289,
"loss": 0.8075,
"step": 2465
},
{
"epoch": 0.4648099360180655,
"grad_norm": 0.042607119092549925,
"learning_rate": 0.00019398453048595268,
"loss": 0.8056,
"step": 2470
},
{
"epoch": 0.4657508468197215,
"grad_norm": 0.04192235755301655,
"learning_rate": 0.00019351323159993083,
"loss": 0.8347,
"step": 2475
},
{
"epoch": 0.4666917576213775,
"grad_norm": 0.03928466435364261,
"learning_rate": 0.00019304146320746397,
"loss": 0.7899,
"step": 2480
},
{
"epoch": 0.46763266842303347,
"grad_norm": 0.040044034579578336,
"learning_rate": 0.00019256923039891877,
"loss": 0.7933,
"step": 2485
},
{
"epoch": 0.4685735792246895,
"grad_norm": 0.04312279650442732,
"learning_rate": 0.00019209653826967276,
"loss": 0.7883,
"step": 2490
},
{
"epoch": 0.4695144900263455,
"grad_norm": 0.04085403683152945,
"learning_rate": 0.00019162339192005972,
"loss": 0.7579,
"step": 2495
},
{
"epoch": 0.4704554008280015,
"grad_norm": 0.043873630317500224,
"learning_rate": 0.00019114979645531437,
"loss": 0.7929,
"step": 2500
},
{
"epoch": 0.4713963116296575,
"grad_norm": 0.040154803681733534,
"learning_rate": 0.00019067575698551728,
"loss": 0.79,
"step": 2505
},
{
"epoch": 0.4723372224313135,
"grad_norm": 0.03684855703789701,
"learning_rate": 0.00019020127862554,
"loss": 0.7897,
"step": 2510
},
{
"epoch": 0.4732781332329695,
"grad_norm": 0.04422268883476745,
"learning_rate": 0.0001897263664949896,
"loss": 0.792,
"step": 2515
},
{
"epoch": 0.4742190440346255,
"grad_norm": 0.043857810173637005,
"learning_rate": 0.00018925102571815344,
"loss": 0.7851,
"step": 2520
},
{
"epoch": 0.47515995483628154,
"grad_norm": 0.042539435296578026,
"learning_rate": 0.00018877526142394404,
"loss": 0.7886,
"step": 2525
},
{
"epoch": 0.4761008656379375,
"grad_norm": 0.04214932919499374,
"learning_rate": 0.00018829907874584376,
"loss": 0.7906,
"step": 2530
},
{
"epoch": 0.4770417764395935,
"grad_norm": 0.0384479257214575,
"learning_rate": 0.0001878224828218491,
"loss": 0.7825,
"step": 2535
},
{
"epoch": 0.47798268724124954,
"grad_norm": 0.04162096424062071,
"learning_rate": 0.0001873454787944156,
"loss": 0.7957,
"step": 2540
},
{
"epoch": 0.4789235980429055,
"grad_norm": 0.03883485962874196,
"learning_rate": 0.0001868680718104023,
"loss": 0.7909,
"step": 2545
},
{
"epoch": 0.47986450884456155,
"grad_norm": 0.03749476318685997,
"learning_rate": 0.0001863902670210159,
"loss": 0.7817,
"step": 2550
},
{
"epoch": 0.48080541964621754,
"grad_norm": 0.03910242002103014,
"learning_rate": 0.0001859120695817556,
"loss": 0.7817,
"step": 2555
},
{
"epoch": 0.4817463304478735,
"grad_norm": 0.038950913555839455,
"learning_rate": 0.0001854334846523572,
"loss": 0.7796,
"step": 2560
},
{
"epoch": 0.48268724124952955,
"grad_norm": 0.04324847711598021,
"learning_rate": 0.00018495451739673757,
"loss": 0.797,
"step": 2565
},
{
"epoch": 0.48362815205118553,
"grad_norm": 0.039000544800866135,
"learning_rate": 0.0001844751729829388,
"loss": 0.7777,
"step": 2570
},
{
"epoch": 0.48456906285284157,
"grad_norm": 0.040232908364916414,
"learning_rate": 0.0001839954565830725,
"loss": 0.7885,
"step": 2575
},
{
"epoch": 0.48550997365449755,
"grad_norm": 0.03871796270192426,
"learning_rate": 0.00018351537337326404,
"loss": 0.8016,
"step": 2580
},
{
"epoch": 0.48645088445615353,
"grad_norm": 0.037364272818028524,
"learning_rate": 0.0001830349285335967,
"loss": 0.7796,
"step": 2585
},
{
"epoch": 0.48739179525780957,
"grad_norm": 0.03990561450519654,
"learning_rate": 0.00018255412724805557,
"loss": 0.7863,
"step": 2590
},
{
"epoch": 0.48833270605946555,
"grad_norm": 0.04185146560129521,
"learning_rate": 0.00018207297470447206,
"loss": 0.8122,
"step": 2595
},
{
"epoch": 0.4892736168611216,
"grad_norm": 0.040063511223070405,
"learning_rate": 0.00018159147609446728,
"loss": 0.7948,
"step": 2600
},
{
"epoch": 0.49021452766277757,
"grad_norm": 0.044541898554158056,
"learning_rate": 0.00018110963661339675,
"loss": 0.7987,
"step": 2605
},
{
"epoch": 0.49115543846443355,
"grad_norm": 0.04209970580203627,
"learning_rate": 0.00018062746146029374,
"loss": 0.7982,
"step": 2610
},
{
"epoch": 0.4920963492660896,
"grad_norm": 0.03954771319272124,
"learning_rate": 0.00018014495583781344,
"loss": 0.7631,
"step": 2615
},
{
"epoch": 0.49303726006774556,
"grad_norm": 0.043084260115585415,
"learning_rate": 0.00017966212495217686,
"loss": 0.77,
"step": 2620
},
{
"epoch": 0.4939781708694016,
"grad_norm": 0.045997910488941696,
"learning_rate": 0.00017917897401311465,
"loss": 0.8029,
"step": 2625
},
{
"epoch": 0.4949190816710576,
"grad_norm": 0.04061225203519024,
"learning_rate": 0.0001786955082338106,
"loss": 0.7776,
"step": 2630
},
{
"epoch": 0.49585999247271356,
"grad_norm": 0.03975991003304855,
"learning_rate": 0.00017821173283084584,
"loss": 0.7995,
"step": 2635
},
{
"epoch": 0.4968009032743696,
"grad_norm": 0.04073622084361536,
"learning_rate": 0.00017772765302414228,
"loss": 0.7881,
"step": 2640
},
{
"epoch": 0.4977418140760256,
"grad_norm": 0.04246319256025743,
"learning_rate": 0.0001772432740369062,
"loss": 0.7851,
"step": 2645
},
{
"epoch": 0.4986827248776816,
"grad_norm": 0.040200342333617425,
"learning_rate": 0.00017675860109557225,
"loss": 0.7874,
"step": 2650
},
{
"epoch": 0.4996236356793376,
"grad_norm": 0.04313427466781039,
"learning_rate": 0.00017627363942974663,
"loss": 0.7662,
"step": 2655
},
{
"epoch": 0.5005645464809936,
"grad_norm": 0.04470911106937541,
"learning_rate": 0.00017578839427215102,
"loss": 0.777,
"step": 2660
},
{
"epoch": 0.5015054572826496,
"grad_norm": 0.04174530809793399,
"learning_rate": 0.00017530287085856583,
"loss": 0.7591,
"step": 2665
},
{
"epoch": 0.5024463680843057,
"grad_norm": 0.04344420655267131,
"learning_rate": 0.00017481707442777402,
"loss": 0.7915,
"step": 2670
},
{
"epoch": 0.5033872788859616,
"grad_norm": 0.04411942787559072,
"learning_rate": 0.0001743310102215042,
"loss": 0.8101,
"step": 2675
},
{
"epoch": 0.5043281896876176,
"grad_norm": 0.03995728857524497,
"learning_rate": 0.00017384468348437447,
"loss": 0.7676,
"step": 2680
},
{
"epoch": 0.5052691004892736,
"grad_norm": 0.04144099448309982,
"learning_rate": 0.00017335809946383542,
"loss": 0.7839,
"step": 2685
},
{
"epoch": 0.5062100112909296,
"grad_norm": 0.03635972182712348,
"learning_rate": 0.00017287126341011396,
"loss": 0.7652,
"step": 2690
},
{
"epoch": 0.5071509220925856,
"grad_norm": 0.04144440650346267,
"learning_rate": 0.00017238418057615611,
"loss": 0.7691,
"step": 2695
},
{
"epoch": 0.5080918328942416,
"grad_norm": 0.0388799423795147,
"learning_rate": 0.000171896856217571,
"loss": 0.7745,
"step": 2700
},
{
"epoch": 0.5090327436958977,
"grad_norm": 0.04098955077677498,
"learning_rate": 0.0001714092955925735,
"loss": 0.7638,
"step": 2705
},
{
"epoch": 0.5099736544975536,
"grad_norm": 0.05176471641470045,
"learning_rate": 0.000170921503961928,
"loss": 0.7915,
"step": 2710
},
{
"epoch": 0.5109145652992096,
"grad_norm": 0.03903056061857688,
"learning_rate": 0.00017043348658889133,
"loss": 0.7582,
"step": 2715
},
{
"epoch": 0.5118554761008657,
"grad_norm": 0.03918100612671428,
"learning_rate": 0.00016994524873915613,
"loss": 0.7876,
"step": 2720
},
{
"epoch": 0.5127963869025216,
"grad_norm": 0.04124280214103889,
"learning_rate": 0.0001694567956807939,
"loss": 0.7761,
"step": 2725
},
{
"epoch": 0.5137372977041776,
"grad_norm": 0.042496128838645876,
"learning_rate": 0.00016896813268419824,
"loss": 0.7563,
"step": 2730
},
{
"epoch": 0.5146782085058337,
"grad_norm": 0.042727195504920386,
"learning_rate": 0.00016847926502202814,
"loss": 0.7715,
"step": 2735
},
{
"epoch": 0.5156191193074896,
"grad_norm": 0.0453239652902863,
"learning_rate": 0.00016799019796915067,
"loss": 0.7786,
"step": 2740
},
{
"epoch": 0.5165600301091456,
"grad_norm": 0.045118257069757896,
"learning_rate": 0.00016750093680258454,
"loss": 0.7824,
"step": 2745
},
{
"epoch": 0.5175009409108017,
"grad_norm": 0.038825006570890906,
"learning_rate": 0.00016701148680144277,
"loss": 0.7749,
"step": 2750
},
{
"epoch": 0.5184418517124577,
"grad_norm": 0.041921133819207645,
"learning_rate": 0.00016652185324687605,
"loss": 0.7762,
"step": 2755
},
{
"epoch": 0.5193827625141136,
"grad_norm": 0.04014068407855972,
"learning_rate": 0.0001660320414220155,
"loss": 0.7965,
"step": 2760
},
{
"epoch": 0.5203236733157697,
"grad_norm": 0.0389817444179057,
"learning_rate": 0.0001655420566119158,
"loss": 0.7803,
"step": 2765
},
{
"epoch": 0.5212645841174257,
"grad_norm": 0.04084074773574928,
"learning_rate": 0.00016505190410349817,
"loss": 0.7728,
"step": 2770
},
{
"epoch": 0.5222054949190816,
"grad_norm": 0.03931726740857862,
"learning_rate": 0.00016456158918549328,
"loss": 0.7673,
"step": 2775
},
{
"epoch": 0.5231464057207377,
"grad_norm": 0.04263330287791737,
"learning_rate": 0.00016407111714838407,
"loss": 0.7768,
"step": 2780
},
{
"epoch": 0.5240873165223937,
"grad_norm": 0.03868707213111586,
"learning_rate": 0.00016358049328434903,
"loss": 0.7975,
"step": 2785
},
{
"epoch": 0.5250282273240496,
"grad_norm": 0.04065607527535718,
"learning_rate": 0.00016308972288720466,
"loss": 0.793,
"step": 2790
},
{
"epoch": 0.5259691381257057,
"grad_norm": 0.03586574957051923,
"learning_rate": 0.00016259881125234863,
"loss": 0.7704,
"step": 2795
},
{
"epoch": 0.5269100489273617,
"grad_norm": 0.04097298758383249,
"learning_rate": 0.00016210776367670253,
"loss": 0.7503,
"step": 2800
},
{
"epoch": 0.5278509597290177,
"grad_norm": 0.044803423222512996,
"learning_rate": 0.00016161658545865473,
"loss": 0.806,
"step": 2805
},
{
"epoch": 0.5287918705306737,
"grad_norm": 0.04270347978524801,
"learning_rate": 0.00016112528189800334,
"loss": 0.8056,
"step": 2810
},
{
"epoch": 0.5297327813323297,
"grad_norm": 0.04047987808565142,
"learning_rate": 0.00016063385829589874,
"loss": 0.754,
"step": 2815
},
{
"epoch": 0.5306736921339857,
"grad_norm": 0.037879635517597286,
"learning_rate": 0.0001601423199547867,
"loss": 0.7573,
"step": 2820
},
{
"epoch": 0.5316146029356417,
"grad_norm": 0.03784641405503706,
"learning_rate": 0.00015965067217835093,
"loss": 0.7672,
"step": 2825
},
{
"epoch": 0.5325555137372977,
"grad_norm": 0.04190542796765804,
"learning_rate": 0.00015915892027145603,
"loss": 0.7646,
"step": 2830
},
{
"epoch": 0.5334964245389537,
"grad_norm": 0.0410546092280587,
"learning_rate": 0.00015866706954009005,
"loss": 0.7823,
"step": 2835
},
{
"epoch": 0.5344373353406097,
"grad_norm": 0.040014735565657274,
"learning_rate": 0.00015817512529130748,
"loss": 0.7619,
"step": 2840
},
{
"epoch": 0.5353782461422657,
"grad_norm": 0.04109448908128682,
"learning_rate": 0.00015768309283317175,
"loss": 0.7749,
"step": 2845
},
{
"epoch": 0.5363191569439217,
"grad_norm": 0.042386121106642534,
"learning_rate": 0.0001571909774746981,
"loss": 0.759,
"step": 2850
},
{
"epoch": 0.5372600677455778,
"grad_norm": 0.03948724800684583,
"learning_rate": 0.00015669878452579633,
"loss": 0.8001,
"step": 2855
},
{
"epoch": 0.5382009785472337,
"grad_norm": 0.039488312405531356,
"learning_rate": 0.00015620651929721335,
"loss": 0.7394,
"step": 2860
},
{
"epoch": 0.5391418893488897,
"grad_norm": 0.04561518902409399,
"learning_rate": 0.00015571418710047597,
"loss": 0.8038,
"step": 2865
},
{
"epoch": 0.5400828001505458,
"grad_norm": 0.042232271266558645,
"learning_rate": 0.00015522179324783364,
"loss": 0.7848,
"step": 2870
},
{
"epoch": 0.5410237109522017,
"grad_norm": 0.042459217752880335,
"learning_rate": 0.0001547293430522011,
"loss": 0.787,
"step": 2875
},
{
"epoch": 0.5419646217538577,
"grad_norm": 0.04563613956360711,
"learning_rate": 0.0001542368418271009,
"loss": 0.7922,
"step": 2880
},
{
"epoch": 0.5429055325555138,
"grad_norm": 0.04471760423803294,
"learning_rate": 0.0001537442948866063,
"loss": 0.7768,
"step": 2885
},
{
"epoch": 0.5438464433571697,
"grad_norm": 0.04367786244918379,
"learning_rate": 0.00015325170754528376,
"loss": 0.7551,
"step": 2890
},
{
"epoch": 0.5447873541588257,
"grad_norm": 0.04050477501250885,
"learning_rate": 0.00015275908511813583,
"loss": 0.7591,
"step": 2895
},
{
"epoch": 0.5457282649604818,
"grad_norm": 0.03457399407540036,
"learning_rate": 0.00015226643292054335,
"loss": 0.7867,
"step": 2900
},
{
"epoch": 0.5466691757621378,
"grad_norm": 0.03851340729404049,
"learning_rate": 0.00015177375626820866,
"loss": 0.7514,
"step": 2905
},
{
"epoch": 0.5476100865637937,
"grad_norm": 0.04344603262075698,
"learning_rate": 0.00015128106047709782,
"loss": 0.7818,
"step": 2910
},
{
"epoch": 0.5485509973654498,
"grad_norm": 0.0385972605358632,
"learning_rate": 0.00015078835086338333,
"loss": 0.776,
"step": 2915
},
{
"epoch": 0.5494919081671058,
"grad_norm": 0.04105948951389995,
"learning_rate": 0.00015029563274338711,
"loss": 0.7799,
"step": 2920
},
{
"epoch": 0.5504328189687617,
"grad_norm": 0.0373428818729792,
"learning_rate": 0.00014980291143352253,
"loss": 0.787,
"step": 2925
},
{
"epoch": 0.5513737297704178,
"grad_norm": 0.04145957845710554,
"learning_rate": 0.00014931019225023764,
"loss": 0.7903,
"step": 2930
},
{
"epoch": 0.5523146405720738,
"grad_norm": 0.03832821967374709,
"learning_rate": 0.0001488174805099573,
"loss": 0.7743,
"step": 2935
},
{
"epoch": 0.5532555513737297,
"grad_norm": 0.04371721164045727,
"learning_rate": 0.00014832478152902633,
"loss": 0.7917,
"step": 2940
},
{
"epoch": 0.5541964621753858,
"grad_norm": 0.045192576709012396,
"learning_rate": 0.0001478321006236517,
"loss": 0.819,
"step": 2945
},
{
"epoch": 0.5551373729770418,
"grad_norm": 0.041868786627357824,
"learning_rate": 0.00014733944310984533,
"loss": 0.7836,
"step": 2950
},
{
"epoch": 0.5560782837786977,
"grad_norm": 0.040054306103734574,
"learning_rate": 0.00014684681430336688,
"loss": 0.7581,
"step": 2955
},
{
"epoch": 0.5570191945803538,
"grad_norm": 0.04498485518590034,
"learning_rate": 0.00014635421951966613,
"loss": 0.7804,
"step": 2960
},
{
"epoch": 0.5579601053820098,
"grad_norm": 0.03795169554357177,
"learning_rate": 0.00014586166407382585,
"loss": 0.76,
"step": 2965
},
{
"epoch": 0.5589010161836658,
"grad_norm": 0.039089524876466074,
"learning_rate": 0.0001453691532805043,
"loss": 0.7692,
"step": 2970
},
{
"epoch": 0.5598419269853218,
"grad_norm": 0.03791366653189066,
"learning_rate": 0.00014487669245387793,
"loss": 0.7979,
"step": 2975
},
{
"epoch": 0.5607828377869778,
"grad_norm": 0.048004979557690994,
"learning_rate": 0.00014438428690758415,
"loss": 0.746,
"step": 2980
},
{
"epoch": 0.5617237485886338,
"grad_norm": 0.040058739496838455,
"learning_rate": 0.00014389194195466373,
"loss": 0.7711,
"step": 2985
},
{
"epoch": 0.5626646593902898,
"grad_norm": 0.03652076270611612,
"learning_rate": 0.00014339966290750374,
"loss": 0.7344,
"step": 2990
},
{
"epoch": 0.5636055701919458,
"grad_norm": 0.042636228941696684,
"learning_rate": 0.00014290745507778018,
"loss": 0.7819,
"step": 2995
},
{
"epoch": 0.5645464809936018,
"grad_norm": 0.03580210308957461,
"learning_rate": 0.00014241532377640056,
"loss": 0.7699,
"step": 3000
},
{
"epoch": 0.5654873917952578,
"grad_norm": 0.038702251641132904,
"learning_rate": 0.00014192327431344654,
"loss": 0.7587,
"step": 3005
},
{
"epoch": 0.5664283025969138,
"grad_norm": 0.043869218575217146,
"learning_rate": 0.00014143131199811695,
"loss": 0.7598,
"step": 3010
},
{
"epoch": 0.5673692133985698,
"grad_norm": 0.03942521728979009,
"learning_rate": 0.00014093944213867027,
"loss": 0.7779,
"step": 3015
},
{
"epoch": 0.5683101242002259,
"grad_norm": 0.04379986996024926,
"learning_rate": 0.00014044767004236708,
"loss": 0.7721,
"step": 3020
},
{
"epoch": 0.5692510350018818,
"grad_norm": 0.041493273088253166,
"learning_rate": 0.00013995600101541358,
"loss": 0.7864,
"step": 3025
},
{
"epoch": 0.5701919458035378,
"grad_norm": 0.04809266982465616,
"learning_rate": 0.0001394644403629035,
"loss": 0.7826,
"step": 3030
},
{
"epoch": 0.5711328566051939,
"grad_norm": 0.03877586910463833,
"learning_rate": 0.0001389729933887613,
"loss": 0.7555,
"step": 3035
},
{
"epoch": 0.5720737674068498,
"grad_norm": 0.0424860755202435,
"learning_rate": 0.00013848166539568495,
"loss": 0.749,
"step": 3040
},
{
"epoch": 0.5730146782085058,
"grad_norm": 0.04037436043604233,
"learning_rate": 0.00013799046168508851,
"loss": 0.7779,
"step": 3045
},
{
"epoch": 0.5739555890101619,
"grad_norm": 0.04095161246609761,
"learning_rate": 0.00013749938755704504,
"loss": 0.7682,
"step": 3050
},
{
"epoch": 0.5748964998118178,
"grad_norm": 0.03996299861070224,
"learning_rate": 0.00013700844831022948,
"loss": 0.7495,
"step": 3055
},
{
"epoch": 0.5758374106134738,
"grad_norm": 0.04127396019541893,
"learning_rate": 0.00013651764924186142,
"loss": 0.8048,
"step": 3060
},
{
"epoch": 0.5767783214151299,
"grad_norm": 0.03925183560100803,
"learning_rate": 0.0001360269956476477,
"loss": 0.7614,
"step": 3065
},
{
"epoch": 0.5777192322167859,
"grad_norm": 0.04104317482073666,
"learning_rate": 0.00013553649282172588,
"loss": 0.7875,
"step": 3070
},
{
"epoch": 0.5786601430184418,
"grad_norm": 0.04047442818024267,
"learning_rate": 0.00013504614605660642,
"loss": 0.8087,
"step": 3075
},
{
"epoch": 0.5796010538200979,
"grad_norm": 0.03790235486672248,
"learning_rate": 0.00013455596064311593,
"loss": 0.7414,
"step": 3080
},
{
"epoch": 0.5805419646217539,
"grad_norm": 0.039394611832599055,
"learning_rate": 0.00013406594187034026,
"loss": 0.7511,
"step": 3085
},
{
"epoch": 0.5814828754234098,
"grad_norm": 0.04013793427774893,
"learning_rate": 0.00013357609502556697,
"loss": 0.7666,
"step": 3090
},
{
"epoch": 0.5824237862250659,
"grad_norm": 0.04085601620139591,
"learning_rate": 0.00013308642539422858,
"loss": 0.7471,
"step": 3095
},
{
"epoch": 0.5833646970267219,
"grad_norm": 0.04129620694156995,
"learning_rate": 0.00013259693825984562,
"loss": 0.8013,
"step": 3100
},
{
"epoch": 0.5843056078283778,
"grad_norm": 0.03631629884354763,
"learning_rate": 0.0001321076389039693,
"loss": 0.7534,
"step": 3105
},
{
"epoch": 0.5852465186300339,
"grad_norm": 0.08206893663995983,
"learning_rate": 0.00013161853260612474,
"loss": 0.7885,
"step": 3110
},
{
"epoch": 0.5861874294316899,
"grad_norm": 0.046338835239627396,
"learning_rate": 0.0001311296246437541,
"loss": 0.7386,
"step": 3115
},
{
"epoch": 0.5871283402333459,
"grad_norm": 0.04213777780082758,
"learning_rate": 0.0001306409202921594,
"loss": 0.7798,
"step": 3120
},
{
"epoch": 0.5880692510350018,
"grad_norm": 0.04362316942259357,
"learning_rate": 0.00013015242482444564,
"loss": 0.8252,
"step": 3125
},
{
"epoch": 0.5890101618366579,
"grad_norm": 0.041766857211591846,
"learning_rate": 0.0001296641435114642,
"loss": 0.7943,
"step": 3130
},
{
"epoch": 0.5899510726383139,
"grad_norm": 0.04441519901114515,
"learning_rate": 0.0001291760816217555,
"loss": 0.7652,
"step": 3135
},
{
"epoch": 0.5908919834399698,
"grad_norm": 0.038449411700647515,
"learning_rate": 0.00012868824442149242,
"loss": 0.7802,
"step": 3140
},
{
"epoch": 0.5918328942416259,
"grad_norm": 0.03939930370264386,
"learning_rate": 0.00012820063717442366,
"loss": 0.7715,
"step": 3145
},
{
"epoch": 0.5927738050432819,
"grad_norm": 0.040400026135115875,
"learning_rate": 0.00012771326514181646,
"loss": 0.7749,
"step": 3150
},
{
"epoch": 0.5937147158449378,
"grad_norm": 0.04110702660939482,
"learning_rate": 0.00012722613358240022,
"loss": 0.7564,
"step": 3155
},
{
"epoch": 0.5946556266465939,
"grad_norm": 0.03784848807322662,
"learning_rate": 0.00012673924775230972,
"loss": 0.7424,
"step": 3160
},
{
"epoch": 0.5955965374482499,
"grad_norm": 0.04018189304197721,
"learning_rate": 0.00012625261290502823,
"loss": 0.7974,
"step": 3165
},
{
"epoch": 0.596537448249906,
"grad_norm": 0.03834107117175049,
"learning_rate": 0.00012576623429133089,
"loss": 0.7441,
"step": 3170
},
{
"epoch": 0.5974783590515619,
"grad_norm": 0.04188273882175529,
"learning_rate": 0.00012528011715922822,
"loss": 0.7666,
"step": 3175
},
{
"epoch": 0.5984192698532179,
"grad_norm": 0.04048234489441125,
"learning_rate": 0.0001247942667539092,
"loss": 0.7686,
"step": 3180
},
{
"epoch": 0.599360180654874,
"grad_norm": 0.042753366871858815,
"learning_rate": 0.00012430868831768505,
"loss": 0.7692,
"step": 3185
},
{
"epoch": 0.6003010914565299,
"grad_norm": 0.03965154904470419,
"learning_rate": 0.0001238233870899322,
"loss": 0.7621,
"step": 3190
},
{
"epoch": 0.6012420022581859,
"grad_norm": 0.040895100280079935,
"learning_rate": 0.00012333836830703615,
"loss": 0.7711,
"step": 3195
},
{
"epoch": 0.602182913059842,
"grad_norm": 0.039036996904750175,
"learning_rate": 0.00012285363720233484,
"loss": 0.7403,
"step": 3200
},
{
"epoch": 0.6031238238614979,
"grad_norm": 0.03984475123838604,
"learning_rate": 0.00012236919900606214,
"loss": 0.8089,
"step": 3205
},
{
"epoch": 0.6040647346631539,
"grad_norm": 0.03602768052095753,
"learning_rate": 0.0001218850589452914,
"loss": 0.772,
"step": 3210
},
{
"epoch": 0.60500564546481,
"grad_norm": 0.03775213464617447,
"learning_rate": 0.00012140122224387924,
"loss": 0.7517,
"step": 3215
},
{
"epoch": 0.605946556266466,
"grad_norm": 0.03568230380993887,
"learning_rate": 0.00012091769412240889,
"loss": 0.7606,
"step": 3220
},
{
"epoch": 0.6068874670681219,
"grad_norm": 0.03614446877998065,
"learning_rate": 0.000120434479798134,
"loss": 0.7758,
"step": 3225
},
{
"epoch": 0.607828377869778,
"grad_norm": 0.04101207650613775,
"learning_rate": 0.00011995158448492257,
"loss": 0.7326,
"step": 3230
},
{
"epoch": 0.608769288671434,
"grad_norm": 0.03872714799207714,
"learning_rate": 0.00011946901339320025,
"loss": 0.7583,
"step": 3235
},
{
"epoch": 0.6097101994730899,
"grad_norm": 0.04499673060260799,
"learning_rate": 0.0001189867717298944,
"loss": 0.7808,
"step": 3240
},
{
"epoch": 0.610651110274746,
"grad_norm": 0.037363074205504686,
"learning_rate": 0.000118504864698378,
"loss": 0.7854,
"step": 3245
},
{
"epoch": 0.611592021076402,
"grad_norm": 0.037838990173753896,
"learning_rate": 0.00011802329749841316,
"loss": 0.7946,
"step": 3250
},
{
"epoch": 0.6125329318780579,
"grad_norm": 0.04050006759000257,
"learning_rate": 0.00011754207532609534,
"loss": 0.7541,
"step": 3255
},
{
"epoch": 0.6134738426797139,
"grad_norm": 0.038242266145084615,
"learning_rate": 0.00011706120337379718,
"loss": 0.7483,
"step": 3260
},
{
"epoch": 0.61441475348137,
"grad_norm": 0.0392400144645598,
"learning_rate": 0.00011658068683011241,
"loss": 0.7575,
"step": 3265
},
{
"epoch": 0.615355664283026,
"grad_norm": 0.045110458493318586,
"learning_rate": 0.0001161005308797998,
"loss": 0.7429,
"step": 3270
},
{
"epoch": 0.6162965750846819,
"grad_norm": 0.038540030132832,
"learning_rate": 0.00011562074070372764,
"loss": 0.7321,
"step": 3275
},
{
"epoch": 0.617237485886338,
"grad_norm": 0.037506266917269986,
"learning_rate": 0.00011514132147881717,
"loss": 0.7405,
"step": 3280
},
{
"epoch": 0.618178396687994,
"grad_norm": 0.03913471723109424,
"learning_rate": 0.0001146622783779873,
"loss": 0.7495,
"step": 3285
},
{
"epoch": 0.6191193074896499,
"grad_norm": 0.04109286833361118,
"learning_rate": 0.0001141836165700985,
"loss": 0.7286,
"step": 3290
},
{
"epoch": 0.620060218291306,
"grad_norm": 0.03854391729454944,
"learning_rate": 0.00011370534121989713,
"loss": 0.772,
"step": 3295
},
{
"epoch": 0.621001129092962,
"grad_norm": 0.0393768039777761,
"learning_rate": 0.00011322745748795964,
"loss": 0.7605,
"step": 3300
},
{
"epoch": 0.6219420398946179,
"grad_norm": 0.04333935406275173,
"learning_rate": 0.00011274997053063702,
"loss": 0.7883,
"step": 3305
},
{
"epoch": 0.622882950696274,
"grad_norm": 0.0390615079213823,
"learning_rate": 0.00011227288549999894,
"loss": 0.7809,
"step": 3310
},
{
"epoch": 0.62382386149793,
"grad_norm": 0.045178121155615474,
"learning_rate": 0.00011179620754377833,
"loss": 0.7541,
"step": 3315
},
{
"epoch": 0.624764772299586,
"grad_norm": 0.038276191870259636,
"learning_rate": 0.00011131994180531597,
"loss": 0.7668,
"step": 3320
},
{
"epoch": 0.625705683101242,
"grad_norm": 0.04593621357066484,
"learning_rate": 0.00011084409342350458,
"loss": 0.7778,
"step": 3325
},
{
"epoch": 0.626646593902898,
"grad_norm": 0.042697008069698866,
"learning_rate": 0.00011036866753273372,
"loss": 0.7812,
"step": 3330
},
{
"epoch": 0.627587504704554,
"grad_norm": 0.038210651549027225,
"learning_rate": 0.00010989366926283435,
"loss": 0.7645,
"step": 3335
},
{
"epoch": 0.62852841550621,
"grad_norm": 0.04016328258057881,
"learning_rate": 0.00010941910373902334,
"loss": 0.7447,
"step": 3340
},
{
"epoch": 0.629469326307866,
"grad_norm": 0.03979873617240781,
"learning_rate": 0.00010894497608184814,
"loss": 0.7739,
"step": 3345
},
{
"epoch": 0.630410237109522,
"grad_norm": 0.03825888432646367,
"learning_rate": 0.00010847129140713192,
"loss": 0.7654,
"step": 3350
},
{
"epoch": 0.631351147911178,
"grad_norm": 0.04160212266379006,
"learning_rate": 0.00010799805482591778,
"loss": 0.8068,
"step": 3355
},
{
"epoch": 0.632292058712834,
"grad_norm": 0.040102831786925454,
"learning_rate": 0.00010752527144441405,
"loss": 0.7676,
"step": 3360
},
{
"epoch": 0.63323296951449,
"grad_norm": 0.04073264579453533,
"learning_rate": 0.00010705294636393908,
"loss": 0.7605,
"step": 3365
},
{
"epoch": 0.6341738803161461,
"grad_norm": 0.03930481140797053,
"learning_rate": 0.00010658108468086611,
"loss": 0.7421,
"step": 3370
},
{
"epoch": 0.635114791117802,
"grad_norm": 0.04129180654967124,
"learning_rate": 0.00010610969148656824,
"loss": 0.7459,
"step": 3375
},
{
"epoch": 0.636055701919458,
"grad_norm": 0.04025751491566716,
"learning_rate": 0.00010563877186736384,
"loss": 0.7532,
"step": 3380
},
{
"epoch": 0.6369966127211141,
"grad_norm": 0.04130711339590857,
"learning_rate": 0.00010516833090446123,
"loss": 0.7692,
"step": 3385
},
{
"epoch": 0.63793752352277,
"grad_norm": 0.04249719108175661,
"learning_rate": 0.000104698373673904,
"loss": 0.7721,
"step": 3390
},
{
"epoch": 0.638878434324426,
"grad_norm": 0.03793225541937712,
"learning_rate": 0.00010422890524651647,
"loss": 0.7605,
"step": 3395
},
{
"epoch": 0.6398193451260821,
"grad_norm": 0.037043396185056116,
"learning_rate": 0.0001037599306878486,
"loss": 0.7335,
"step": 3400
},
{
"epoch": 0.640760255927738,
"grad_norm": 0.03812826423492339,
"learning_rate": 0.0001032914550581217,
"loss": 0.7342,
"step": 3405
},
{
"epoch": 0.641701166729394,
"grad_norm": 0.03967669170673079,
"learning_rate": 0.00010282348341217352,
"loss": 0.7574,
"step": 3410
},
{
"epoch": 0.6426420775310501,
"grad_norm": 0.03737016200455732,
"learning_rate": 0.00010235602079940385,
"loss": 0.7766,
"step": 3415
},
{
"epoch": 0.6435829883327061,
"grad_norm": 0.03898453606920356,
"learning_rate": 0.0001018890722637201,
"loss": 0.7677,
"step": 3420
},
{
"epoch": 0.644523899134362,
"grad_norm": 0.042695495342005334,
"learning_rate": 0.00010142264284348278,
"loss": 0.8001,
"step": 3425
},
{
"epoch": 0.6454648099360181,
"grad_norm": 0.0396946259433925,
"learning_rate": 0.00010095673757145103,
"loss": 0.7716,
"step": 3430
},
{
"epoch": 0.6464057207376741,
"grad_norm": 0.04406355047386435,
"learning_rate": 0.00010049136147472874,
"loss": 0.7759,
"step": 3435
},
{
"epoch": 0.64734663153933,
"grad_norm": 0.043764052802193115,
"learning_rate": 0.00010002651957470968,
"loss": 0.8051,
"step": 3440
},
{
"epoch": 0.6482875423409861,
"grad_norm": 0.041926281617999266,
"learning_rate": 9.956221688702384e-05,
"loss": 0.7647,
"step": 3445
},
{
"epoch": 0.6492284531426421,
"grad_norm": 0.03895521821927712,
"learning_rate": 9.909845842148313e-05,
"loss": 0.7602,
"step": 3450
},
{
"epoch": 0.650169363944298,
"grad_norm": 0.03914173750008508,
"learning_rate": 9.863524918202729e-05,
"loss": 0.7627,
"step": 3455
},
{
"epoch": 0.6511102747459541,
"grad_norm": 0.03993016779305327,
"learning_rate": 9.817259416666985e-05,
"loss": 0.7646,
"step": 3460
},
{
"epoch": 0.6520511855476101,
"grad_norm": 0.03980886857877244,
"learning_rate": 9.77104983674444e-05,
"loss": 0.7425,
"step": 3465
},
{
"epoch": 0.6529920963492661,
"grad_norm": 0.0401924210782212,
"learning_rate": 9.724896677035061e-05,
"loss": 0.6987,
"step": 3470
},
{
"epoch": 0.6539330071509221,
"grad_norm": 0.03664837733116737,
"learning_rate": 9.67880043553002e-05,
"loss": 0.7429,
"step": 3475
},
{
"epoch": 0.6548739179525781,
"grad_norm": 0.03777329875630416,
"learning_rate": 9.632761609606382e-05,
"loss": 0.7661,
"step": 3480
},
{
"epoch": 0.6558148287542341,
"grad_norm": 0.03939981829447013,
"learning_rate": 9.586780696021662e-05,
"loss": 0.7475,
"step": 3485
},
{
"epoch": 0.6567557395558901,
"grad_norm": 0.04466284968667148,
"learning_rate": 9.540858190908521e-05,
"loss": 0.7922,
"step": 3490
},
{
"epoch": 0.6576966503575461,
"grad_norm": 0.038955559682471524,
"learning_rate": 9.494994589769395e-05,
"loss": 0.7738,
"step": 3495
},
{
"epoch": 0.6586375611592021,
"grad_norm": 0.0411424120473035,
"learning_rate": 9.449190387471146e-05,
"loss": 0.7914,
"step": 3500
},
{
"epoch": 0.6595784719608581,
"grad_norm": 0.03649617721160762,
"learning_rate": 9.40344607823972e-05,
"loss": 0.7332,
"step": 3505
},
{
"epoch": 0.6605193827625141,
"grad_norm": 0.045697152332815814,
"learning_rate": 9.357762155654826e-05,
"loss": 0.736,
"step": 3510
},
{
"epoch": 0.6614602935641701,
"grad_norm": 0.04346134712991159,
"learning_rate": 9.312139112644593e-05,
"loss": 0.7753,
"step": 3515
},
{
"epoch": 0.6624012043658262,
"grad_norm": 0.03920510438440831,
"learning_rate": 9.266577441480266e-05,
"loss": 0.778,
"step": 3520
},
{
"epoch": 0.6633421151674821,
"grad_norm": 0.040739403735231945,
"learning_rate": 9.221077633770898e-05,
"loss": 0.7421,
"step": 3525
},
{
"epoch": 0.6642830259691381,
"grad_norm": 0.03703007602048622,
"learning_rate": 9.175640180458026e-05,
"loss": 0.7625,
"step": 3530
},
{
"epoch": 0.6652239367707942,
"grad_norm": 0.04195682496632696,
"learning_rate": 9.130265571810383e-05,
"loss": 0.7675,
"step": 3535
},
{
"epoch": 0.6661648475724501,
"grad_norm": 0.038248688786009566,
"learning_rate": 9.084954297418625e-05,
"loss": 0.7286,
"step": 3540
},
{
"epoch": 0.6671057583741061,
"grad_norm": 0.04263400445476122,
"learning_rate": 9.039706846190026e-05,
"loss": 0.7723,
"step": 3545
},
{
"epoch": 0.6680466691757622,
"grad_norm": 0.04018110083160048,
"learning_rate": 8.99452370634319e-05,
"loss": 0.7595,
"step": 3550
},
{
"epoch": 0.6689875799774181,
"grad_norm": 0.04129649016774401,
"learning_rate": 8.949405365402843e-05,
"loss": 0.7386,
"step": 3555
},
{
"epoch": 0.6699284907790741,
"grad_norm": 0.03730989500271481,
"learning_rate": 8.904352310194497e-05,
"loss": 0.7762,
"step": 3560
},
{
"epoch": 0.6708694015807302,
"grad_norm": 0.04233708639216568,
"learning_rate": 8.85936502683925e-05,
"loss": 0.7552,
"step": 3565
},
{
"epoch": 0.6718103123823862,
"grad_norm": 0.03856143493937795,
"learning_rate": 8.814444000748523e-05,
"loss": 0.7539,
"step": 3570
},
{
"epoch": 0.6727512231840421,
"grad_norm": 0.041220356746205916,
"learning_rate": 8.76958971661882e-05,
"loss": 0.7583,
"step": 3575
},
{
"epoch": 0.6736921339856982,
"grad_norm": 0.03885156037305439,
"learning_rate": 8.724802658426502e-05,
"loss": 0.7376,
"step": 3580
},
{
"epoch": 0.6746330447873542,
"grad_norm": 0.03874853523714315,
"learning_rate": 8.68008330942256e-05,
"loss": 0.7656,
"step": 3585
},
{
"epoch": 0.6755739555890101,
"grad_norm": 0.04028531730729849,
"learning_rate": 8.635432152127418e-05,
"loss": 0.7825,
"step": 3590
},
{
"epoch": 0.6765148663906662,
"grad_norm": 0.04280053960563239,
"learning_rate": 8.590849668325693e-05,
"loss": 0.754,
"step": 3595
},
{
"epoch": 0.6774557771923222,
"grad_norm": 0.03794786579840221,
"learning_rate": 8.546336339061036e-05,
"loss": 0.734,
"step": 3600
},
{
"epoch": 0.6783966879939781,
"grad_norm": 0.0402090640543888,
"learning_rate": 8.501892644630921e-05,
"loss": 0.7534,
"step": 3605
},
{
"epoch": 0.6793375987956342,
"grad_norm": 0.043552980145466276,
"learning_rate": 8.457519064581444e-05,
"loss": 0.7616,
"step": 3610
},
{
"epoch": 0.6802785095972902,
"grad_norm": 0.04080500922486465,
"learning_rate": 8.413216077702196e-05,
"loss": 0.7479,
"step": 3615
},
{
"epoch": 0.6812194203989462,
"grad_norm": 0.041554874325141766,
"learning_rate": 8.368984162021043e-05,
"loss": 0.7544,
"step": 3620
},
{
"epoch": 0.6821603312006022,
"grad_norm": 0.042784250483365954,
"learning_rate": 8.324823794799032e-05,
"loss": 0.7553,
"step": 3625
},
{
"epoch": 0.6831012420022582,
"grad_norm": 0.038641859923428605,
"learning_rate": 8.280735452525167e-05,
"loss": 0.7607,
"step": 3630
},
{
"epoch": 0.6840421528039142,
"grad_norm": 0.043016436368934555,
"learning_rate": 8.236719610911314e-05,
"loss": 0.7391,
"step": 3635
},
{
"epoch": 0.6849830636055702,
"grad_norm": 0.03879736605977712,
"learning_rate": 8.192776744887076e-05,
"loss": 0.7313,
"step": 3640
},
{
"epoch": 0.6859239744072262,
"grad_norm": 0.04080074250182663,
"learning_rate": 8.14890732859464e-05,
"loss": 0.7391,
"step": 3645
},
{
"epoch": 0.6868648852088822,
"grad_norm": 0.03990822597090894,
"learning_rate": 8.105111835383663e-05,
"loss": 0.7714,
"step": 3650
},
{
"epoch": 0.6878057960105382,
"grad_norm": 0.039537316806122595,
"learning_rate": 8.061390737806198e-05,
"loss": 0.7504,
"step": 3655
},
{
"epoch": 0.6887467068121942,
"grad_norm": 0.03939074291969639,
"learning_rate": 8.017744507611544e-05,
"loss": 0.715,
"step": 3660
},
{
"epoch": 0.6896876176138502,
"grad_norm": 0.0366601266910447,
"learning_rate": 7.974173615741204e-05,
"loss": 0.7386,
"step": 3665
},
{
"epoch": 0.6906285284155063,
"grad_norm": 0.038454118392867886,
"learning_rate": 7.930678532323778e-05,
"loss": 0.7742,
"step": 3670
},
{
"epoch": 0.6915694392171622,
"grad_norm": 0.03849044618209459,
"learning_rate": 7.887259726669884e-05,
"loss": 0.7567,
"step": 3675
},
{
"epoch": 0.6925103500188182,
"grad_norm": 0.03823520672805604,
"learning_rate": 7.84391766726712e-05,
"loss": 0.76,
"step": 3680
},
{
"epoch": 0.6934512608204743,
"grad_norm": 0.041103142624627866,
"learning_rate": 7.800652821774995e-05,
"loss": 0.737,
"step": 3685
},
{
"epoch": 0.6943921716221302,
"grad_norm": 0.04060986038183799,
"learning_rate": 7.757465657019864e-05,
"loss": 0.7447,
"step": 3690
},
{
"epoch": 0.6953330824237862,
"grad_norm": 0.04010014279749571,
"learning_rate": 7.714356638989914e-05,
"loss": 0.7119,
"step": 3695
},
{
"epoch": 0.6962739932254423,
"grad_norm": 0.03873233135487847,
"learning_rate": 7.67132623283016e-05,
"loss": 0.7429,
"step": 3700
},
{
"epoch": 0.6972149040270982,
"grad_norm": 0.04007347045017312,
"learning_rate": 7.628374902837363e-05,
"loss": 0.7385,
"step": 3705
},
{
"epoch": 0.6981558148287542,
"grad_norm": 0.040519533492342004,
"learning_rate": 7.585503112455062e-05,
"loss": 0.7534,
"step": 3710
},
{
"epoch": 0.6990967256304103,
"grad_norm": 0.04109428606589565,
"learning_rate": 7.542711324268576e-05,
"loss": 0.7538,
"step": 3715
},
{
"epoch": 0.7000376364320663,
"grad_norm": 0.03658743414096176,
"learning_rate": 7.500000000000002e-05,
"loss": 0.7326,
"step": 3720
},
{
"epoch": 0.7009785472337222,
"grad_norm": 0.03980166365958357,
"learning_rate": 7.45736960050322e-05,
"loss": 0.7409,
"step": 3725
},
{
"epoch": 0.7019194580353783,
"grad_norm": 0.03656884698195228,
"learning_rate": 7.414820585758949e-05,
"loss": 0.7699,
"step": 3730
},
{
"epoch": 0.7028603688370343,
"grad_norm": 0.03618243018140937,
"learning_rate": 7.372353414869766e-05,
"loss": 0.7493,
"step": 3735
},
{
"epoch": 0.7038012796386902,
"grad_norm": 0.0400626300749975,
"learning_rate": 7.329968546055144e-05,
"loss": 0.7505,
"step": 3740
},
{
"epoch": 0.7047421904403463,
"grad_norm": 0.03697501490735565,
"learning_rate": 7.287666436646539e-05,
"loss": 0.7508,
"step": 3745
},
{
"epoch": 0.7056831012420023,
"grad_norm": 0.03865403994396552,
"learning_rate": 7.245447543082414e-05,
"loss": 0.7348,
"step": 3750
},
{
"epoch": 0.7066240120436582,
"grad_norm": 0.039335397836340956,
"learning_rate": 7.20331232090335e-05,
"loss": 0.7493,
"step": 3755
},
{
"epoch": 0.7075649228453142,
"grad_norm": 0.03793808471305577,
"learning_rate": 7.161261224747119e-05,
"loss": 0.7339,
"step": 3760
},
{
"epoch": 0.7085058336469703,
"grad_norm": 0.037488809656223004,
"learning_rate": 7.119294708343755e-05,
"loss": 0.7477,
"step": 3765
},
{
"epoch": 0.7094467444486263,
"grad_norm": 0.038730154299705596,
"learning_rate": 7.077413224510702e-05,
"loss": 0.7421,
"step": 3770
},
{
"epoch": 0.7103876552502822,
"grad_norm": 0.04047318420968464,
"learning_rate": 7.0356172251479e-05,
"loss": 0.786,
"step": 3775
},
{
"epoch": 0.7113285660519383,
"grad_norm": 0.04012985610521384,
"learning_rate": 6.993907161232907e-05,
"loss": 0.7564,
"step": 3780
},
{
"epoch": 0.7122694768535943,
"grad_norm": 0.040195166286305296,
"learning_rate": 6.952283482816037e-05,
"loss": 0.7452,
"step": 3785
},
{
"epoch": 0.7132103876552502,
"grad_norm": 0.036013934570515216,
"learning_rate": 6.910746639015518e-05,
"loss": 0.7416,
"step": 3790
},
{
"epoch": 0.7141512984569063,
"grad_norm": 0.039404706768370185,
"learning_rate": 6.869297078012636e-05,
"loss": 0.7566,
"step": 3795
},
{
"epoch": 0.7150922092585623,
"grad_norm": 0.03794240394451752,
"learning_rate": 6.827935247046883e-05,
"loss": 0.7405,
"step": 3800
},
{
"epoch": 0.7160331200602182,
"grad_norm": 0.03817108747506884,
"learning_rate": 6.786661592411162e-05,
"loss": 0.7466,
"step": 3805
},
{
"epoch": 0.7169740308618743,
"grad_norm": 0.03872769520991303,
"learning_rate": 6.745476559446956e-05,
"loss": 0.7595,
"step": 3810
},
{
"epoch": 0.7179149416635303,
"grad_norm": 0.03884711036444254,
"learning_rate": 6.704380592539508e-05,
"loss": 0.7578,
"step": 3815
},
{
"epoch": 0.7188558524651864,
"grad_norm": 0.03780978653067677,
"learning_rate": 6.663374135113059e-05,
"loss": 0.7311,
"step": 3820
},
{
"epoch": 0.7197967632668423,
"grad_norm": 0.040555453463663634,
"learning_rate": 6.622457629626027e-05,
"loss": 0.7664,
"step": 3825
},
{
"epoch": 0.7207376740684983,
"grad_norm": 0.03892829872566159,
"learning_rate": 6.581631517566268e-05,
"loss": 0.7515,
"step": 3830
},
{
"epoch": 0.7216785848701543,
"grad_norm": 0.03899925932302288,
"learning_rate": 6.540896239446293e-05,
"loss": 0.7431,
"step": 3835
},
{
"epoch": 0.7226194956718103,
"grad_norm": 0.03724943309525145,
"learning_rate": 6.500252234798503e-05,
"loss": 0.7305,
"step": 3840
},
{
"epoch": 0.7235604064734663,
"grad_norm": 0.03821337427785788,
"learning_rate": 6.459699942170475e-05,
"loss": 0.745,
"step": 3845
},
{
"epoch": 0.7245013172751223,
"grad_norm": 0.04022918959169577,
"learning_rate": 6.419239799120222e-05,
"loss": 0.7662,
"step": 3850
},
{
"epoch": 0.7254422280767783,
"grad_norm": 0.03852879372490541,
"learning_rate": 6.378872242211443e-05,
"loss": 0.7675,
"step": 3855
},
{
"epoch": 0.7263831388784343,
"grad_norm": 0.03781259503267875,
"learning_rate": 6.338597707008859e-05,
"loss": 0.7308,
"step": 3860
},
{
"epoch": 0.7273240496800903,
"grad_norm": 0.03951092581019206,
"learning_rate": 6.29841662807347e-05,
"loss": 0.7387,
"step": 3865
},
{
"epoch": 0.7282649604817464,
"grad_norm": 0.040290629672598395,
"learning_rate": 6.258329438957899e-05,
"loss": 0.7406,
"step": 3870
},
{
"epoch": 0.7292058712834023,
"grad_norm": 0.03694512622132562,
"learning_rate": 6.218336572201705e-05,
"loss": 0.7193,
"step": 3875
},
{
"epoch": 0.7301467820850583,
"grad_norm": 0.03646511173753941,
"learning_rate": 6.178438459326689e-05,
"loss": 0.7402,
"step": 3880
},
{
"epoch": 0.7310876928867144,
"grad_norm": 0.0406395023903416,
"learning_rate": 6.138635530832283e-05,
"loss": 0.7414,
"step": 3885
},
{
"epoch": 0.7320286036883703,
"grad_norm": 0.036014218987116424,
"learning_rate": 6.09892821619088e-05,
"loss": 0.725,
"step": 3890
},
{
"epoch": 0.7329695144900263,
"grad_norm": 0.040755514238358985,
"learning_rate": 6.059316943843189e-05,
"loss": 0.7587,
"step": 3895
},
{
"epoch": 0.7339104252916824,
"grad_norm": 0.040411548204649615,
"learning_rate": 6.019802141193625e-05,
"loss": 0.7646,
"step": 3900
},
{
"epoch": 0.7348513360933383,
"grad_norm": 0.038968355232819656,
"learning_rate": 5.980384234605726e-05,
"loss": 0.776,
"step": 3905
},
{
"epoch": 0.7357922468949943,
"grad_norm": 0.042090530618439974,
"learning_rate": 5.941063649397495e-05,
"loss": 0.7758,
"step": 3910
},
{
"epoch": 0.7367331576966504,
"grad_norm": 0.037713848383341686,
"learning_rate": 5.901840809836844e-05,
"loss": 0.7538,
"step": 3915
},
{
"epoch": 0.7376740684983064,
"grad_norm": 0.03686538623291845,
"learning_rate": 5.8627161391370245e-05,
"loss": 0.7419,
"step": 3920
},
{
"epoch": 0.7386149792999623,
"grad_norm": 0.0412323805290299,
"learning_rate": 5.823690059452049e-05,
"loss": 0.7286,
"step": 3925
},
{
"epoch": 0.7395558901016184,
"grad_norm": 0.04681091382814438,
"learning_rate": 5.7847629918721165e-05,
"loss": 0.7392,
"step": 3930
},
{
"epoch": 0.7404968009032744,
"grad_norm": 0.038102594611252705,
"learning_rate": 5.7459353564191095e-05,
"loss": 0.7581,
"step": 3935
},
{
"epoch": 0.7414377117049303,
"grad_norm": 0.03890825582383635,
"learning_rate": 5.707207572042037e-05,
"loss": 0.7455,
"step": 3940
},
{
"epoch": 0.7423786225065864,
"grad_norm": 0.039387553938316465,
"learning_rate": 5.668580056612504e-05,
"loss": 0.7285,
"step": 3945
},
{
"epoch": 0.7433195333082424,
"grad_norm": 0.03946818985958446,
"learning_rate": 5.630053226920239e-05,
"loss": 0.7248,
"step": 3950
},
{
"epoch": 0.7442604441098983,
"grad_norm": 0.03936387385845415,
"learning_rate": 5.591627498668548e-05,
"loss": 0.766,
"step": 3955
},
{
"epoch": 0.7452013549115544,
"grad_norm": 0.03472976147552407,
"learning_rate": 5.5533032864698754e-05,
"loss": 0.7205,
"step": 3960
},
{
"epoch": 0.7461422657132104,
"grad_norm": 0.03912387994170675,
"learning_rate": 5.515081003841315e-05,
"loss": 0.7593,
"step": 3965
},
{
"epoch": 0.7470831765148664,
"grad_norm": 0.041833487223668316,
"learning_rate": 5.4769610632001164e-05,
"loss": 0.7618,
"step": 3970
},
{
"epoch": 0.7480240873165224,
"grad_norm": 0.03700071263990363,
"learning_rate": 5.4389438758592884e-05,
"loss": 0.7354,
"step": 3975
},
{
"epoch": 0.7489649981181784,
"grad_norm": 0.039575292853940104,
"learning_rate": 5.401029852023129e-05,
"loss": 0.7493,
"step": 3980
},
{
"epoch": 0.7499059089198344,
"grad_norm": 0.034510383462628254,
"learning_rate": 5.363219400782798e-05,
"loss": 0.7379,
"step": 3985
},
{
"epoch": 0.7508468197214904,
"grad_norm": 0.04074525021374615,
"learning_rate": 5.325512930111907e-05,
"loss": 0.7327,
"step": 3990
},
{
"epoch": 0.7517877305231464,
"grad_norm": 0.03972661994649692,
"learning_rate": 5.2879108468621346e-05,
"loss": 0.7632,
"step": 3995
},
{
"epoch": 0.7527286413248024,
"grad_norm": 0.040751426623560565,
"learning_rate": 5.250413556758819e-05,
"loss": 0.7573,
"step": 4000
},
{
"epoch": 0.7536695521264584,
"grad_norm": 0.037165299818227866,
"learning_rate": 5.2130214643965685e-05,
"loss": 0.7703,
"step": 4005
},
{
"epoch": 0.7546104629281144,
"grad_norm": 0.039192953238953,
"learning_rate": 5.175734973234927e-05,
"loss": 0.7383,
"step": 4010
},
{
"epoch": 0.7555513737297704,
"grad_norm": 0.03861148502718981,
"learning_rate": 5.1385544855940066e-05,
"loss": 0.7483,
"step": 4015
},
{
"epoch": 0.7564922845314265,
"grad_norm": 0.03866162098831916,
"learning_rate": 5.1014804026501244e-05,
"loss": 0.7212,
"step": 4020
},
{
"epoch": 0.7574331953330824,
"grad_norm": 0.0403684213936758,
"learning_rate": 5.0645131244315214e-05,
"loss": 0.7466,
"step": 4025
},
{
"epoch": 0.7583741061347384,
"grad_norm": 0.036399521831994304,
"learning_rate": 5.027653049813991e-05,
"loss": 0.7327,
"step": 4030
},
{
"epoch": 0.7593150169363945,
"grad_norm": 0.03824476127393509,
"learning_rate": 4.990900576516625e-05,
"loss": 0.7304,
"step": 4035
},
{
"epoch": 0.7602559277380504,
"grad_norm": 0.04226316377265603,
"learning_rate": 4.954256101097494e-05,
"loss": 0.7592,
"step": 4040
},
{
"epoch": 0.7611968385397064,
"grad_norm": 0.0353431684108597,
"learning_rate": 4.917720018949364e-05,
"loss": 0.7101,
"step": 4045
},
{
"epoch": 0.7621377493413625,
"grad_norm": 0.03822443909312598,
"learning_rate": 4.8812927242954564e-05,
"loss": 0.7392,
"step": 4050
},
{
"epoch": 0.7630786601430184,
"grad_norm": 0.03835749701797319,
"learning_rate": 4.844974610185173e-05,
"loss": 0.7323,
"step": 4055
},
{
"epoch": 0.7640195709446744,
"grad_norm": 0.037684888204025845,
"learning_rate": 4.808766068489855e-05,
"loss": 0.7392,
"step": 4060
},
{
"epoch": 0.7649604817463305,
"grad_norm": 0.03795012722182954,
"learning_rate": 4.772667489898572e-05,
"loss": 0.7261,
"step": 4065
},
{
"epoch": 0.7659013925479865,
"grad_norm": 0.03846412848813938,
"learning_rate": 4.736679263913881e-05,
"loss": 0.7456,
"step": 4070
},
{
"epoch": 0.7668423033496424,
"grad_norm": 0.04277670399034575,
"learning_rate": 4.7008017788476476e-05,
"loss": 0.7676,
"step": 4075
},
{
"epoch": 0.7677832141512985,
"grad_norm": 0.03733090710113088,
"learning_rate": 4.665035421816852e-05,
"loss": 0.7586,
"step": 4080
},
{
"epoch": 0.7687241249529545,
"grad_norm": 0.036470579624849335,
"learning_rate": 4.629380578739385e-05,
"loss": 0.7254,
"step": 4085
},
{
"epoch": 0.7696650357546104,
"grad_norm": 0.03975957425764092,
"learning_rate": 4.593837634329928e-05,
"loss": 0.7569,
"step": 4090
},
{
"epoch": 0.7706059465562665,
"grad_norm": 0.040549018273597276,
"learning_rate": 4.558406972095771e-05,
"loss": 0.7307,
"step": 4095
},
{
"epoch": 0.7715468573579225,
"grad_norm": 0.03801305901578898,
"learning_rate": 4.523088974332676e-05,
"loss": 0.7249,
"step": 4100
},
{
"epoch": 0.7724877681595784,
"grad_norm": 0.04200138565229912,
"learning_rate": 4.487884022120758e-05,
"loss": 0.7567,
"step": 4105
},
{
"epoch": 0.7734286789612345,
"grad_norm": 0.03950895201351385,
"learning_rate": 4.452792495320396e-05,
"loss": 0.7046,
"step": 4110
},
{
"epoch": 0.7743695897628905,
"grad_norm": 0.036898557922398705,
"learning_rate": 4.41781477256809e-05,
"loss": 0.7263,
"step": 4115
},
{
"epoch": 0.7753105005645465,
"grad_norm": 0.03802666125569989,
"learning_rate": 4.382951231272397e-05,
"loss": 0.7588,
"step": 4120
},
{
"epoch": 0.7762514113662025,
"grad_norm": 0.037620561058654495,
"learning_rate": 4.3482022476098736e-05,
"loss": 0.7518,
"step": 4125
},
{
"epoch": 0.7771923221678585,
"grad_norm": 0.042625621645339215,
"learning_rate": 4.313568196520998e-05,
"loss": 0.7467,
"step": 4130
},
{
"epoch": 0.7781332329695145,
"grad_norm": 0.03933008556723823,
"learning_rate": 4.27904945170612e-05,
"loss": 0.74,
"step": 4135
},
{
"epoch": 0.7790741437711705,
"grad_norm": 0.03976842759714889,
"learning_rate": 4.244646385621451e-05,
"loss": 0.7475,
"step": 4140
},
{
"epoch": 0.7800150545728265,
"grad_norm": 0.03768428813268079,
"learning_rate": 4.2103593694750324e-05,
"loss": 0.7295,
"step": 4145
},
{
"epoch": 0.7809559653744825,
"grad_norm": 0.03876815133956591,
"learning_rate": 4.176188773222715e-05,
"loss": 0.7232,
"step": 4150
},
{
"epoch": 0.7818968761761385,
"grad_norm": 0.03668277473358429,
"learning_rate": 4.1421349655641994e-05,
"loss": 0.7589,
"step": 4155
},
{
"epoch": 0.7828377869777945,
"grad_norm": 0.04030783885886016,
"learning_rate": 4.108198313939029e-05,
"loss": 0.7473,
"step": 4160
},
{
"epoch": 0.7837786977794505,
"grad_norm": 0.03763435879728677,
"learning_rate": 4.0743791845226446e-05,
"loss": 0.7461,
"step": 4165
},
{
"epoch": 0.7847196085811066,
"grad_norm": 0.03854446021480888,
"learning_rate": 4.04067794222243e-05,
"loss": 0.7435,
"step": 4170
},
{
"epoch": 0.7856605193827625,
"grad_norm": 0.037540640945074796,
"learning_rate": 4.007094950673753e-05,
"loss": 0.7447,
"step": 4175
},
{
"epoch": 0.7866014301844185,
"grad_norm": 0.04173529963643009,
"learning_rate": 3.973630572236075e-05,
"loss": 0.7272,
"step": 4180
},
{
"epoch": 0.7875423409860746,
"grad_norm": 0.03950203020150416,
"learning_rate": 3.940285167989028e-05,
"loss": 0.7385,
"step": 4185
},
{
"epoch": 0.7884832517877305,
"grad_norm": 0.03744463784853737,
"learning_rate": 3.9070590977285016e-05,
"loss": 0.7408,
"step": 4190
},
{
"epoch": 0.7894241625893865,
"grad_norm": 0.0415749729661611,
"learning_rate": 3.873952719962781e-05,
"loss": 0.7553,
"step": 4195
},
{
"epoch": 0.7903650733910426,
"grad_norm": 0.04023668652299723,
"learning_rate": 3.840966391908678e-05,
"loss": 0.7393,
"step": 4200
},
{
"epoch": 0.7913059841926985,
"grad_norm": 0.03985163930669438,
"learning_rate": 3.808100469487674e-05,
"loss": 0.7433,
"step": 4205
},
{
"epoch": 0.7922468949943545,
"grad_norm": 0.042655408743132714,
"learning_rate": 3.775355307322063e-05,
"loss": 0.7815,
"step": 4210
},
{
"epoch": 0.7931878057960106,
"grad_norm": 0.037507990065317645,
"learning_rate": 3.742731258731152e-05,
"loss": 0.763,
"step": 4215
},
{
"epoch": 0.7941287165976666,
"grad_norm": 0.04151808695349708,
"learning_rate": 3.7102286757274364e-05,
"loss": 0.7298,
"step": 4220
},
{
"epoch": 0.7950696273993225,
"grad_norm": 0.03871917007808031,
"learning_rate": 3.6778479090127913e-05,
"loss": 0.7179,
"step": 4225
},
{
"epoch": 0.7960105382009786,
"grad_norm": 0.0395273004111852,
"learning_rate": 3.6455893079747114e-05,
"loss": 0.7215,
"step": 4230
},
{
"epoch": 0.7969514490026346,
"grad_norm": 0.037341829211501365,
"learning_rate": 3.6134532206825136e-05,
"loss": 0.7518,
"step": 4235
},
{
"epoch": 0.7978923598042905,
"grad_norm": 0.039938364583883966,
"learning_rate": 3.581439993883604e-05,
"loss": 0.7498,
"step": 4240
},
{
"epoch": 0.7988332706059466,
"grad_norm": 0.040057519903909024,
"learning_rate": 3.5495499729997304e-05,
"loss": 0.7071,
"step": 4245
},
{
"epoch": 0.7997741814076026,
"grad_norm": 0.037110178091548936,
"learning_rate": 3.5177835021232395e-05,
"loss": 0.7041,
"step": 4250
},
{
"epoch": 0.8007150922092585,
"grad_norm": 0.03888319458444373,
"learning_rate": 3.486140924013391e-05,
"loss": 0.7312,
"step": 4255
},
{
"epoch": 0.8016560030109146,
"grad_norm": 0.03743060676550586,
"learning_rate": 3.4546225800926416e-05,
"loss": 0.7128,
"step": 4260
},
{
"epoch": 0.8025969138125706,
"grad_norm": 0.038536328098299384,
"learning_rate": 3.4232288104429636e-05,
"loss": 0.7449,
"step": 4265
},
{
"epoch": 0.8035378246142266,
"grad_norm": 0.03878605489785658,
"learning_rate": 3.3919599538021664e-05,
"loss": 0.7497,
"step": 4270
},
{
"epoch": 0.8044787354158826,
"grad_norm": 0.03793631840075618,
"learning_rate": 3.3608163475602684e-05,
"loss": 0.7377,
"step": 4275
},
{
"epoch": 0.8054196462175386,
"grad_norm": 0.03968152466748523,
"learning_rate": 3.329798327755835e-05,
"loss": 0.7307,
"step": 4280
},
{
"epoch": 0.8063605570191946,
"grad_norm": 0.03749528471492435,
"learning_rate": 3.298906229072357e-05,
"loss": 0.7644,
"step": 4285
},
{
"epoch": 0.8073014678208505,
"grad_norm": 0.03784045958987088,
"learning_rate": 3.268140384834633e-05,
"loss": 0.7269,
"step": 4290
},
{
"epoch": 0.8082423786225066,
"grad_norm": 0.037631543918115885,
"learning_rate": 3.237501127005192e-05,
"loss": 0.7341,
"step": 4295
},
{
"epoch": 0.8091832894241626,
"grad_norm": 0.0390771932444786,
"learning_rate": 3.206988786180693e-05,
"loss": 0.7328,
"step": 4300
},
{
"epoch": 0.8101242002258185,
"grad_norm": 0.03647087521210094,
"learning_rate": 3.176603691588365e-05,
"loss": 0.7395,
"step": 4305
},
{
"epoch": 0.8110651110274746,
"grad_norm": 0.03694803246663371,
"learning_rate": 3.146346171082445e-05,
"loss": 0.7527,
"step": 4310
},
{
"epoch": 0.8120060218291306,
"grad_norm": 0.040278247753399284,
"learning_rate": 3.1162165511406756e-05,
"loss": 0.7317,
"step": 4315
},
{
"epoch": 0.8129469326307867,
"grad_norm": 0.03898944326441212,
"learning_rate": 3.086215156860729e-05,
"loss": 0.7222,
"step": 4320
},
{
"epoch": 0.8138878434324426,
"grad_norm": 0.04051272560722607,
"learning_rate": 3.056342311956735e-05,
"loss": 0.7484,
"step": 4325
},
{
"epoch": 0.8148287542340986,
"grad_norm": 0.03796686896436505,
"learning_rate": 3.026598338755783e-05,
"loss": 0.737,
"step": 4330
},
{
"epoch": 0.8157696650357547,
"grad_norm": 0.03901772952940146,
"learning_rate": 2.9969835581944423e-05,
"loss": 0.7275,
"step": 4335
},
{
"epoch": 0.8167105758374106,
"grad_norm": 0.03789607099708822,
"learning_rate": 2.9674982898152904e-05,
"loss": 0.7522,
"step": 4340
},
{
"epoch": 0.8176514866390666,
"grad_norm": 0.04437902229228163,
"learning_rate": 2.938142851763476e-05,
"loss": 0.748,
"step": 4345
},
{
"epoch": 0.8185923974407227,
"grad_norm": 0.040534612356349066,
"learning_rate": 2.908917560783286e-05,
"loss": 0.745,
"step": 4350
},
{
"epoch": 0.8195333082423786,
"grad_norm": 0.04165229631859438,
"learning_rate": 2.8798227322147167e-05,
"loss": 0.754,
"step": 4355
},
{
"epoch": 0.8204742190440346,
"grad_norm": 0.04285176631929181,
"learning_rate": 2.8508586799900878e-05,
"loss": 0.7279,
"step": 4360
},
{
"epoch": 0.8214151298456907,
"grad_norm": 0.04179797544417384,
"learning_rate": 2.8220257166306338e-05,
"loss": 0.7461,
"step": 4365
},
{
"epoch": 0.8223560406473467,
"grad_norm": 0.04019820707217107,
"learning_rate": 2.7933241532431576e-05,
"loss": 0.7481,
"step": 4370
},
{
"epoch": 0.8232969514490026,
"grad_norm": 0.03603648651130401,
"learning_rate": 2.7647542995166576e-05,
"loss": 0.7335,
"step": 4375
},
{
"epoch": 0.8242378622506586,
"grad_norm": 0.03979622492169847,
"learning_rate": 2.736316463718978e-05,
"loss": 0.7483,
"step": 4380
},
{
"epoch": 0.8251787730523147,
"grad_norm": 0.03939493805725078,
"learning_rate": 2.7080109526935083e-05,
"loss": 0.7187,
"step": 4385
},
{
"epoch": 0.8261196838539706,
"grad_norm": 0.036865190585736146,
"learning_rate": 2.6798380718558526e-05,
"loss": 0.759,
"step": 4390
},
{
"epoch": 0.8270605946556266,
"grad_norm": 0.03612023659473949,
"learning_rate": 2.6517981251905336e-05,
"loss": 0.7388,
"step": 4395
},
{
"epoch": 0.8280015054572827,
"grad_norm": 0.03764904252746977,
"learning_rate": 2.623891415247721e-05,
"loss": 0.7146,
"step": 4400
},
{
"epoch": 0.8289424162589386,
"grad_norm": 0.03996839095480978,
"learning_rate": 2.596118243139968e-05,
"loss": 0.7382,
"step": 4405
},
{
"epoch": 0.8298833270605946,
"grad_norm": 0.041170050061391135,
"learning_rate": 2.5684789085389607e-05,
"loss": 0.7324,
"step": 4410
},
{
"epoch": 0.8308242378622507,
"grad_norm": 0.03765734472228694,
"learning_rate": 2.5409737096722716e-05,
"loss": 0.7205,
"step": 4415
},
{
"epoch": 0.8317651486639067,
"grad_norm": 0.03910602217381,
"learning_rate": 2.5136029433201625e-05,
"loss": 0.712,
"step": 4420
},
{
"epoch": 0.8327060594655626,
"grad_norm": 0.043314895360783305,
"learning_rate": 2.4863669048123746e-05,
"loss": 0.7252,
"step": 4425
},
{
"epoch": 0.8336469702672187,
"grad_norm": 0.038416515119674845,
"learning_rate": 2.4592658880249244e-05,
"loss": 0.7342,
"step": 4430
},
{
"epoch": 0.8345878810688747,
"grad_norm": 0.03966310990660385,
"learning_rate": 2.4323001853769692e-05,
"loss": 0.7278,
"step": 4435
},
{
"epoch": 0.8355287918705306,
"grad_norm": 0.039679489923308116,
"learning_rate": 2.4054700878276122e-05,
"loss": 0.7268,
"step": 4440
},
{
"epoch": 0.8364697026721867,
"grad_norm": 0.03897089451361406,
"learning_rate": 2.3787758848727912e-05,
"loss": 0.7233,
"step": 4445
},
{
"epoch": 0.8374106134738427,
"grad_norm": 0.03825105631460866,
"learning_rate": 2.352217864542149e-05,
"loss": 0.7186,
"step": 4450
},
{
"epoch": 0.8383515242754986,
"grad_norm": 0.037516278264590745,
"learning_rate": 2.3257963133959086e-05,
"loss": 0.7359,
"step": 4455
},
{
"epoch": 0.8392924350771547,
"grad_norm": 0.03803680908475784,
"learning_rate": 2.2995115165218076e-05,
"loss": 0.7324,
"step": 4460
},
{
"epoch": 0.8402333458788107,
"grad_norm": 0.041405405174316776,
"learning_rate": 2.2733637575320085e-05,
"loss": 0.7328,
"step": 4465
},
{
"epoch": 0.8411742566804666,
"grad_norm": 0.03654086979510833,
"learning_rate": 2.2473533185600295e-05,
"loss": 0.7187,
"step": 4470
},
{
"epoch": 0.8421151674821227,
"grad_norm": 0.039987056977315666,
"learning_rate": 2.2214804802577108e-05,
"loss": 0.741,
"step": 4475
},
{
"epoch": 0.8430560782837787,
"grad_norm": 0.04099954115508159,
"learning_rate": 2.1957455217922033e-05,
"loss": 0.7247,
"step": 4480
},
{
"epoch": 0.8439969890854347,
"grad_norm": 0.038398613227579474,
"learning_rate": 2.1701487208429197e-05,
"loss": 0.7382,
"step": 4485
},
{
"epoch": 0.8449378998870907,
"grad_norm": 0.03836989471820563,
"learning_rate": 2.1446903535985587e-05,
"loss": 0.7134,
"step": 4490
},
{
"epoch": 0.8458788106887467,
"grad_norm": 0.036465783338229904,
"learning_rate": 2.119370694754132e-05,
"loss": 0.7,
"step": 4495
},
{
"epoch": 0.8468197214904027,
"grad_norm": 0.0385801606753652,
"learning_rate": 2.094190017507989e-05,
"loss": 0.7475,
"step": 4500
},
{
"epoch": 0.8477606322920587,
"grad_norm": 0.035990420555936965,
"learning_rate": 2.0691485935588743e-05,
"loss": 0.7334,
"step": 4505
},
{
"epoch": 0.8487015430937147,
"grad_norm": 0.04380741523278768,
"learning_rate": 2.0442466931029867e-05,
"loss": 0.7237,
"step": 4510
},
{
"epoch": 0.8496424538953707,
"grad_norm": 0.037151691736236815,
"learning_rate": 2.0194845848310674e-05,
"loss": 0.7276,
"step": 4515
},
{
"epoch": 0.8505833646970267,
"grad_norm": 0.036431746414973216,
"learning_rate": 1.9948625359255248e-05,
"loss": 0.7031,
"step": 4520
},
{
"epoch": 0.8515242754986827,
"grad_norm": 0.03727873723258803,
"learning_rate": 1.970380812057512e-05,
"loss": 0.7229,
"step": 4525
},
{
"epoch": 0.8524651863003387,
"grad_norm": 0.03479466063222127,
"learning_rate": 1.9460396773840786e-05,
"loss": 0.7544,
"step": 4530
},
{
"epoch": 0.8534060971019948,
"grad_norm": 0.04104017949784562,
"learning_rate": 1.9218393945453327e-05,
"loss": 0.7197,
"step": 4535
},
{
"epoch": 0.8543470079036507,
"grad_norm": 0.03790070545246109,
"learning_rate": 1.8977802246615908e-05,
"loss": 0.7431,
"step": 4540
},
{
"epoch": 0.8552879187053067,
"grad_norm": 0.03729217061921706,
"learning_rate": 1.8738624273305602e-05,
"loss": 0.7533,
"step": 4545
},
{
"epoch": 0.8562288295069628,
"grad_norm": 0.03931349035555177,
"learning_rate": 1.8500862606245476e-05,
"loss": 0.7394,
"step": 4550
},
{
"epoch": 0.8571697403086187,
"grad_norm": 0.0394433321374659,
"learning_rate": 1.8264519810876722e-05,
"loss": 0.7326,
"step": 4555
},
{
"epoch": 0.8581106511102747,
"grad_norm": 0.035368518092189605,
"learning_rate": 1.802959843733086e-05,
"loss": 0.7472,
"step": 4560
},
{
"epoch": 0.8590515619119308,
"grad_norm": 0.037582766906246075,
"learning_rate": 1.7796101020402405e-05,
"loss": 0.7061,
"step": 4565
},
{
"epoch": 0.8599924727135867,
"grad_norm": 0.038386275083281136,
"learning_rate": 1.7564030079521312e-05,
"loss": 0.7281,
"step": 4570
},
{
"epoch": 0.8609333835152427,
"grad_norm": 0.03798225357409595,
"learning_rate": 1.7333388118726033e-05,
"loss": 0.7417,
"step": 4575
},
{
"epoch": 0.8618742943168988,
"grad_norm": 0.03952864589268583,
"learning_rate": 1.7104177626636308e-05,
"loss": 0.7386,
"step": 4580
},
{
"epoch": 0.8628152051185548,
"grad_norm": 0.039175178129552325,
"learning_rate": 1.6876401076426332e-05,
"loss": 0.7348,
"step": 4585
},
{
"epoch": 0.8637561159202107,
"grad_norm": 0.04213404267588905,
"learning_rate": 1.665006092579817e-05,
"loss": 0.7451,
"step": 4590
},
{
"epoch": 0.8646970267218668,
"grad_norm": 0.036904029064067076,
"learning_rate": 1.6425159616955208e-05,
"loss": 0.732,
"step": 4595
},
{
"epoch": 0.8656379375235228,
"grad_norm": 0.04055058687139153,
"learning_rate": 1.620169957657567e-05,
"loss": 0.7336,
"step": 4600
},
{
"epoch": 0.8665788483251787,
"grad_norm": 0.03633816516730953,
"learning_rate": 1.5979683215786575e-05,
"loss": 0.7253,
"step": 4605
},
{
"epoch": 0.8675197591268348,
"grad_norm": 0.03780409397880549,
"learning_rate": 1.575911293013773e-05,
"loss": 0.7493,
"step": 4610
},
{
"epoch": 0.8684606699284908,
"grad_norm": 0.036721595492597056,
"learning_rate": 1.5539991099575854e-05,
"loss": 0.7202,
"step": 4615
},
{
"epoch": 0.8694015807301467,
"grad_norm": 0.037482839459661806,
"learning_rate": 1.5322320088418725e-05,
"loss": 0.7112,
"step": 4620
},
{
"epoch": 0.8703424915318028,
"grad_norm": 0.04032680309979362,
"learning_rate": 1.510610224533001e-05,
"loss": 0.7459,
"step": 4625
},
{
"epoch": 0.8712834023334588,
"grad_norm": 0.039375359663647816,
"learning_rate": 1.489133990329366e-05,
"loss": 0.7459,
"step": 4630
},
{
"epoch": 0.8722243131351148,
"grad_norm": 0.03878387865856684,
"learning_rate": 1.467803537958876e-05,
"loss": 0.7314,
"step": 4635
},
{
"epoch": 0.8731652239367708,
"grad_norm": 0.03820312498729265,
"learning_rate": 1.446619097576468e-05,
"loss": 0.7081,
"step": 4640
},
{
"epoch": 0.8741061347384268,
"grad_norm": 0.04141438212898032,
"learning_rate": 1.425580897761604e-05,
"loss": 0.7524,
"step": 4645
},
{
"epoch": 0.8750470455400828,
"grad_norm": 0.03963965690705673,
"learning_rate": 1.4046891655158233e-05,
"loss": 0.7289,
"step": 4650
},
{
"epoch": 0.8759879563417388,
"grad_norm": 0.04111262551595695,
"learning_rate": 1.383944126260284e-05,
"loss": 0.7293,
"step": 4655
},
{
"epoch": 0.8769288671433948,
"grad_norm": 0.04061469675303553,
"learning_rate": 1.3633460038333211e-05,
"loss": 0.7525,
"step": 4660
},
{
"epoch": 0.8778697779450508,
"grad_norm": 0.0376892938395166,
"learning_rate": 1.3428950204880534e-05,
"loss": 0.7311,
"step": 4665
},
{
"epoch": 0.8788106887467068,
"grad_norm": 0.03891564478991497,
"learning_rate": 1.3225913968899705e-05,
"loss": 0.7366,
"step": 4670
},
{
"epoch": 0.8797515995483628,
"grad_norm": 0.03795076957307728,
"learning_rate": 1.3024353521145515e-05,
"loss": 0.74,
"step": 4675
},
{
"epoch": 0.8806925103500188,
"grad_norm": 0.03879491364323159,
"learning_rate": 1.2824271036449013e-05,
"loss": 0.7569,
"step": 4680
},
{
"epoch": 0.8816334211516749,
"grad_norm": 0.03648855134440867,
"learning_rate": 1.2625668673694206e-05,
"loss": 0.7261,
"step": 4685
},
{
"epoch": 0.8825743319533308,
"grad_norm": 0.039114744946843395,
"learning_rate": 1.2428548575794506e-05,
"loss": 0.7276,
"step": 4690
},
{
"epoch": 0.8835152427549868,
"grad_norm": 0.03922278721510972,
"learning_rate": 1.2232912869669753e-05,
"loss": 0.7412,
"step": 4695
},
{
"epoch": 0.8844561535566429,
"grad_norm": 0.0413220284359729,
"learning_rate": 1.2038763666223283e-05,
"loss": 0.6961,
"step": 4700
},
{
"epoch": 0.8853970643582988,
"grad_norm": 0.03774504327378547,
"learning_rate": 1.1846103060319112e-05,
"loss": 0.7616,
"step": 4705
},
{
"epoch": 0.8863379751599548,
"grad_norm": 0.03719254603141807,
"learning_rate": 1.1654933130759269e-05,
"loss": 0.7295,
"step": 4710
},
{
"epoch": 0.8872788859616109,
"grad_norm": 0.04039683465612545,
"learning_rate": 1.1465255940261536e-05,
"loss": 0.7315,
"step": 4715
},
{
"epoch": 0.8882197967632668,
"grad_norm": 0.038907342623597366,
"learning_rate": 1.1277073535436943e-05,
"loss": 0.7424,
"step": 4720
},
{
"epoch": 0.8891607075649228,
"grad_norm": 0.03788244107623249,
"learning_rate": 1.1090387946768003e-05,
"loss": 0.7515,
"step": 4725
},
{
"epoch": 0.8901016183665789,
"grad_norm": 0.038838422890715006,
"learning_rate": 1.090520118858652e-05,
"loss": 0.7368,
"step": 4730
},
{
"epoch": 0.8910425291682349,
"grad_norm": 0.03725542123628333,
"learning_rate": 1.0721515259051916e-05,
"loss": 0.7201,
"step": 4735
},
{
"epoch": 0.8919834399698908,
"grad_norm": 0.03589707805750827,
"learning_rate": 1.053933214012983e-05,
"loss": 0.7129,
"step": 4740
},
{
"epoch": 0.8929243507715469,
"grad_norm": 0.038642857374594816,
"learning_rate": 1.0358653797570593e-05,
"loss": 0.7541,
"step": 4745
},
{
"epoch": 0.8938652615732029,
"grad_norm": 0.039982874321707856,
"learning_rate": 1.017948218088797e-05,
"loss": 0.7061,
"step": 4750
},
{
"epoch": 0.8948061723748588,
"grad_norm": 0.0370503487079274,
"learning_rate": 1.0001819223338287e-05,
"loss": 0.7363,
"step": 4755
},
{
"epoch": 0.8957470831765149,
"grad_norm": 0.03536457438950792,
"learning_rate": 9.825666841899465e-06,
"loss": 0.7415,
"step": 4760
},
{
"epoch": 0.8966879939781709,
"grad_norm": 0.03707956825731724,
"learning_rate": 9.651026937250288e-06,
"loss": 0.7211,
"step": 4765
},
{
"epoch": 0.8976289047798268,
"grad_norm": 0.03963652644770756,
"learning_rate": 9.477901393750076e-06,
"loss": 0.7391,
"step": 4770
},
{
"epoch": 0.8985698155814829,
"grad_norm": 0.038563286201813984,
"learning_rate": 9.306292079418115e-06,
"loss": 0.6965,
"step": 4775
},
{
"epoch": 0.8995107263831389,
"grad_norm": 0.037576451289906165,
"learning_rate": 9.136200845913716e-06,
"loss": 0.7404,
"step": 4780
},
{
"epoch": 0.9004516371847949,
"grad_norm": 0.03920865759024056,
"learning_rate": 8.967629528516141e-06,
"loss": 0.7214,
"step": 4785
},
{
"epoch": 0.9013925479864509,
"grad_norm": 0.03869578048793669,
"learning_rate": 8.800579946104702e-06,
"loss": 0.7287,
"step": 4790
},
{
"epoch": 0.9023334587881069,
"grad_norm": 0.041785756799252886,
"learning_rate": 8.635053901139367e-06,
"loss": 0.7647,
"step": 4795
},
{
"epoch": 0.9032743695897629,
"grad_norm": 0.03868915487772746,
"learning_rate": 8.471053179641147e-06,
"loss": 0.7223,
"step": 4800
},
{
"epoch": 0.9042152803914189,
"grad_norm": 0.03662076107336638,
"learning_rate": 8.30857955117279e-06,
"loss": 0.7399,
"step": 4805
},
{
"epoch": 0.9051561911930749,
"grad_norm": 0.03359384020617572,
"learning_rate": 8.147634768819788e-06,
"loss": 0.7046,
"step": 4810
},
{
"epoch": 0.9060971019947309,
"grad_norm": 0.03666121162864638,
"learning_rate": 7.988220569171467e-06,
"loss": 0.7387,
"step": 4815
},
{
"epoch": 0.9070380127963868,
"grad_norm": 0.036559492972736925,
"learning_rate": 7.830338672302223e-06,
"loss": 0.7398,
"step": 4820
},
{
"epoch": 0.9079789235980429,
"grad_norm": 0.03695997811728329,
"learning_rate": 7.673990781752881e-06,
"loss": 0.7133,
"step": 4825
},
{
"epoch": 0.9089198343996989,
"grad_norm": 0.038686133865653574,
"learning_rate": 7.5191785845124255e-06,
"loss": 0.7275,
"step": 4830
},
{
"epoch": 0.909860745201355,
"grad_norm": 0.034693594406954034,
"learning_rate": 7.365903750999791e-06,
"loss": 0.7157,
"step": 4835
},
{
"epoch": 0.9108016560030109,
"grad_norm": 0.0388397518590927,
"learning_rate": 7.2141679350457175e-06,
"loss": 0.7216,
"step": 4840
},
{
"epoch": 0.9117425668046669,
"grad_norm": 0.03748596193447337,
"learning_rate": 7.063972773875076e-06,
"loss": 0.7359,
"step": 4845
},
{
"epoch": 0.912683477606323,
"grad_norm": 0.03888928393947248,
"learning_rate": 6.915319888089055e-06,
"loss": 0.7296,
"step": 4850
},
{
"epoch": 0.9136243884079789,
"grad_norm": 0.03873257173734812,
"learning_rate": 6.768210881647784e-06,
"loss": 0.7514,
"step": 4855
},
{
"epoch": 0.9145652992096349,
"grad_norm": 0.03709779612823535,
"learning_rate": 6.622647341853005e-06,
"loss": 0.7421,
"step": 4860
},
{
"epoch": 0.915506210011291,
"grad_norm": 0.03587682055067527,
"learning_rate": 6.478630839330828e-06,
"loss": 0.7205,
"step": 4865
},
{
"epoch": 0.9164471208129469,
"grad_norm": 0.04029055825522218,
"learning_rate": 6.336162928014937e-06,
"loss": 0.737,
"step": 4870
},
{
"epoch": 0.9173880316146029,
"grad_norm": 0.037073343055570404,
"learning_rate": 6.195245145129812e-06,
"loss": 0.7347,
"step": 4875
},
{
"epoch": 0.918328942416259,
"grad_norm": 0.036440160856527605,
"learning_rate": 6.055879011173998e-06,
"loss": 0.7387,
"step": 4880
},
{
"epoch": 0.919269853217915,
"grad_norm": 0.0377819594467133,
"learning_rate": 5.918066029903812e-06,
"loss": 0.7215,
"step": 4885
},
{
"epoch": 0.9202107640195709,
"grad_norm": 0.0384362688453867,
"learning_rate": 5.781807688317214e-06,
"loss": 0.7101,
"step": 4890
},
{
"epoch": 0.921151674821227,
"grad_norm": 0.04042614908872056,
"learning_rate": 5.6471054566374965e-06,
"loss": 0.726,
"step": 4895
},
{
"epoch": 0.922092585622883,
"grad_norm": 0.037566662938396556,
"learning_rate": 5.5139607882976666e-06,
"loss": 0.7312,
"step": 4900
},
{
"epoch": 0.9230334964245389,
"grad_norm": 0.03800770208223098,
"learning_rate": 5.382375119924626e-06,
"loss": 0.7431,
"step": 4905
},
{
"epoch": 0.923974407226195,
"grad_norm": 0.03619717959439674,
"learning_rate": 5.252349871323747e-06,
"loss": 0.7284,
"step": 4910
},
{
"epoch": 0.924915318027851,
"grad_norm": 0.03753904354868467,
"learning_rate": 5.123886445463504e-06,
"loss": 0.7567,
"step": 4915
},
{
"epoch": 0.9258562288295069,
"grad_norm": 0.038858725721177816,
"learning_rate": 4.99698622846037e-06,
"loss": 0.7093,
"step": 4920
},
{
"epoch": 0.926797139631163,
"grad_norm": 0.03824992710079084,
"learning_rate": 4.871650589563775e-06,
"loss": 0.7451,
"step": 4925
},
{
"epoch": 0.927738050432819,
"grad_norm": 0.04100741519395592,
"learning_rate": 4.747880881141502e-06,
"loss": 0.7291,
"step": 4930
},
{
"epoch": 0.928678961234475,
"grad_norm": 0.03950366551892632,
"learning_rate": 4.62567843866492e-06,
"loss": 0.7362,
"step": 4935
},
{
"epoch": 0.929619872036131,
"grad_norm": 0.03655294044802147,
"learning_rate": 4.5050445806946555e-06,
"loss": 0.7318,
"step": 4940
},
{
"epoch": 0.930560782837787,
"grad_norm": 0.03893513792284634,
"learning_rate": 4.385980608866374e-06,
"loss": 0.751,
"step": 4945
},
{
"epoch": 0.931501693639443,
"grad_norm": 0.03535458990197148,
"learning_rate": 4.268487807876725e-06,
"loss": 0.7318,
"step": 4950
},
{
"epoch": 0.9324426044410989,
"grad_norm": 0.041789838223581545,
"learning_rate": 4.152567445469418e-06,
"loss": 0.7276,
"step": 4955
},
{
"epoch": 0.933383515242755,
"grad_norm": 0.03906894325898849,
"learning_rate": 4.038220772421668e-06,
"loss": 0.7163,
"step": 4960
},
{
"epoch": 0.934324426044411,
"grad_norm": 0.03989311989542537,
"learning_rate": 3.9254490225305915e-06,
"loss": 0.7326,
"step": 4965
},
{
"epoch": 0.9352653368460669,
"grad_norm": 0.03578498133734959,
"learning_rate": 3.814253412599927e-06,
"loss": 0.6939,
"step": 4970
},
{
"epoch": 0.936206247647723,
"grad_norm": 0.03670755137788207,
"learning_rate": 3.704635142426937e-06,
"loss": 0.7555,
"step": 4975
},
{
"epoch": 0.937147158449379,
"grad_norm": 0.03938777341705205,
"learning_rate": 3.5965953947894144e-06,
"loss": 0.7263,
"step": 4980
},
{
"epoch": 0.938088069251035,
"grad_norm": 0.03628876749133803,
"learning_rate": 3.490135335432942e-06,
"loss": 0.7009,
"step": 4985
},
{
"epoch": 0.939028980052691,
"grad_norm": 0.03826742122227512,
"learning_rate": 3.3852561130583376e-06,
"loss": 0.7432,
"step": 4990
},
{
"epoch": 0.939969890854347,
"grad_norm": 0.03883574703669586,
"learning_rate": 3.281958859309197e-06,
"loss": 0.7406,
"step": 4995
},
{
"epoch": 0.940910801656003,
"grad_norm": 0.038660784217541634,
"learning_rate": 3.18024468875977e-06,
"loss": 0.7288,
"step": 5000
},
{
"epoch": 0.941851712457659,
"grad_norm": 0.03986660657950191,
"learning_rate": 3.0801146989028525e-06,
"loss": 0.7233,
"step": 5005
},
{
"epoch": 0.942792623259315,
"grad_norm": 0.03698570319860795,
"learning_rate": 2.9815699701379813e-06,
"loss": 0.7249,
"step": 5010
},
{
"epoch": 0.943733534060971,
"grad_norm": 0.035194532189499236,
"learning_rate": 2.884611565759792e-06,
"loss": 0.724,
"step": 5015
},
{
"epoch": 0.944674444862627,
"grad_norm": 0.03724239049887656,
"learning_rate": 2.7892405319464963e-06,
"loss": 0.7434,
"step": 5020
},
{
"epoch": 0.945615355664283,
"grad_norm": 0.041330518568329144,
"learning_rate": 2.6954578977486707e-06,
"loss": 0.7232,
"step": 5025
},
{
"epoch": 0.946556266465939,
"grad_norm": 0.037368253372142564,
"learning_rate": 2.6032646750780706e-06,
"loss": 0.7389,
"step": 5030
},
{
"epoch": 0.9474971772675951,
"grad_norm": 0.03493182990339746,
"learning_rate": 2.5126618586967685e-06,
"loss": 0.7497,
"step": 5035
},
{
"epoch": 0.948438088069251,
"grad_norm": 0.03889842265077455,
"learning_rate": 2.4236504262064136e-06,
"loss": 0.7089,
"step": 5040
},
{
"epoch": 0.949378998870907,
"grad_norm": 0.0359201656454254,
"learning_rate": 2.3362313380376253e-06,
"loss": 0.7094,
"step": 5045
},
{
"epoch": 0.9503199096725631,
"grad_norm": 0.0353292313883741,
"learning_rate": 2.2504055374397144e-06,
"loss": 0.7348,
"step": 5050
},
{
"epoch": 0.951260820474219,
"grad_norm": 0.03966754193538584,
"learning_rate": 2.1661739504704623e-06,
"loss": 0.7268,
"step": 5055
},
{
"epoch": 0.952201731275875,
"grad_norm": 0.03702888456410481,
"learning_rate": 2.0835374859861255e-06,
"loss": 0.7224,
"step": 5060
},
{
"epoch": 0.9531426420775311,
"grad_norm": 0.037585415673704874,
"learning_rate": 2.0024970356316615e-06,
"loss": 0.7313,
"step": 5065
},
{
"epoch": 0.954083552879187,
"grad_norm": 0.04180203300864954,
"learning_rate": 1.9230534738310375e-06,
"loss": 0.7107,
"step": 5070
},
{
"epoch": 0.955024463680843,
"grad_norm": 0.037393099939864984,
"learning_rate": 1.8452076577778696e-06,
"loss": 0.7181,
"step": 5075
},
{
"epoch": 0.9559653744824991,
"grad_norm": 0.03553043089641243,
"learning_rate": 1.7689604274261637e-06,
"loss": 0.7303,
"step": 5080
},
{
"epoch": 0.9569062852841551,
"grad_norm": 0.037063309388190024,
"learning_rate": 1.6943126054811906e-06,
"loss": 0.7582,
"step": 5085
},
{
"epoch": 0.957847196085811,
"grad_norm": 0.03908640307451864,
"learning_rate": 1.621264997390692e-06,
"loss": 0.7409,
"step": 5090
},
{
"epoch": 0.9587881068874671,
"grad_norm": 0.03940471571019541,
"learning_rate": 1.5498183913361383e-06,
"loss": 0.7608,
"step": 5095
},
{
"epoch": 0.9597290176891231,
"grad_norm": 0.03659422309678058,
"learning_rate": 1.4799735582242344e-06,
"loss": 0.7184,
"step": 5100
},
{
"epoch": 0.960669928490779,
"grad_norm": 0.03897811850405322,
"learning_rate": 1.4117312516785938e-06,
"loss": 0.7231,
"step": 5105
},
{
"epoch": 0.9616108392924351,
"grad_norm": 0.03672154149071319,
"learning_rate": 1.345092208031645e-06,
"loss": 0.7052,
"step": 5110
},
{
"epoch": 0.9625517500940911,
"grad_norm": 0.03491112077981314,
"learning_rate": 1.280057146316621e-06,
"loss": 0.7194,
"step": 5115
},
{
"epoch": 0.963492660895747,
"grad_norm": 0.03847928744181442,
"learning_rate": 1.2166267682598818e-06,
"loss": 0.7348,
"step": 5120
},
{
"epoch": 0.9644335716974031,
"grad_norm": 0.03929152091074595,
"learning_rate": 1.154801758273255e-06,
"loss": 0.7096,
"step": 5125
},
{
"epoch": 0.9653744824990591,
"grad_norm": 0.037421477773637336,
"learning_rate": 1.0945827834467402e-06,
"loss": 0.7433,
"step": 5130
},
{
"epoch": 0.9663153933007151,
"grad_norm": 0.03988682711853078,
"learning_rate": 1.035970493541216e-06,
"loss": 0.744,
"step": 5135
},
{
"epoch": 0.9672563041023711,
"grad_norm": 0.03687421910742082,
"learning_rate": 9.789655209815284e-07,
"loss": 0.7039,
"step": 5140
},
{
"epoch": 0.9681972149040271,
"grad_norm": 0.0401166521279677,
"learning_rate": 9.235684808495792e-07,
"loss": 0.7078,
"step": 5145
},
{
"epoch": 0.9691381257056831,
"grad_norm": 0.036933913690302404,
"learning_rate": 8.697799708777653e-07,
"loss": 0.7205,
"step": 5150
},
{
"epoch": 0.9700790365073391,
"grad_norm": 0.03862870548258374,
"learning_rate": 8.176005714424671e-07,
"loss": 0.7427,
"step": 5155
},
{
"epoch": 0.9710199473089951,
"grad_norm": 0.037195912476703265,
"learning_rate": 7.670308455578034e-07,
"loss": 0.7095,
"step": 5160
},
{
"epoch": 0.9719608581106511,
"grad_norm": 0.0387790773069963,
"learning_rate": 7.180713388695858e-07,
"loss": 0.7335,
"step": 5165
},
{
"epoch": 0.9729017689123071,
"grad_norm": 0.038473815659207836,
"learning_rate": 6.707225796494076e-07,
"loss": 0.7406,
"step": 5170
},
{
"epoch": 0.9738426797139631,
"grad_norm": 0.038220935568139616,
"learning_rate": 6.249850787889477e-07,
"loss": 0.7328,
"step": 5175
},
{
"epoch": 0.9747835905156191,
"grad_norm": 0.03577405006427383,
"learning_rate": 5.808593297944253e-07,
"loss": 0.6966,
"step": 5180
},
{
"epoch": 0.9757245013172752,
"grad_norm": 0.03926752579673784,
"learning_rate": 5.383458087813375e-07,
"loss": 0.7144,
"step": 5185
},
{
"epoch": 0.9766654121189311,
"grad_norm": 0.03907926405272728,
"learning_rate": 4.974449744692966e-07,
"loss": 0.7429,
"step": 5190
},
{
"epoch": 0.9776063229205871,
"grad_norm": 0.038899452851094404,
"learning_rate": 4.5815726817705065e-07,
"loss": 0.7677,
"step": 5195
},
{
"epoch": 0.9785472337222432,
"grad_norm": 0.0423749591733099,
"learning_rate": 4.204831138177378e-07,
"loss": 0.7252,
"step": 5200
},
{
"epoch": 0.9794881445238991,
"grad_norm": 0.03698062230023023,
"learning_rate": 3.844229178943725e-07,
"loss": 0.7439,
"step": 5205
},
{
"epoch": 0.9804290553255551,
"grad_norm": 0.03715944964729997,
"learning_rate": 3.4997706949534966e-07,
"loss": 0.7431,
"step": 5210
},
{
"epoch": 0.9813699661272112,
"grad_norm": 0.03833525661199143,
"learning_rate": 3.171459402903309e-07,
"loss": 0.7506,
"step": 5215
},
{
"epoch": 0.9823108769288671,
"grad_norm": 0.04029829091509341,
"learning_rate": 2.859298845261815e-07,
"loss": 0.7206,
"step": 5220
},
{
"epoch": 0.9832517877305231,
"grad_norm": 0.04317581049694436,
"learning_rate": 2.56329239023223e-07,
"loss": 0.7442,
"step": 5225
},
{
"epoch": 0.9841926985321792,
"grad_norm": 0.03947040991708499,
"learning_rate": 2.2834432317151986e-07,
"loss": 0.7467,
"step": 5230
},
{
"epoch": 0.9851336093338352,
"grad_norm": 0.038225453966465185,
"learning_rate": 2.0197543892743195e-07,
"loss": 0.71,
"step": 5235
},
{
"epoch": 0.9860745201354911,
"grad_norm": 0.03981803197968165,
"learning_rate": 1.772228708104506e-07,
"loss": 0.7083,
"step": 5240
},
{
"epoch": 0.9870154309371472,
"grad_norm": 0.03706590784392966,
"learning_rate": 1.5408688590000107e-07,
"loss": 0.7193,
"step": 5245
},
{
"epoch": 0.9879563417388032,
"grad_norm": 0.03776800734619944,
"learning_rate": 1.325677338326947e-07,
"loss": 0.7093,
"step": 5250
},
{
"epoch": 0.9888972525404591,
"grad_norm": 0.03617798634178418,
"learning_rate": 1.1266564679949797e-07,
"loss": 0.7014,
"step": 5255
},
{
"epoch": 0.9898381633421152,
"grad_norm": 0.04052107735340787,
"learning_rate": 9.43808395433343e-08,
"loss": 0.7344,
"step": 5260
},
{
"epoch": 0.9907790741437712,
"grad_norm": 0.03811509221337354,
"learning_rate": 7.771350935670274e-08,
"loss": 0.7027,
"step": 5265
},
{
"epoch": 0.9917199849454271,
"grad_norm": 0.03795993990734393,
"learning_rate": 6.266383607961278e-08,
"loss": 0.7107,
"step": 5270
},
{
"epoch": 0.9926608957470832,
"grad_norm": 0.03844604892644717,
"learning_rate": 4.9231982097586164e-08,
"loss": 0.7499,
"step": 5275
},
{
"epoch": 0.9936018065487392,
"grad_norm": 0.03756964997959224,
"learning_rate": 3.741809233989146e-08,
"loss": 0.7194,
"step": 5280
},
{
"epoch": 0.9945427173503952,
"grad_norm": 0.03744567511221886,
"learning_rate": 2.7222294278045343e-08,
"loss": 0.7258,
"step": 5285
},
{
"epoch": 0.9954836281520512,
"grad_norm": 0.03534047578763023,
"learning_rate": 1.8644697924413697e-08,
"loss": 0.7428,
"step": 5290
},
{
"epoch": 0.9964245389537072,
"grad_norm": 0.03693996603556449,
"learning_rate": 1.1685395830979271e-08,
"loss": 0.7315,
"step": 5295
},
{
"epoch": 0.9973654497553632,
"grad_norm": 0.03684649235727902,
"learning_rate": 6.344463088425733e-09,
"loss": 0.7008,
"step": 5300
},
{
"epoch": 0.9983063605570192,
"grad_norm": 0.03615359821424916,
"learning_rate": 2.6219573252383995e-09,
"loss": 0.7311,
"step": 5305
},
{
"epoch": 0.9992472713586752,
"grad_norm": 0.03782149225904557,
"learning_rate": 5.179187071546742e-10,
"loss": 0.7642,
"step": 5310
},
{
"epoch": 1.0,
"eval_loss": 1.0969747304916382,
"eval_runtime": 1105.2258,
"eval_samples_per_second": 191.917,
"eval_steps_per_second": 5.998,
"step": 5314
},
{
"epoch": 1.0,
"step": 5314,
"total_flos": 773266676056064.0,
"train_loss": 0.8050324304417017,
"train_runtime": 21832.2284,
"train_samples_per_second": 31.155,
"train_steps_per_second": 0.243
}
],
"logging_steps": 5,
"max_steps": 5314,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 773266676056064.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}