Files
qwen3-8b-sft/trainer_state.json

3373 lines
82 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 4717,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00211999152003392,
"grad_norm": 32.50752996739497,
"learning_rate": 1.6949152542372883e-07,
"loss": 3.7461,
"step": 10
},
{
"epoch": 0.00423998304006784,
"grad_norm": 19.7783715335098,
"learning_rate": 5.93220338983051e-07,
"loss": 3.5757,
"step": 20
},
{
"epoch": 0.006359974560101759,
"grad_norm": 9.354428743861847,
"learning_rate": 1.016949152542373e-06,
"loss": 2.7484,
"step": 30
},
{
"epoch": 0.00847996608013568,
"grad_norm": 5.7293369283524935,
"learning_rate": 1.4406779661016951e-06,
"loss": 1.9415,
"step": 40
},
{
"epoch": 0.0105999576001696,
"grad_norm": 6.878907941943918,
"learning_rate": 1.8644067796610171e-06,
"loss": 1.2673,
"step": 50
},
{
"epoch": 0.012719949120203519,
"grad_norm": 8.215857672525798,
"learning_rate": 2.288135593220339e-06,
"loss": 0.8349,
"step": 60
},
{
"epoch": 0.014839940640237439,
"grad_norm": 2.0985232088184698,
"learning_rate": 2.7118644067796613e-06,
"loss": 0.6788,
"step": 70
},
{
"epoch": 0.01695993216027136,
"grad_norm": 4.181552201854574,
"learning_rate": 3.135593220338983e-06,
"loss": 0.5306,
"step": 80
},
{
"epoch": 0.01907992368030528,
"grad_norm": 1.3578290731521812,
"learning_rate": 3.5593220338983053e-06,
"loss": 0.428,
"step": 90
},
{
"epoch": 0.0211999152003392,
"grad_norm": 8.019239361403569,
"learning_rate": 3.9830508474576275e-06,
"loss": 0.3659,
"step": 100
},
{
"epoch": 0.02331990672037312,
"grad_norm": 1.0371743917362437,
"learning_rate": 4.40677966101695e-06,
"loss": 0.3098,
"step": 110
},
{
"epoch": 0.025439898240407037,
"grad_norm": 1.0725520171241911,
"learning_rate": 4.830508474576272e-06,
"loss": 0.272,
"step": 120
},
{
"epoch": 0.027559889760440957,
"grad_norm": 4.320619473460736,
"learning_rate": 5.254237288135594e-06,
"loss": 0.2408,
"step": 130
},
{
"epoch": 0.029679881280474878,
"grad_norm": 9.318265850779325,
"learning_rate": 5.677966101694916e-06,
"loss": 0.2177,
"step": 140
},
{
"epoch": 0.0317998728005088,
"grad_norm": 0.9802919759217678,
"learning_rate": 6.1016949152542385e-06,
"loss": 0.2049,
"step": 150
},
{
"epoch": 0.03391986432054272,
"grad_norm": 2.4459592955831337,
"learning_rate": 6.52542372881356e-06,
"loss": 0.1998,
"step": 160
},
{
"epoch": 0.03603985584057664,
"grad_norm": 1.5882881176464134,
"learning_rate": 6.949152542372882e-06,
"loss": 0.1803,
"step": 170
},
{
"epoch": 0.03815984736061056,
"grad_norm": 3.036655132788213,
"learning_rate": 7.372881355932204e-06,
"loss": 0.1643,
"step": 180
},
{
"epoch": 0.04027983888064448,
"grad_norm": 0.4963099670959003,
"learning_rate": 7.796610169491526e-06,
"loss": 0.1595,
"step": 190
},
{
"epoch": 0.0423998304006784,
"grad_norm": 0.8389581242220279,
"learning_rate": 8.220338983050849e-06,
"loss": 0.1556,
"step": 200
},
{
"epoch": 0.04451982192071232,
"grad_norm": 0.5257252909788135,
"learning_rate": 8.64406779661017e-06,
"loss": 0.1471,
"step": 210
},
{
"epoch": 0.04663981344074624,
"grad_norm": 0.3592742330584675,
"learning_rate": 9.067796610169493e-06,
"loss": 0.1374,
"step": 220
},
{
"epoch": 0.048759804960780154,
"grad_norm": 0.3187507665176873,
"learning_rate": 9.491525423728815e-06,
"loss": 0.1398,
"step": 230
},
{
"epoch": 0.050879796480814074,
"grad_norm": 0.3482668638560697,
"learning_rate": 9.915254237288137e-06,
"loss": 0.1293,
"step": 240
},
{
"epoch": 0.052999788000847994,
"grad_norm": 0.44242263127368797,
"learning_rate": 9.999921355437334e-06,
"loss": 0.1281,
"step": 250
},
{
"epoch": 0.055119779520881915,
"grad_norm": 0.43066449617717567,
"learning_rate": 9.999601866141578e-06,
"loss": 0.1236,
"step": 260
},
{
"epoch": 0.057239771040915835,
"grad_norm": 0.2632241978204935,
"learning_rate": 9.999036632519274e-06,
"loss": 0.1198,
"step": 270
},
{
"epoch": 0.059359762560949755,
"grad_norm": 0.5107458203310429,
"learning_rate": 9.998225682353224e-06,
"loss": 0.1219,
"step": 280
},
{
"epoch": 0.061479754080983676,
"grad_norm": 0.4495473347031151,
"learning_rate": 9.997169055503885e-06,
"loss": 0.1215,
"step": 290
},
{
"epoch": 0.0635997456010176,
"grad_norm": 0.852881887488011,
"learning_rate": 9.995866803907402e-06,
"loss": 0.1113,
"step": 300
},
{
"epoch": 0.06571973712105152,
"grad_norm": 0.3612657381641809,
"learning_rate": 9.99431899157306e-06,
"loss": 0.1111,
"step": 310
},
{
"epoch": 0.06783972864108544,
"grad_norm": 0.44988476523435184,
"learning_rate": 9.992525694580135e-06,
"loss": 0.1072,
"step": 320
},
{
"epoch": 0.06995972016111936,
"grad_norm": 0.29666583291365195,
"learning_rate": 9.990487001074161e-06,
"loss": 0.1124,
"step": 330
},
{
"epoch": 0.07207971168115328,
"grad_norm": 0.28345381185455476,
"learning_rate": 9.988203011262589e-06,
"loss": 0.1075,
"step": 340
},
{
"epoch": 0.0741997032011872,
"grad_norm": 0.3252373191017647,
"learning_rate": 9.985673837409865e-06,
"loss": 0.1012,
"step": 350
},
{
"epoch": 0.07631969472122112,
"grad_norm": 0.34083940392900397,
"learning_rate": 9.982899603831912e-06,
"loss": 0.1031,
"step": 360
},
{
"epoch": 0.07843968624125504,
"grad_norm": 0.4836055260534099,
"learning_rate": 9.979880446890025e-06,
"loss": 0.0996,
"step": 370
},
{
"epoch": 0.08055967776128896,
"grad_norm": 0.3984011129854127,
"learning_rate": 9.976616514984152e-06,
"loss": 0.1009,
"step": 380
},
{
"epoch": 0.08267966928132288,
"grad_norm": 0.35365486849367245,
"learning_rate": 9.973107968545623e-06,
"loss": 0.0976,
"step": 390
},
{
"epoch": 0.0847996608013568,
"grad_norm": 0.30027024887187254,
"learning_rate": 9.969354980029243e-06,
"loss": 0.0969,
"step": 400
},
{
"epoch": 0.08691965232139072,
"grad_norm": 0.25355683531168066,
"learning_rate": 9.96535773390483e-06,
"loss": 0.1002,
"step": 410
},
{
"epoch": 0.08903964384142464,
"grad_norm": 0.20391383326557452,
"learning_rate": 9.961116426648138e-06,
"loss": 0.0995,
"step": 420
},
{
"epoch": 0.09115963536145856,
"grad_norm": 0.29085751915140184,
"learning_rate": 9.956631266731207e-06,
"loss": 0.0992,
"step": 430
},
{
"epoch": 0.09327962688149248,
"grad_norm": 0.2774668505220891,
"learning_rate": 9.951902474612112e-06,
"loss": 0.0981,
"step": 440
},
{
"epoch": 0.09539961840152639,
"grad_norm": 0.23046127720501614,
"learning_rate": 9.946930282724128e-06,
"loss": 0.0946,
"step": 450
},
{
"epoch": 0.09751960992156031,
"grad_norm": 0.21609489095737885,
"learning_rate": 9.941714935464303e-06,
"loss": 0.0903,
"step": 460
},
{
"epoch": 0.09963960144159423,
"grad_norm": 0.3223533405901417,
"learning_rate": 9.936256689181454e-06,
"loss": 0.0996,
"step": 470
},
{
"epoch": 0.10175959296162815,
"grad_norm": 0.33768759021085587,
"learning_rate": 9.930555812163552e-06,
"loss": 0.094,
"step": 480
},
{
"epoch": 0.10387958448166207,
"grad_norm": 0.22549215526677444,
"learning_rate": 9.924612584624545e-06,
"loss": 0.094,
"step": 490
},
{
"epoch": 0.10599957600169599,
"grad_norm": 0.20645922834306707,
"learning_rate": 9.918427298690585e-06,
"loss": 0.0909,
"step": 500
},
{
"epoch": 0.10811956752172991,
"grad_norm": 0.3341574730006992,
"learning_rate": 9.912000258385669e-06,
"loss": 0.0873,
"step": 510
},
{
"epoch": 0.11023955904176383,
"grad_norm": 0.22497042484649127,
"learning_rate": 9.905331779616683e-06,
"loss": 0.091,
"step": 520
},
{
"epoch": 0.11235955056179775,
"grad_norm": 0.25629246939970207,
"learning_rate": 9.898422190157897e-06,
"loss": 0.0908,
"step": 530
},
{
"epoch": 0.11447954208183167,
"grad_norm": 0.3221978930825859,
"learning_rate": 9.891271829634837e-06,
"loss": 0.0958,
"step": 540
},
{
"epoch": 0.11659953360186559,
"grad_norm": 0.24323805543228721,
"learning_rate": 9.883881049507592e-06,
"loss": 0.0931,
"step": 550
},
{
"epoch": 0.11871952512189951,
"grad_norm": 1.4298692529820525,
"learning_rate": 9.876250213053542e-06,
"loss": 0.0899,
"step": 560
},
{
"epoch": 0.12083951664193343,
"grad_norm": 0.2343862275786179,
"learning_rate": 9.868379695349514e-06,
"loss": 0.0954,
"step": 570
},
{
"epoch": 0.12295950816196735,
"grad_norm": 0.2786132714201901,
"learning_rate": 9.860269883253321e-06,
"loss": 0.0909,
"step": 580
},
{
"epoch": 0.12507949968200127,
"grad_norm": 0.3051144098888441,
"learning_rate": 9.851921175384769e-06,
"loss": 0.0875,
"step": 590
},
{
"epoch": 0.1271994912020352,
"grad_norm": 0.19169696652783166,
"learning_rate": 9.843333982106052e-06,
"loss": 0.0877,
"step": 600
},
{
"epoch": 0.1293194827220691,
"grad_norm": 0.17535067812063176,
"learning_rate": 9.834508725501584e-06,
"loss": 0.088,
"step": 610
},
{
"epoch": 0.13143947424210303,
"grad_norm": 0.24821593393490932,
"learning_rate": 9.825445839357256e-06,
"loss": 0.0869,
"step": 620
},
{
"epoch": 0.13355946576213695,
"grad_norm": 0.25345703968329003,
"learning_rate": 9.816145769139107e-06,
"loss": 0.0882,
"step": 630
},
{
"epoch": 0.13567945728217087,
"grad_norm": 0.2081953809171809,
"learning_rate": 9.806608971971436e-06,
"loss": 0.0862,
"step": 640
},
{
"epoch": 0.1377994488022048,
"grad_norm": 0.18784011735994216,
"learning_rate": 9.796835916614329e-06,
"loss": 0.0872,
"step": 650
},
{
"epoch": 0.13991944032223871,
"grad_norm": 0.24880072040487056,
"learning_rate": 9.786827083440616e-06,
"loss": 0.0845,
"step": 660
},
{
"epoch": 0.14203943184227263,
"grad_norm": 0.24058730019755548,
"learning_rate": 9.776582964412267e-06,
"loss": 0.0862,
"step": 670
},
{
"epoch": 0.14415942336230655,
"grad_norm": 0.2565530024882537,
"learning_rate": 9.766104063056201e-06,
"loss": 0.0867,
"step": 680
},
{
"epoch": 0.14627941488234047,
"grad_norm": 0.2716229887520234,
"learning_rate": 9.75539089443954e-06,
"loss": 0.0847,
"step": 690
},
{
"epoch": 0.1483994064023744,
"grad_norm": 0.18039433691787665,
"learning_rate": 9.7444439851443e-06,
"loss": 0.084,
"step": 700
},
{
"epoch": 0.15051939792240832,
"grad_norm": 0.19975307362406894,
"learning_rate": 9.733263873241494e-06,
"loss": 0.085,
"step": 710
},
{
"epoch": 0.15263938944244224,
"grad_norm": 0.20672720831030839,
"learning_rate": 9.721851108264692e-06,
"loss": 0.0854,
"step": 720
},
{
"epoch": 0.15475938096247616,
"grad_norm": 0.22685728560638968,
"learning_rate": 9.710206251183015e-06,
"loss": 0.0822,
"step": 730
},
{
"epoch": 0.15687937248251008,
"grad_norm": 0.19518820234055553,
"learning_rate": 9.698329874373547e-06,
"loss": 0.0841,
"step": 740
},
{
"epoch": 0.158999364002544,
"grad_norm": 0.1711051072902191,
"learning_rate": 9.686222561593218e-06,
"loss": 0.0813,
"step": 750
},
{
"epoch": 0.16111935552257792,
"grad_norm": 0.21782045297391434,
"learning_rate": 9.6738849079501e-06,
"loss": 0.0811,
"step": 760
},
{
"epoch": 0.16323934704261184,
"grad_norm": 0.24001365708386638,
"learning_rate": 9.661317519874156e-06,
"loss": 0.0839,
"step": 770
},
{
"epoch": 0.16535933856264576,
"grad_norm": 0.24788434673856172,
"learning_rate": 9.648521015087437e-06,
"loss": 0.0821,
"step": 780
},
{
"epoch": 0.16747933008267968,
"grad_norm": 0.2361652593535425,
"learning_rate": 9.63549602257372e-06,
"loss": 0.0815,
"step": 790
},
{
"epoch": 0.1695993216027136,
"grad_norm": 0.20131220073696676,
"learning_rate": 9.622243182547584e-06,
"loss": 0.0814,
"step": 800
},
{
"epoch": 0.17171931312274752,
"grad_norm": 0.1910839922084592,
"learning_rate": 9.608763146422947e-06,
"loss": 0.0805,
"step": 810
},
{
"epoch": 0.17383930464278144,
"grad_norm": 0.19343246908459,
"learning_rate": 9.59505657678105e-06,
"loss": 0.0817,
"step": 820
},
{
"epoch": 0.17595929616281536,
"grad_norm": 0.20795991387602794,
"learning_rate": 9.581124147337886e-06,
"loss": 0.0829,
"step": 830
},
{
"epoch": 0.17807928768284928,
"grad_norm": 0.2384907168932879,
"learning_rate": 9.566966542911079e-06,
"loss": 0.0828,
"step": 840
},
{
"epoch": 0.1801992792028832,
"grad_norm": 0.2661800926148604,
"learning_rate": 9.552584459386234e-06,
"loss": 0.0807,
"step": 850
},
{
"epoch": 0.18231927072291712,
"grad_norm": 0.16793437681402207,
"learning_rate": 9.537978603682728e-06,
"loss": 0.0808,
"step": 860
},
{
"epoch": 0.18443926224295104,
"grad_norm": 0.17934828150494173,
"learning_rate": 9.52314969371896e-06,
"loss": 0.084,
"step": 870
},
{
"epoch": 0.18655925376298496,
"grad_norm": 0.2375412871314736,
"learning_rate": 9.50809845837707e-06,
"loss": 0.0816,
"step": 880
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.19298906022439435,
"learning_rate": 9.492825637467103e-06,
"loss": 0.0823,
"step": 890
},
{
"epoch": 0.19079923680305277,
"grad_norm": 0.21813774555943088,
"learning_rate": 9.47733198169065e-06,
"loss": 0.0783,
"step": 900
},
{
"epoch": 0.1929192283230867,
"grad_norm": 0.20165689985494703,
"learning_rate": 9.461618252603956e-06,
"loss": 0.0799,
"step": 910
},
{
"epoch": 0.19503921984312061,
"grad_norm": 0.19889073544853322,
"learning_rate": 9.44568522258048e-06,
"loss": 0.0824,
"step": 920
},
{
"epoch": 0.19715921136315454,
"grad_norm": 0.1677587269136254,
"learning_rate": 9.42953367477292e-06,
"loss": 0.0817,
"step": 930
},
{
"epoch": 0.19927920288318846,
"grad_norm": 0.27029655878775144,
"learning_rate": 9.413164403074744e-06,
"loss": 0.0771,
"step": 940
},
{
"epoch": 0.20139919440322238,
"grad_norm": 0.20752899646185938,
"learning_rate": 9.398246569397352e-06,
"loss": 0.083,
"step": 950
},
{
"epoch": 0.2035191859232563,
"grad_norm": 0.1755855686731489,
"learning_rate": 9.381465847779896e-06,
"loss": 0.0773,
"step": 960
},
{
"epoch": 0.20563917744329022,
"grad_norm": 0.20944110815210132,
"learning_rate": 9.364469764939109e-06,
"loss": 0.0856,
"step": 970
},
{
"epoch": 0.20775916896332414,
"grad_norm": 0.20414341977081546,
"learning_rate": 9.347259156279697e-06,
"loss": 0.0814,
"step": 980
},
{
"epoch": 0.20987916048335806,
"grad_norm": 0.23534197904825535,
"learning_rate": 9.329834867750912e-06,
"loss": 0.0782,
"step": 990
},
{
"epoch": 0.21199915200339198,
"grad_norm": 0.19706741061553582,
"learning_rate": 9.312197755804957e-06,
"loss": 0.0813,
"step": 1000
},
{
"epoch": 0.21199915200339198,
"eval_loss": 0.07808271795511246,
"eval_runtime": 489.1656,
"eval_samples_per_second": 4.183,
"eval_steps_per_second": 0.301,
"step": 1000
},
{
"epoch": 0.2141191435234259,
"grad_norm": 0.20255197961584584,
"learning_rate": 9.294348687354899e-06,
"loss": 0.0786,
"step": 1010
},
{
"epoch": 0.21623913504345982,
"grad_norm": 0.18756797354546684,
"learning_rate": 9.278104027838603e-06,
"loss": 0.0904,
"step": 1020
},
{
"epoch": 0.21835912656349374,
"grad_norm": 0.1803465926904638,
"learning_rate": 9.259854667654485e-06,
"loss": 0.0794,
"step": 1030
},
{
"epoch": 0.22047911808352766,
"grad_norm": 0.23334153257650003,
"learning_rate": 9.24139592377452e-06,
"loss": 0.0787,
"step": 1040
},
{
"epoch": 0.22259910960356158,
"grad_norm": 0.2179720848023895,
"learning_rate": 9.222728703497267e-06,
"loss": 0.082,
"step": 1050
},
{
"epoch": 0.2247191011235955,
"grad_norm": 0.19658693620368076,
"learning_rate": 9.203853924368488e-06,
"loss": 0.0774,
"step": 1060
},
{
"epoch": 0.22683909264362942,
"grad_norm": 0.19207266252736302,
"learning_rate": 9.18477251413603e-06,
"loss": 0.075,
"step": 1070
},
{
"epoch": 0.22895908416366334,
"grad_norm": 0.26026047947148445,
"learning_rate": 9.165485410704238e-06,
"loss": 0.0767,
"step": 1080
},
{
"epoch": 0.23107907568369726,
"grad_norm": 0.20808377225507274,
"learning_rate": 9.145993562087848e-06,
"loss": 0.0784,
"step": 1090
},
{
"epoch": 0.23319906720373118,
"grad_norm": 0.18546577944881873,
"learning_rate": 9.12629792636539e-06,
"loss": 0.0761,
"step": 1100
},
{
"epoch": 0.2353190587237651,
"grad_norm": 0.17056106678355273,
"learning_rate": 9.1063994716321e-06,
"loss": 0.079,
"step": 1110
},
{
"epoch": 0.23743905024379902,
"grad_norm": 0.23748473849259574,
"learning_rate": 9.086299175952327e-06,
"loss": 0.0769,
"step": 1120
},
{
"epoch": 0.23955904176383294,
"grad_norm": 0.21686782311110087,
"learning_rate": 9.065998027311467e-06,
"loss": 0.0783,
"step": 1130
},
{
"epoch": 0.24167903328386686,
"grad_norm": 0.18252100885280442,
"learning_rate": 9.045497023567396e-06,
"loss": 0.08,
"step": 1140
},
{
"epoch": 0.24379902480390078,
"grad_norm": 0.24544524196615075,
"learning_rate": 9.024797172401426e-06,
"loss": 0.08,
"step": 1150
},
{
"epoch": 0.2459190163239347,
"grad_norm": 0.19645758097319682,
"learning_rate": 9.003899491268768e-06,
"loss": 0.0798,
"step": 1160
},
{
"epoch": 0.24803900784396862,
"grad_norm": 0.21351292431439828,
"learning_rate": 8.982805007348531e-06,
"loss": 0.0754,
"step": 1170
},
{
"epoch": 0.25015899936400254,
"grad_norm": 0.17911187274119753,
"learning_rate": 8.961514757493224e-06,
"loss": 0.0772,
"step": 1180
},
{
"epoch": 0.2522789908840365,
"grad_norm": 0.22385841458052336,
"learning_rate": 8.940029788177795e-06,
"loss": 0.0773,
"step": 1190
},
{
"epoch": 0.2543989824040704,
"grad_norm": 0.1683385451298309,
"learning_rate": 8.9183511554482e-06,
"loss": 0.0747,
"step": 1200
},
{
"epoch": 0.2565189739241043,
"grad_norm": 0.2372277626069385,
"learning_rate": 8.896479924869483e-06,
"loss": 0.076,
"step": 1210
},
{
"epoch": 0.2586389654441382,
"grad_norm": 0.2022762684424488,
"learning_rate": 8.874417171473415e-06,
"loss": 0.074,
"step": 1220
},
{
"epoch": 0.2607589569641721,
"grad_norm": 0.19827726532798173,
"learning_rate": 8.852163979705639e-06,
"loss": 0.0782,
"step": 1230
},
{
"epoch": 0.26287894848420607,
"grad_norm": 0.2343304853478425,
"learning_rate": 8.829721443372378e-06,
"loss": 0.0756,
"step": 1240
},
{
"epoch": 0.26499894000423996,
"grad_norm": 0.2217975820152699,
"learning_rate": 8.807090665586664e-06,
"loss": 0.0777,
"step": 1250
},
{
"epoch": 0.2671189315242739,
"grad_norm": 0.20059042051568582,
"learning_rate": 8.784272758714118e-06,
"loss": 0.0738,
"step": 1260
},
{
"epoch": 0.2692389230443078,
"grad_norm": 0.18668008819406168,
"learning_rate": 8.761268844318282e-06,
"loss": 0.0757,
"step": 1270
},
{
"epoch": 0.27135891456434175,
"grad_norm": 0.24287290051699115,
"learning_rate": 8.73808005310548e-06,
"loss": 0.0762,
"step": 1280
},
{
"epoch": 0.27347890608437564,
"grad_norm": 0.18654864916400832,
"learning_rate": 8.714707524869245e-06,
"loss": 0.0795,
"step": 1290
},
{
"epoch": 0.2755988976044096,
"grad_norm": 0.2341640233496617,
"learning_rate": 8.691152408434296e-06,
"loss": 0.0732,
"step": 1300
},
{
"epoch": 0.2777188891244435,
"grad_norm": 0.2082849165240921,
"learning_rate": 8.66741586160007e-06,
"loss": 0.0774,
"step": 1310
},
{
"epoch": 0.27983888064447743,
"grad_norm": 0.22099657211974957,
"learning_rate": 8.643499051083812e-06,
"loss": 0.0738,
"step": 1320
},
{
"epoch": 0.2819588721645113,
"grad_norm": 0.20266362710361852,
"learning_rate": 8.619403152463231e-06,
"loss": 0.0765,
"step": 1330
},
{
"epoch": 0.28407886368454527,
"grad_norm": 0.2061785256600231,
"learning_rate": 8.595129350118707e-06,
"loss": 0.0743,
"step": 1340
},
{
"epoch": 0.28619885520457916,
"grad_norm": 0.16451845890324332,
"learning_rate": 8.570678837175089e-06,
"loss": 0.0731,
"step": 1350
},
{
"epoch": 0.2883188467246131,
"grad_norm": 0.2039051062049253,
"learning_rate": 8.546052815443041e-06,
"loss": 0.075,
"step": 1360
},
{
"epoch": 0.290438838244647,
"grad_norm": 0.20907838865950076,
"learning_rate": 8.521252495359971e-06,
"loss": 0.0779,
"step": 1370
},
{
"epoch": 0.29255882976468095,
"grad_norm": 0.24041556209946813,
"learning_rate": 8.496279095930535e-06,
"loss": 0.0752,
"step": 1380
},
{
"epoch": 0.29467882128471484,
"grad_norm": 0.749878332883915,
"learning_rate": 8.471133844666721e-06,
"loss": 0.0736,
"step": 1390
},
{
"epoch": 0.2967988128047488,
"grad_norm": 0.20890550228302898,
"learning_rate": 8.445817977527513e-06,
"loss": 0.075,
"step": 1400
},
{
"epoch": 0.2989188043247827,
"grad_norm": 0.19623374633823948,
"learning_rate": 8.420332738858136e-06,
"loss": 0.0764,
"step": 1410
},
{
"epoch": 0.30103879584481663,
"grad_norm": 0.1803239880391424,
"learning_rate": 8.394679381328904e-06,
"loss": 0.0782,
"step": 1420
},
{
"epoch": 0.3031587873648505,
"grad_norm": 0.20782215083834218,
"learning_rate": 8.368859165873629e-06,
"loss": 0.075,
"step": 1430
},
{
"epoch": 0.30527877888488447,
"grad_norm": 0.15864327444836024,
"learning_rate": 8.342873361627663e-06,
"loss": 0.0736,
"step": 1440
},
{
"epoch": 0.30739877040491836,
"grad_norm": 0.16758113802075347,
"learning_rate": 8.316723245865503e-06,
"loss": 0.0743,
"step": 1450
},
{
"epoch": 0.3095187619249523,
"grad_norm": 0.18187491458078606,
"learning_rate": 8.290410103938015e-06,
"loss": 0.0763,
"step": 1460
},
{
"epoch": 0.3116387534449862,
"grad_norm": 0.21015841421612225,
"learning_rate": 8.263935229209255e-06,
"loss": 0.0778,
"step": 1470
},
{
"epoch": 0.31375874496502015,
"grad_norm": 0.18123545189165477,
"learning_rate": 8.237299922992894e-06,
"loss": 0.0737,
"step": 1480
},
{
"epoch": 0.31587873648505405,
"grad_norm": 0.22166407582198208,
"learning_rate": 8.210505494488257e-06,
"loss": 0.0747,
"step": 1490
},
{
"epoch": 0.317998728005088,
"grad_norm": 0.20907147507686014,
"learning_rate": 8.183553260715971e-06,
"loss": 0.0753,
"step": 1500
},
{
"epoch": 0.3201187195251219,
"grad_norm": 0.18137783305550276,
"learning_rate": 8.15644454645323e-06,
"loss": 0.076,
"step": 1510
},
{
"epoch": 0.32223871104515583,
"grad_norm": 0.18468699366411487,
"learning_rate": 8.129180684168683e-06,
"loss": 0.0756,
"step": 1520
},
{
"epoch": 0.3243587025651897,
"grad_norm": 0.25213749529106116,
"learning_rate": 8.101763013956933e-06,
"loss": 0.0746,
"step": 1530
},
{
"epoch": 0.3264786940852237,
"grad_norm": 0.2161642309592381,
"learning_rate": 8.074192883472667e-06,
"loss": 0.0759,
"step": 1540
},
{
"epoch": 0.32859868560525757,
"grad_norm": 0.20176374830183055,
"learning_rate": 8.04647164786442e-06,
"loss": 0.0731,
"step": 1550
},
{
"epoch": 0.3307186771252915,
"grad_norm": 0.1969763529692524,
"learning_rate": 8.01860066970797e-06,
"loss": 0.0747,
"step": 1560
},
{
"epoch": 0.3328386686453254,
"grad_norm": 0.2060140719520926,
"learning_rate": 7.990581318939346e-06,
"loss": 0.0776,
"step": 1570
},
{
"epoch": 0.33495866016535936,
"grad_norm": 0.19063310891295998,
"learning_rate": 7.962414972787513e-06,
"loss": 0.0732,
"step": 1580
},
{
"epoch": 0.33707865168539325,
"grad_norm": 0.32170213151067406,
"learning_rate": 7.934103015706665e-06,
"loss": 0.0718,
"step": 1590
},
{
"epoch": 0.3391986432054272,
"grad_norm": 0.3017816089089381,
"learning_rate": 7.905646839308171e-06,
"loss": 0.0713,
"step": 1600
},
{
"epoch": 0.3413186347254611,
"grad_norm": 0.23008639936227068,
"learning_rate": 7.877047842292193e-06,
"loss": 0.0761,
"step": 1610
},
{
"epoch": 0.34343862624549504,
"grad_norm": 0.21592149981422365,
"learning_rate": 7.84830743037891e-06,
"loss": 0.0743,
"step": 1620
},
{
"epoch": 0.34555861776552893,
"grad_norm": 0.19186122210621318,
"learning_rate": 7.819427016239447e-06,
"loss": 0.0727,
"step": 1630
},
{
"epoch": 0.3476786092855629,
"grad_norm": 0.19028094584781277,
"learning_rate": 7.790408019426424e-06,
"loss": 0.0732,
"step": 1640
},
{
"epoch": 0.34979860080559677,
"grad_norm": 0.18613264864125317,
"learning_rate": 7.761251866304176e-06,
"loss": 0.0735,
"step": 1650
},
{
"epoch": 0.3519185923256307,
"grad_norm": 0.25693606830513216,
"learning_rate": 7.731959989978667e-06,
"loss": 0.0761,
"step": 1660
},
{
"epoch": 0.3540385838456646,
"grad_norm": 0.21174972493824354,
"learning_rate": 7.702533830227024e-06,
"loss": 0.073,
"step": 1670
},
{
"epoch": 0.35615857536569856,
"grad_norm": 0.17143400242703105,
"learning_rate": 7.672974833426779e-06,
"loss": 0.0737,
"step": 1680
},
{
"epoch": 0.35827856688573245,
"grad_norm": 0.1896798767328968,
"learning_rate": 7.643284452484773e-06,
"loss": 0.0725,
"step": 1690
},
{
"epoch": 0.3603985584057664,
"grad_norm": 0.20989812969482,
"learning_rate": 7.613464146765748e-06,
"loss": 0.0728,
"step": 1700
},
{
"epoch": 0.3625185499258003,
"grad_norm": 0.19528230055400297,
"learning_rate": 7.583515382020603e-06,
"loss": 0.0732,
"step": 1710
},
{
"epoch": 0.36463854144583424,
"grad_norm": 0.21719433509159808,
"learning_rate": 7.5534396303143605e-06,
"loss": 0.0704,
"step": 1720
},
{
"epoch": 0.36675853296586813,
"grad_norm": 0.17922009396269356,
"learning_rate": 7.523238369953802e-06,
"loss": 0.0683,
"step": 1730
},
{
"epoch": 0.3688785244859021,
"grad_norm": 0.43299284172403135,
"learning_rate": 7.4929130854148105e-06,
"loss": 0.0724,
"step": 1740
},
{
"epoch": 0.370998516005936,
"grad_norm": 0.17140377305963891,
"learning_rate": 7.4624652672693984e-06,
"loss": 0.0748,
"step": 1750
},
{
"epoch": 0.3731185075259699,
"grad_norm": 0.21895097629186294,
"learning_rate": 7.43189641211245e-06,
"loss": 0.0731,
"step": 1760
},
{
"epoch": 0.3752384990460038,
"grad_norm": 0.217978788931641,
"learning_rate": 7.401208022488152e-06,
"loss": 0.0742,
"step": 1770
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.1778572661187928,
"learning_rate": 7.370401606816142e-06,
"loss": 0.0699,
"step": 1780
},
{
"epoch": 0.37947848208607166,
"grad_norm": 0.22879874120541474,
"learning_rate": 7.339478679317369e-06,
"loss": 0.0762,
"step": 1790
},
{
"epoch": 0.38159847360610555,
"grad_norm": 0.20685054508670736,
"learning_rate": 7.308440759939659e-06,
"loss": 0.0717,
"step": 1800
},
{
"epoch": 0.3837184651261395,
"grad_norm": 0.23785935863776791,
"learning_rate": 7.277289374283009e-06,
"loss": 0.0736,
"step": 1810
},
{
"epoch": 0.3858384566461734,
"grad_norm": 0.25317055966913016,
"learning_rate": 7.246026053524603e-06,
"loss": 0.0729,
"step": 1820
},
{
"epoch": 0.38795844816620734,
"grad_norm": 0.17578925874031257,
"learning_rate": 7.214652334343539e-06,
"loss": 0.0736,
"step": 1830
},
{
"epoch": 0.39007843968624123,
"grad_norm": 0.20527746180283624,
"learning_rate": 7.183169758845308e-06,
"loss": 0.0738,
"step": 1840
},
{
"epoch": 0.3921984312062752,
"grad_norm": 0.17762579012389196,
"learning_rate": 7.151579874485995e-06,
"loss": 0.0713,
"step": 1850
},
{
"epoch": 0.39431842272630907,
"grad_norm": 0.200855955585812,
"learning_rate": 7.119884233996208e-06,
"loss": 0.0712,
"step": 1860
},
{
"epoch": 0.396438414246343,
"grad_norm": 0.18153714925752748,
"learning_rate": 7.088084395304765e-06,
"loss": 0.0716,
"step": 1870
},
{
"epoch": 0.3985584057663769,
"grad_norm": 0.30888829292022363,
"learning_rate": 7.0561819214621186e-06,
"loss": 0.0709,
"step": 1880
},
{
"epoch": 0.40067839728641086,
"grad_norm": 0.1677914580623982,
"learning_rate": 7.024178380563517e-06,
"loss": 0.0686,
"step": 1890
},
{
"epoch": 0.40279838880644475,
"grad_norm": 0.23978151499288752,
"learning_rate": 6.99207534567194e-06,
"loss": 0.0733,
"step": 1900
},
{
"epoch": 0.4049183803264787,
"grad_norm": 0.1496936015385523,
"learning_rate": 6.959874394740775e-06,
"loss": 0.0703,
"step": 1910
},
{
"epoch": 0.4070383718465126,
"grad_norm": 0.17861927332885064,
"learning_rate": 6.927577110536251e-06,
"loss": 0.0709,
"step": 1920
},
{
"epoch": 0.40915836336654654,
"grad_norm": 0.20321672481515973,
"learning_rate": 6.895185080559649e-06,
"loss": 0.0718,
"step": 1930
},
{
"epoch": 0.41127835488658043,
"grad_norm": 0.1702613475343327,
"learning_rate": 6.862699896969262e-06,
"loss": 0.0726,
"step": 1940
},
{
"epoch": 0.4133983464066144,
"grad_norm": 0.19820271991017135,
"learning_rate": 6.830123156502147e-06,
"loss": 0.0722,
"step": 1950
},
{
"epoch": 0.4155183379266483,
"grad_norm": 0.21659807580282067,
"learning_rate": 6.7974564603956395e-06,
"loss": 0.072,
"step": 1960
},
{
"epoch": 0.4176383294466822,
"grad_norm": 0.2484602203487257,
"learning_rate": 6.7647014143086334e-06,
"loss": 0.0707,
"step": 1970
},
{
"epoch": 0.4197583209667161,
"grad_norm": 0.2428781266233325,
"learning_rate": 6.7318596282426796e-06,
"loss": 0.0726,
"step": 1980
},
{
"epoch": 0.42187831248675006,
"grad_norm": 0.19890188197759706,
"learning_rate": 6.6989327164628375e-06,
"loss": 0.0735,
"step": 1990
},
{
"epoch": 0.42399830400678395,
"grad_norm": 0.17618893341728534,
"learning_rate": 6.665922297418328e-06,
"loss": 0.0717,
"step": 2000
},
{
"epoch": 0.42399830400678395,
"eval_loss": 0.07119767367839813,
"eval_runtime": 489.4309,
"eval_samples_per_second": 4.18,
"eval_steps_per_second": 0.3,
"step": 2000
},
{
"epoch": 0.4261182955268179,
"grad_norm": 0.2226486798390119,
"learning_rate": 6.632829993662994e-06,
"loss": 0.0698,
"step": 2010
},
{
"epoch": 0.4282382870468518,
"grad_norm": 0.15591714439754456,
"learning_rate": 6.599657431775529e-06,
"loss": 0.073,
"step": 2020
},
{
"epoch": 0.43035827856688574,
"grad_norm": 0.18569107867432982,
"learning_rate": 6.566406242279546e-06,
"loss": 0.0701,
"step": 2030
},
{
"epoch": 0.43247827008691964,
"grad_norm": 0.2044929271888512,
"learning_rate": 6.53307805956342e-06,
"loss": 0.0684,
"step": 2040
},
{
"epoch": 0.4345982616069536,
"grad_norm": 0.1591048247213101,
"learning_rate": 6.4996745217999566e-06,
"loss": 0.0712,
"step": 2050
},
{
"epoch": 0.4367182531269875,
"grad_norm": 0.18457570554796743,
"learning_rate": 6.4661972708658715e-06,
"loss": 0.0682,
"step": 2060
},
{
"epoch": 0.4388382446470214,
"grad_norm": 0.18024866180958676,
"learning_rate": 6.4326479522610855e-06,
"loss": 0.0703,
"step": 2070
},
{
"epoch": 0.4409582361670553,
"grad_norm": 0.17393779181333482,
"learning_rate": 6.399028215027849e-06,
"loss": 0.0677,
"step": 2080
},
{
"epoch": 0.44307822768708927,
"grad_norm": 0.15822079895374294,
"learning_rate": 6.365339711669687e-06,
"loss": 0.0696,
"step": 2090
},
{
"epoch": 0.44519821920712316,
"grad_norm": 0.17783185791820674,
"learning_rate": 6.331584098070159e-06,
"loss": 0.0729,
"step": 2100
},
{
"epoch": 0.4473182107271571,
"grad_norm": 0.17784945554783102,
"learning_rate": 6.2977630334114904e-06,
"loss": 0.0706,
"step": 2110
},
{
"epoch": 0.449438202247191,
"grad_norm": 0.21655542057286598,
"learning_rate": 6.263878180093004e-06,
"loss": 0.0734,
"step": 2120
},
{
"epoch": 0.45155819376722495,
"grad_norm": 0.1933797514771672,
"learning_rate": 6.2299312036494134e-06,
"loss": 0.069,
"step": 2130
},
{
"epoch": 0.45367818528725884,
"grad_norm": 0.15757976242950295,
"learning_rate": 6.195923772668955e-06,
"loss": 0.0722,
"step": 2140
},
{
"epoch": 0.4557981768072928,
"grad_norm": 0.20409740685630307,
"learning_rate": 6.161857558711372e-06,
"loss": 0.0705,
"step": 2150
},
{
"epoch": 0.4579181683273267,
"grad_norm": 0.15041431962094184,
"learning_rate": 6.12773423622576e-06,
"loss": 0.0695,
"step": 2160
},
{
"epoch": 0.46003815984736063,
"grad_norm": 0.281897607782115,
"learning_rate": 6.0935554824682556e-06,
"loss": 0.0704,
"step": 2170
},
{
"epoch": 0.4621581513673945,
"grad_norm": 0.22084672726453938,
"learning_rate": 6.059322977419591e-06,
"loss": 0.0705,
"step": 2180
},
{
"epoch": 0.46427814288742847,
"grad_norm": 0.20019812476026203,
"learning_rate": 6.02503840370253e-06,
"loss": 0.0703,
"step": 2190
},
{
"epoch": 0.46639813440746236,
"grad_norm": 0.17909334136517222,
"learning_rate": 5.990703446499153e-06,
"loss": 0.0706,
"step": 2200
},
{
"epoch": 0.4685181259274963,
"grad_norm": 0.16644185623431462,
"learning_rate": 5.9563197934680325e-06,
"loss": 0.0746,
"step": 2210
},
{
"epoch": 0.4706381174475302,
"grad_norm": 0.23611788687622157,
"learning_rate": 5.921889134661272e-06,
"loss": 0.0715,
"step": 2220
},
{
"epoch": 0.47275810896756415,
"grad_norm": 0.1692697227784412,
"learning_rate": 5.887413162441438e-06,
"loss": 0.0703,
"step": 2230
},
{
"epoch": 0.47487810048759804,
"grad_norm": 0.16272992258196417,
"learning_rate": 5.852893571398385e-06,
"loss": 0.0703,
"step": 2240
},
{
"epoch": 0.476998092007632,
"grad_norm": 0.16602591153652455,
"learning_rate": 5.818332058265948e-06,
"loss": 0.0682,
"step": 2250
},
{
"epoch": 0.4791180835276659,
"grad_norm": 0.15187588978068958,
"learning_rate": 5.783730321838548e-06,
"loss": 0.0658,
"step": 2260
},
{
"epoch": 0.48123807504769983,
"grad_norm": 0.21228071370192056,
"learning_rate": 5.749090062887697e-06,
"loss": 0.07,
"step": 2270
},
{
"epoch": 0.4833580665677337,
"grad_norm": 0.1935655119130272,
"learning_rate": 5.714412984078393e-06,
"loss": 0.0699,
"step": 2280
},
{
"epoch": 0.48547805808776767,
"grad_norm": 0.1611360908597304,
"learning_rate": 5.679700789885436e-06,
"loss": 0.0715,
"step": 2290
},
{
"epoch": 0.48759804960780156,
"grad_norm": 0.2436477600612657,
"learning_rate": 5.644955186509641e-06,
"loss": 0.0689,
"step": 2300
},
{
"epoch": 0.4897180411278355,
"grad_norm": 0.24133950450204542,
"learning_rate": 5.610177881793976e-06,
"loss": 0.0693,
"step": 2310
},
{
"epoch": 0.4918380326478694,
"grad_norm": 0.20263042804166118,
"learning_rate": 5.5753705851396236e-06,
"loss": 0.0692,
"step": 2320
},
{
"epoch": 0.49395802416790335,
"grad_norm": 0.1758643154419978,
"learning_rate": 5.54053500742195e-06,
"loss": 0.0717,
"step": 2330
},
{
"epoch": 0.49607801568793725,
"grad_norm": 0.17041444204200845,
"learning_rate": 5.505672860906412e-06,
"loss": 0.0731,
"step": 2340
},
{
"epoch": 0.4981980072079712,
"grad_norm": 0.16318236116620452,
"learning_rate": 5.470785859164402e-06,
"loss": 0.0717,
"step": 2350
},
{
"epoch": 0.5003179987280051,
"grad_norm": 0.1684480788354608,
"learning_rate": 5.435875716989013e-06,
"loss": 0.0731,
"step": 2360
},
{
"epoch": 0.502437990248039,
"grad_norm": 0.16940752138117054,
"learning_rate": 5.400944150310754e-06,
"loss": 0.0686,
"step": 2370
},
{
"epoch": 0.504557981768073,
"grad_norm": 0.18543062436184285,
"learning_rate": 5.3659928761132084e-06,
"loss": 0.0712,
"step": 2380
},
{
"epoch": 0.5066779732881068,
"grad_norm": 0.18981591633920203,
"learning_rate": 5.3310236123486396e-06,
"loss": 0.0713,
"step": 2390
},
{
"epoch": 0.5087979648081408,
"grad_norm": 0.20107039697147697,
"learning_rate": 5.296038077853545e-06,
"loss": 0.0724,
"step": 2400
},
{
"epoch": 0.5109179563281747,
"grad_norm": 0.15561965976521763,
"learning_rate": 5.261037992264182e-06,
"loss": 0.0691,
"step": 2410
},
{
"epoch": 0.5130379478482086,
"grad_norm": 0.18814302974879546,
"learning_rate": 5.226025075932024e-06,
"loss": 0.0725,
"step": 2420
},
{
"epoch": 0.5151579393682425,
"grad_norm": 0.19409510196146995,
"learning_rate": 5.191001049839218e-06,
"loss": 0.0718,
"step": 2430
},
{
"epoch": 0.5172779308882764,
"grad_norm": 0.1948905204885732,
"learning_rate": 5.155967635513985e-06,
"loss": 0.0689,
"step": 2440
},
{
"epoch": 0.5193979224083104,
"grad_norm": 0.15910287909271553,
"learning_rate": 5.120926554946003e-06,
"loss": 0.07,
"step": 2450
},
{
"epoch": 0.5215179139283442,
"grad_norm": 0.16754971258212684,
"learning_rate": 5.0858795305017696e-06,
"loss": 0.0697,
"step": 2460
},
{
"epoch": 0.5236379054483782,
"grad_norm": 0.19912027070603852,
"learning_rate": 5.050828284839936e-06,
"loss": 0.0707,
"step": 2470
},
{
"epoch": 0.5257578969684121,
"grad_norm": 0.1770839557299797,
"learning_rate": 5.015774540826639e-06,
"loss": 0.0708,
"step": 2480
},
{
"epoch": 0.5278778884884461,
"grad_norm": 0.1879664250171856,
"learning_rate": 4.980720021450822e-06,
"loss": 0.0719,
"step": 2490
},
{
"epoch": 0.5299978800084799,
"grad_norm": 0.2038214395747643,
"learning_rate": 4.945666449739534e-06,
"loss": 0.0724,
"step": 2500
},
{
"epoch": 0.5321178715285139,
"grad_norm": 0.14313855897723543,
"learning_rate": 4.910615548673245e-06,
"loss": 0.0671,
"step": 2510
},
{
"epoch": 0.5342378630485478,
"grad_norm": 0.1667988114624785,
"learning_rate": 4.875569041101152e-06,
"loss": 0.0704,
"step": 2520
},
{
"epoch": 0.5363578545685818,
"grad_norm": 0.15027735583780358,
"learning_rate": 4.840528649656507e-06,
"loss": 0.0683,
"step": 2530
},
{
"epoch": 0.5384778460886156,
"grad_norm": 0.18256318111022773,
"learning_rate": 4.805496096671933e-06,
"loss": 0.0723,
"step": 2540
},
{
"epoch": 0.5405978376086495,
"grad_norm": 0.1581087386884916,
"learning_rate": 4.77047310409477e-06,
"loss": 0.0678,
"step": 2550
},
{
"epoch": 0.5427178291286835,
"grad_norm": 0.15440122313131024,
"learning_rate": 4.735461393402437e-06,
"loss": 0.0683,
"step": 2560
},
{
"epoch": 0.5448378206487174,
"grad_norm": 0.16903444896901648,
"learning_rate": 4.700462685517822e-06,
"loss": 0.069,
"step": 2570
},
{
"epoch": 0.5469578121687513,
"grad_norm": 0.16890011975539956,
"learning_rate": 4.665478700724684e-06,
"loss": 0.0684,
"step": 2580
},
{
"epoch": 0.5490778036887852,
"grad_norm": 0.20866405931584792,
"learning_rate": 4.630511158583102e-06,
"loss": 0.0698,
"step": 2590
},
{
"epoch": 0.5511977952088192,
"grad_norm": 0.17297937706938452,
"learning_rate": 4.595561777844954e-06,
"loss": 0.0683,
"step": 2600
},
{
"epoch": 0.5533177867288531,
"grad_norm": 0.19334365079463428,
"learning_rate": 4.560632276369436e-06,
"loss": 0.071,
"step": 2610
},
{
"epoch": 0.555437778248887,
"grad_norm": 0.1566806985390742,
"learning_rate": 4.525724371038616e-06,
"loss": 0.0681,
"step": 2620
},
{
"epoch": 0.5575577697689209,
"grad_norm": 0.19883045801104277,
"learning_rate": 4.4908397776730634e-06,
"loss": 0.0693,
"step": 2630
},
{
"epoch": 0.5596777612889549,
"grad_norm": 0.18015525710269312,
"learning_rate": 4.455980210947488e-06,
"loss": 0.0694,
"step": 2640
},
{
"epoch": 0.5617977528089888,
"grad_norm": 0.19996437083442065,
"learning_rate": 4.421147384306476e-06,
"loss": 0.0724,
"step": 2650
},
{
"epoch": 0.5639177443290226,
"grad_norm": 0.1576506824802755,
"learning_rate": 4.3863430098802674e-06,
"loss": 0.0676,
"step": 2660
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.15643885696863916,
"learning_rate": 4.35156879840059e-06,
"loss": 0.0711,
"step": 2670
},
{
"epoch": 0.5681577273690905,
"grad_norm": 0.1810361041257664,
"learning_rate": 4.3168264591165825e-06,
"loss": 0.0673,
"step": 2680
},
{
"epoch": 0.5702777188891245,
"grad_norm": 0.18342485320088614,
"learning_rate": 4.282117699710775e-06,
"loss": 0.0693,
"step": 2690
},
{
"epoch": 0.5723977104091583,
"grad_norm": 0.1715806871966692,
"learning_rate": 4.247444226215157e-06,
"loss": 0.0663,
"step": 2700
},
{
"epoch": 0.5745177019291923,
"grad_norm": 0.182483141924479,
"learning_rate": 4.212807742927315e-06,
"loss": 0.0679,
"step": 2710
},
{
"epoch": 0.5766376934492262,
"grad_norm": 0.17017972956968302,
"learning_rate": 4.178209952326659e-06,
"loss": 0.0708,
"step": 2720
},
{
"epoch": 0.5787576849692602,
"grad_norm": 0.17249912512947316,
"learning_rate": 4.143652554990756e-06,
"loss": 0.0665,
"step": 2730
},
{
"epoch": 0.580877676489294,
"grad_norm": 0.16601147330223942,
"learning_rate": 4.109137249511726e-06,
"loss": 0.0663,
"step": 2740
},
{
"epoch": 0.582997668009328,
"grad_norm": 0.18185554052245853,
"learning_rate": 4.074665732412753e-06,
"loss": 0.0678,
"step": 2750
},
{
"epoch": 0.5851176595293619,
"grad_norm": 0.16710135698081338,
"learning_rate": 4.040239698064712e-06,
"loss": 0.0679,
"step": 2760
},
{
"epoch": 0.5872376510493958,
"grad_norm": 0.14000708323466857,
"learning_rate": 4.005860838602863e-06,
"loss": 0.0697,
"step": 2770
},
{
"epoch": 0.5893576425694297,
"grad_norm": 0.14927867611572637,
"learning_rate": 3.971530843843694e-06,
"loss": 0.0688,
"step": 2780
},
{
"epoch": 0.5914776340894636,
"grad_norm": 0.15171093238665134,
"learning_rate": 3.9372514012018596e-06,
"loss": 0.0699,
"step": 2790
},
{
"epoch": 0.5935976256094976,
"grad_norm": 0.1804183937869093,
"learning_rate": 3.903024195607232e-06,
"loss": 0.0716,
"step": 2800
},
{
"epoch": 0.5957176171295315,
"grad_norm": 0.15215129472796332,
"learning_rate": 3.868850909422092e-06,
"loss": 0.0698,
"step": 2810
},
{
"epoch": 0.5978376086495654,
"grad_norm": 0.15088502756447672,
"learning_rate": 3.834733222358427e-06,
"loss": 0.0687,
"step": 2820
},
{
"epoch": 0.5999576001695993,
"grad_norm": 0.18595108870060142,
"learning_rate": 3.80067281139538e-06,
"loss": 0.0724,
"step": 2830
},
{
"epoch": 0.6020775916896333,
"grad_norm": 0.16954135988504473,
"learning_rate": 3.7666713506968052e-06,
"loss": 0.0691,
"step": 2840
},
{
"epoch": 0.6041975832096672,
"grad_norm": 0.18097495350034157,
"learning_rate": 3.7327305115289938e-06,
"loss": 0.066,
"step": 2850
},
{
"epoch": 0.606317574729701,
"grad_norm": 0.15243124437822092,
"learning_rate": 3.69885196217852e-06,
"loss": 0.0682,
"step": 2860
},
{
"epoch": 0.608437566249735,
"grad_norm": 0.1614824984725212,
"learning_rate": 3.66503736787024e-06,
"loss": 0.0637,
"step": 2870
},
{
"epoch": 0.6105575577697689,
"grad_norm": 0.16257796428966814,
"learning_rate": 3.6312883906854376e-06,
"loss": 0.0674,
"step": 2880
},
{
"epoch": 0.6126775492898029,
"grad_norm": 0.1786290065781706,
"learning_rate": 3.5976066894801386e-06,
"loss": 0.0657,
"step": 2890
},
{
"epoch": 0.6147975408098367,
"grad_norm": 0.1489676922818998,
"learning_rate": 3.5639939198035655e-06,
"loss": 0.0662,
"step": 2900
},
{
"epoch": 0.6169175323298707,
"grad_norm": 0.15203380554832843,
"learning_rate": 3.530451733816762e-06,
"loss": 0.0682,
"step": 2910
},
{
"epoch": 0.6190375238499046,
"grad_norm": 0.20295326197958097,
"learning_rate": 3.496981780211392e-06,
"loss": 0.0685,
"step": 2920
},
{
"epoch": 0.6211575153699386,
"grad_norm": 0.18783757751530197,
"learning_rate": 3.4635857041286922e-06,
"loss": 0.0696,
"step": 2930
},
{
"epoch": 0.6232775068899724,
"grad_norm": 0.14570978880022487,
"learning_rate": 3.430265147078616e-06,
"loss": 0.0702,
"step": 2940
},
{
"epoch": 0.6253974984100064,
"grad_norm": 0.14375379036873775,
"learning_rate": 3.3970217468591486e-06,
"loss": 0.0664,
"step": 2950
},
{
"epoch": 0.6275174899300403,
"grad_norm": 0.173702914525196,
"learning_rate": 3.3638571374758e-06,
"loss": 0.0657,
"step": 2960
},
{
"epoch": 0.6296374814500743,
"grad_norm": 0.15569914868699147,
"learning_rate": 3.3307729490612896e-06,
"loss": 0.0659,
"step": 2970
},
{
"epoch": 0.6317574729701081,
"grad_norm": 0.18252127530194195,
"learning_rate": 3.297770807795425e-06,
"loss": 0.0665,
"step": 2980
},
{
"epoch": 0.633877464490142,
"grad_norm": 0.18473089844858295,
"learning_rate": 3.2648523358251726e-06,
"loss": 0.068,
"step": 2990
},
{
"epoch": 0.635997456010176,
"grad_norm": 0.156014132437691,
"learning_rate": 3.232019151184913e-06,
"loss": 0.0664,
"step": 3000
},
{
"epoch": 0.635997456010176,
"eval_loss": 0.06693108379840851,
"eval_runtime": 487.8882,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 0.301,
"step": 3000
},
{
"epoch": 0.6381174475302098,
"grad_norm": 0.17571197418197826,
"learning_rate": 3.1992728677169214e-06,
"loss": 0.0688,
"step": 3010
},
{
"epoch": 0.6402374390502438,
"grad_norm": 0.14947601720845555,
"learning_rate": 3.1666150949920393e-06,
"loss": 0.0665,
"step": 3020
},
{
"epoch": 0.6423574305702777,
"grad_norm": 0.15331032877554068,
"learning_rate": 3.1340474382305585e-06,
"loss": 0.0655,
"step": 3030
},
{
"epoch": 0.6444774220903117,
"grad_norm": 0.18933623167552627,
"learning_rate": 3.101571498223317e-06,
"loss": 0.0649,
"step": 3040
},
{
"epoch": 0.6465974136103455,
"grad_norm": 0.15247439973376195,
"learning_rate": 3.069188871253026e-06,
"loss": 0.0649,
"step": 3050
},
{
"epoch": 0.6487174051303795,
"grad_norm": 0.16943772711604502,
"learning_rate": 3.0369011490157984e-06,
"loss": 0.0692,
"step": 3060
},
{
"epoch": 0.6508373966504134,
"grad_norm": 0.15521523385110902,
"learning_rate": 3.0047099185429142e-06,
"loss": 0.0654,
"step": 3070
},
{
"epoch": 0.6529573881704474,
"grad_norm": 0.14728105383234777,
"learning_rate": 2.9726167621228187e-06,
"loss": 0.0657,
"step": 3080
},
{
"epoch": 0.6550773796904812,
"grad_norm": 0.1832509363427932,
"learning_rate": 2.940623257223341e-06,
"loss": 0.0665,
"step": 3090
},
{
"epoch": 0.6571973712105151,
"grad_norm": 0.15168423601274655,
"learning_rate": 2.9087309764141613e-06,
"loss": 0.0665,
"step": 3100
},
{
"epoch": 0.6593173627305491,
"grad_norm": 0.1483275502933062,
"learning_rate": 2.876941487289522e-06,
"loss": 0.072,
"step": 3110
},
{
"epoch": 0.661437354250583,
"grad_norm": 0.15452416173310596,
"learning_rate": 2.845256352391157e-06,
"loss": 0.0687,
"step": 3120
},
{
"epoch": 0.6635573457706169,
"grad_norm": 0.16759174006680952,
"learning_rate": 2.8136771291315063e-06,
"loss": 0.0669,
"step": 3130
},
{
"epoch": 0.6656773372906508,
"grad_norm": 0.14998494541872762,
"learning_rate": 2.7822053697171588e-06,
"loss": 0.0666,
"step": 3140
},
{
"epoch": 0.6677973288106848,
"grad_norm": 0.17131639340630408,
"learning_rate": 2.7508426210725546e-06,
"loss": 0.0672,
"step": 3150
},
{
"epoch": 0.6699173203307187,
"grad_norm": 0.19399216153317256,
"learning_rate": 2.7195904247639544e-06,
"loss": 0.0662,
"step": 3160
},
{
"epoch": 0.6720373118507526,
"grad_norm": 0.15393012051599972,
"learning_rate": 2.68845031692366e-06,
"loss": 0.0685,
"step": 3170
},
{
"epoch": 0.6741573033707865,
"grad_norm": 0.1761419745993989,
"learning_rate": 2.657423828174518e-06,
"loss": 0.0644,
"step": 3180
},
{
"epoch": 0.6762772948908204,
"grad_norm": 0.16292970391303543,
"learning_rate": 2.626512483554678e-06,
"loss": 0.0673,
"step": 3190
},
{
"epoch": 0.6783972864108544,
"grad_norm": 0.15248743923822936,
"learning_rate": 2.595717802442636e-06,
"loss": 0.0636,
"step": 3200
},
{
"epoch": 0.6805172779308882,
"grad_norm": 0.17164291620759312,
"learning_rate": 2.5650412984825535e-06,
"loss": 0.0661,
"step": 3210
},
{
"epoch": 0.6826372694509222,
"grad_norm": 0.14003403542018764,
"learning_rate": 2.5344844795098577e-06,
"loss": 0.0644,
"step": 3220
},
{
"epoch": 0.6847572609709561,
"grad_norm": 0.13906331383996035,
"learning_rate": 2.5040488474771183e-06,
"loss": 0.0664,
"step": 3230
},
{
"epoch": 0.6868772524909901,
"grad_norm": 0.1654974091386292,
"learning_rate": 2.4737358983802417e-06,
"loss": 0.0657,
"step": 3240
},
{
"epoch": 0.6889972440110239,
"grad_norm": 0.17123238672779562,
"learning_rate": 2.443547122184921e-06,
"loss": 0.0684,
"step": 3250
},
{
"epoch": 0.6911172355310579,
"grad_norm": 0.13771748743849033,
"learning_rate": 2.416484617979397e-06,
"loss": 0.0718,
"step": 3260
},
{
"epoch": 0.6932372270510918,
"grad_norm": 0.14445999863423453,
"learning_rate": 2.386535853234254e-06,
"loss": 0.0703,
"step": 3270
},
{
"epoch": 0.6953572185711258,
"grad_norm": 0.15853968355485656,
"learning_rate": 2.356715547515228e-06,
"loss": 0.071,
"step": 3280
},
{
"epoch": 0.6974772100911596,
"grad_norm": 0.16059029746896103,
"learning_rate": 2.3270251665732236e-06,
"loss": 0.0682,
"step": 3290
},
{
"epoch": 0.6995972016111935,
"grad_norm": 0.1311794653898363,
"learning_rate": 2.2974661697729777e-06,
"loss": 0.0656,
"step": 3300
},
{
"epoch": 0.7017171931312275,
"grad_norm": 0.14003109623808868,
"learning_rate": 2.268040010021334e-06,
"loss": 0.0658,
"step": 3310
},
{
"epoch": 0.7038371846512614,
"grad_norm": 0.13679184892364368,
"learning_rate": 2.2387481336958243e-06,
"loss": 0.0676,
"step": 3320
},
{
"epoch": 0.7059571761712953,
"grad_norm": 0.1553274432738983,
"learning_rate": 2.2095919805735786e-06,
"loss": 0.0654,
"step": 3330
},
{
"epoch": 0.7080771676913292,
"grad_norm": 0.16561059697374547,
"learning_rate": 2.1805729837605533e-06,
"loss": 0.0677,
"step": 3340
},
{
"epoch": 0.7101971592113632,
"grad_norm": 0.1504235594519663,
"learning_rate": 2.1516925696210917e-06,
"loss": 0.0666,
"step": 3350
},
{
"epoch": 0.7123171507313971,
"grad_norm": 0.15286590047529391,
"learning_rate": 2.122952157707808e-06,
"loss": 0.0684,
"step": 3360
},
{
"epoch": 0.714437142251431,
"grad_norm": 0.1598473142296576,
"learning_rate": 2.0943531606918304e-06,
"loss": 0.0665,
"step": 3370
},
{
"epoch": 0.7165571337714649,
"grad_norm": 0.14455546823633267,
"learning_rate": 2.0658969842933386e-06,
"loss": 0.0694,
"step": 3380
},
{
"epoch": 0.7186771252914989,
"grad_norm": 0.14684140177879562,
"learning_rate": 2.0375850272124865e-06,
"loss": 0.063,
"step": 3390
},
{
"epoch": 0.7207971168115328,
"grad_norm": 0.1558774790348137,
"learning_rate": 2.0094186810606553e-06,
"loss": 0.0664,
"step": 3400
},
{
"epoch": 0.7229171083315666,
"grad_norm": 0.14875490426420004,
"learning_rate": 1.9813993302920325e-06,
"loss": 0.065,
"step": 3410
},
{
"epoch": 0.7250370998516006,
"grad_norm": 0.14376290785675833,
"learning_rate": 1.9535283521355807e-06,
"loss": 0.0645,
"step": 3420
},
{
"epoch": 0.7271570913716345,
"grad_norm": 0.1774603135257143,
"learning_rate": 1.925807116527336e-06,
"loss": 0.0628,
"step": 3430
},
{
"epoch": 0.7292770828916685,
"grad_norm": 0.15648377782580034,
"learning_rate": 1.8982369860430693e-06,
"loss": 0.0669,
"step": 3440
},
{
"epoch": 0.7313970744117023,
"grad_norm": 0.1410744866971639,
"learning_rate": 1.8708193158313175e-06,
"loss": 0.0652,
"step": 3450
},
{
"epoch": 0.7335170659317363,
"grad_norm": 0.13848781419789238,
"learning_rate": 1.8435554535467709e-06,
"loss": 0.0668,
"step": 3460
},
{
"epoch": 0.7356370574517702,
"grad_norm": 0.1565698593715598,
"learning_rate": 1.8164467392840306e-06,
"loss": 0.065,
"step": 3470
},
{
"epoch": 0.7377570489718042,
"grad_norm": 0.14483319470448863,
"learning_rate": 1.7894945055117462e-06,
"loss": 0.0689,
"step": 3480
},
{
"epoch": 0.739877040491838,
"grad_norm": 0.14690155750172293,
"learning_rate": 1.7627000770071062e-06,
"loss": 0.0643,
"step": 3490
},
{
"epoch": 0.741997032011872,
"grad_norm": 0.1523880735964551,
"learning_rate": 1.7360647707907447e-06,
"loss": 0.0666,
"step": 3500
},
{
"epoch": 0.7441170235319059,
"grad_norm": 0.16195859504926405,
"learning_rate": 1.7095898960619862e-06,
"loss": 0.0657,
"step": 3510
},
{
"epoch": 0.7462370150519398,
"grad_norm": 0.13638473122547523,
"learning_rate": 1.6832767541344974e-06,
"loss": 0.0655,
"step": 3520
},
{
"epoch": 0.7483570065719737,
"grad_norm": 0.13278921458936405,
"learning_rate": 1.6571266383723388e-06,
"loss": 0.0672,
"step": 3530
},
{
"epoch": 0.7504769980920076,
"grad_norm": 0.14406828983312037,
"learning_rate": 1.631140834126373e-06,
"loss": 0.066,
"step": 3540
},
{
"epoch": 0.7525969896120416,
"grad_norm": 0.1395384360254768,
"learning_rate": 1.6053206186710967e-06,
"loss": 0.0652,
"step": 3550
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.1579964196169218,
"learning_rate": 1.5796672611418645e-06,
"loss": 0.0656,
"step": 3560
},
{
"epoch": 0.7568369726521094,
"grad_norm": 0.1539176914379727,
"learning_rate": 1.5541820224724884e-06,
"loss": 0.0659,
"step": 3570
},
{
"epoch": 0.7589569641721433,
"grad_norm": 0.1432268965723713,
"learning_rate": 1.5288661553332802e-06,
"loss": 0.068,
"step": 3580
},
{
"epoch": 0.7610769556921773,
"grad_norm": 0.1475776868236256,
"learning_rate": 1.5037209040694668e-06,
"loss": 0.0674,
"step": 3590
},
{
"epoch": 0.7631969472122111,
"grad_norm": 0.13942686520284647,
"learning_rate": 1.4787475046400307e-06,
"loss": 0.0658,
"step": 3600
},
{
"epoch": 0.765316938732245,
"grad_norm": 0.16612542851996417,
"learning_rate": 1.4539471845569598e-06,
"loss": 0.0673,
"step": 3610
},
{
"epoch": 0.767436930252279,
"grad_norm": 0.13347560560880484,
"learning_rate": 1.4293211628249115e-06,
"loss": 0.0651,
"step": 3620
},
{
"epoch": 0.7695569217723129,
"grad_norm": 0.14290885311257007,
"learning_rate": 1.4048706498812936e-06,
"loss": 0.0632,
"step": 3630
},
{
"epoch": 0.7716769132923468,
"grad_norm": 0.15900916314465804,
"learning_rate": 1.380596847536772e-06,
"loss": 0.0662,
"step": 3640
},
{
"epoch": 0.7737969048123807,
"grad_norm": 0.15826198491620722,
"learning_rate": 1.3565009489161878e-06,
"loss": 0.0669,
"step": 3650
},
{
"epoch": 0.7759168963324147,
"grad_norm": 0.1338916105091316,
"learning_rate": 1.3325841383999321e-06,
"loss": 0.0661,
"step": 3660
},
{
"epoch": 0.7780368878524486,
"grad_norm": 0.14647123286090982,
"learning_rate": 1.3088475915657066e-06,
"loss": 0.0653,
"step": 3670
},
{
"epoch": 0.7801568793724825,
"grad_norm": 0.12519200539181277,
"learning_rate": 1.2852924751307555e-06,
"loss": 0.065,
"step": 3680
},
{
"epoch": 0.7822768708925164,
"grad_norm": 0.15737674167435736,
"learning_rate": 1.2619199468945215e-06,
"loss": 0.0647,
"step": 3690
},
{
"epoch": 0.7843968624125504,
"grad_norm": 0.14864208572307655,
"learning_rate": 1.2387311556817183e-06,
"loss": 0.0671,
"step": 3700
},
{
"epoch": 0.7865168539325843,
"grad_norm": 0.14386823191288503,
"learning_rate": 1.2157272412858811e-06,
"loss": 0.0672,
"step": 3710
},
{
"epoch": 0.7886368454526181,
"grad_norm": 0.15384247542083423,
"learning_rate": 1.192909334413338e-06,
"loss": 0.0654,
"step": 3720
},
{
"epoch": 0.7907568369726521,
"grad_norm": 0.14067508984359764,
"learning_rate": 1.1702785566276236e-06,
"loss": 0.0644,
"step": 3730
},
{
"epoch": 0.792876828492686,
"grad_norm": 0.1437217497105591,
"learning_rate": 1.1478360202943618e-06,
"loss": 0.0645,
"step": 3740
},
{
"epoch": 0.79499682001272,
"grad_norm": 0.15519712474428182,
"learning_rate": 1.1255828285265862e-06,
"loss": 0.0649,
"step": 3750
},
{
"epoch": 0.7971168115327538,
"grad_norm": 0.14145423148178207,
"learning_rate": 1.1035200751305176e-06,
"loss": 0.0653,
"step": 3760
},
{
"epoch": 0.7992368030527878,
"grad_norm": 0.13536631332448693,
"learning_rate": 1.0816488445518014e-06,
"loss": 0.0663,
"step": 3770
},
{
"epoch": 0.8013567945728217,
"grad_norm": 0.17054020151205723,
"learning_rate": 1.0599702118222054e-06,
"loss": 0.072,
"step": 3780
},
{
"epoch": 0.8034767860928557,
"grad_norm": 0.15685256143417375,
"learning_rate": 1.038485242506777e-06,
"loss": 0.0656,
"step": 3790
},
{
"epoch": 0.8055967776128895,
"grad_norm": 0.14095243780908379,
"learning_rate": 1.0171949926514706e-06,
"loss": 0.0647,
"step": 3800
},
{
"epoch": 0.8077167691329235,
"grad_norm": 0.13455803530480603,
"learning_rate": 9.96100508731232e-07,
"loss": 0.0656,
"step": 3810
},
{
"epoch": 0.8098367606529574,
"grad_norm": 0.1334874446417613,
"learning_rate": 9.75202827598576e-07,
"loss": 0.0646,
"step": 3820
},
{
"epoch": 0.8119567521729913,
"grad_norm": 0.14692274148907183,
"learning_rate": 9.54502976432606e-07,
"loss": 0.069,
"step": 3830
},
{
"epoch": 0.8140767436930252,
"grad_norm": 0.15341984926535815,
"learning_rate": 9.340019726885341e-07,
"loss": 0.0673,
"step": 3840
},
{
"epoch": 0.8161967352130591,
"grad_norm": 0.13727695678950919,
"learning_rate": 9.137008240476752e-07,
"loss": 0.0644,
"step": 3850
},
{
"epoch": 0.8183167267330931,
"grad_norm": 0.1301351491962901,
"learning_rate": 8.936005283679022e-07,
"loss": 0.0653,
"step": 3860
},
{
"epoch": 0.820436718253127,
"grad_norm": 0.15417587256216225,
"learning_rate": 8.737020736346114e-07,
"loss": 0.0687,
"step": 3870
},
{
"epoch": 0.8225567097731609,
"grad_norm": 0.13849301619933713,
"learning_rate": 8.540064379121537e-07,
"loss": 0.0643,
"step": 3880
},
{
"epoch": 0.8246767012931948,
"grad_norm": 0.12751613533813724,
"learning_rate": 8.345145892957635e-07,
"loss": 0.0675,
"step": 3890
},
{
"epoch": 0.8267966928132288,
"grad_norm": 0.13641022859652724,
"learning_rate": 8.152274858639709e-07,
"loss": 0.0644,
"step": 3900
},
{
"epoch": 0.8289166843332627,
"grad_norm": 0.13498806105829741,
"learning_rate": 7.961460756315131e-07,
"loss": 0.0661,
"step": 3910
},
{
"epoch": 0.8310366758532965,
"grad_norm": 0.1649413393713791,
"learning_rate": 7.772712965027329e-07,
"loss": 0.0681,
"step": 3920
},
{
"epoch": 0.8331566673733305,
"grad_norm": 0.14352566951876747,
"learning_rate": 7.586040762254831e-07,
"loss": 0.0666,
"step": 3930
},
{
"epoch": 0.8352766588933644,
"grad_norm": 0.13644803212350157,
"learning_rate": 7.40145332345516e-07,
"loss": 0.0703,
"step": 3940
},
{
"epoch": 0.8373966504133984,
"grad_norm": 0.13207832198888683,
"learning_rate": 7.218959721613966e-07,
"loss": 0.0677,
"step": 3950
},
{
"epoch": 0.8395166419334322,
"grad_norm": 0.12801202011992016,
"learning_rate": 7.038568926798972e-07,
"loss": 0.0669,
"step": 3960
},
{
"epoch": 0.8416366334534662,
"grad_norm": 0.1446132283031493,
"learning_rate": 6.860289805719051e-07,
"loss": 0.0657,
"step": 3970
},
{
"epoch": 0.8437566249735001,
"grad_norm": 0.15537910760985132,
"learning_rate": 6.684131121288506e-07,
"loss": 0.0645,
"step": 3980
},
{
"epoch": 0.8458766164935341,
"grad_norm": 0.138745128040943,
"learning_rate": 6.510101532196228e-07,
"loss": 0.0663,
"step": 3990
},
{
"epoch": 0.8479966080135679,
"grad_norm": 0.1422532471037774,
"learning_rate": 6.338209592480187e-07,
"loss": 0.0659,
"step": 4000
},
{
"epoch": 0.8479966080135679,
"eval_loss": 0.06505845487117767,
"eval_runtime": 488.4948,
"eval_samples_per_second": 4.188,
"eval_steps_per_second": 0.301,
"step": 4000
},
{
"epoch": 0.8501165995336019,
"grad_norm": 0.13051273275584138,
"learning_rate": 6.168463751106973e-07,
"loss": 0.0676,
"step": 4010
},
{
"epoch": 0.8522365910536358,
"grad_norm": 0.1584048424567531,
"learning_rate": 6.000872351556402e-07,
"loss": 0.0647,
"step": 4020
},
{
"epoch": 0.8543565825736698,
"grad_norm": 0.14980989572464892,
"learning_rate": 5.835443631411548e-07,
"loss": 0.0656,
"step": 4030
},
{
"epoch": 0.8564765740937036,
"grad_norm": 0.14192254512990504,
"learning_rate": 5.672185721953761e-07,
"loss": 0.0664,
"step": 4040
},
{
"epoch": 0.8585965656137375,
"grad_norm": 0.14482703076509035,
"learning_rate": 5.51110664776302e-07,
"loss": 0.0672,
"step": 4050
},
{
"epoch": 0.8607165571337715,
"grad_norm": 0.134823932240745,
"learning_rate": 5.352214326323485e-07,
"loss": 0.0675,
"step": 4060
},
{
"epoch": 0.8628365486538054,
"grad_norm": 0.13549221587996976,
"learning_rate": 5.195516567634345e-07,
"loss": 0.0643,
"step": 4070
},
{
"epoch": 0.8649565401738393,
"grad_norm": 0.14157113823645306,
"learning_rate": 5.041021073825935e-07,
"loss": 0.0681,
"step": 4080
},
{
"epoch": 0.8670765316938732,
"grad_norm": 0.13268190112303166,
"learning_rate": 4.888735438781156e-07,
"loss": 0.0634,
"step": 4090
},
{
"epoch": 0.8691965232139072,
"grad_norm": 0.15044371596526965,
"learning_rate": 4.738667147762177e-07,
"loss": 0.0638,
"step": 4100
},
{
"epoch": 0.8713165147339411,
"grad_norm": 0.15554565316213642,
"learning_rate": 4.590823577042597e-07,
"loss": 0.0673,
"step": 4110
},
{
"epoch": 0.873436506253975,
"grad_norm": 0.13924045324828926,
"learning_rate": 4.4452119935447844e-07,
"loss": 0.0684,
"step": 4120
},
{
"epoch": 0.8755564977740089,
"grad_norm": 0.14738905954975795,
"learning_rate": 4.301839554482745e-07,
"loss": 0.0646,
"step": 4130
},
{
"epoch": 0.8776764892940428,
"grad_norm": 0.17734811072141277,
"learning_rate": 4.160713307010339e-07,
"loss": 0.0627,
"step": 4140
},
{
"epoch": 0.8797964808140768,
"grad_norm": 0.14667184777407763,
"learning_rate": 4.021840187874831e-07,
"loss": 0.0665,
"step": 4150
},
{
"epoch": 0.8819164723341106,
"grad_norm": 0.13247047796191325,
"learning_rate": 3.8852270230759715e-07,
"loss": 0.068,
"step": 4160
},
{
"epoch": 0.8840364638541446,
"grad_norm": 0.1318359100305846,
"learning_rate": 3.750880527530515e-07,
"loss": 0.0642,
"step": 4170
},
{
"epoch": 0.8861564553741785,
"grad_norm": 0.14660978947680608,
"learning_rate": 3.618807304742067e-07,
"loss": 0.064,
"step": 4180
},
{
"epoch": 0.8882764468942124,
"grad_norm": 0.16073812743121169,
"learning_rate": 3.4890138464765854e-07,
"loss": 0.0624,
"step": 4190
},
{
"epoch": 0.8903964384142463,
"grad_norm": 0.1317816842544379,
"learning_rate": 3.361506532443265e-07,
"loss": 0.0637,
"step": 4200
},
{
"epoch": 0.8925164299342803,
"grad_norm": 0.17027046123997486,
"learning_rate": 3.2362916299809643e-07,
"loss": 0.066,
"step": 4210
},
{
"epoch": 0.8946364214543142,
"grad_norm": 0.13836358324093678,
"learning_rate": 3.113375293750137e-07,
"loss": 0.0676,
"step": 4220
},
{
"epoch": 0.896756412974348,
"grad_norm": 0.13744563225532516,
"learning_rate": 2.992763565430301e-07,
"loss": 0.064,
"step": 4230
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.13017524095673055,
"learning_rate": 2.874462373423115e-07,
"loss": 0.0682,
"step": 4240
},
{
"epoch": 0.900996396014416,
"grad_norm": 0.13351598157626,
"learning_rate": 2.7584775325609546e-07,
"loss": 0.0684,
"step": 4250
},
{
"epoch": 0.9031163875344499,
"grad_norm": 0.14042699267228834,
"learning_rate": 2.6448147438210725e-07,
"loss": 0.0652,
"step": 4260
},
{
"epoch": 0.9052363790544837,
"grad_norm": 0.1347635143628056,
"learning_rate": 2.5334795940454514e-07,
"loss": 0.0687,
"step": 4270
},
{
"epoch": 0.9073563705745177,
"grad_norm": 0.14711313054197867,
"learning_rate": 2.424477555666105e-07,
"loss": 0.0642,
"step": 4280
},
{
"epoch": 0.9094763620945516,
"grad_norm": 0.12669127686809334,
"learning_rate": 2.3178139864361514e-07,
"loss": 0.0662,
"step": 4290
},
{
"epoch": 0.9115963536145856,
"grad_norm": 0.1482558394168092,
"learning_rate": 2.213494129166477e-07,
"loss": 0.0663,
"step": 4300
},
{
"epoch": 0.9137163451346194,
"grad_norm": 0.13767908522932615,
"learning_rate": 2.111523111467978e-07,
"loss": 0.0662,
"step": 4310
},
{
"epoch": 0.9158363366546534,
"grad_norm": 0.1307957796839651,
"learning_rate": 2.0119059454995705e-07,
"loss": 0.0637,
"step": 4320
},
{
"epoch": 0.9179563281746873,
"grad_norm": 0.1387365413487702,
"learning_rate": 1.9146475277218247e-07,
"loss": 0.066,
"step": 4330
},
{
"epoch": 0.9200763196947213,
"grad_norm": 0.13783938524006778,
"learning_rate": 1.8197526386562637e-07,
"loss": 0.0656,
"step": 4340
},
{
"epoch": 0.9221963112147551,
"grad_norm": 0.15258547100463352,
"learning_rate": 1.7272259426504178e-07,
"loss": 0.0635,
"step": 4350
},
{
"epoch": 0.924316302734789,
"grad_norm": 0.12836303549043818,
"learning_rate": 1.6370719876485474e-07,
"loss": 0.0654,
"step": 4360
},
{
"epoch": 0.926436294254823,
"grad_norm": 0.16082006996334058,
"learning_rate": 1.5492952049680987e-07,
"loss": 0.0665,
"step": 4370
},
{
"epoch": 0.9285562857748569,
"grad_norm": 0.15509787140465903,
"learning_rate": 1.463899909081884e-07,
"loss": 0.0701,
"step": 4380
},
{
"epoch": 0.9306762772948908,
"grad_norm": 0.15582542247867595,
"learning_rate": 1.3808902974060234e-07,
"loss": 0.0663,
"step": 4390
},
{
"epoch": 0.9327962688149247,
"grad_norm": 0.1285740918117715,
"learning_rate": 1.3002704500936324e-07,
"loss": 0.0666,
"step": 4400
},
{
"epoch": 0.9349162603349587,
"grad_norm": 0.12844195029643576,
"learning_rate": 1.222044329834271e-07,
"loss": 0.0649,
"step": 4410
},
{
"epoch": 0.9370362518549926,
"grad_norm": 0.14024654119089836,
"learning_rate": 1.1462157816591435e-07,
"loss": 0.0653,
"step": 4420
},
{
"epoch": 0.9391562433750265,
"grad_norm": 0.13569299209873553,
"learning_rate": 1.0727885327521448e-07,
"loss": 0.0636,
"step": 4430
},
{
"epoch": 0.9412762348950604,
"grad_norm": 0.1506002944720704,
"learning_rate": 1.0017661922666177e-07,
"loss": 0.0666,
"step": 4440
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.1454846226979092,
"learning_rate": 9.331522511479785e-08,
"loss": 0.0666,
"step": 4450
},
{
"epoch": 0.9455162179351283,
"grad_norm": 0.12532906503884286,
"learning_rate": 8.669500819621424e-08,
"loss": 0.0633,
"step": 4460
},
{
"epoch": 0.9476362094551621,
"grad_norm": 0.13843242796856053,
"learning_rate": 8.031629387296958e-08,
"loss": 0.065,
"step": 4470
},
{
"epoch": 0.9497562009751961,
"grad_norm": 0.14800544603658328,
"learning_rate": 7.41793956766007e-08,
"loss": 0.068,
"step": 4480
},
{
"epoch": 0.95187619249523,
"grad_norm": 0.1302604634301598,
"learning_rate": 6.828461525271057e-08,
"loss": 0.0669,
"step": 4490
},
{
"epoch": 0.953996184015264,
"grad_norm": 0.1317179792652165,
"learning_rate": 6.26322423461384e-08,
"loss": 0.0669,
"step": 4500
},
{
"epoch": 0.9561161755352978,
"grad_norm": 0.14912594874221324,
"learning_rate": 5.7222554786722784e-08,
"loss": 0.0656,
"step": 4510
},
{
"epoch": 0.9582361670553318,
"grad_norm": 0.14837310552453115,
"learning_rate": 5.20558184756409e-08,
"loss": 0.0637,
"step": 4520
},
{
"epoch": 0.9603561585753657,
"grad_norm": 0.13757532372594639,
"learning_rate": 4.7132287372341764e-08,
"loss": 0.0648,
"step": 4530
},
{
"epoch": 0.9624761500953997,
"grad_norm": 0.1412166618656987,
"learning_rate": 4.245220348206347e-08,
"loss": 0.0652,
"step": 4540
},
{
"epoch": 0.9645961416154335,
"grad_norm": 0.13183693219990808,
"learning_rate": 3.801579684393486e-08,
"loss": 0.0641,
"step": 4550
},
{
"epoch": 0.9667161331354674,
"grad_norm": 0.13113013926781358,
"learning_rate": 3.382328551967296e-08,
"loss": 0.062,
"step": 4560
},
{
"epoch": 0.9688361246555014,
"grad_norm": 0.1413505271426179,
"learning_rate": 2.9874875582860395e-08,
"loss": 0.0645,
"step": 4570
},
{
"epoch": 0.9709561161755353,
"grad_norm": 0.1412996190385206,
"learning_rate": 2.6170761108818554e-08,
"loss": 0.0663,
"step": 4580
},
{
"epoch": 0.9730761076955692,
"grad_norm": 0.12681619081672574,
"learning_rate": 2.2711124165069043e-08,
"loss": 0.0642,
"step": 4590
},
{
"epoch": 0.9751960992156031,
"grad_norm": 0.12962052445070302,
"learning_rate": 1.949613480238255e-08,
"loss": 0.069,
"step": 4600
},
{
"epoch": 0.9773160907356371,
"grad_norm": 0.12947117948159637,
"learning_rate": 1.652595104642052e-08,
"loss": 0.0664,
"step": 4610
},
{
"epoch": 0.979436082255671,
"grad_norm": 0.1306635495902112,
"learning_rate": 1.3800718889970255e-08,
"loss": 0.0631,
"step": 4620
},
{
"epoch": 0.9815560737757049,
"grad_norm": 0.14160335253570747,
"learning_rate": 1.1320572285765663e-08,
"loss": 0.0655,
"step": 4630
},
{
"epoch": 0.9836760652957388,
"grad_norm": 0.1331013714780897,
"learning_rate": 9.085633139905292e-09,
"loss": 0.0679,
"step": 4640
},
{
"epoch": 0.9857960568157728,
"grad_norm": 0.1521352520849485,
"learning_rate": 7.096011305859352e-09,
"loss": 0.0659,
"step": 4650
},
{
"epoch": 0.9879160483358067,
"grad_norm": 0.1490651261311961,
"learning_rate": 5.351804579070696e-09,
"loss": 0.0663,
"step": 4660
},
{
"epoch": 0.9900360398558405,
"grad_norm": 0.14100706470294022,
"learning_rate": 3.853098692147006e-09,
"loss": 0.0658,
"step": 4670
},
{
"epoch": 0.9921560313758745,
"grad_norm": 0.1375637676528597,
"learning_rate": 2.5999673106480438e-09,
"loss": 0.0638,
"step": 4680
},
{
"epoch": 0.9942760228959084,
"grad_norm": 0.1362445388806314,
"learning_rate": 1.5924720294641093e-09,
"loss": 0.0645,
"step": 4690
},
{
"epoch": 0.9963960144159424,
"grad_norm": 0.12875364924264648,
"learning_rate": 8.306623697884597e-10,
"loss": 0.0669,
"step": 4700
},
{
"epoch": 0.9985160059359762,
"grad_norm": 0.12918021464281548,
"learning_rate": 3.1457577668259074e-10,
"loss": 0.0658,
"step": 4710
},
{
"epoch": 1.0,
"step": 4717,
"total_flos": 3995069728161792.0,
"train_loss": 0.10921624170816473,
"train_runtime": 69121.8578,
"train_samples_per_second": 0.955,
"train_steps_per_second": 0.068
}
],
"logging_steps": 10,
"max_steps": 4717,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3995069728161792.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}