3373 lines
82 KiB
JSON
3373 lines
82 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 1000,
|
|
"global_step": 4717,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00211999152003392,
|
|
"grad_norm": 32.50752996739497,
|
|
"learning_rate": 1.6949152542372883e-07,
|
|
"loss": 3.7461,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.00423998304006784,
|
|
"grad_norm": 19.7783715335098,
|
|
"learning_rate": 5.93220338983051e-07,
|
|
"loss": 3.5757,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.006359974560101759,
|
|
"grad_norm": 9.354428743861847,
|
|
"learning_rate": 1.016949152542373e-06,
|
|
"loss": 2.7484,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.00847996608013568,
|
|
"grad_norm": 5.7293369283524935,
|
|
"learning_rate": 1.4406779661016951e-06,
|
|
"loss": 1.9415,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0105999576001696,
|
|
"grad_norm": 6.878907941943918,
|
|
"learning_rate": 1.8644067796610171e-06,
|
|
"loss": 1.2673,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.012719949120203519,
|
|
"grad_norm": 8.215857672525798,
|
|
"learning_rate": 2.288135593220339e-06,
|
|
"loss": 0.8349,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.014839940640237439,
|
|
"grad_norm": 2.0985232088184698,
|
|
"learning_rate": 2.7118644067796613e-06,
|
|
"loss": 0.6788,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.01695993216027136,
|
|
"grad_norm": 4.181552201854574,
|
|
"learning_rate": 3.135593220338983e-06,
|
|
"loss": 0.5306,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.01907992368030528,
|
|
"grad_norm": 1.3578290731521812,
|
|
"learning_rate": 3.5593220338983053e-06,
|
|
"loss": 0.428,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.0211999152003392,
|
|
"grad_norm": 8.019239361403569,
|
|
"learning_rate": 3.9830508474576275e-06,
|
|
"loss": 0.3659,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.02331990672037312,
|
|
"grad_norm": 1.0371743917362437,
|
|
"learning_rate": 4.40677966101695e-06,
|
|
"loss": 0.3098,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.025439898240407037,
|
|
"grad_norm": 1.0725520171241911,
|
|
"learning_rate": 4.830508474576272e-06,
|
|
"loss": 0.272,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.027559889760440957,
|
|
"grad_norm": 4.320619473460736,
|
|
"learning_rate": 5.254237288135594e-06,
|
|
"loss": 0.2408,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.029679881280474878,
|
|
"grad_norm": 9.318265850779325,
|
|
"learning_rate": 5.677966101694916e-06,
|
|
"loss": 0.2177,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.0317998728005088,
|
|
"grad_norm": 0.9802919759217678,
|
|
"learning_rate": 6.1016949152542385e-06,
|
|
"loss": 0.2049,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.03391986432054272,
|
|
"grad_norm": 2.4459592955831337,
|
|
"learning_rate": 6.52542372881356e-06,
|
|
"loss": 0.1998,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.03603985584057664,
|
|
"grad_norm": 1.5882881176464134,
|
|
"learning_rate": 6.949152542372882e-06,
|
|
"loss": 0.1803,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.03815984736061056,
|
|
"grad_norm": 3.036655132788213,
|
|
"learning_rate": 7.372881355932204e-06,
|
|
"loss": 0.1643,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.04027983888064448,
|
|
"grad_norm": 0.4963099670959003,
|
|
"learning_rate": 7.796610169491526e-06,
|
|
"loss": 0.1595,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.0423998304006784,
|
|
"grad_norm": 0.8389581242220279,
|
|
"learning_rate": 8.220338983050849e-06,
|
|
"loss": 0.1556,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.04451982192071232,
|
|
"grad_norm": 0.5257252909788135,
|
|
"learning_rate": 8.64406779661017e-06,
|
|
"loss": 0.1471,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.04663981344074624,
|
|
"grad_norm": 0.3592742330584675,
|
|
"learning_rate": 9.067796610169493e-06,
|
|
"loss": 0.1374,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.048759804960780154,
|
|
"grad_norm": 0.3187507665176873,
|
|
"learning_rate": 9.491525423728815e-06,
|
|
"loss": 0.1398,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.050879796480814074,
|
|
"grad_norm": 0.3482668638560697,
|
|
"learning_rate": 9.915254237288137e-06,
|
|
"loss": 0.1293,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.052999788000847994,
|
|
"grad_norm": 0.44242263127368797,
|
|
"learning_rate": 9.999921355437334e-06,
|
|
"loss": 0.1281,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.055119779520881915,
|
|
"grad_norm": 0.43066449617717567,
|
|
"learning_rate": 9.999601866141578e-06,
|
|
"loss": 0.1236,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.057239771040915835,
|
|
"grad_norm": 0.2632241978204935,
|
|
"learning_rate": 9.999036632519274e-06,
|
|
"loss": 0.1198,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.059359762560949755,
|
|
"grad_norm": 0.5107458203310429,
|
|
"learning_rate": 9.998225682353224e-06,
|
|
"loss": 0.1219,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.061479754080983676,
|
|
"grad_norm": 0.4495473347031151,
|
|
"learning_rate": 9.997169055503885e-06,
|
|
"loss": 0.1215,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.0635997456010176,
|
|
"grad_norm": 0.852881887488011,
|
|
"learning_rate": 9.995866803907402e-06,
|
|
"loss": 0.1113,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.06571973712105152,
|
|
"grad_norm": 0.3612657381641809,
|
|
"learning_rate": 9.99431899157306e-06,
|
|
"loss": 0.1111,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.06783972864108544,
|
|
"grad_norm": 0.44988476523435184,
|
|
"learning_rate": 9.992525694580135e-06,
|
|
"loss": 0.1072,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.06995972016111936,
|
|
"grad_norm": 0.29666583291365195,
|
|
"learning_rate": 9.990487001074161e-06,
|
|
"loss": 0.1124,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.07207971168115328,
|
|
"grad_norm": 0.28345381185455476,
|
|
"learning_rate": 9.988203011262589e-06,
|
|
"loss": 0.1075,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.0741997032011872,
|
|
"grad_norm": 0.3252373191017647,
|
|
"learning_rate": 9.985673837409865e-06,
|
|
"loss": 0.1012,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.07631969472122112,
|
|
"grad_norm": 0.34083940392900397,
|
|
"learning_rate": 9.982899603831912e-06,
|
|
"loss": 0.1031,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.07843968624125504,
|
|
"grad_norm": 0.4836055260534099,
|
|
"learning_rate": 9.979880446890025e-06,
|
|
"loss": 0.0996,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.08055967776128896,
|
|
"grad_norm": 0.3984011129854127,
|
|
"learning_rate": 9.976616514984152e-06,
|
|
"loss": 0.1009,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.08267966928132288,
|
|
"grad_norm": 0.35365486849367245,
|
|
"learning_rate": 9.973107968545623e-06,
|
|
"loss": 0.0976,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.0847996608013568,
|
|
"grad_norm": 0.30027024887187254,
|
|
"learning_rate": 9.969354980029243e-06,
|
|
"loss": 0.0969,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.08691965232139072,
|
|
"grad_norm": 0.25355683531168066,
|
|
"learning_rate": 9.96535773390483e-06,
|
|
"loss": 0.1002,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.08903964384142464,
|
|
"grad_norm": 0.20391383326557452,
|
|
"learning_rate": 9.961116426648138e-06,
|
|
"loss": 0.0995,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.09115963536145856,
|
|
"grad_norm": 0.29085751915140184,
|
|
"learning_rate": 9.956631266731207e-06,
|
|
"loss": 0.0992,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.09327962688149248,
|
|
"grad_norm": 0.2774668505220891,
|
|
"learning_rate": 9.951902474612112e-06,
|
|
"loss": 0.0981,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.09539961840152639,
|
|
"grad_norm": 0.23046127720501614,
|
|
"learning_rate": 9.946930282724128e-06,
|
|
"loss": 0.0946,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.09751960992156031,
|
|
"grad_norm": 0.21609489095737885,
|
|
"learning_rate": 9.941714935464303e-06,
|
|
"loss": 0.0903,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.09963960144159423,
|
|
"grad_norm": 0.3223533405901417,
|
|
"learning_rate": 9.936256689181454e-06,
|
|
"loss": 0.0996,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.10175959296162815,
|
|
"grad_norm": 0.33768759021085587,
|
|
"learning_rate": 9.930555812163552e-06,
|
|
"loss": 0.094,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.10387958448166207,
|
|
"grad_norm": 0.22549215526677444,
|
|
"learning_rate": 9.924612584624545e-06,
|
|
"loss": 0.094,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.10599957600169599,
|
|
"grad_norm": 0.20645922834306707,
|
|
"learning_rate": 9.918427298690585e-06,
|
|
"loss": 0.0909,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.10811956752172991,
|
|
"grad_norm": 0.3341574730006992,
|
|
"learning_rate": 9.912000258385669e-06,
|
|
"loss": 0.0873,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.11023955904176383,
|
|
"grad_norm": 0.22497042484649127,
|
|
"learning_rate": 9.905331779616683e-06,
|
|
"loss": 0.091,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.11235955056179775,
|
|
"grad_norm": 0.25629246939970207,
|
|
"learning_rate": 9.898422190157897e-06,
|
|
"loss": 0.0908,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.11447954208183167,
|
|
"grad_norm": 0.3221978930825859,
|
|
"learning_rate": 9.891271829634837e-06,
|
|
"loss": 0.0958,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.11659953360186559,
|
|
"grad_norm": 0.24323805543228721,
|
|
"learning_rate": 9.883881049507592e-06,
|
|
"loss": 0.0931,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.11871952512189951,
|
|
"grad_norm": 1.4298692529820525,
|
|
"learning_rate": 9.876250213053542e-06,
|
|
"loss": 0.0899,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.12083951664193343,
|
|
"grad_norm": 0.2343862275786179,
|
|
"learning_rate": 9.868379695349514e-06,
|
|
"loss": 0.0954,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.12295950816196735,
|
|
"grad_norm": 0.2786132714201901,
|
|
"learning_rate": 9.860269883253321e-06,
|
|
"loss": 0.0909,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.12507949968200127,
|
|
"grad_norm": 0.3051144098888441,
|
|
"learning_rate": 9.851921175384769e-06,
|
|
"loss": 0.0875,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.1271994912020352,
|
|
"grad_norm": 0.19169696652783166,
|
|
"learning_rate": 9.843333982106052e-06,
|
|
"loss": 0.0877,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.1293194827220691,
|
|
"grad_norm": 0.17535067812063176,
|
|
"learning_rate": 9.834508725501584e-06,
|
|
"loss": 0.088,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.13143947424210303,
|
|
"grad_norm": 0.24821593393490932,
|
|
"learning_rate": 9.825445839357256e-06,
|
|
"loss": 0.0869,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.13355946576213695,
|
|
"grad_norm": 0.25345703968329003,
|
|
"learning_rate": 9.816145769139107e-06,
|
|
"loss": 0.0882,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.13567945728217087,
|
|
"grad_norm": 0.2081953809171809,
|
|
"learning_rate": 9.806608971971436e-06,
|
|
"loss": 0.0862,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.1377994488022048,
|
|
"grad_norm": 0.18784011735994216,
|
|
"learning_rate": 9.796835916614329e-06,
|
|
"loss": 0.0872,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.13991944032223871,
|
|
"grad_norm": 0.24880072040487056,
|
|
"learning_rate": 9.786827083440616e-06,
|
|
"loss": 0.0845,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.14203943184227263,
|
|
"grad_norm": 0.24058730019755548,
|
|
"learning_rate": 9.776582964412267e-06,
|
|
"loss": 0.0862,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.14415942336230655,
|
|
"grad_norm": 0.2565530024882537,
|
|
"learning_rate": 9.766104063056201e-06,
|
|
"loss": 0.0867,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.14627941488234047,
|
|
"grad_norm": 0.2716229887520234,
|
|
"learning_rate": 9.75539089443954e-06,
|
|
"loss": 0.0847,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.1483994064023744,
|
|
"grad_norm": 0.18039433691787665,
|
|
"learning_rate": 9.7444439851443e-06,
|
|
"loss": 0.084,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.15051939792240832,
|
|
"grad_norm": 0.19975307362406894,
|
|
"learning_rate": 9.733263873241494e-06,
|
|
"loss": 0.085,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.15263938944244224,
|
|
"grad_norm": 0.20672720831030839,
|
|
"learning_rate": 9.721851108264692e-06,
|
|
"loss": 0.0854,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.15475938096247616,
|
|
"grad_norm": 0.22685728560638968,
|
|
"learning_rate": 9.710206251183015e-06,
|
|
"loss": 0.0822,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.15687937248251008,
|
|
"grad_norm": 0.19518820234055553,
|
|
"learning_rate": 9.698329874373547e-06,
|
|
"loss": 0.0841,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.158999364002544,
|
|
"grad_norm": 0.1711051072902191,
|
|
"learning_rate": 9.686222561593218e-06,
|
|
"loss": 0.0813,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.16111935552257792,
|
|
"grad_norm": 0.21782045297391434,
|
|
"learning_rate": 9.6738849079501e-06,
|
|
"loss": 0.0811,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.16323934704261184,
|
|
"grad_norm": 0.24001365708386638,
|
|
"learning_rate": 9.661317519874156e-06,
|
|
"loss": 0.0839,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.16535933856264576,
|
|
"grad_norm": 0.24788434673856172,
|
|
"learning_rate": 9.648521015087437e-06,
|
|
"loss": 0.0821,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.16747933008267968,
|
|
"grad_norm": 0.2361652593535425,
|
|
"learning_rate": 9.63549602257372e-06,
|
|
"loss": 0.0815,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.1695993216027136,
|
|
"grad_norm": 0.20131220073696676,
|
|
"learning_rate": 9.622243182547584e-06,
|
|
"loss": 0.0814,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.17171931312274752,
|
|
"grad_norm": 0.1910839922084592,
|
|
"learning_rate": 9.608763146422947e-06,
|
|
"loss": 0.0805,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.17383930464278144,
|
|
"grad_norm": 0.19343246908459,
|
|
"learning_rate": 9.59505657678105e-06,
|
|
"loss": 0.0817,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.17595929616281536,
|
|
"grad_norm": 0.20795991387602794,
|
|
"learning_rate": 9.581124147337886e-06,
|
|
"loss": 0.0829,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.17807928768284928,
|
|
"grad_norm": 0.2384907168932879,
|
|
"learning_rate": 9.566966542911079e-06,
|
|
"loss": 0.0828,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.1801992792028832,
|
|
"grad_norm": 0.2661800926148604,
|
|
"learning_rate": 9.552584459386234e-06,
|
|
"loss": 0.0807,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.18231927072291712,
|
|
"grad_norm": 0.16793437681402207,
|
|
"learning_rate": 9.537978603682728e-06,
|
|
"loss": 0.0808,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.18443926224295104,
|
|
"grad_norm": 0.17934828150494173,
|
|
"learning_rate": 9.52314969371896e-06,
|
|
"loss": 0.084,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.18655925376298496,
|
|
"grad_norm": 0.2375412871314736,
|
|
"learning_rate": 9.50809845837707e-06,
|
|
"loss": 0.0816,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.18867924528301888,
|
|
"grad_norm": 0.19298906022439435,
|
|
"learning_rate": 9.492825637467103e-06,
|
|
"loss": 0.0823,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.19079923680305277,
|
|
"grad_norm": 0.21813774555943088,
|
|
"learning_rate": 9.47733198169065e-06,
|
|
"loss": 0.0783,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.1929192283230867,
|
|
"grad_norm": 0.20165689985494703,
|
|
"learning_rate": 9.461618252603956e-06,
|
|
"loss": 0.0799,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.19503921984312061,
|
|
"grad_norm": 0.19889073544853322,
|
|
"learning_rate": 9.44568522258048e-06,
|
|
"loss": 0.0824,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.19715921136315454,
|
|
"grad_norm": 0.1677587269136254,
|
|
"learning_rate": 9.42953367477292e-06,
|
|
"loss": 0.0817,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.19927920288318846,
|
|
"grad_norm": 0.27029655878775144,
|
|
"learning_rate": 9.413164403074744e-06,
|
|
"loss": 0.0771,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.20139919440322238,
|
|
"grad_norm": 0.20752899646185938,
|
|
"learning_rate": 9.398246569397352e-06,
|
|
"loss": 0.083,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.2035191859232563,
|
|
"grad_norm": 0.1755855686731489,
|
|
"learning_rate": 9.381465847779896e-06,
|
|
"loss": 0.0773,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.20563917744329022,
|
|
"grad_norm": 0.20944110815210132,
|
|
"learning_rate": 9.364469764939109e-06,
|
|
"loss": 0.0856,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.20775916896332414,
|
|
"grad_norm": 0.20414341977081546,
|
|
"learning_rate": 9.347259156279697e-06,
|
|
"loss": 0.0814,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.20987916048335806,
|
|
"grad_norm": 0.23534197904825535,
|
|
"learning_rate": 9.329834867750912e-06,
|
|
"loss": 0.0782,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.21199915200339198,
|
|
"grad_norm": 0.19706741061553582,
|
|
"learning_rate": 9.312197755804957e-06,
|
|
"loss": 0.0813,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.21199915200339198,
|
|
"eval_loss": 0.07808271795511246,
|
|
"eval_runtime": 489.1656,
|
|
"eval_samples_per_second": 4.183,
|
|
"eval_steps_per_second": 0.301,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.2141191435234259,
|
|
"grad_norm": 0.20255197961584584,
|
|
"learning_rate": 9.294348687354899e-06,
|
|
"loss": 0.0786,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.21623913504345982,
|
|
"grad_norm": 0.18756797354546684,
|
|
"learning_rate": 9.278104027838603e-06,
|
|
"loss": 0.0904,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.21835912656349374,
|
|
"grad_norm": 0.1803465926904638,
|
|
"learning_rate": 9.259854667654485e-06,
|
|
"loss": 0.0794,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.22047911808352766,
|
|
"grad_norm": 0.23334153257650003,
|
|
"learning_rate": 9.24139592377452e-06,
|
|
"loss": 0.0787,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.22259910960356158,
|
|
"grad_norm": 0.2179720848023895,
|
|
"learning_rate": 9.222728703497267e-06,
|
|
"loss": 0.082,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.2247191011235955,
|
|
"grad_norm": 0.19658693620368076,
|
|
"learning_rate": 9.203853924368488e-06,
|
|
"loss": 0.0774,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.22683909264362942,
|
|
"grad_norm": 0.19207266252736302,
|
|
"learning_rate": 9.18477251413603e-06,
|
|
"loss": 0.075,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.22895908416366334,
|
|
"grad_norm": 0.26026047947148445,
|
|
"learning_rate": 9.165485410704238e-06,
|
|
"loss": 0.0767,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.23107907568369726,
|
|
"grad_norm": 0.20808377225507274,
|
|
"learning_rate": 9.145993562087848e-06,
|
|
"loss": 0.0784,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.23319906720373118,
|
|
"grad_norm": 0.18546577944881873,
|
|
"learning_rate": 9.12629792636539e-06,
|
|
"loss": 0.0761,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.2353190587237651,
|
|
"grad_norm": 0.17056106678355273,
|
|
"learning_rate": 9.1063994716321e-06,
|
|
"loss": 0.079,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.23743905024379902,
|
|
"grad_norm": 0.23748473849259574,
|
|
"learning_rate": 9.086299175952327e-06,
|
|
"loss": 0.0769,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.23955904176383294,
|
|
"grad_norm": 0.21686782311110087,
|
|
"learning_rate": 9.065998027311467e-06,
|
|
"loss": 0.0783,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.24167903328386686,
|
|
"grad_norm": 0.18252100885280442,
|
|
"learning_rate": 9.045497023567396e-06,
|
|
"loss": 0.08,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.24379902480390078,
|
|
"grad_norm": 0.24544524196615075,
|
|
"learning_rate": 9.024797172401426e-06,
|
|
"loss": 0.08,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.2459190163239347,
|
|
"grad_norm": 0.19645758097319682,
|
|
"learning_rate": 9.003899491268768e-06,
|
|
"loss": 0.0798,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.24803900784396862,
|
|
"grad_norm": 0.21351292431439828,
|
|
"learning_rate": 8.982805007348531e-06,
|
|
"loss": 0.0754,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.25015899936400254,
|
|
"grad_norm": 0.17911187274119753,
|
|
"learning_rate": 8.961514757493224e-06,
|
|
"loss": 0.0772,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.2522789908840365,
|
|
"grad_norm": 0.22385841458052336,
|
|
"learning_rate": 8.940029788177795e-06,
|
|
"loss": 0.0773,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.2543989824040704,
|
|
"grad_norm": 0.1683385451298309,
|
|
"learning_rate": 8.9183511554482e-06,
|
|
"loss": 0.0747,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.2565189739241043,
|
|
"grad_norm": 0.2372277626069385,
|
|
"learning_rate": 8.896479924869483e-06,
|
|
"loss": 0.076,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.2586389654441382,
|
|
"grad_norm": 0.2022762684424488,
|
|
"learning_rate": 8.874417171473415e-06,
|
|
"loss": 0.074,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.2607589569641721,
|
|
"grad_norm": 0.19827726532798173,
|
|
"learning_rate": 8.852163979705639e-06,
|
|
"loss": 0.0782,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.26287894848420607,
|
|
"grad_norm": 0.2343304853478425,
|
|
"learning_rate": 8.829721443372378e-06,
|
|
"loss": 0.0756,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.26499894000423996,
|
|
"grad_norm": 0.2217975820152699,
|
|
"learning_rate": 8.807090665586664e-06,
|
|
"loss": 0.0777,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.2671189315242739,
|
|
"grad_norm": 0.20059042051568582,
|
|
"learning_rate": 8.784272758714118e-06,
|
|
"loss": 0.0738,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.2692389230443078,
|
|
"grad_norm": 0.18668008819406168,
|
|
"learning_rate": 8.761268844318282e-06,
|
|
"loss": 0.0757,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.27135891456434175,
|
|
"grad_norm": 0.24287290051699115,
|
|
"learning_rate": 8.73808005310548e-06,
|
|
"loss": 0.0762,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.27347890608437564,
|
|
"grad_norm": 0.18654864916400832,
|
|
"learning_rate": 8.714707524869245e-06,
|
|
"loss": 0.0795,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.2755988976044096,
|
|
"grad_norm": 0.2341640233496617,
|
|
"learning_rate": 8.691152408434296e-06,
|
|
"loss": 0.0732,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.2777188891244435,
|
|
"grad_norm": 0.2082849165240921,
|
|
"learning_rate": 8.66741586160007e-06,
|
|
"loss": 0.0774,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.27983888064447743,
|
|
"grad_norm": 0.22099657211974957,
|
|
"learning_rate": 8.643499051083812e-06,
|
|
"loss": 0.0738,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.2819588721645113,
|
|
"grad_norm": 0.20266362710361852,
|
|
"learning_rate": 8.619403152463231e-06,
|
|
"loss": 0.0765,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.28407886368454527,
|
|
"grad_norm": 0.2061785256600231,
|
|
"learning_rate": 8.595129350118707e-06,
|
|
"loss": 0.0743,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.28619885520457916,
|
|
"grad_norm": 0.16451845890324332,
|
|
"learning_rate": 8.570678837175089e-06,
|
|
"loss": 0.0731,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.2883188467246131,
|
|
"grad_norm": 0.2039051062049253,
|
|
"learning_rate": 8.546052815443041e-06,
|
|
"loss": 0.075,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.290438838244647,
|
|
"grad_norm": 0.20907838865950076,
|
|
"learning_rate": 8.521252495359971e-06,
|
|
"loss": 0.0779,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.29255882976468095,
|
|
"grad_norm": 0.24041556209946813,
|
|
"learning_rate": 8.496279095930535e-06,
|
|
"loss": 0.0752,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.29467882128471484,
|
|
"grad_norm": 0.749878332883915,
|
|
"learning_rate": 8.471133844666721e-06,
|
|
"loss": 0.0736,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.2967988128047488,
|
|
"grad_norm": 0.20890550228302898,
|
|
"learning_rate": 8.445817977527513e-06,
|
|
"loss": 0.075,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.2989188043247827,
|
|
"grad_norm": 0.19623374633823948,
|
|
"learning_rate": 8.420332738858136e-06,
|
|
"loss": 0.0764,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.30103879584481663,
|
|
"grad_norm": 0.1803239880391424,
|
|
"learning_rate": 8.394679381328904e-06,
|
|
"loss": 0.0782,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.3031587873648505,
|
|
"grad_norm": 0.20782215083834218,
|
|
"learning_rate": 8.368859165873629e-06,
|
|
"loss": 0.075,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.30527877888488447,
|
|
"grad_norm": 0.15864327444836024,
|
|
"learning_rate": 8.342873361627663e-06,
|
|
"loss": 0.0736,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.30739877040491836,
|
|
"grad_norm": 0.16758113802075347,
|
|
"learning_rate": 8.316723245865503e-06,
|
|
"loss": 0.0743,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.3095187619249523,
|
|
"grad_norm": 0.18187491458078606,
|
|
"learning_rate": 8.290410103938015e-06,
|
|
"loss": 0.0763,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.3116387534449862,
|
|
"grad_norm": 0.21015841421612225,
|
|
"learning_rate": 8.263935229209255e-06,
|
|
"loss": 0.0778,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.31375874496502015,
|
|
"grad_norm": 0.18123545189165477,
|
|
"learning_rate": 8.237299922992894e-06,
|
|
"loss": 0.0737,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.31587873648505405,
|
|
"grad_norm": 0.22166407582198208,
|
|
"learning_rate": 8.210505494488257e-06,
|
|
"loss": 0.0747,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.317998728005088,
|
|
"grad_norm": 0.20907147507686014,
|
|
"learning_rate": 8.183553260715971e-06,
|
|
"loss": 0.0753,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.3201187195251219,
|
|
"grad_norm": 0.18137783305550276,
|
|
"learning_rate": 8.15644454645323e-06,
|
|
"loss": 0.076,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.32223871104515583,
|
|
"grad_norm": 0.18468699366411487,
|
|
"learning_rate": 8.129180684168683e-06,
|
|
"loss": 0.0756,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.3243587025651897,
|
|
"grad_norm": 0.25213749529106116,
|
|
"learning_rate": 8.101763013956933e-06,
|
|
"loss": 0.0746,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.3264786940852237,
|
|
"grad_norm": 0.2161642309592381,
|
|
"learning_rate": 8.074192883472667e-06,
|
|
"loss": 0.0759,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.32859868560525757,
|
|
"grad_norm": 0.20176374830183055,
|
|
"learning_rate": 8.04647164786442e-06,
|
|
"loss": 0.0731,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.3307186771252915,
|
|
"grad_norm": 0.1969763529692524,
|
|
"learning_rate": 8.01860066970797e-06,
|
|
"loss": 0.0747,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.3328386686453254,
|
|
"grad_norm": 0.2060140719520926,
|
|
"learning_rate": 7.990581318939346e-06,
|
|
"loss": 0.0776,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.33495866016535936,
|
|
"grad_norm": 0.19063310891295998,
|
|
"learning_rate": 7.962414972787513e-06,
|
|
"loss": 0.0732,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.33707865168539325,
|
|
"grad_norm": 0.32170213151067406,
|
|
"learning_rate": 7.934103015706665e-06,
|
|
"loss": 0.0718,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.3391986432054272,
|
|
"grad_norm": 0.3017816089089381,
|
|
"learning_rate": 7.905646839308171e-06,
|
|
"loss": 0.0713,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.3413186347254611,
|
|
"grad_norm": 0.23008639936227068,
|
|
"learning_rate": 7.877047842292193e-06,
|
|
"loss": 0.0761,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.34343862624549504,
|
|
"grad_norm": 0.21592149981422365,
|
|
"learning_rate": 7.84830743037891e-06,
|
|
"loss": 0.0743,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.34555861776552893,
|
|
"grad_norm": 0.19186122210621318,
|
|
"learning_rate": 7.819427016239447e-06,
|
|
"loss": 0.0727,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.3476786092855629,
|
|
"grad_norm": 0.19028094584781277,
|
|
"learning_rate": 7.790408019426424e-06,
|
|
"loss": 0.0732,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.34979860080559677,
|
|
"grad_norm": 0.18613264864125317,
|
|
"learning_rate": 7.761251866304176e-06,
|
|
"loss": 0.0735,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.3519185923256307,
|
|
"grad_norm": 0.25693606830513216,
|
|
"learning_rate": 7.731959989978667e-06,
|
|
"loss": 0.0761,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.3540385838456646,
|
|
"grad_norm": 0.21174972493824354,
|
|
"learning_rate": 7.702533830227024e-06,
|
|
"loss": 0.073,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.35615857536569856,
|
|
"grad_norm": 0.17143400242703105,
|
|
"learning_rate": 7.672974833426779e-06,
|
|
"loss": 0.0737,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.35827856688573245,
|
|
"grad_norm": 0.1896798767328968,
|
|
"learning_rate": 7.643284452484773e-06,
|
|
"loss": 0.0725,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.3603985584057664,
|
|
"grad_norm": 0.20989812969482,
|
|
"learning_rate": 7.613464146765748e-06,
|
|
"loss": 0.0728,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.3625185499258003,
|
|
"grad_norm": 0.19528230055400297,
|
|
"learning_rate": 7.583515382020603e-06,
|
|
"loss": 0.0732,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.36463854144583424,
|
|
"grad_norm": 0.21719433509159808,
|
|
"learning_rate": 7.5534396303143605e-06,
|
|
"loss": 0.0704,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.36675853296586813,
|
|
"grad_norm": 0.17922009396269356,
|
|
"learning_rate": 7.523238369953802e-06,
|
|
"loss": 0.0683,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.3688785244859021,
|
|
"grad_norm": 0.43299284172403135,
|
|
"learning_rate": 7.4929130854148105e-06,
|
|
"loss": 0.0724,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.370998516005936,
|
|
"grad_norm": 0.17140377305963891,
|
|
"learning_rate": 7.4624652672693984e-06,
|
|
"loss": 0.0748,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.3731185075259699,
|
|
"grad_norm": 0.21895097629186294,
|
|
"learning_rate": 7.43189641211245e-06,
|
|
"loss": 0.0731,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.3752384990460038,
|
|
"grad_norm": 0.217978788931641,
|
|
"learning_rate": 7.401208022488152e-06,
|
|
"loss": 0.0742,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.37735849056603776,
|
|
"grad_norm": 0.1778572661187928,
|
|
"learning_rate": 7.370401606816142e-06,
|
|
"loss": 0.0699,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.37947848208607166,
|
|
"grad_norm": 0.22879874120541474,
|
|
"learning_rate": 7.339478679317369e-06,
|
|
"loss": 0.0762,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.38159847360610555,
|
|
"grad_norm": 0.20685054508670736,
|
|
"learning_rate": 7.308440759939659e-06,
|
|
"loss": 0.0717,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.3837184651261395,
|
|
"grad_norm": 0.23785935863776791,
|
|
"learning_rate": 7.277289374283009e-06,
|
|
"loss": 0.0736,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.3858384566461734,
|
|
"grad_norm": 0.25317055966913016,
|
|
"learning_rate": 7.246026053524603e-06,
|
|
"loss": 0.0729,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.38795844816620734,
|
|
"grad_norm": 0.17578925874031257,
|
|
"learning_rate": 7.214652334343539e-06,
|
|
"loss": 0.0736,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.39007843968624123,
|
|
"grad_norm": 0.20527746180283624,
|
|
"learning_rate": 7.183169758845308e-06,
|
|
"loss": 0.0738,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.3921984312062752,
|
|
"grad_norm": 0.17762579012389196,
|
|
"learning_rate": 7.151579874485995e-06,
|
|
"loss": 0.0713,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.39431842272630907,
|
|
"grad_norm": 0.200855955585812,
|
|
"learning_rate": 7.119884233996208e-06,
|
|
"loss": 0.0712,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.396438414246343,
|
|
"grad_norm": 0.18153714925752748,
|
|
"learning_rate": 7.088084395304765e-06,
|
|
"loss": 0.0716,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.3985584057663769,
|
|
"grad_norm": 0.30888829292022363,
|
|
"learning_rate": 7.0561819214621186e-06,
|
|
"loss": 0.0709,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.40067839728641086,
|
|
"grad_norm": 0.1677914580623982,
|
|
"learning_rate": 7.024178380563517e-06,
|
|
"loss": 0.0686,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.40279838880644475,
|
|
"grad_norm": 0.23978151499288752,
|
|
"learning_rate": 6.99207534567194e-06,
|
|
"loss": 0.0733,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.4049183803264787,
|
|
"grad_norm": 0.1496936015385523,
|
|
"learning_rate": 6.959874394740775e-06,
|
|
"loss": 0.0703,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.4070383718465126,
|
|
"grad_norm": 0.17861927332885064,
|
|
"learning_rate": 6.927577110536251e-06,
|
|
"loss": 0.0709,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.40915836336654654,
|
|
"grad_norm": 0.20321672481515973,
|
|
"learning_rate": 6.895185080559649e-06,
|
|
"loss": 0.0718,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.41127835488658043,
|
|
"grad_norm": 0.1702613475343327,
|
|
"learning_rate": 6.862699896969262e-06,
|
|
"loss": 0.0726,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.4133983464066144,
|
|
"grad_norm": 0.19820271991017135,
|
|
"learning_rate": 6.830123156502147e-06,
|
|
"loss": 0.0722,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.4155183379266483,
|
|
"grad_norm": 0.21659807580282067,
|
|
"learning_rate": 6.7974564603956395e-06,
|
|
"loss": 0.072,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.4176383294466822,
|
|
"grad_norm": 0.2484602203487257,
|
|
"learning_rate": 6.7647014143086334e-06,
|
|
"loss": 0.0707,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.4197583209667161,
|
|
"grad_norm": 0.2428781266233325,
|
|
"learning_rate": 6.7318596282426796e-06,
|
|
"loss": 0.0726,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.42187831248675006,
|
|
"grad_norm": 0.19890188197759706,
|
|
"learning_rate": 6.6989327164628375e-06,
|
|
"loss": 0.0735,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.42399830400678395,
|
|
"grad_norm": 0.17618893341728534,
|
|
"learning_rate": 6.665922297418328e-06,
|
|
"loss": 0.0717,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.42399830400678395,
|
|
"eval_loss": 0.07119767367839813,
|
|
"eval_runtime": 489.4309,
|
|
"eval_samples_per_second": 4.18,
|
|
"eval_steps_per_second": 0.3,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.4261182955268179,
|
|
"grad_norm": 0.2226486798390119,
|
|
"learning_rate": 6.632829993662994e-06,
|
|
"loss": 0.0698,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.4282382870468518,
|
|
"grad_norm": 0.15591714439754456,
|
|
"learning_rate": 6.599657431775529e-06,
|
|
"loss": 0.073,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.43035827856688574,
|
|
"grad_norm": 0.18569107867432982,
|
|
"learning_rate": 6.566406242279546e-06,
|
|
"loss": 0.0701,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.43247827008691964,
|
|
"grad_norm": 0.2044929271888512,
|
|
"learning_rate": 6.53307805956342e-06,
|
|
"loss": 0.0684,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.4345982616069536,
|
|
"grad_norm": 0.1591048247213101,
|
|
"learning_rate": 6.4996745217999566e-06,
|
|
"loss": 0.0712,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.4367182531269875,
|
|
"grad_norm": 0.18457570554796743,
|
|
"learning_rate": 6.4661972708658715e-06,
|
|
"loss": 0.0682,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.4388382446470214,
|
|
"grad_norm": 0.18024866180958676,
|
|
"learning_rate": 6.4326479522610855e-06,
|
|
"loss": 0.0703,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.4409582361670553,
|
|
"grad_norm": 0.17393779181333482,
|
|
"learning_rate": 6.399028215027849e-06,
|
|
"loss": 0.0677,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.44307822768708927,
|
|
"grad_norm": 0.15822079895374294,
|
|
"learning_rate": 6.365339711669687e-06,
|
|
"loss": 0.0696,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.44519821920712316,
|
|
"grad_norm": 0.17783185791820674,
|
|
"learning_rate": 6.331584098070159e-06,
|
|
"loss": 0.0729,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.4473182107271571,
|
|
"grad_norm": 0.17784945554783102,
|
|
"learning_rate": 6.2977630334114904e-06,
|
|
"loss": 0.0706,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.449438202247191,
|
|
"grad_norm": 0.21655542057286598,
|
|
"learning_rate": 6.263878180093004e-06,
|
|
"loss": 0.0734,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.45155819376722495,
|
|
"grad_norm": 0.1933797514771672,
|
|
"learning_rate": 6.2299312036494134e-06,
|
|
"loss": 0.069,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.45367818528725884,
|
|
"grad_norm": 0.15757976242950295,
|
|
"learning_rate": 6.195923772668955e-06,
|
|
"loss": 0.0722,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.4557981768072928,
|
|
"grad_norm": 0.20409740685630307,
|
|
"learning_rate": 6.161857558711372e-06,
|
|
"loss": 0.0705,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.4579181683273267,
|
|
"grad_norm": 0.15041431962094184,
|
|
"learning_rate": 6.12773423622576e-06,
|
|
"loss": 0.0695,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.46003815984736063,
|
|
"grad_norm": 0.281897607782115,
|
|
"learning_rate": 6.0935554824682556e-06,
|
|
"loss": 0.0704,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.4621581513673945,
|
|
"grad_norm": 0.22084672726453938,
|
|
"learning_rate": 6.059322977419591e-06,
|
|
"loss": 0.0705,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.46427814288742847,
|
|
"grad_norm": 0.20019812476026203,
|
|
"learning_rate": 6.02503840370253e-06,
|
|
"loss": 0.0703,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.46639813440746236,
|
|
"grad_norm": 0.17909334136517222,
|
|
"learning_rate": 5.990703446499153e-06,
|
|
"loss": 0.0706,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.4685181259274963,
|
|
"grad_norm": 0.16644185623431462,
|
|
"learning_rate": 5.9563197934680325e-06,
|
|
"loss": 0.0746,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.4706381174475302,
|
|
"grad_norm": 0.23611788687622157,
|
|
"learning_rate": 5.921889134661272e-06,
|
|
"loss": 0.0715,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.47275810896756415,
|
|
"grad_norm": 0.1692697227784412,
|
|
"learning_rate": 5.887413162441438e-06,
|
|
"loss": 0.0703,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.47487810048759804,
|
|
"grad_norm": 0.16272992258196417,
|
|
"learning_rate": 5.852893571398385e-06,
|
|
"loss": 0.0703,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.476998092007632,
|
|
"grad_norm": 0.16602591153652455,
|
|
"learning_rate": 5.818332058265948e-06,
|
|
"loss": 0.0682,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.4791180835276659,
|
|
"grad_norm": 0.15187588978068958,
|
|
"learning_rate": 5.783730321838548e-06,
|
|
"loss": 0.0658,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.48123807504769983,
|
|
"grad_norm": 0.21228071370192056,
|
|
"learning_rate": 5.749090062887697e-06,
|
|
"loss": 0.07,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.4833580665677337,
|
|
"grad_norm": 0.1935655119130272,
|
|
"learning_rate": 5.714412984078393e-06,
|
|
"loss": 0.0699,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.48547805808776767,
|
|
"grad_norm": 0.1611360908597304,
|
|
"learning_rate": 5.679700789885436e-06,
|
|
"loss": 0.0715,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.48759804960780156,
|
|
"grad_norm": 0.2436477600612657,
|
|
"learning_rate": 5.644955186509641e-06,
|
|
"loss": 0.0689,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.4897180411278355,
|
|
"grad_norm": 0.24133950450204542,
|
|
"learning_rate": 5.610177881793976e-06,
|
|
"loss": 0.0693,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.4918380326478694,
|
|
"grad_norm": 0.20263042804166118,
|
|
"learning_rate": 5.5753705851396236e-06,
|
|
"loss": 0.0692,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.49395802416790335,
|
|
"grad_norm": 0.1758643154419978,
|
|
"learning_rate": 5.54053500742195e-06,
|
|
"loss": 0.0717,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.49607801568793725,
|
|
"grad_norm": 0.17041444204200845,
|
|
"learning_rate": 5.505672860906412e-06,
|
|
"loss": 0.0731,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.4981980072079712,
|
|
"grad_norm": 0.16318236116620452,
|
|
"learning_rate": 5.470785859164402e-06,
|
|
"loss": 0.0717,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.5003179987280051,
|
|
"grad_norm": 0.1684480788354608,
|
|
"learning_rate": 5.435875716989013e-06,
|
|
"loss": 0.0731,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.502437990248039,
|
|
"grad_norm": 0.16940752138117054,
|
|
"learning_rate": 5.400944150310754e-06,
|
|
"loss": 0.0686,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.504557981768073,
|
|
"grad_norm": 0.18543062436184285,
|
|
"learning_rate": 5.3659928761132084e-06,
|
|
"loss": 0.0712,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.5066779732881068,
|
|
"grad_norm": 0.18981591633920203,
|
|
"learning_rate": 5.3310236123486396e-06,
|
|
"loss": 0.0713,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.5087979648081408,
|
|
"grad_norm": 0.20107039697147697,
|
|
"learning_rate": 5.296038077853545e-06,
|
|
"loss": 0.0724,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.5109179563281747,
|
|
"grad_norm": 0.15561965976521763,
|
|
"learning_rate": 5.261037992264182e-06,
|
|
"loss": 0.0691,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.5130379478482086,
|
|
"grad_norm": 0.18814302974879546,
|
|
"learning_rate": 5.226025075932024e-06,
|
|
"loss": 0.0725,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.5151579393682425,
|
|
"grad_norm": 0.19409510196146995,
|
|
"learning_rate": 5.191001049839218e-06,
|
|
"loss": 0.0718,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.5172779308882764,
|
|
"grad_norm": 0.1948905204885732,
|
|
"learning_rate": 5.155967635513985e-06,
|
|
"loss": 0.0689,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.5193979224083104,
|
|
"grad_norm": 0.15910287909271553,
|
|
"learning_rate": 5.120926554946003e-06,
|
|
"loss": 0.07,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.5215179139283442,
|
|
"grad_norm": 0.16754971258212684,
|
|
"learning_rate": 5.0858795305017696e-06,
|
|
"loss": 0.0697,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.5236379054483782,
|
|
"grad_norm": 0.19912027070603852,
|
|
"learning_rate": 5.050828284839936e-06,
|
|
"loss": 0.0707,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.5257578969684121,
|
|
"grad_norm": 0.1770839557299797,
|
|
"learning_rate": 5.015774540826639e-06,
|
|
"loss": 0.0708,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.5278778884884461,
|
|
"grad_norm": 0.1879664250171856,
|
|
"learning_rate": 4.980720021450822e-06,
|
|
"loss": 0.0719,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.5299978800084799,
|
|
"grad_norm": 0.2038214395747643,
|
|
"learning_rate": 4.945666449739534e-06,
|
|
"loss": 0.0724,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.5321178715285139,
|
|
"grad_norm": 0.14313855897723543,
|
|
"learning_rate": 4.910615548673245e-06,
|
|
"loss": 0.0671,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.5342378630485478,
|
|
"grad_norm": 0.1667988114624785,
|
|
"learning_rate": 4.875569041101152e-06,
|
|
"loss": 0.0704,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.5363578545685818,
|
|
"grad_norm": 0.15027735583780358,
|
|
"learning_rate": 4.840528649656507e-06,
|
|
"loss": 0.0683,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.5384778460886156,
|
|
"grad_norm": 0.18256318111022773,
|
|
"learning_rate": 4.805496096671933e-06,
|
|
"loss": 0.0723,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.5405978376086495,
|
|
"grad_norm": 0.1581087386884916,
|
|
"learning_rate": 4.77047310409477e-06,
|
|
"loss": 0.0678,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.5427178291286835,
|
|
"grad_norm": 0.15440122313131024,
|
|
"learning_rate": 4.735461393402437e-06,
|
|
"loss": 0.0683,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.5448378206487174,
|
|
"grad_norm": 0.16903444896901648,
|
|
"learning_rate": 4.700462685517822e-06,
|
|
"loss": 0.069,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.5469578121687513,
|
|
"grad_norm": 0.16890011975539956,
|
|
"learning_rate": 4.665478700724684e-06,
|
|
"loss": 0.0684,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.5490778036887852,
|
|
"grad_norm": 0.20866405931584792,
|
|
"learning_rate": 4.630511158583102e-06,
|
|
"loss": 0.0698,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.5511977952088192,
|
|
"grad_norm": 0.17297937706938452,
|
|
"learning_rate": 4.595561777844954e-06,
|
|
"loss": 0.0683,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.5533177867288531,
|
|
"grad_norm": 0.19334365079463428,
|
|
"learning_rate": 4.560632276369436e-06,
|
|
"loss": 0.071,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.555437778248887,
|
|
"grad_norm": 0.1566806985390742,
|
|
"learning_rate": 4.525724371038616e-06,
|
|
"loss": 0.0681,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.5575577697689209,
|
|
"grad_norm": 0.19883045801104277,
|
|
"learning_rate": 4.4908397776730634e-06,
|
|
"loss": 0.0693,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.5596777612889549,
|
|
"grad_norm": 0.18015525710269312,
|
|
"learning_rate": 4.455980210947488e-06,
|
|
"loss": 0.0694,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.5617977528089888,
|
|
"grad_norm": 0.19996437083442065,
|
|
"learning_rate": 4.421147384306476e-06,
|
|
"loss": 0.0724,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.5639177443290226,
|
|
"grad_norm": 0.1576506824802755,
|
|
"learning_rate": 4.3863430098802674e-06,
|
|
"loss": 0.0676,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.5660377358490566,
|
|
"grad_norm": 0.15643885696863916,
|
|
"learning_rate": 4.35156879840059e-06,
|
|
"loss": 0.0711,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.5681577273690905,
|
|
"grad_norm": 0.1810361041257664,
|
|
"learning_rate": 4.3168264591165825e-06,
|
|
"loss": 0.0673,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.5702777188891245,
|
|
"grad_norm": 0.18342485320088614,
|
|
"learning_rate": 4.282117699710775e-06,
|
|
"loss": 0.0693,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.5723977104091583,
|
|
"grad_norm": 0.1715806871966692,
|
|
"learning_rate": 4.247444226215157e-06,
|
|
"loss": 0.0663,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.5745177019291923,
|
|
"grad_norm": 0.182483141924479,
|
|
"learning_rate": 4.212807742927315e-06,
|
|
"loss": 0.0679,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.5766376934492262,
|
|
"grad_norm": 0.17017972956968302,
|
|
"learning_rate": 4.178209952326659e-06,
|
|
"loss": 0.0708,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.5787576849692602,
|
|
"grad_norm": 0.17249912512947316,
|
|
"learning_rate": 4.143652554990756e-06,
|
|
"loss": 0.0665,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.580877676489294,
|
|
"grad_norm": 0.16601147330223942,
|
|
"learning_rate": 4.109137249511726e-06,
|
|
"loss": 0.0663,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.582997668009328,
|
|
"grad_norm": 0.18185554052245853,
|
|
"learning_rate": 4.074665732412753e-06,
|
|
"loss": 0.0678,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.5851176595293619,
|
|
"grad_norm": 0.16710135698081338,
|
|
"learning_rate": 4.040239698064712e-06,
|
|
"loss": 0.0679,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.5872376510493958,
|
|
"grad_norm": 0.14000708323466857,
|
|
"learning_rate": 4.005860838602863e-06,
|
|
"loss": 0.0697,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.5893576425694297,
|
|
"grad_norm": 0.14927867611572637,
|
|
"learning_rate": 3.971530843843694e-06,
|
|
"loss": 0.0688,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.5914776340894636,
|
|
"grad_norm": 0.15171093238665134,
|
|
"learning_rate": 3.9372514012018596e-06,
|
|
"loss": 0.0699,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.5935976256094976,
|
|
"grad_norm": 0.1804183937869093,
|
|
"learning_rate": 3.903024195607232e-06,
|
|
"loss": 0.0716,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.5957176171295315,
|
|
"grad_norm": 0.15215129472796332,
|
|
"learning_rate": 3.868850909422092e-06,
|
|
"loss": 0.0698,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.5978376086495654,
|
|
"grad_norm": 0.15088502756447672,
|
|
"learning_rate": 3.834733222358427e-06,
|
|
"loss": 0.0687,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.5999576001695993,
|
|
"grad_norm": 0.18595108870060142,
|
|
"learning_rate": 3.80067281139538e-06,
|
|
"loss": 0.0724,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.6020775916896333,
|
|
"grad_norm": 0.16954135988504473,
|
|
"learning_rate": 3.7666713506968052e-06,
|
|
"loss": 0.0691,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.6041975832096672,
|
|
"grad_norm": 0.18097495350034157,
|
|
"learning_rate": 3.7327305115289938e-06,
|
|
"loss": 0.066,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.606317574729701,
|
|
"grad_norm": 0.15243124437822092,
|
|
"learning_rate": 3.69885196217852e-06,
|
|
"loss": 0.0682,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.608437566249735,
|
|
"grad_norm": 0.1614824984725212,
|
|
"learning_rate": 3.66503736787024e-06,
|
|
"loss": 0.0637,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.6105575577697689,
|
|
"grad_norm": 0.16257796428966814,
|
|
"learning_rate": 3.6312883906854376e-06,
|
|
"loss": 0.0674,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.6126775492898029,
|
|
"grad_norm": 0.1786290065781706,
|
|
"learning_rate": 3.5976066894801386e-06,
|
|
"loss": 0.0657,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.6147975408098367,
|
|
"grad_norm": 0.1489676922818998,
|
|
"learning_rate": 3.5639939198035655e-06,
|
|
"loss": 0.0662,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.6169175323298707,
|
|
"grad_norm": 0.15203380554832843,
|
|
"learning_rate": 3.530451733816762e-06,
|
|
"loss": 0.0682,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.6190375238499046,
|
|
"grad_norm": 0.20295326197958097,
|
|
"learning_rate": 3.496981780211392e-06,
|
|
"loss": 0.0685,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.6211575153699386,
|
|
"grad_norm": 0.18783757751530197,
|
|
"learning_rate": 3.4635857041286922e-06,
|
|
"loss": 0.0696,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.6232775068899724,
|
|
"grad_norm": 0.14570978880022487,
|
|
"learning_rate": 3.430265147078616e-06,
|
|
"loss": 0.0702,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.6253974984100064,
|
|
"grad_norm": 0.14375379036873775,
|
|
"learning_rate": 3.3970217468591486e-06,
|
|
"loss": 0.0664,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.6275174899300403,
|
|
"grad_norm": 0.173702914525196,
|
|
"learning_rate": 3.3638571374758e-06,
|
|
"loss": 0.0657,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.6296374814500743,
|
|
"grad_norm": 0.15569914868699147,
|
|
"learning_rate": 3.3307729490612896e-06,
|
|
"loss": 0.0659,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.6317574729701081,
|
|
"grad_norm": 0.18252127530194195,
|
|
"learning_rate": 3.297770807795425e-06,
|
|
"loss": 0.0665,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.633877464490142,
|
|
"grad_norm": 0.18473089844858295,
|
|
"learning_rate": 3.2648523358251726e-06,
|
|
"loss": 0.068,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.635997456010176,
|
|
"grad_norm": 0.156014132437691,
|
|
"learning_rate": 3.232019151184913e-06,
|
|
"loss": 0.0664,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.635997456010176,
|
|
"eval_loss": 0.06693108379840851,
|
|
"eval_runtime": 487.8882,
|
|
"eval_samples_per_second": 4.194,
|
|
"eval_steps_per_second": 0.301,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.6381174475302098,
|
|
"grad_norm": 0.17571197418197826,
|
|
"learning_rate": 3.1992728677169214e-06,
|
|
"loss": 0.0688,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.6402374390502438,
|
|
"grad_norm": 0.14947601720845555,
|
|
"learning_rate": 3.1666150949920393e-06,
|
|
"loss": 0.0665,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.6423574305702777,
|
|
"grad_norm": 0.15331032877554068,
|
|
"learning_rate": 3.1340474382305585e-06,
|
|
"loss": 0.0655,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.6444774220903117,
|
|
"grad_norm": 0.18933623167552627,
|
|
"learning_rate": 3.101571498223317e-06,
|
|
"loss": 0.0649,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.6465974136103455,
|
|
"grad_norm": 0.15247439973376195,
|
|
"learning_rate": 3.069188871253026e-06,
|
|
"loss": 0.0649,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.6487174051303795,
|
|
"grad_norm": 0.16943772711604502,
|
|
"learning_rate": 3.0369011490157984e-06,
|
|
"loss": 0.0692,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.6508373966504134,
|
|
"grad_norm": 0.15521523385110902,
|
|
"learning_rate": 3.0047099185429142e-06,
|
|
"loss": 0.0654,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.6529573881704474,
|
|
"grad_norm": 0.14728105383234777,
|
|
"learning_rate": 2.9726167621228187e-06,
|
|
"loss": 0.0657,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.6550773796904812,
|
|
"grad_norm": 0.1832509363427932,
|
|
"learning_rate": 2.940623257223341e-06,
|
|
"loss": 0.0665,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.6571973712105151,
|
|
"grad_norm": 0.15168423601274655,
|
|
"learning_rate": 2.9087309764141613e-06,
|
|
"loss": 0.0665,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.6593173627305491,
|
|
"grad_norm": 0.1483275502933062,
|
|
"learning_rate": 2.876941487289522e-06,
|
|
"loss": 0.072,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.661437354250583,
|
|
"grad_norm": 0.15452416173310596,
|
|
"learning_rate": 2.845256352391157e-06,
|
|
"loss": 0.0687,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.6635573457706169,
|
|
"grad_norm": 0.16759174006680952,
|
|
"learning_rate": 2.8136771291315063e-06,
|
|
"loss": 0.0669,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.6656773372906508,
|
|
"grad_norm": 0.14998494541872762,
|
|
"learning_rate": 2.7822053697171588e-06,
|
|
"loss": 0.0666,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.6677973288106848,
|
|
"grad_norm": 0.17131639340630408,
|
|
"learning_rate": 2.7508426210725546e-06,
|
|
"loss": 0.0672,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.6699173203307187,
|
|
"grad_norm": 0.19399216153317256,
|
|
"learning_rate": 2.7195904247639544e-06,
|
|
"loss": 0.0662,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.6720373118507526,
|
|
"grad_norm": 0.15393012051599972,
|
|
"learning_rate": 2.68845031692366e-06,
|
|
"loss": 0.0685,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.6741573033707865,
|
|
"grad_norm": 0.1761419745993989,
|
|
"learning_rate": 2.657423828174518e-06,
|
|
"loss": 0.0644,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.6762772948908204,
|
|
"grad_norm": 0.16292970391303543,
|
|
"learning_rate": 2.626512483554678e-06,
|
|
"loss": 0.0673,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.6783972864108544,
|
|
"grad_norm": 0.15248743923822936,
|
|
"learning_rate": 2.595717802442636e-06,
|
|
"loss": 0.0636,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.6805172779308882,
|
|
"grad_norm": 0.17164291620759312,
|
|
"learning_rate": 2.5650412984825535e-06,
|
|
"loss": 0.0661,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.6826372694509222,
|
|
"grad_norm": 0.14003403542018764,
|
|
"learning_rate": 2.5344844795098577e-06,
|
|
"loss": 0.0644,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.6847572609709561,
|
|
"grad_norm": 0.13906331383996035,
|
|
"learning_rate": 2.5040488474771183e-06,
|
|
"loss": 0.0664,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.6868772524909901,
|
|
"grad_norm": 0.1654974091386292,
|
|
"learning_rate": 2.4737358983802417e-06,
|
|
"loss": 0.0657,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.6889972440110239,
|
|
"grad_norm": 0.17123238672779562,
|
|
"learning_rate": 2.443547122184921e-06,
|
|
"loss": 0.0684,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.6911172355310579,
|
|
"grad_norm": 0.13771748743849033,
|
|
"learning_rate": 2.416484617979397e-06,
|
|
"loss": 0.0718,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.6932372270510918,
|
|
"grad_norm": 0.14445999863423453,
|
|
"learning_rate": 2.386535853234254e-06,
|
|
"loss": 0.0703,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.6953572185711258,
|
|
"grad_norm": 0.15853968355485656,
|
|
"learning_rate": 2.356715547515228e-06,
|
|
"loss": 0.071,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.6974772100911596,
|
|
"grad_norm": 0.16059029746896103,
|
|
"learning_rate": 2.3270251665732236e-06,
|
|
"loss": 0.0682,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.6995972016111935,
|
|
"grad_norm": 0.1311794653898363,
|
|
"learning_rate": 2.2974661697729777e-06,
|
|
"loss": 0.0656,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.7017171931312275,
|
|
"grad_norm": 0.14003109623808868,
|
|
"learning_rate": 2.268040010021334e-06,
|
|
"loss": 0.0658,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.7038371846512614,
|
|
"grad_norm": 0.13679184892364368,
|
|
"learning_rate": 2.2387481336958243e-06,
|
|
"loss": 0.0676,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.7059571761712953,
|
|
"grad_norm": 0.1553274432738983,
|
|
"learning_rate": 2.2095919805735786e-06,
|
|
"loss": 0.0654,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.7080771676913292,
|
|
"grad_norm": 0.16561059697374547,
|
|
"learning_rate": 2.1805729837605533e-06,
|
|
"loss": 0.0677,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.7101971592113632,
|
|
"grad_norm": 0.1504235594519663,
|
|
"learning_rate": 2.1516925696210917e-06,
|
|
"loss": 0.0666,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.7123171507313971,
|
|
"grad_norm": 0.15286590047529391,
|
|
"learning_rate": 2.122952157707808e-06,
|
|
"loss": 0.0684,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.714437142251431,
|
|
"grad_norm": 0.1598473142296576,
|
|
"learning_rate": 2.0943531606918304e-06,
|
|
"loss": 0.0665,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.7165571337714649,
|
|
"grad_norm": 0.14455546823633267,
|
|
"learning_rate": 2.0658969842933386e-06,
|
|
"loss": 0.0694,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.7186771252914989,
|
|
"grad_norm": 0.14684140177879562,
|
|
"learning_rate": 2.0375850272124865e-06,
|
|
"loss": 0.063,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.7207971168115328,
|
|
"grad_norm": 0.1558774790348137,
|
|
"learning_rate": 2.0094186810606553e-06,
|
|
"loss": 0.0664,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.7229171083315666,
|
|
"grad_norm": 0.14875490426420004,
|
|
"learning_rate": 1.9813993302920325e-06,
|
|
"loss": 0.065,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.7250370998516006,
|
|
"grad_norm": 0.14376290785675833,
|
|
"learning_rate": 1.9535283521355807e-06,
|
|
"loss": 0.0645,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.7271570913716345,
|
|
"grad_norm": 0.1774603135257143,
|
|
"learning_rate": 1.925807116527336e-06,
|
|
"loss": 0.0628,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.7292770828916685,
|
|
"grad_norm": 0.15648377782580034,
|
|
"learning_rate": 1.8982369860430693e-06,
|
|
"loss": 0.0669,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.7313970744117023,
|
|
"grad_norm": 0.1410744866971639,
|
|
"learning_rate": 1.8708193158313175e-06,
|
|
"loss": 0.0652,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.7335170659317363,
|
|
"grad_norm": 0.13848781419789238,
|
|
"learning_rate": 1.8435554535467709e-06,
|
|
"loss": 0.0668,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.7356370574517702,
|
|
"grad_norm": 0.1565698593715598,
|
|
"learning_rate": 1.8164467392840306e-06,
|
|
"loss": 0.065,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.7377570489718042,
|
|
"grad_norm": 0.14483319470448863,
|
|
"learning_rate": 1.7894945055117462e-06,
|
|
"loss": 0.0689,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.739877040491838,
|
|
"grad_norm": 0.14690155750172293,
|
|
"learning_rate": 1.7627000770071062e-06,
|
|
"loss": 0.0643,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.741997032011872,
|
|
"grad_norm": 0.1523880735964551,
|
|
"learning_rate": 1.7360647707907447e-06,
|
|
"loss": 0.0666,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.7441170235319059,
|
|
"grad_norm": 0.16195859504926405,
|
|
"learning_rate": 1.7095898960619862e-06,
|
|
"loss": 0.0657,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.7462370150519398,
|
|
"grad_norm": 0.13638473122547523,
|
|
"learning_rate": 1.6832767541344974e-06,
|
|
"loss": 0.0655,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.7483570065719737,
|
|
"grad_norm": 0.13278921458936405,
|
|
"learning_rate": 1.6571266383723388e-06,
|
|
"loss": 0.0672,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.7504769980920076,
|
|
"grad_norm": 0.14406828983312037,
|
|
"learning_rate": 1.631140834126373e-06,
|
|
"loss": 0.066,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.7525969896120416,
|
|
"grad_norm": 0.1395384360254768,
|
|
"learning_rate": 1.6053206186710967e-06,
|
|
"loss": 0.0652,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.7547169811320755,
|
|
"grad_norm": 0.1579964196169218,
|
|
"learning_rate": 1.5796672611418645e-06,
|
|
"loss": 0.0656,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.7568369726521094,
|
|
"grad_norm": 0.1539176914379727,
|
|
"learning_rate": 1.5541820224724884e-06,
|
|
"loss": 0.0659,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.7589569641721433,
|
|
"grad_norm": 0.1432268965723713,
|
|
"learning_rate": 1.5288661553332802e-06,
|
|
"loss": 0.068,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.7610769556921773,
|
|
"grad_norm": 0.1475776868236256,
|
|
"learning_rate": 1.5037209040694668e-06,
|
|
"loss": 0.0674,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.7631969472122111,
|
|
"grad_norm": 0.13942686520284647,
|
|
"learning_rate": 1.4787475046400307e-06,
|
|
"loss": 0.0658,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.765316938732245,
|
|
"grad_norm": 0.16612542851996417,
|
|
"learning_rate": 1.4539471845569598e-06,
|
|
"loss": 0.0673,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.767436930252279,
|
|
"grad_norm": 0.13347560560880484,
|
|
"learning_rate": 1.4293211628249115e-06,
|
|
"loss": 0.0651,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.7695569217723129,
|
|
"grad_norm": 0.14290885311257007,
|
|
"learning_rate": 1.4048706498812936e-06,
|
|
"loss": 0.0632,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.7716769132923468,
|
|
"grad_norm": 0.15900916314465804,
|
|
"learning_rate": 1.380596847536772e-06,
|
|
"loss": 0.0662,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.7737969048123807,
|
|
"grad_norm": 0.15826198491620722,
|
|
"learning_rate": 1.3565009489161878e-06,
|
|
"loss": 0.0669,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.7759168963324147,
|
|
"grad_norm": 0.1338916105091316,
|
|
"learning_rate": 1.3325841383999321e-06,
|
|
"loss": 0.0661,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.7780368878524486,
|
|
"grad_norm": 0.14647123286090982,
|
|
"learning_rate": 1.3088475915657066e-06,
|
|
"loss": 0.0653,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.7801568793724825,
|
|
"grad_norm": 0.12519200539181277,
|
|
"learning_rate": 1.2852924751307555e-06,
|
|
"loss": 0.065,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.7822768708925164,
|
|
"grad_norm": 0.15737674167435736,
|
|
"learning_rate": 1.2619199468945215e-06,
|
|
"loss": 0.0647,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.7843968624125504,
|
|
"grad_norm": 0.14864208572307655,
|
|
"learning_rate": 1.2387311556817183e-06,
|
|
"loss": 0.0671,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.7865168539325843,
|
|
"grad_norm": 0.14386823191288503,
|
|
"learning_rate": 1.2157272412858811e-06,
|
|
"loss": 0.0672,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.7886368454526181,
|
|
"grad_norm": 0.15384247542083423,
|
|
"learning_rate": 1.192909334413338e-06,
|
|
"loss": 0.0654,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.7907568369726521,
|
|
"grad_norm": 0.14067508984359764,
|
|
"learning_rate": 1.1702785566276236e-06,
|
|
"loss": 0.0644,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.792876828492686,
|
|
"grad_norm": 0.1437217497105591,
|
|
"learning_rate": 1.1478360202943618e-06,
|
|
"loss": 0.0645,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.79499682001272,
|
|
"grad_norm": 0.15519712474428182,
|
|
"learning_rate": 1.1255828285265862e-06,
|
|
"loss": 0.0649,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.7971168115327538,
|
|
"grad_norm": 0.14145423148178207,
|
|
"learning_rate": 1.1035200751305176e-06,
|
|
"loss": 0.0653,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.7992368030527878,
|
|
"grad_norm": 0.13536631332448693,
|
|
"learning_rate": 1.0816488445518014e-06,
|
|
"loss": 0.0663,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.8013567945728217,
|
|
"grad_norm": 0.17054020151205723,
|
|
"learning_rate": 1.0599702118222054e-06,
|
|
"loss": 0.072,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.8034767860928557,
|
|
"grad_norm": 0.15685256143417375,
|
|
"learning_rate": 1.038485242506777e-06,
|
|
"loss": 0.0656,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.8055967776128895,
|
|
"grad_norm": 0.14095243780908379,
|
|
"learning_rate": 1.0171949926514706e-06,
|
|
"loss": 0.0647,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.8077167691329235,
|
|
"grad_norm": 0.13455803530480603,
|
|
"learning_rate": 9.96100508731232e-07,
|
|
"loss": 0.0656,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.8098367606529574,
|
|
"grad_norm": 0.1334874446417613,
|
|
"learning_rate": 9.75202827598576e-07,
|
|
"loss": 0.0646,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.8119567521729913,
|
|
"grad_norm": 0.14692274148907183,
|
|
"learning_rate": 9.54502976432606e-07,
|
|
"loss": 0.069,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.8140767436930252,
|
|
"grad_norm": 0.15341984926535815,
|
|
"learning_rate": 9.340019726885341e-07,
|
|
"loss": 0.0673,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.8161967352130591,
|
|
"grad_norm": 0.13727695678950919,
|
|
"learning_rate": 9.137008240476752e-07,
|
|
"loss": 0.0644,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.8183167267330931,
|
|
"grad_norm": 0.1301351491962901,
|
|
"learning_rate": 8.936005283679022e-07,
|
|
"loss": 0.0653,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.820436718253127,
|
|
"grad_norm": 0.15417587256216225,
|
|
"learning_rate": 8.737020736346114e-07,
|
|
"loss": 0.0687,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.8225567097731609,
|
|
"grad_norm": 0.13849301619933713,
|
|
"learning_rate": 8.540064379121537e-07,
|
|
"loss": 0.0643,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.8246767012931948,
|
|
"grad_norm": 0.12751613533813724,
|
|
"learning_rate": 8.345145892957635e-07,
|
|
"loss": 0.0675,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.8267966928132288,
|
|
"grad_norm": 0.13641022859652724,
|
|
"learning_rate": 8.152274858639709e-07,
|
|
"loss": 0.0644,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.8289166843332627,
|
|
"grad_norm": 0.13498806105829741,
|
|
"learning_rate": 7.961460756315131e-07,
|
|
"loss": 0.0661,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.8310366758532965,
|
|
"grad_norm": 0.1649413393713791,
|
|
"learning_rate": 7.772712965027329e-07,
|
|
"loss": 0.0681,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.8331566673733305,
|
|
"grad_norm": 0.14352566951876747,
|
|
"learning_rate": 7.586040762254831e-07,
|
|
"loss": 0.0666,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.8352766588933644,
|
|
"grad_norm": 0.13644803212350157,
|
|
"learning_rate": 7.40145332345516e-07,
|
|
"loss": 0.0703,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.8373966504133984,
|
|
"grad_norm": 0.13207832198888683,
|
|
"learning_rate": 7.218959721613966e-07,
|
|
"loss": 0.0677,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.8395166419334322,
|
|
"grad_norm": 0.12801202011992016,
|
|
"learning_rate": 7.038568926798972e-07,
|
|
"loss": 0.0669,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.8416366334534662,
|
|
"grad_norm": 0.1446132283031493,
|
|
"learning_rate": 6.860289805719051e-07,
|
|
"loss": 0.0657,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.8437566249735001,
|
|
"grad_norm": 0.15537910760985132,
|
|
"learning_rate": 6.684131121288506e-07,
|
|
"loss": 0.0645,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.8458766164935341,
|
|
"grad_norm": 0.138745128040943,
|
|
"learning_rate": 6.510101532196228e-07,
|
|
"loss": 0.0663,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.8479966080135679,
|
|
"grad_norm": 0.1422532471037774,
|
|
"learning_rate": 6.338209592480187e-07,
|
|
"loss": 0.0659,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.8479966080135679,
|
|
"eval_loss": 0.06505845487117767,
|
|
"eval_runtime": 488.4948,
|
|
"eval_samples_per_second": 4.188,
|
|
"eval_steps_per_second": 0.301,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.8501165995336019,
|
|
"grad_norm": 0.13051273275584138,
|
|
"learning_rate": 6.168463751106973e-07,
|
|
"loss": 0.0676,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.8522365910536358,
|
|
"grad_norm": 0.1584048424567531,
|
|
"learning_rate": 6.000872351556402e-07,
|
|
"loss": 0.0647,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.8543565825736698,
|
|
"grad_norm": 0.14980989572464892,
|
|
"learning_rate": 5.835443631411548e-07,
|
|
"loss": 0.0656,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.8564765740937036,
|
|
"grad_norm": 0.14192254512990504,
|
|
"learning_rate": 5.672185721953761e-07,
|
|
"loss": 0.0664,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.8585965656137375,
|
|
"grad_norm": 0.14482703076509035,
|
|
"learning_rate": 5.51110664776302e-07,
|
|
"loss": 0.0672,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.8607165571337715,
|
|
"grad_norm": 0.134823932240745,
|
|
"learning_rate": 5.352214326323485e-07,
|
|
"loss": 0.0675,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.8628365486538054,
|
|
"grad_norm": 0.13549221587996976,
|
|
"learning_rate": 5.195516567634345e-07,
|
|
"loss": 0.0643,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.8649565401738393,
|
|
"grad_norm": 0.14157113823645306,
|
|
"learning_rate": 5.041021073825935e-07,
|
|
"loss": 0.0681,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.8670765316938732,
|
|
"grad_norm": 0.13268190112303166,
|
|
"learning_rate": 4.888735438781156e-07,
|
|
"loss": 0.0634,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.8691965232139072,
|
|
"grad_norm": 0.15044371596526965,
|
|
"learning_rate": 4.738667147762177e-07,
|
|
"loss": 0.0638,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.8713165147339411,
|
|
"grad_norm": 0.15554565316213642,
|
|
"learning_rate": 4.590823577042597e-07,
|
|
"loss": 0.0673,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.873436506253975,
|
|
"grad_norm": 0.13924045324828926,
|
|
"learning_rate": 4.4452119935447844e-07,
|
|
"loss": 0.0684,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.8755564977740089,
|
|
"grad_norm": 0.14738905954975795,
|
|
"learning_rate": 4.301839554482745e-07,
|
|
"loss": 0.0646,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.8776764892940428,
|
|
"grad_norm": 0.17734811072141277,
|
|
"learning_rate": 4.160713307010339e-07,
|
|
"loss": 0.0627,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.8797964808140768,
|
|
"grad_norm": 0.14667184777407763,
|
|
"learning_rate": 4.021840187874831e-07,
|
|
"loss": 0.0665,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.8819164723341106,
|
|
"grad_norm": 0.13247047796191325,
|
|
"learning_rate": 3.8852270230759715e-07,
|
|
"loss": 0.068,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.8840364638541446,
|
|
"grad_norm": 0.1318359100305846,
|
|
"learning_rate": 3.750880527530515e-07,
|
|
"loss": 0.0642,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.8861564553741785,
|
|
"grad_norm": 0.14660978947680608,
|
|
"learning_rate": 3.618807304742067e-07,
|
|
"loss": 0.064,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.8882764468942124,
|
|
"grad_norm": 0.16073812743121169,
|
|
"learning_rate": 3.4890138464765854e-07,
|
|
"loss": 0.0624,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.8903964384142463,
|
|
"grad_norm": 0.1317816842544379,
|
|
"learning_rate": 3.361506532443265e-07,
|
|
"loss": 0.0637,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.8925164299342803,
|
|
"grad_norm": 0.17027046123997486,
|
|
"learning_rate": 3.2362916299809643e-07,
|
|
"loss": 0.066,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.8946364214543142,
|
|
"grad_norm": 0.13836358324093678,
|
|
"learning_rate": 3.113375293750137e-07,
|
|
"loss": 0.0676,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.896756412974348,
|
|
"grad_norm": 0.13744563225532516,
|
|
"learning_rate": 2.992763565430301e-07,
|
|
"loss": 0.064,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.898876404494382,
|
|
"grad_norm": 0.13017524095673055,
|
|
"learning_rate": 2.874462373423115e-07,
|
|
"loss": 0.0682,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.900996396014416,
|
|
"grad_norm": 0.13351598157626,
|
|
"learning_rate": 2.7584775325609546e-07,
|
|
"loss": 0.0684,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.9031163875344499,
|
|
"grad_norm": 0.14042699267228834,
|
|
"learning_rate": 2.6448147438210725e-07,
|
|
"loss": 0.0652,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.9052363790544837,
|
|
"grad_norm": 0.1347635143628056,
|
|
"learning_rate": 2.5334795940454514e-07,
|
|
"loss": 0.0687,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.9073563705745177,
|
|
"grad_norm": 0.14711313054197867,
|
|
"learning_rate": 2.424477555666105e-07,
|
|
"loss": 0.0642,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.9094763620945516,
|
|
"grad_norm": 0.12669127686809334,
|
|
"learning_rate": 2.3178139864361514e-07,
|
|
"loss": 0.0662,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.9115963536145856,
|
|
"grad_norm": 0.1482558394168092,
|
|
"learning_rate": 2.213494129166477e-07,
|
|
"loss": 0.0663,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.9137163451346194,
|
|
"grad_norm": 0.13767908522932615,
|
|
"learning_rate": 2.111523111467978e-07,
|
|
"loss": 0.0662,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.9158363366546534,
|
|
"grad_norm": 0.1307957796839651,
|
|
"learning_rate": 2.0119059454995705e-07,
|
|
"loss": 0.0637,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.9179563281746873,
|
|
"grad_norm": 0.1387365413487702,
|
|
"learning_rate": 1.9146475277218247e-07,
|
|
"loss": 0.066,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.9200763196947213,
|
|
"grad_norm": 0.13783938524006778,
|
|
"learning_rate": 1.8197526386562637e-07,
|
|
"loss": 0.0656,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.9221963112147551,
|
|
"grad_norm": 0.15258547100463352,
|
|
"learning_rate": 1.7272259426504178e-07,
|
|
"loss": 0.0635,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.924316302734789,
|
|
"grad_norm": 0.12836303549043818,
|
|
"learning_rate": 1.6370719876485474e-07,
|
|
"loss": 0.0654,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.926436294254823,
|
|
"grad_norm": 0.16082006996334058,
|
|
"learning_rate": 1.5492952049680987e-07,
|
|
"loss": 0.0665,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.9285562857748569,
|
|
"grad_norm": 0.15509787140465903,
|
|
"learning_rate": 1.463899909081884e-07,
|
|
"loss": 0.0701,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.9306762772948908,
|
|
"grad_norm": 0.15582542247867595,
|
|
"learning_rate": 1.3808902974060234e-07,
|
|
"loss": 0.0663,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.9327962688149247,
|
|
"grad_norm": 0.1285740918117715,
|
|
"learning_rate": 1.3002704500936324e-07,
|
|
"loss": 0.0666,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.9349162603349587,
|
|
"grad_norm": 0.12844195029643576,
|
|
"learning_rate": 1.222044329834271e-07,
|
|
"loss": 0.0649,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.9370362518549926,
|
|
"grad_norm": 0.14024654119089836,
|
|
"learning_rate": 1.1462157816591435e-07,
|
|
"loss": 0.0653,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.9391562433750265,
|
|
"grad_norm": 0.13569299209873553,
|
|
"learning_rate": 1.0727885327521448e-07,
|
|
"loss": 0.0636,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.9412762348950604,
|
|
"grad_norm": 0.1506002944720704,
|
|
"learning_rate": 1.0017661922666177e-07,
|
|
"loss": 0.0666,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.9433962264150944,
|
|
"grad_norm": 0.1454846226979092,
|
|
"learning_rate": 9.331522511479785e-08,
|
|
"loss": 0.0666,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.9455162179351283,
|
|
"grad_norm": 0.12532906503884286,
|
|
"learning_rate": 8.669500819621424e-08,
|
|
"loss": 0.0633,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.9476362094551621,
|
|
"grad_norm": 0.13843242796856053,
|
|
"learning_rate": 8.031629387296958e-08,
|
|
"loss": 0.065,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.9497562009751961,
|
|
"grad_norm": 0.14800544603658328,
|
|
"learning_rate": 7.41793956766007e-08,
|
|
"loss": 0.068,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.95187619249523,
|
|
"grad_norm": 0.1302604634301598,
|
|
"learning_rate": 6.828461525271057e-08,
|
|
"loss": 0.0669,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.953996184015264,
|
|
"grad_norm": 0.1317179792652165,
|
|
"learning_rate": 6.26322423461384e-08,
|
|
"loss": 0.0669,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.9561161755352978,
|
|
"grad_norm": 0.14912594874221324,
|
|
"learning_rate": 5.7222554786722784e-08,
|
|
"loss": 0.0656,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.9582361670553318,
|
|
"grad_norm": 0.14837310552453115,
|
|
"learning_rate": 5.20558184756409e-08,
|
|
"loss": 0.0637,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.9603561585753657,
|
|
"grad_norm": 0.13757532372594639,
|
|
"learning_rate": 4.7132287372341764e-08,
|
|
"loss": 0.0648,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.9624761500953997,
|
|
"grad_norm": 0.1412166618656987,
|
|
"learning_rate": 4.245220348206347e-08,
|
|
"loss": 0.0652,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.9645961416154335,
|
|
"grad_norm": 0.13183693219990808,
|
|
"learning_rate": 3.801579684393486e-08,
|
|
"loss": 0.0641,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.9667161331354674,
|
|
"grad_norm": 0.13113013926781358,
|
|
"learning_rate": 3.382328551967296e-08,
|
|
"loss": 0.062,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.9688361246555014,
|
|
"grad_norm": 0.1413505271426179,
|
|
"learning_rate": 2.9874875582860395e-08,
|
|
"loss": 0.0645,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.9709561161755353,
|
|
"grad_norm": 0.1412996190385206,
|
|
"learning_rate": 2.6170761108818554e-08,
|
|
"loss": 0.0663,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.9730761076955692,
|
|
"grad_norm": 0.12681619081672574,
|
|
"learning_rate": 2.2711124165069043e-08,
|
|
"loss": 0.0642,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.9751960992156031,
|
|
"grad_norm": 0.12962052445070302,
|
|
"learning_rate": 1.949613480238255e-08,
|
|
"loss": 0.069,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.9773160907356371,
|
|
"grad_norm": 0.12947117948159637,
|
|
"learning_rate": 1.652595104642052e-08,
|
|
"loss": 0.0664,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.979436082255671,
|
|
"grad_norm": 0.1306635495902112,
|
|
"learning_rate": 1.3800718889970255e-08,
|
|
"loss": 0.0631,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.9815560737757049,
|
|
"grad_norm": 0.14160335253570747,
|
|
"learning_rate": 1.1320572285765663e-08,
|
|
"loss": 0.0655,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.9836760652957388,
|
|
"grad_norm": 0.1331013714780897,
|
|
"learning_rate": 9.085633139905292e-09,
|
|
"loss": 0.0679,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.9857960568157728,
|
|
"grad_norm": 0.1521352520849485,
|
|
"learning_rate": 7.096011305859352e-09,
|
|
"loss": 0.0659,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.9879160483358067,
|
|
"grad_norm": 0.1490651261311961,
|
|
"learning_rate": 5.351804579070696e-09,
|
|
"loss": 0.0663,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.9900360398558405,
|
|
"grad_norm": 0.14100706470294022,
|
|
"learning_rate": 3.853098692147006e-09,
|
|
"loss": 0.0658,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.9921560313758745,
|
|
"grad_norm": 0.1375637676528597,
|
|
"learning_rate": 2.5999673106480438e-09,
|
|
"loss": 0.0638,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.9942760228959084,
|
|
"grad_norm": 0.1362445388806314,
|
|
"learning_rate": 1.5924720294641093e-09,
|
|
"loss": 0.0645,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.9963960144159424,
|
|
"grad_norm": 0.12875364924264648,
|
|
"learning_rate": 8.306623697884597e-10,
|
|
"loss": 0.0669,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.9985160059359762,
|
|
"grad_norm": 0.12918021464281548,
|
|
"learning_rate": 3.1457577668259074e-10,
|
|
"loss": 0.0658,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 4717,
|
|
"total_flos": 3995069728161792.0,
|
|
"train_loss": 0.10921624170816473,
|
|
"train_runtime": 69121.8578,
|
|
"train_samples_per_second": 0.955,
|
|
"train_steps_per_second": 0.068
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 4717,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 3995069728161792.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|