8603 lines
211 KiB
JSON
8603 lines
211 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 1000,
|
|
"global_step": 12091,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0008270614506657845,
|
|
"grad_norm": 14.711603164672852,
|
|
"learning_rate": 1.487603305785124e-07,
|
|
"loss": 2.7309,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.001654122901331569,
|
|
"grad_norm": 14.97842025756836,
|
|
"learning_rate": 3.1404958677685957e-07,
|
|
"loss": 2.7435,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.0024811843519973536,
|
|
"grad_norm": 12.011658668518066,
|
|
"learning_rate": 4.793388429752067e-07,
|
|
"loss": 2.7037,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.003308245802663138,
|
|
"grad_norm": 11.382739067077637,
|
|
"learning_rate": 6.446280991735538e-07,
|
|
"loss": 2.5787,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0041353072533289225,
|
|
"grad_norm": 7.4454193115234375,
|
|
"learning_rate": 8.099173553719009e-07,
|
|
"loss": 2.2764,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.004962368703994707,
|
|
"grad_norm": 5.262652397155762,
|
|
"learning_rate": 9.75206611570248e-07,
|
|
"loss": 1.9743,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.005789430154660491,
|
|
"grad_norm": 3.370814323425293,
|
|
"learning_rate": 1.140495867768595e-06,
|
|
"loss": 1.5944,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.006616491605326276,
|
|
"grad_norm": 3.199523687362671,
|
|
"learning_rate": 1.3057851239669423e-06,
|
|
"loss": 1.3237,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.00744355305599206,
|
|
"grad_norm": 2.578493118286133,
|
|
"learning_rate": 1.4710743801652894e-06,
|
|
"loss": 1.0191,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.008270614506657845,
|
|
"grad_norm": 2.9655439853668213,
|
|
"learning_rate": 1.6363636363636365e-06,
|
|
"loss": 0.7199,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.00909767595732363,
|
|
"grad_norm": 5.320194244384766,
|
|
"learning_rate": 1.8016528925619835e-06,
|
|
"loss": 0.5692,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.009924737407989414,
|
|
"grad_norm": 1.893227458000183,
|
|
"learning_rate": 1.966942148760331e-06,
|
|
"loss": 0.4681,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.010751798858655199,
|
|
"grad_norm": 1.1032103300094604,
|
|
"learning_rate": 2.132231404958678e-06,
|
|
"loss": 0.3882,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.011578860309320982,
|
|
"grad_norm": 1.873787760734558,
|
|
"learning_rate": 2.297520661157025e-06,
|
|
"loss": 0.3182,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.012405921759986767,
|
|
"grad_norm": 3.0224239826202393,
|
|
"learning_rate": 2.462809917355372e-06,
|
|
"loss": 0.2801,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.013232983210652551,
|
|
"grad_norm": 1.4292343854904175,
|
|
"learning_rate": 2.628099173553719e-06,
|
|
"loss": 0.2074,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.014060044661318336,
|
|
"grad_norm": 5.8895368576049805,
|
|
"learning_rate": 2.7933884297520662e-06,
|
|
"loss": 0.2002,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.01488710611198412,
|
|
"grad_norm": 1.3134126663208008,
|
|
"learning_rate": 2.9586776859504133e-06,
|
|
"loss": 0.1672,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.015714167562649903,
|
|
"grad_norm": 0.5309329628944397,
|
|
"learning_rate": 3.123966942148761e-06,
|
|
"loss": 0.1528,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.01654122901331569,
|
|
"grad_norm": 0.7183464169502258,
|
|
"learning_rate": 3.289256198347108e-06,
|
|
"loss": 0.1324,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.017368290463981473,
|
|
"grad_norm": 0.7033889293670654,
|
|
"learning_rate": 3.454545454545455e-06,
|
|
"loss": 0.1214,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.01819535191464726,
|
|
"grad_norm": 0.39757004380226135,
|
|
"learning_rate": 3.619834710743802e-06,
|
|
"loss": 0.1116,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.019022413365313042,
|
|
"grad_norm": 1.0405199527740479,
|
|
"learning_rate": 3.785123966942149e-06,
|
|
"loss": 0.1011,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.01984947481597883,
|
|
"grad_norm": 1.6865506172180176,
|
|
"learning_rate": 3.950413223140496e-06,
|
|
"loss": 0.1023,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.02067653626664461,
|
|
"grad_norm": 0.746986985206604,
|
|
"learning_rate": 4.115702479338843e-06,
|
|
"loss": 0.092,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.021503597717310398,
|
|
"grad_norm": 0.482876718044281,
|
|
"learning_rate": 4.28099173553719e-06,
|
|
"loss": 0.0803,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.02233065916797618,
|
|
"grad_norm": 0.4853907525539398,
|
|
"learning_rate": 4.4462809917355374e-06,
|
|
"loss": 0.0782,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.023157720618641964,
|
|
"grad_norm": 0.2537175714969635,
|
|
"learning_rate": 4.6115702479338845e-06,
|
|
"loss": 0.0776,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.02398478206930775,
|
|
"grad_norm": 0.7867515683174133,
|
|
"learning_rate": 4.776859504132232e-06,
|
|
"loss": 0.073,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.024811843519973533,
|
|
"grad_norm": 0.43127694725990295,
|
|
"learning_rate": 4.942148760330579e-06,
|
|
"loss": 0.0734,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.02563890497063932,
|
|
"grad_norm": 0.20157092809677124,
|
|
"learning_rate": 5.107438016528926e-06,
|
|
"loss": 0.0737,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.026465966421305102,
|
|
"grad_norm": 0.5229440927505493,
|
|
"learning_rate": 5.272727272727273e-06,
|
|
"loss": 0.0692,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.02729302787197089,
|
|
"grad_norm": 0.7331608533859253,
|
|
"learning_rate": 5.438016528925621e-06,
|
|
"loss": 0.0688,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.028120089322636672,
|
|
"grad_norm": 0.2658148407936096,
|
|
"learning_rate": 5.603305785123967e-06,
|
|
"loss": 0.0603,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.028947150773302455,
|
|
"grad_norm": 0.3650042414665222,
|
|
"learning_rate": 5.768595041322315e-06,
|
|
"loss": 0.0593,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.02977421222396824,
|
|
"grad_norm": 0.2189350426197052,
|
|
"learning_rate": 5.933884297520661e-06,
|
|
"loss": 0.0561,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.030601273674634024,
|
|
"grad_norm": 0.2192901372909546,
|
|
"learning_rate": 6.099173553719009e-06,
|
|
"loss": 0.0624,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.03142833512529981,
|
|
"grad_norm": 0.2812904715538025,
|
|
"learning_rate": 6.264462809917355e-06,
|
|
"loss": 0.0556,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.03225539657596559,
|
|
"grad_norm": 0.6402881145477295,
|
|
"learning_rate": 6.429752066115703e-06,
|
|
"loss": 0.0572,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.03308245802663138,
|
|
"grad_norm": 0.17161770164966583,
|
|
"learning_rate": 6.5950413223140495e-06,
|
|
"loss": 0.0537,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.033909519477297166,
|
|
"grad_norm": 0.3633708357810974,
|
|
"learning_rate": 6.760330578512397e-06,
|
|
"loss": 0.0504,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.034736580927962946,
|
|
"grad_norm": 0.3227091133594513,
|
|
"learning_rate": 6.925619834710744e-06,
|
|
"loss": 0.0527,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.03556364237862873,
|
|
"grad_norm": 0.1883084774017334,
|
|
"learning_rate": 7.0909090909090916e-06,
|
|
"loss": 0.0496,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.03639070382929452,
|
|
"grad_norm": 0.404940664768219,
|
|
"learning_rate": 7.256198347107438e-06,
|
|
"loss": 0.0515,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.0372177652799603,
|
|
"grad_norm": 0.2766735553741455,
|
|
"learning_rate": 7.421487603305786e-06,
|
|
"loss": 0.0482,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.038044826730626084,
|
|
"grad_norm": 0.14233002066612244,
|
|
"learning_rate": 7.586776859504133e-06,
|
|
"loss": 0.0495,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.03887188818129187,
|
|
"grad_norm": 0.17358863353729248,
|
|
"learning_rate": 7.75206611570248e-06,
|
|
"loss": 0.0464,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.03969894963195766,
|
|
"grad_norm": 0.24469003081321716,
|
|
"learning_rate": 7.917355371900827e-06,
|
|
"loss": 0.0479,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.04052601108262344,
|
|
"grad_norm": 0.20702078938484192,
|
|
"learning_rate": 8.082644628099174e-06,
|
|
"loss": 0.042,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.04135307253328922,
|
|
"grad_norm": 0.38820740580558777,
|
|
"learning_rate": 8.247933884297521e-06,
|
|
"loss": 0.0486,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.04218013398395501,
|
|
"grad_norm": 0.17128099501132965,
|
|
"learning_rate": 8.413223140495868e-06,
|
|
"loss": 0.0432,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.043007195434620796,
|
|
"grad_norm": 0.15014755725860596,
|
|
"learning_rate": 8.578512396694215e-06,
|
|
"loss": 0.0471,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.043834256885286575,
|
|
"grad_norm": 0.31599992513656616,
|
|
"learning_rate": 8.743801652892562e-06,
|
|
"loss": 0.0431,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.04466131833595236,
|
|
"grad_norm": 0.2722884714603424,
|
|
"learning_rate": 8.90909090909091e-06,
|
|
"loss": 0.0422,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.04548837978661815,
|
|
"grad_norm": 0.2727777361869812,
|
|
"learning_rate": 9.074380165289256e-06,
|
|
"loss": 0.0411,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.04631544123728393,
|
|
"grad_norm": 0.11177966743707657,
|
|
"learning_rate": 9.239669421487604e-06,
|
|
"loss": 0.0422,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.047142502687949714,
|
|
"grad_norm": 0.603720486164093,
|
|
"learning_rate": 9.40495867768595e-06,
|
|
"loss": 0.0434,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.0479695641386155,
|
|
"grad_norm": 0.13153497874736786,
|
|
"learning_rate": 9.570247933884298e-06,
|
|
"loss": 0.0392,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.04879662558928129,
|
|
"grad_norm": 0.11294803768396378,
|
|
"learning_rate": 9.735537190082645e-06,
|
|
"loss": 0.0421,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.049623687039947066,
|
|
"grad_norm": 0.08017970621585846,
|
|
"learning_rate": 9.900826446280992e-06,
|
|
"loss": 0.0395,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.05045074849061285,
|
|
"grad_norm": 0.10552022606134415,
|
|
"learning_rate": 9.999997007583302e-06,
|
|
"loss": 0.0409,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.05127780994127864,
|
|
"grad_norm": 0.09761521220207214,
|
|
"learning_rate": 9.999963342936584e-06,
|
|
"loss": 0.041,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.05210487139194442,
|
|
"grad_norm": 0.19305112957954407,
|
|
"learning_rate": 9.999892273374958e-06,
|
|
"loss": 0.0387,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.052931932842610205,
|
|
"grad_norm": 0.09766830503940582,
|
|
"learning_rate": 9.999783799430103e-06,
|
|
"loss": 0.0407,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.05375899429327599,
|
|
"grad_norm": 0.14489105343818665,
|
|
"learning_rate": 9.999637921913512e-06,
|
|
"loss": 0.0389,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.05458605574394178,
|
|
"grad_norm": 0.32930856943130493,
|
|
"learning_rate": 9.999454641916505e-06,
|
|
"loss": 0.038,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.05541311719460756,
|
|
"grad_norm": 0.14028801023960114,
|
|
"learning_rate": 9.9992339608102e-06,
|
|
"loss": 0.0389,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.056240178645273343,
|
|
"grad_norm": 0.18841058015823364,
|
|
"learning_rate": 9.998975880245528e-06,
|
|
"loss": 0.0377,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.05706724009593913,
|
|
"grad_norm": 0.22034569084644318,
|
|
"learning_rate": 9.998680402153193e-06,
|
|
"loss": 0.0375,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.05789430154660491,
|
|
"grad_norm": 0.09396768361330032,
|
|
"learning_rate": 9.998347528743684e-06,
|
|
"loss": 0.0373,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.058721362997270696,
|
|
"grad_norm": 0.21181143820285797,
|
|
"learning_rate": 9.997977262507234e-06,
|
|
"loss": 0.0368,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.05954842444793648,
|
|
"grad_norm": 0.09176570922136307,
|
|
"learning_rate": 9.997569606213822e-06,
|
|
"loss": 0.0402,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.06037548589860227,
|
|
"grad_norm": 0.1015128493309021,
|
|
"learning_rate": 9.997124562913138e-06,
|
|
"loss": 0.037,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.06120254734926805,
|
|
"grad_norm": 0.10508076846599579,
|
|
"learning_rate": 9.996642135934571e-06,
|
|
"loss": 0.0359,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.062029608799933834,
|
|
"grad_norm": 0.1171686202287674,
|
|
"learning_rate": 9.996122328887173e-06,
|
|
"loss": 0.0355,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.06285667025059961,
|
|
"grad_norm": 0.0857829749584198,
|
|
"learning_rate": 9.99556514565964e-06,
|
|
"loss": 0.0373,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.0636837317012654,
|
|
"grad_norm": 0.10609547048807144,
|
|
"learning_rate": 9.994970590420284e-06,
|
|
"loss": 0.0358,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.06451079315193119,
|
|
"grad_norm": 0.08259180933237076,
|
|
"learning_rate": 9.994338667616989e-06,
|
|
"loss": 0.0357,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.06533785460259697,
|
|
"grad_norm": 0.09571733325719833,
|
|
"learning_rate": 9.9936693819772e-06,
|
|
"loss": 0.0384,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.06616491605326276,
|
|
"grad_norm": 0.15978851914405823,
|
|
"learning_rate": 9.992962738507862e-06,
|
|
"loss": 0.0365,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.06699197750392855,
|
|
"grad_norm": 0.14394982159137726,
|
|
"learning_rate": 9.992218742495409e-06,
|
|
"loss": 0.0371,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.06781903895459433,
|
|
"grad_norm": 0.09818094223737717,
|
|
"learning_rate": 9.991437399505697e-06,
|
|
"loss": 0.0375,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.0686461004052601,
|
|
"grad_norm": 0.23322197794914246,
|
|
"learning_rate": 9.990618715383985e-06,
|
|
"loss": 0.0349,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.06947316185592589,
|
|
"grad_norm": 0.09629665315151215,
|
|
"learning_rate": 9.98976269625488e-06,
|
|
"loss": 0.0349,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.07030022330659168,
|
|
"grad_norm": 0.1651093065738678,
|
|
"learning_rate": 9.988869348522293e-06,
|
|
"loss": 0.035,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.07112728475725746,
|
|
"grad_norm": 0.12863469123840332,
|
|
"learning_rate": 9.98793867886939e-06,
|
|
"loss": 0.0364,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.07195434620792325,
|
|
"grad_norm": 0.3346972167491913,
|
|
"learning_rate": 9.98697069425855e-06,
|
|
"loss": 0.0337,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.07278140765858904,
|
|
"grad_norm": 0.146303191781044,
|
|
"learning_rate": 9.9859654019313e-06,
|
|
"loss": 0.0363,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.07360846910925482,
|
|
"grad_norm": 0.14840497076511383,
|
|
"learning_rate": 9.984922809408272e-06,
|
|
"loss": 0.0349,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.0744355305599206,
|
|
"grad_norm": 0.10886628180742264,
|
|
"learning_rate": 9.983842924489137e-06,
|
|
"loss": 0.0344,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.07526259201058638,
|
|
"grad_norm": 0.09753160178661346,
|
|
"learning_rate": 9.982725755252557e-06,
|
|
"loss": 0.0327,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.07608965346125217,
|
|
"grad_norm": 0.09709116816520691,
|
|
"learning_rate": 9.981571310056116e-06,
|
|
"loss": 0.0361,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.07691671491191795,
|
|
"grad_norm": 0.08936941623687744,
|
|
"learning_rate": 9.980379597536263e-06,
|
|
"loss": 0.039,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.07774377636258374,
|
|
"grad_norm": 0.07184750586748123,
|
|
"learning_rate": 9.979150626608246e-06,
|
|
"loss": 0.034,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.07857083781324953,
|
|
"grad_norm": 0.07059776037931442,
|
|
"learning_rate": 9.97788440646604e-06,
|
|
"loss": 0.0314,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.07939789926391531,
|
|
"grad_norm": 0.07418540120124817,
|
|
"learning_rate": 9.976580946582289e-06,
|
|
"loss": 0.0338,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.08022496071458109,
|
|
"grad_norm": 0.14634068310260773,
|
|
"learning_rate": 9.975240256708222e-06,
|
|
"loss": 0.0344,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.08105202216524687,
|
|
"grad_norm": 0.10202177613973618,
|
|
"learning_rate": 9.973862346873594e-06,
|
|
"loss": 0.0312,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.08187908361591266,
|
|
"grad_norm": 0.08847320824861526,
|
|
"learning_rate": 9.9724472273866e-06,
|
|
"loss": 0.0335,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.08270614506657845,
|
|
"grad_norm": 0.1381935477256775,
|
|
"learning_rate": 9.9709949088338e-06,
|
|
"loss": 0.0399,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.08270614506657845,
|
|
"eval_loss": 0.0343376062810421,
|
|
"eval_runtime": 1220.1317,
|
|
"eval_samples_per_second": 4.917,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.08353320651724423,
|
|
"grad_norm": 0.15219560265541077,
|
|
"learning_rate": 9.969505402080044e-06,
|
|
"loss": 0.0337,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.08436026796791002,
|
|
"grad_norm": 0.20263217389583588,
|
|
"learning_rate": 9.967978718268391e-06,
|
|
"loss": 0.0315,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.0851873294185758,
|
|
"grad_norm": 0.10303157567977905,
|
|
"learning_rate": 9.966414868820022e-06,
|
|
"loss": 0.0354,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.08601439086924159,
|
|
"grad_norm": 0.10471872240304947,
|
|
"learning_rate": 9.964813865434149e-06,
|
|
"loss": 0.035,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.08684145231990736,
|
|
"grad_norm": 0.08253839612007141,
|
|
"learning_rate": 9.963175720087941e-06,
|
|
"loss": 0.0317,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.08766851377057315,
|
|
"grad_norm": 0.08755608648061752,
|
|
"learning_rate": 9.961500445036428e-06,
|
|
"loss": 0.0314,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.08849557522123894,
|
|
"grad_norm": 0.15729674696922302,
|
|
"learning_rate": 9.9597880528124e-06,
|
|
"loss": 0.0371,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.08932263667190472,
|
|
"grad_norm": 0.14116688072681427,
|
|
"learning_rate": 9.958038556226332e-06,
|
|
"loss": 0.0317,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.09014969812257051,
|
|
"grad_norm": 0.1923297643661499,
|
|
"learning_rate": 9.956251968366276e-06,
|
|
"loss": 0.035,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.0909767595732363,
|
|
"grad_norm": 0.07124887406826019,
|
|
"learning_rate": 9.954428302597759e-06,
|
|
"loss": 0.0308,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.09180382102390208,
|
|
"grad_norm": 0.07786933332681656,
|
|
"learning_rate": 9.952567572563696e-06,
|
|
"loss": 0.0304,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.09263088247456785,
|
|
"grad_norm": 0.15352018177509308,
|
|
"learning_rate": 9.950669792184279e-06,
|
|
"loss": 0.0332,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.09345794392523364,
|
|
"grad_norm": 0.0767594501376152,
|
|
"learning_rate": 9.948734975656874e-06,
|
|
"loss": 0.032,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.09428500537589943,
|
|
"grad_norm": 0.06958391517400742,
|
|
"learning_rate": 9.946763137455915e-06,
|
|
"loss": 0.032,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.09511206682656521,
|
|
"grad_norm": 0.09188707917928696,
|
|
"learning_rate": 9.944754292332802e-06,
|
|
"loss": 0.0318,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.095939128277231,
|
|
"grad_norm": 0.07205051183700562,
|
|
"learning_rate": 9.942708455315779e-06,
|
|
"loss": 0.03,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.09676618972789679,
|
|
"grad_norm": 0.07053325325250626,
|
|
"learning_rate": 9.94062564170983e-06,
|
|
"loss": 0.0314,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.09759325117856257,
|
|
"grad_norm": 0.09830465912818909,
|
|
"learning_rate": 9.938505867096563e-06,
|
|
"loss": 0.031,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.09842031262922835,
|
|
"grad_norm": 0.08823514729738235,
|
|
"learning_rate": 9.93634914733409e-06,
|
|
"loss": 0.0308,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.09924737407989413,
|
|
"grad_norm": 0.16881339251995087,
|
|
"learning_rate": 9.934155498556919e-06,
|
|
"loss": 0.0319,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.10007443553055992,
|
|
"grad_norm": 0.07580441236495972,
|
|
"learning_rate": 9.931924937175813e-06,
|
|
"loss": 0.0304,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.1009014969812257,
|
|
"grad_norm": 0.12182483077049255,
|
|
"learning_rate": 9.929657479877688e-06,
|
|
"loss": 0.03,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.10172855843189149,
|
|
"grad_norm": 0.1227443590760231,
|
|
"learning_rate": 9.92735314362548e-06,
|
|
"loss": 0.0297,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.10255561988255728,
|
|
"grad_norm": 0.1861189603805542,
|
|
"learning_rate": 9.925011945658012e-06,
|
|
"loss": 0.0298,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.10338268133322306,
|
|
"grad_norm": 0.07195472717285156,
|
|
"learning_rate": 9.922633903489878e-06,
|
|
"loss": 0.0348,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.10420974278388884,
|
|
"grad_norm": 0.08541199564933777,
|
|
"learning_rate": 9.9202190349113e-06,
|
|
"loss": 0.0329,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.10503680423455462,
|
|
"grad_norm": 0.07149334251880646,
|
|
"learning_rate": 9.917767357988e-06,
|
|
"loss": 0.0295,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.10586386568522041,
|
|
"grad_norm": 0.07702941447496414,
|
|
"learning_rate": 9.915278891061069e-06,
|
|
"loss": 0.0317,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.1066909271358862,
|
|
"grad_norm": 0.09982211887836456,
|
|
"learning_rate": 9.912753652746819e-06,
|
|
"loss": 0.0296,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.10751798858655198,
|
|
"grad_norm": 0.15653453767299652,
|
|
"learning_rate": 9.910191661936654e-06,
|
|
"loss": 0.0312,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.10834505003721777,
|
|
"grad_norm": 0.09917636215686798,
|
|
"learning_rate": 9.907592937796927e-06,
|
|
"loss": 0.0304,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.10917211148788356,
|
|
"grad_norm": 0.07035510987043381,
|
|
"learning_rate": 9.904957499768787e-06,
|
|
"loss": 0.0314,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.10999917293854933,
|
|
"grad_norm": 0.07983675599098206,
|
|
"learning_rate": 9.902285367568049e-06,
|
|
"loss": 0.0301,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.11082623438921511,
|
|
"grad_norm": 0.06464583426713943,
|
|
"learning_rate": 9.899576561185034e-06,
|
|
"loss": 0.0305,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.1116532958398809,
|
|
"grad_norm": 0.1578930765390396,
|
|
"learning_rate": 9.896831100884424e-06,
|
|
"loss": 0.0303,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.11248035729054669,
|
|
"grad_norm": 0.06734715402126312,
|
|
"learning_rate": 9.894049007205112e-06,
|
|
"loss": 0.0281,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.11330741874121247,
|
|
"grad_norm": 0.09131414443254471,
|
|
"learning_rate": 9.891230300960049e-06,
|
|
"loss": 0.0302,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.11413448019187826,
|
|
"grad_norm": 0.06612879037857056,
|
|
"learning_rate": 9.888375003236078e-06,
|
|
"loss": 0.032,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.11496154164254405,
|
|
"grad_norm": 0.07150176167488098,
|
|
"learning_rate": 9.885483135393792e-06,
|
|
"loss": 0.031,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.11578860309320982,
|
|
"grad_norm": 0.06837069243192673,
|
|
"learning_rate": 9.882554719067363e-06,
|
|
"loss": 0.0292,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.1166156645438756,
|
|
"grad_norm": 0.09854337573051453,
|
|
"learning_rate": 9.879589776164387e-06,
|
|
"loss": 0.0302,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.11744272599454139,
|
|
"grad_norm": 0.06270977109670639,
|
|
"learning_rate": 9.87658832886571e-06,
|
|
"loss": 0.0285,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.11826978744520718,
|
|
"grad_norm": 0.09647868573665619,
|
|
"learning_rate": 9.873550399625275e-06,
|
|
"loss": 0.0283,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.11909684889587296,
|
|
"grad_norm": 0.06852090358734131,
|
|
"learning_rate": 9.870476011169948e-06,
|
|
"loss": 0.0299,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.11992391034653875,
|
|
"grad_norm": 0.06666602194309235,
|
|
"learning_rate": 9.867365186499337e-06,
|
|
"loss": 0.0338,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.12075097179720454,
|
|
"grad_norm": 0.07159853726625443,
|
|
"learning_rate": 9.864217948885648e-06,
|
|
"loss": 0.0281,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.12157803324787032,
|
|
"grad_norm": 0.1366601437330246,
|
|
"learning_rate": 9.861034321873481e-06,
|
|
"loss": 0.0309,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.1224050946985361,
|
|
"grad_norm": 0.08372735232114792,
|
|
"learning_rate": 9.85781432927967e-06,
|
|
"loss": 0.0308,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.12323215614920188,
|
|
"grad_norm": 0.10882294178009033,
|
|
"learning_rate": 9.854557995193102e-06,
|
|
"loss": 0.0289,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.12405921759986767,
|
|
"grad_norm": 0.07682844996452332,
|
|
"learning_rate": 9.851265343974534e-06,
|
|
"loss": 0.031,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.12488627905053346,
|
|
"grad_norm": 0.12793898582458496,
|
|
"learning_rate": 9.847936400256415e-06,
|
|
"loss": 0.0291,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.12571334050119923,
|
|
"grad_norm": 0.07250412553548813,
|
|
"learning_rate": 9.844571188942701e-06,
|
|
"loss": 0.029,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.12654040195186503,
|
|
"grad_norm": 0.06386396288871765,
|
|
"learning_rate": 9.841169735208662e-06,
|
|
"loss": 0.0307,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.1273674634025308,
|
|
"grad_norm": 0.05723176896572113,
|
|
"learning_rate": 9.837732064500705e-06,
|
|
"loss": 0.0286,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.1281945248531966,
|
|
"grad_norm": 0.07307655364274979,
|
|
"learning_rate": 9.834258202536173e-06,
|
|
"loss": 0.0304,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.12902158630386237,
|
|
"grad_norm": 0.06836479902267456,
|
|
"learning_rate": 9.830748175303157e-06,
|
|
"loss": 0.0286,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.12984864775452817,
|
|
"grad_norm": 0.0659850612282753,
|
|
"learning_rate": 9.827202009060307e-06,
|
|
"loss": 0.0271,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.13067570920519395,
|
|
"grad_norm": 0.06328194588422775,
|
|
"learning_rate": 9.823619730336624e-06,
|
|
"loss": 0.028,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.13150277065585972,
|
|
"grad_norm": 0.05650272220373154,
|
|
"learning_rate": 9.820001365931273e-06,
|
|
"loss": 0.0279,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.13232983210652552,
|
|
"grad_norm": 0.10159070044755936,
|
|
"learning_rate": 9.816346942913376e-06,
|
|
"loss": 0.029,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.1331568935571913,
|
|
"grad_norm": 0.4464583098888397,
|
|
"learning_rate": 9.812656488621804e-06,
|
|
"loss": 0.0298,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.1339839550078571,
|
|
"grad_norm": 0.06621966511011124,
|
|
"learning_rate": 9.808930030664989e-06,
|
|
"loss": 0.0303,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.13481101645852286,
|
|
"grad_norm": 0.07061782479286194,
|
|
"learning_rate": 9.805167596920707e-06,
|
|
"loss": 0.0283,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.13563807790918866,
|
|
"grad_norm": 0.06339192390441895,
|
|
"learning_rate": 9.80136921553586e-06,
|
|
"loss": 0.0274,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.13646513935985444,
|
|
"grad_norm": 0.09378033131361008,
|
|
"learning_rate": 9.797534914926289e-06,
|
|
"loss": 0.028,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.1372922008105202,
|
|
"grad_norm": 0.11731720715761185,
|
|
"learning_rate": 9.793664723776539e-06,
|
|
"loss": 0.0289,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.138119262261186,
|
|
"grad_norm": 0.07038633525371552,
|
|
"learning_rate": 9.789758671039658e-06,
|
|
"loss": 0.0279,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.13894632371185178,
|
|
"grad_norm": 0.08343333005905151,
|
|
"learning_rate": 9.785816785936973e-06,
|
|
"loss": 0.0278,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.13977338516251758,
|
|
"grad_norm": 0.08339129388332367,
|
|
"learning_rate": 9.781839097957875e-06,
|
|
"loss": 0.0302,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.14060044661318336,
|
|
"grad_norm": 0.15373483300209045,
|
|
"learning_rate": 9.777825636859599e-06,
|
|
"loss": 0.0293,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.14142750806384916,
|
|
"grad_norm": 0.07383430004119873,
|
|
"learning_rate": 9.773776432667e-06,
|
|
"loss": 0.0295,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.14225456951451493,
|
|
"grad_norm": 0.07228893786668777,
|
|
"learning_rate": 9.769691515672328e-06,
|
|
"loss": 0.0276,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.1430816309651807,
|
|
"grad_norm": 0.09278323501348495,
|
|
"learning_rate": 9.765570916434998e-06,
|
|
"loss": 0.0289,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.1439086924158465,
|
|
"grad_norm": 0.09062926471233368,
|
|
"learning_rate": 9.761414665781374e-06,
|
|
"loss": 0.028,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.14473575386651227,
|
|
"grad_norm": 0.06459420919418335,
|
|
"learning_rate": 9.757222794804522e-06,
|
|
"loss": 0.0279,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.14556281531717807,
|
|
"grad_norm": 0.06160286068916321,
|
|
"learning_rate": 9.752995334863985e-06,
|
|
"loss": 0.028,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.14638987676784385,
|
|
"grad_norm": 0.07651007920503616,
|
|
"learning_rate": 9.748732317585557e-06,
|
|
"loss": 0.0295,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.14721693821850965,
|
|
"grad_norm": 0.08549734950065613,
|
|
"learning_rate": 9.744433774861024e-06,
|
|
"loss": 0.028,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.14804399966917542,
|
|
"grad_norm": 0.06986084580421448,
|
|
"learning_rate": 9.74009973884795e-06,
|
|
"loss": 0.029,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.1488710611198412,
|
|
"grad_norm": 0.0717945545911789,
|
|
"learning_rate": 9.735730241969425e-06,
|
|
"loss": 0.0287,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.149698122570507,
|
|
"grad_norm": 0.055828921496868134,
|
|
"learning_rate": 9.731325316913816e-06,
|
|
"loss": 0.0279,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.15052518402117276,
|
|
"grad_norm": 0.09485841542482376,
|
|
"learning_rate": 9.726884996634535e-06,
|
|
"loss": 0.0288,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.15135224547183856,
|
|
"grad_norm": 0.09525461494922638,
|
|
"learning_rate": 9.72240931434979e-06,
|
|
"loss": 0.0266,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.15217930692250434,
|
|
"grad_norm": 0.058921415358781815,
|
|
"learning_rate": 9.717898303542324e-06,
|
|
"loss": 0.0278,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.15300636837317014,
|
|
"grad_norm": 0.08154033869504929,
|
|
"learning_rate": 9.713351997959184e-06,
|
|
"loss": 0.0348,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.1538334298238359,
|
|
"grad_norm": 0.059776682406663895,
|
|
"learning_rate": 9.70877043161145e-06,
|
|
"loss": 0.0275,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.15466049127450168,
|
|
"grad_norm": 0.089345782995224,
|
|
"learning_rate": 9.704153638773996e-06,
|
|
"loss": 0.0253,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.15548755272516748,
|
|
"grad_norm": 0.0736837238073349,
|
|
"learning_rate": 9.699501653985223e-06,
|
|
"loss": 0.0263,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.15631461417583326,
|
|
"grad_norm": 0.09357444941997528,
|
|
"learning_rate": 9.694814512046805e-06,
|
|
"loss": 0.0278,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.15714167562649906,
|
|
"grad_norm": 0.06266128271818161,
|
|
"learning_rate": 9.690092248023428e-06,
|
|
"loss": 0.0277,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.15796873707716483,
|
|
"grad_norm": 0.081548310816288,
|
|
"learning_rate": 9.68533489724253e-06,
|
|
"loss": 0.0307,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.15879579852783063,
|
|
"grad_norm": 0.05208978429436684,
|
|
"learning_rate": 9.680542495294027e-06,
|
|
"loss": 0.0277,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.1596228599784964,
|
|
"grad_norm": 0.055970244109630585,
|
|
"learning_rate": 9.675715078030063e-06,
|
|
"loss": 0.027,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.16044992142916217,
|
|
"grad_norm": 0.05580395460128784,
|
|
"learning_rate": 9.67085268156473e-06,
|
|
"loss": 0.0277,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.16127698287982797,
|
|
"grad_norm": 0.058690398931503296,
|
|
"learning_rate": 9.665955342273799e-06,
|
|
"loss": 0.0274,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.16210404433049375,
|
|
"grad_norm": 0.06293683499097824,
|
|
"learning_rate": 9.661023096794449e-06,
|
|
"loss": 0.0267,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.16293110578115955,
|
|
"grad_norm": 0.08121343702077866,
|
|
"learning_rate": 9.656055982024995e-06,
|
|
"loss": 0.0279,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.16375816723182532,
|
|
"grad_norm": 0.07257858663797379,
|
|
"learning_rate": 9.651054035124614e-06,
|
|
"loss": 0.0264,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.16458522868249112,
|
|
"grad_norm": 0.06639426201581955,
|
|
"learning_rate": 9.646017293513056e-06,
|
|
"loss": 0.0265,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.1654122901331569,
|
|
"grad_norm": 0.06022842600941658,
|
|
"learning_rate": 9.640945794870377e-06,
|
|
"loss": 0.0261,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.1654122901331569,
|
|
"eval_loss": 0.028430579230189323,
|
|
"eval_runtime": 1220.0038,
|
|
"eval_samples_per_second": 4.917,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.16623935158382266,
|
|
"grad_norm": 0.05734001100063324,
|
|
"learning_rate": 9.63583957713665e-06,
|
|
"loss": 0.0277,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.16706641303448846,
|
|
"grad_norm": 0.08106731623411179,
|
|
"learning_rate": 9.630698678511684e-06,
|
|
"loss": 0.0266,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.16789347448515424,
|
|
"grad_norm": 0.056222159415483475,
|
|
"learning_rate": 9.625523137454736e-06,
|
|
"loss": 0.0261,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.16872053593582004,
|
|
"grad_norm": 0.06166260689496994,
|
|
"learning_rate": 9.620312992684223e-06,
|
|
"loss": 0.0265,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.1695475973864858,
|
|
"grad_norm": 0.08784622699022293,
|
|
"learning_rate": 9.615068283177434e-06,
|
|
"loss": 0.0281,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.1703746588371516,
|
|
"grad_norm": 0.0706792026758194,
|
|
"learning_rate": 9.609789048170243e-06,
|
|
"loss": 0.029,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.17120172028781738,
|
|
"grad_norm": 0.05976736173033714,
|
|
"learning_rate": 9.604475327156804e-06,
|
|
"loss": 0.0254,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.17202878173848318,
|
|
"grad_norm": 0.05874831974506378,
|
|
"learning_rate": 9.599127159889266e-06,
|
|
"loss": 0.0279,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.17285584318914896,
|
|
"grad_norm": 0.06354232132434845,
|
|
"learning_rate": 9.593744586377472e-06,
|
|
"loss": 0.0266,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.17368290463981473,
|
|
"grad_norm": 0.06033729389309883,
|
|
"learning_rate": 9.588327646888655e-06,
|
|
"loss": 0.0266,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.17450996609048053,
|
|
"grad_norm": 0.18101929128170013,
|
|
"learning_rate": 9.582876381947145e-06,
|
|
"loss": 0.0266,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.1753370275411463,
|
|
"grad_norm": 0.26323285698890686,
|
|
"learning_rate": 9.577390832334064e-06,
|
|
"loss": 0.0265,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.1761640889918121,
|
|
"grad_norm": 0.05492362007498741,
|
|
"learning_rate": 9.571871039087013e-06,
|
|
"loss": 0.0266,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.17699115044247787,
|
|
"grad_norm": 0.05727216601371765,
|
|
"learning_rate": 9.566317043499773e-06,
|
|
"loss": 0.0263,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.17781821189314367,
|
|
"grad_norm": 0.14531953632831573,
|
|
"learning_rate": 9.560728887122e-06,
|
|
"loss": 0.0286,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.17864527334380945,
|
|
"grad_norm": 0.06639876216650009,
|
|
"learning_rate": 9.5551066117589e-06,
|
|
"loss": 0.0262,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.17947233479447522,
|
|
"grad_norm": 0.06139986589550972,
|
|
"learning_rate": 9.549450259470927e-06,
|
|
"loss": 0.0272,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.18029939624514102,
|
|
"grad_norm": 0.07039148360490799,
|
|
"learning_rate": 9.543759872573469e-06,
|
|
"loss": 0.0282,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.1811264576958068,
|
|
"grad_norm": 0.08487813919782639,
|
|
"learning_rate": 9.538035493636524e-06,
|
|
"loss": 0.0284,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.1819535191464726,
|
|
"grad_norm": 0.07776181399822235,
|
|
"learning_rate": 9.532277165484387e-06,
|
|
"loss": 0.0279,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.18278058059713836,
|
|
"grad_norm": 0.061026498675346375,
|
|
"learning_rate": 9.52648493119533e-06,
|
|
"loss": 0.0256,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.18360764204780416,
|
|
"grad_norm": 0.061437539756298065,
|
|
"learning_rate": 9.520658834101275e-06,
|
|
"loss": 0.027,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.18443470349846994,
|
|
"grad_norm": 0.06019297242164612,
|
|
"learning_rate": 9.514798917787477e-06,
|
|
"loss": 0.0305,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.1852617649491357,
|
|
"grad_norm": 0.08646666258573532,
|
|
"learning_rate": 9.50890522609219e-06,
|
|
"loss": 0.0263,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.1860888263998015,
|
|
"grad_norm": 0.1908756047487259,
|
|
"learning_rate": 9.502977803106346e-06,
|
|
"loss": 0.0259,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.18691588785046728,
|
|
"grad_norm": 0.24025577306747437,
|
|
"learning_rate": 9.497016693173218e-06,
|
|
"loss": 0.0294,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.18774294930113308,
|
|
"grad_norm": 0.07112468034029007,
|
|
"learning_rate": 9.491021940888096e-06,
|
|
"loss": 0.0266,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.18857001075179886,
|
|
"grad_norm": 0.08155805617570877,
|
|
"learning_rate": 9.484993591097952e-06,
|
|
"loss": 0.0258,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.18939707220246466,
|
|
"grad_norm": 0.05596913397312164,
|
|
"learning_rate": 9.478931688901095e-06,
|
|
"loss": 0.0264,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.19022413365313043,
|
|
"grad_norm": 0.059164054691791534,
|
|
"learning_rate": 9.472836279646844e-06,
|
|
"loss": 0.0272,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.1910511951037962,
|
|
"grad_norm": 0.06571198254823685,
|
|
"learning_rate": 9.466707408935189e-06,
|
|
"loss": 0.0272,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.191878256554462,
|
|
"grad_norm": 0.07002273201942444,
|
|
"learning_rate": 9.460545122616442e-06,
|
|
"loss": 0.0275,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.19270531800512777,
|
|
"grad_norm": 0.06005439907312393,
|
|
"learning_rate": 9.4543494667909e-06,
|
|
"loss": 0.028,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.19353237945579357,
|
|
"grad_norm": 0.11123955994844437,
|
|
"learning_rate": 9.4481204878085e-06,
|
|
"loss": 0.0268,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.19435944090645935,
|
|
"grad_norm": 0.0618051253259182,
|
|
"learning_rate": 9.441858232268467e-06,
|
|
"loss": 0.0259,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.19518650235712515,
|
|
"grad_norm": 0.06244316324591637,
|
|
"learning_rate": 9.435562747018976e-06,
|
|
"loss": 0.0262,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.19601356380779092,
|
|
"grad_norm": 0.08520273864269257,
|
|
"learning_rate": 9.429234079156787e-06,
|
|
"loss": 0.0267,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.1968406252584567,
|
|
"grad_norm": 0.06388260424137115,
|
|
"learning_rate": 9.422872276026902e-06,
|
|
"loss": 0.0263,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.1976676867091225,
|
|
"grad_norm": 0.06510653346776962,
|
|
"learning_rate": 9.416477385222213e-06,
|
|
"loss": 0.0281,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.19849474815978826,
|
|
"grad_norm": 0.12499203532934189,
|
|
"learning_rate": 9.41004945458314e-06,
|
|
"loss": 0.0268,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.19932180961045406,
|
|
"grad_norm": 0.06236669421195984,
|
|
"learning_rate": 9.403588532197277e-06,
|
|
"loss": 0.0262,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.20014887106111984,
|
|
"grad_norm": 0.06980706751346588,
|
|
"learning_rate": 9.397094666399025e-06,
|
|
"loss": 0.0264,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.20097593251178564,
|
|
"grad_norm": 0.05483941361308098,
|
|
"learning_rate": 9.390567905769242e-06,
|
|
"loss": 0.025,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.2018029939624514,
|
|
"grad_norm": 0.08971832692623138,
|
|
"learning_rate": 9.384008299134871e-06,
|
|
"loss": 0.0243,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.20263005541311718,
|
|
"grad_norm": 0.07181546092033386,
|
|
"learning_rate": 9.377415895568578e-06,
|
|
"loss": 0.0257,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.20345711686378298,
|
|
"grad_norm": 0.06366802752017975,
|
|
"learning_rate": 9.370790744388381e-06,
|
|
"loss": 0.026,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.20428417831444876,
|
|
"grad_norm": 0.065264031291008,
|
|
"learning_rate": 9.36413289515729e-06,
|
|
"loss": 0.0274,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.20511123976511456,
|
|
"grad_norm": 0.09247761219739914,
|
|
"learning_rate": 9.357442397682924e-06,
|
|
"loss": 0.0251,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.20593830121578033,
|
|
"grad_norm": 0.04937649890780449,
|
|
"learning_rate": 9.350719302017148e-06,
|
|
"loss": 0.0277,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.20676536266644613,
|
|
"grad_norm": 0.06501265615224838,
|
|
"learning_rate": 9.343963658455698e-06,
|
|
"loss": 0.0266,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.2075924241171119,
|
|
"grad_norm": 0.07810965925455093,
|
|
"learning_rate": 9.337175517537796e-06,
|
|
"loss": 0.0302,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.20841948556777767,
|
|
"grad_norm": 0.056350335478782654,
|
|
"learning_rate": 9.330354930045782e-06,
|
|
"loss": 0.0275,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.20924654701844347,
|
|
"grad_norm": 0.070098377764225,
|
|
"learning_rate": 9.323501947004727e-06,
|
|
"loss": 0.0268,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.21007360846910925,
|
|
"grad_norm": 0.07072274386882782,
|
|
"learning_rate": 9.316616619682059e-06,
|
|
"loss": 0.0256,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.21090066991977505,
|
|
"grad_norm": 0.05314943194389343,
|
|
"learning_rate": 9.309698999587174e-06,
|
|
"loss": 0.0256,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.21172773137044082,
|
|
"grad_norm": 0.05929897353053093,
|
|
"learning_rate": 9.302749138471046e-06,
|
|
"loss": 0.0274,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.21255479282110662,
|
|
"grad_norm": 0.07132343202829361,
|
|
"learning_rate": 9.295767088325848e-06,
|
|
"loss": 0.0256,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.2133818542717724,
|
|
"grad_norm": 0.05408504605293274,
|
|
"learning_rate": 9.288752901384563e-06,
|
|
"loss": 0.0323,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.21420891572243816,
|
|
"grad_norm": 0.0529806949198246,
|
|
"learning_rate": 9.281706630120592e-06,
|
|
"loss": 0.0252,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.21503597717310396,
|
|
"grad_norm": 0.09713909775018692,
|
|
"learning_rate": 9.274628327247353e-06,
|
|
"loss": 0.0249,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.21586303862376974,
|
|
"grad_norm": 0.07577594369649887,
|
|
"learning_rate": 9.267518045717897e-06,
|
|
"loss": 0.0283,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.21669010007443554,
|
|
"grad_norm": 0.05945679545402527,
|
|
"learning_rate": 9.260375838724511e-06,
|
|
"loss": 0.0263,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.2175171615251013,
|
|
"grad_norm": 0.06303580105304718,
|
|
"learning_rate": 9.253201759698317e-06,
|
|
"loss": 0.0297,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.2183442229757671,
|
|
"grad_norm": 0.06167830526828766,
|
|
"learning_rate": 9.245995862308867e-06,
|
|
"loss": 0.0275,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.21917128442643288,
|
|
"grad_norm": 0.05566466599702835,
|
|
"learning_rate": 9.238758200463756e-06,
|
|
"loss": 0.0279,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.21999834587709866,
|
|
"grad_norm": 0.06275132298469543,
|
|
"learning_rate": 9.231488828308205e-06,
|
|
"loss": 0.0248,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.22082540732776446,
|
|
"grad_norm": 0.07304584234952927,
|
|
"learning_rate": 9.224187800224661e-06,
|
|
"loss": 0.0273,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.22165246877843023,
|
|
"grad_norm": 0.05730755627155304,
|
|
"learning_rate": 9.216855170832393e-06,
|
|
"loss": 0.0271,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.22247953022909603,
|
|
"grad_norm": 0.05435599759221077,
|
|
"learning_rate": 9.209490994987079e-06,
|
|
"loss": 0.0248,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.2233065916797618,
|
|
"grad_norm": 0.05061393231153488,
|
|
"learning_rate": 9.202095327780394e-06,
|
|
"loss": 0.0258,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.2241336531304276,
|
|
"grad_norm": 0.05590864270925522,
|
|
"learning_rate": 9.194668224539608e-06,
|
|
"loss": 0.0256,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.22496071458109337,
|
|
"grad_norm": 0.04720637574791908,
|
|
"learning_rate": 9.187209740827159e-06,
|
|
"loss": 0.0243,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.22578777603175915,
|
|
"grad_norm": 0.055518608540296555,
|
|
"learning_rate": 9.179719932440245e-06,
|
|
"loss": 0.026,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.22661483748242495,
|
|
"grad_norm": 0.060342345386743546,
|
|
"learning_rate": 9.172198855410408e-06,
|
|
"loss": 0.0254,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.22744189893309072,
|
|
"grad_norm": 0.06279141455888748,
|
|
"learning_rate": 9.164646566003109e-06,
|
|
"loss": 0.0262,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.22826896038375652,
|
|
"grad_norm": 0.050773248076438904,
|
|
"learning_rate": 9.15706312071731e-06,
|
|
"loss": 0.0271,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.2290960218344223,
|
|
"grad_norm": 0.052737098187208176,
|
|
"learning_rate": 9.149448576285055e-06,
|
|
"loss": 0.0259,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.2299230832850881,
|
|
"grad_norm": 0.055731095373630524,
|
|
"learning_rate": 9.141802989671036e-06,
|
|
"loss": 0.0255,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.23075014473575386,
|
|
"grad_norm": 0.05351400747895241,
|
|
"learning_rate": 9.134126418072175e-06,
|
|
"loss": 0.0255,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.23157720618641964,
|
|
"grad_norm": 0.05681459978222847,
|
|
"learning_rate": 9.126418918917197e-06,
|
|
"loss": 0.0268,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.23240426763708544,
|
|
"grad_norm": 0.05028412118554115,
|
|
"learning_rate": 9.118680549866193e-06,
|
|
"loss": 0.0239,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.2332313290877512,
|
|
"grad_norm": 0.05494118854403496,
|
|
"learning_rate": 9.110911368810193e-06,
|
|
"loss": 0.0239,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.234058390538417,
|
|
"grad_norm": 0.04639596492052078,
|
|
"learning_rate": 9.10311143387074e-06,
|
|
"loss": 0.0246,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.23488545198908278,
|
|
"grad_norm": 0.06322944909334183,
|
|
"learning_rate": 9.095280803399437e-06,
|
|
"loss": 0.0245,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.23571251343974858,
|
|
"grad_norm": 0.08805207163095474,
|
|
"learning_rate": 9.08741953597753e-06,
|
|
"loss": 0.0247,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.23653957489041436,
|
|
"grad_norm": 0.058313675224781036,
|
|
"learning_rate": 9.079527690415455e-06,
|
|
"loss": 0.0258,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.23736663634108016,
|
|
"grad_norm": 0.057351235300302505,
|
|
"learning_rate": 9.07160532575241e-06,
|
|
"loss": 0.0254,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.23819369779174593,
|
|
"grad_norm": 0.06278271973133087,
|
|
"learning_rate": 9.063652501255904e-06,
|
|
"loss": 0.0247,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.2390207592424117,
|
|
"grad_norm": 0.05352174490690231,
|
|
"learning_rate": 9.055669276421315e-06,
|
|
"loss": 0.026,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.2398478206930775,
|
|
"grad_norm": 0.05026556923985481,
|
|
"learning_rate": 9.047655710971455e-06,
|
|
"loss": 0.0266,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.24067488214374327,
|
|
"grad_norm": 0.0480768196284771,
|
|
"learning_rate": 9.039611864856105e-06,
|
|
"loss": 0.0247,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.24150194359440907,
|
|
"grad_norm": 0.07413890212774277,
|
|
"learning_rate": 9.031537798251589e-06,
|
|
"loss": 0.0284,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.24232900504507485,
|
|
"grad_norm": 0.07313451170921326,
|
|
"learning_rate": 9.023433571560297e-06,
|
|
"loss": 0.0256,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.24315606649574065,
|
|
"grad_norm": 0.05278393253684044,
|
|
"learning_rate": 9.015299245410258e-06,
|
|
"loss": 0.0249,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.24398312794640642,
|
|
"grad_norm": 0.08677669614553452,
|
|
"learning_rate": 9.007134880654677e-06,
|
|
"loss": 0.026,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.2448101893970722,
|
|
"grad_norm": 0.060126129537820816,
|
|
"learning_rate": 8.998940538371472e-06,
|
|
"loss": 0.0259,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.245637250847738,
|
|
"grad_norm": 0.05079201981425285,
|
|
"learning_rate": 8.99071627986283e-06,
|
|
"loss": 0.0243,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.24646431229840376,
|
|
"grad_norm": 0.053754109889268875,
|
|
"learning_rate": 8.982462166654737e-06,
|
|
"loss": 0.0257,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.24729137374906957,
|
|
"grad_norm": 0.05371469631791115,
|
|
"learning_rate": 8.974178260496529e-06,
|
|
"loss": 0.0253,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.24811843519973534,
|
|
"grad_norm": 0.060160018503665924,
|
|
"learning_rate": 8.965864623360418e-06,
|
|
"loss": 0.0283,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.24811843519973534,
|
|
"eval_loss": 0.026417342945933342,
|
|
"eval_runtime": 1220.3014,
|
|
"eval_samples_per_second": 4.916,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.24894549665040114,
|
|
"grad_norm": 0.06683066487312317,
|
|
"learning_rate": 8.957521317441043e-06,
|
|
"loss": 0.0245,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.2497725581010669,
|
|
"grad_norm": 0.045557327568531036,
|
|
"learning_rate": 8.949148405154986e-06,
|
|
"loss": 0.0251,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.2505996195517327,
|
|
"grad_norm": 0.05416623502969742,
|
|
"learning_rate": 8.940745949140323e-06,
|
|
"loss": 0.0247,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.25142668100239846,
|
|
"grad_norm": 0.17342466115951538,
|
|
"learning_rate": 8.932314012256147e-06,
|
|
"loss": 0.0249,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.25225374245306426,
|
|
"grad_norm": 0.06348035484552383,
|
|
"learning_rate": 8.923852657582092e-06,
|
|
"loss": 0.0258,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.25308080390373006,
|
|
"grad_norm": 0.05559645593166351,
|
|
"learning_rate": 8.915361948417878e-06,
|
|
"loss": 0.0361,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.25390786535439586,
|
|
"grad_norm": 0.050857000052928925,
|
|
"learning_rate": 8.906841948282818e-06,
|
|
"loss": 0.0257,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.2547349268050616,
|
|
"grad_norm": 0.04826486483216286,
|
|
"learning_rate": 8.898292720915354e-06,
|
|
"loss": 0.0257,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.2555619882557274,
|
|
"grad_norm": 0.06656019389629364,
|
|
"learning_rate": 8.889714330272584e-06,
|
|
"loss": 0.0261,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.2563890497063932,
|
|
"grad_norm": 0.06416959315538406,
|
|
"learning_rate": 8.881106840529769e-06,
|
|
"loss": 0.0252,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.25721611115705895,
|
|
"grad_norm": 0.04848102107644081,
|
|
"learning_rate": 8.872470316079866e-06,
|
|
"loss": 0.024,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.25804317260772475,
|
|
"grad_norm": 0.06827887147665024,
|
|
"learning_rate": 8.863804821533043e-06,
|
|
"loss": 0.0236,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.25887023405839055,
|
|
"grad_norm": 0.0632987692952156,
|
|
"learning_rate": 8.855110421716191e-06,
|
|
"loss": 0.0261,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.25969729550905635,
|
|
"grad_norm": 0.05443909019231796,
|
|
"learning_rate": 8.846387181672443e-06,
|
|
"loss": 0.0245,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.2605243569597221,
|
|
"grad_norm": 0.050953421741724014,
|
|
"learning_rate": 8.837635166660689e-06,
|
|
"loss": 0.0258,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.2613514184103879,
|
|
"grad_norm": 0.04987896978855133,
|
|
"learning_rate": 8.828854442155087e-06,
|
|
"loss": 0.0259,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.2621784798610537,
|
|
"grad_norm": 0.05325448885560036,
|
|
"learning_rate": 8.820045073844563e-06,
|
|
"loss": 0.0263,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.26300554131171944,
|
|
"grad_norm": 0.06813682615756989,
|
|
"learning_rate": 8.81120712763234e-06,
|
|
"loss": 0.024,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.26383260276238524,
|
|
"grad_norm": 0.053441476076841354,
|
|
"learning_rate": 8.802340669635423e-06,
|
|
"loss": 0.0255,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.26465966421305104,
|
|
"grad_norm": 0.061251021921634674,
|
|
"learning_rate": 8.793445766184126e-06,
|
|
"loss": 0.0329,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.26548672566371684,
|
|
"grad_norm": 0.06079159677028656,
|
|
"learning_rate": 8.784522483821554e-06,
|
|
"loss": 0.0271,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.2663137871143826,
|
|
"grad_norm": 0.04815410450100899,
|
|
"learning_rate": 8.77557088930312e-06,
|
|
"loss": 0.0256,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.2671408485650484,
|
|
"grad_norm": 0.058222122490406036,
|
|
"learning_rate": 8.766591049596043e-06,
|
|
"loss": 0.0239,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.2679679100157142,
|
|
"grad_norm": 0.06425308436155319,
|
|
"learning_rate": 8.75758303187884e-06,
|
|
"loss": 0.0248,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.26879497146637993,
|
|
"grad_norm": 0.05385325476527214,
|
|
"learning_rate": 8.748546903540838e-06,
|
|
"loss": 0.0249,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.26962203291704573,
|
|
"grad_norm": 0.04803679138422012,
|
|
"learning_rate": 8.739482732181648e-06,
|
|
"loss": 0.0313,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.27044909436771153,
|
|
"grad_norm": 0.05667194724082947,
|
|
"learning_rate": 8.730390585610685e-06,
|
|
"loss": 0.025,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.27127615581837733,
|
|
"grad_norm": 0.04525600001215935,
|
|
"learning_rate": 8.72127053184664e-06,
|
|
"loss": 0.0254,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.2721032172690431,
|
|
"grad_norm": 0.07599420845508575,
|
|
"learning_rate": 8.712122639116975e-06,
|
|
"loss": 0.0243,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.2729302787197089,
|
|
"grad_norm": 0.052151359617710114,
|
|
"learning_rate": 8.70294697585743e-06,
|
|
"loss": 0.0234,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.2737573401703747,
|
|
"grad_norm": 0.05731287971138954,
|
|
"learning_rate": 8.693743610711482e-06,
|
|
"loss": 0.0248,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.2745844016210404,
|
|
"grad_norm": 0.04920828342437744,
|
|
"learning_rate": 8.684512612529857e-06,
|
|
"loss": 0.0245,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.2754114630717062,
|
|
"grad_norm": 0.05730625241994858,
|
|
"learning_rate": 8.67525405037e-06,
|
|
"loss": 0.0264,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.276238524522372,
|
|
"grad_norm": 0.04498128592967987,
|
|
"learning_rate": 8.665967993495568e-06,
|
|
"loss": 0.0244,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.2770655859730378,
|
|
"grad_norm": 0.0674099400639534,
|
|
"learning_rate": 8.656654511375902e-06,
|
|
"loss": 0.0285,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.27789264742370356,
|
|
"grad_norm": 0.06094598397612572,
|
|
"learning_rate": 8.64731367368551e-06,
|
|
"loss": 0.0258,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.27871970887436937,
|
|
"grad_norm": 0.07126502692699432,
|
|
"learning_rate": 8.637945550303557e-06,
|
|
"loss": 0.0279,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.27954677032503517,
|
|
"grad_norm": 0.08413068950176239,
|
|
"learning_rate": 8.628550211313328e-06,
|
|
"loss": 0.0441,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.2803738317757009,
|
|
"grad_norm": 0.04862065240740776,
|
|
"learning_rate": 8.619127727001708e-06,
|
|
"loss": 0.0238,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.2812008932263667,
|
|
"grad_norm": 0.0653972402215004,
|
|
"learning_rate": 8.60967816785866e-06,
|
|
"loss": 0.0245,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.2820279546770325,
|
|
"grad_norm": 0.05237039551138878,
|
|
"learning_rate": 8.60020160457669e-06,
|
|
"loss": 0.0255,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.2828550161276983,
|
|
"grad_norm": 0.06689222902059555,
|
|
"learning_rate": 8.59069810805033e-06,
|
|
"loss": 0.0286,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.28368207757836406,
|
|
"grad_norm": 0.06750566512346268,
|
|
"learning_rate": 8.581167749375596e-06,
|
|
"loss": 0.0373,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.28450913902902986,
|
|
"grad_norm": 0.04513133317232132,
|
|
"learning_rate": 8.571610599849462e-06,
|
|
"loss": 0.0266,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.28533620047969566,
|
|
"grad_norm": 0.05559685453772545,
|
|
"learning_rate": 8.562026730969325e-06,
|
|
"loss": 0.0253,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.2861632619303614,
|
|
"grad_norm": 0.04561685398221016,
|
|
"learning_rate": 8.552416214432469e-06,
|
|
"loss": 0.0259,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.2869903233810272,
|
|
"grad_norm": 0.054727304726839066,
|
|
"learning_rate": 8.542779122135532e-06,
|
|
"loss": 0.0254,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.287817384831693,
|
|
"grad_norm": 0.05550670251250267,
|
|
"learning_rate": 8.533115526173969e-06,
|
|
"loss": 0.025,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.2886444462823588,
|
|
"grad_norm": 0.04571954905986786,
|
|
"learning_rate": 8.523425498841505e-06,
|
|
"loss": 0.0272,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.28947150773302455,
|
|
"grad_norm": 0.07001665234565735,
|
|
"learning_rate": 8.513709112629599e-06,
|
|
"loss": 0.0245,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.29029856918369035,
|
|
"grad_norm": 0.05153432488441467,
|
|
"learning_rate": 8.503966440226908e-06,
|
|
"loss": 0.0424,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.29112563063435615,
|
|
"grad_norm": 0.05176723748445511,
|
|
"learning_rate": 8.494197554518729e-06,
|
|
"loss": 0.0245,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.2919526920850219,
|
|
"grad_norm": 0.07877220213413239,
|
|
"learning_rate": 8.484402528586469e-06,
|
|
"loss": 0.0241,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.2927797535356877,
|
|
"grad_norm": 0.0443316325545311,
|
|
"learning_rate": 8.474581435707085e-06,
|
|
"loss": 0.0245,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.2936068149863535,
|
|
"grad_norm": 0.05324044078588486,
|
|
"learning_rate": 8.464734349352544e-06,
|
|
"loss": 0.024,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.2944338764370193,
|
|
"grad_norm": 0.0497773103415966,
|
|
"learning_rate": 8.454861343189274e-06,
|
|
"loss": 0.0236,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.29526093788768504,
|
|
"grad_norm": 0.04881919547915459,
|
|
"learning_rate": 8.444962491077604e-06,
|
|
"loss": 0.0236,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.29608799933835084,
|
|
"grad_norm": 0.054020971059799194,
|
|
"learning_rate": 8.435037867071225e-06,
|
|
"loss": 0.0264,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.29691506078901664,
|
|
"grad_norm": 0.04821145534515381,
|
|
"learning_rate": 8.425087545416622e-06,
|
|
"loss": 0.0235,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.2977421222396824,
|
|
"grad_norm": 0.04773546755313873,
|
|
"learning_rate": 8.41511160055253e-06,
|
|
"loss": 0.0406,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.2985691836903482,
|
|
"grad_norm": 0.06340964883565903,
|
|
"learning_rate": 8.405110107109365e-06,
|
|
"loss": 0.0252,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.299396245141014,
|
|
"grad_norm": 0.0523238480091095,
|
|
"learning_rate": 8.395083139908684e-06,
|
|
"loss": 0.0245,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.3002233065916798,
|
|
"grad_norm": 0.04797879606485367,
|
|
"learning_rate": 8.385030773962605e-06,
|
|
"loss": 0.0257,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.30105036804234553,
|
|
"grad_norm": 0.05554933100938797,
|
|
"learning_rate": 8.37495308447326e-06,
|
|
"loss": 0.0233,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.30187742949301133,
|
|
"grad_norm": 0.08046616613864899,
|
|
"learning_rate": 8.364850146832218e-06,
|
|
"loss": 0.0237,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.30270449094367713,
|
|
"grad_norm": 0.04799005016684532,
|
|
"learning_rate": 8.354722036619947e-06,
|
|
"loss": 0.0244,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.3035315523943429,
|
|
"grad_norm": 0.05324197933077812,
|
|
"learning_rate": 8.344568829605216e-06,
|
|
"loss": 0.0232,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.3043586138450087,
|
|
"grad_norm": 0.04944256320595741,
|
|
"learning_rate": 8.334390601744556e-06,
|
|
"loss": 0.0255,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.3051856752956745,
|
|
"grad_norm": 0.0510077141225338,
|
|
"learning_rate": 8.324187429181669e-06,
|
|
"loss": 0.0252,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.3060127367463403,
|
|
"grad_norm": 0.045672621577978134,
|
|
"learning_rate": 8.313959388246882e-06,
|
|
"loss": 0.0257,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.306839798197006,
|
|
"grad_norm": 0.04965253919363022,
|
|
"learning_rate": 8.303706555456547e-06,
|
|
"loss": 0.0291,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.3076668596476718,
|
|
"grad_norm": 0.043674346059560776,
|
|
"learning_rate": 8.293429007512503e-06,
|
|
"loss": 0.0253,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.3084939210983376,
|
|
"grad_norm": 0.04634533450007439,
|
|
"learning_rate": 8.283126821301468e-06,
|
|
"loss": 0.0236,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.30932098254900336,
|
|
"grad_norm": 0.06959991902112961,
|
|
"learning_rate": 8.272800073894492e-06,
|
|
"loss": 0.0245,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.31014804399966917,
|
|
"grad_norm": 0.04980204254388809,
|
|
"learning_rate": 8.26244884254636e-06,
|
|
"loss": 0.0237,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.31097510545033497,
|
|
"grad_norm": 0.052351828664541245,
|
|
"learning_rate": 8.252073204695025e-06,
|
|
"loss": 0.0257,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.31180216690100077,
|
|
"grad_norm": 0.04672665148973465,
|
|
"learning_rate": 8.241673237961027e-06,
|
|
"loss": 0.0238,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.3126292283516665,
|
|
"grad_norm": 0.041996221989393234,
|
|
"learning_rate": 8.231249020146913e-06,
|
|
"loss": 0.024,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.3134562898023323,
|
|
"grad_norm": 0.05913085490465164,
|
|
"learning_rate": 8.220800629236647e-06,
|
|
"loss": 0.0244,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.3142833512529981,
|
|
"grad_norm": 0.04715942218899727,
|
|
"learning_rate": 8.21032814339504e-06,
|
|
"loss": 0.0239,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.31511041270366386,
|
|
"grad_norm": 0.04261414706707001,
|
|
"learning_rate": 8.19983164096715e-06,
|
|
"loss": 0.0231,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.31593747415432966,
|
|
"grad_norm": 0.05027526617050171,
|
|
"learning_rate": 8.189311200477713e-06,
|
|
"loss": 0.0245,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.31676453560499546,
|
|
"grad_norm": 0.19037795066833496,
|
|
"learning_rate": 8.17876690063054e-06,
|
|
"loss": 0.0242,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.31759159705566126,
|
|
"grad_norm": 0.09254226088523865,
|
|
"learning_rate": 8.168198820307938e-06,
|
|
"loss": 0.0234,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.318418658506327,
|
|
"grad_norm": 0.04657592624425888,
|
|
"learning_rate": 8.157607038570117e-06,
|
|
"loss": 0.0241,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.3192457199569928,
|
|
"grad_norm": 0.06853280961513519,
|
|
"learning_rate": 8.146991634654595e-06,
|
|
"loss": 0.0261,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.3200727814076586,
|
|
"grad_norm": 0.05595746263861656,
|
|
"learning_rate": 8.136352687975609e-06,
|
|
"loss": 0.0242,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.32089984285832435,
|
|
"grad_norm": 0.04363076388835907,
|
|
"learning_rate": 8.125690278123524e-06,
|
|
"loss": 0.0235,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.32172690430899015,
|
|
"grad_norm": 0.06170443445444107,
|
|
"learning_rate": 8.115004484864231e-06,
|
|
"loss": 0.0233,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.32255396575965595,
|
|
"grad_norm": 0.04467644914984703,
|
|
"learning_rate": 8.104295388138553e-06,
|
|
"loss": 0.0245,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.32338102721032175,
|
|
"grad_norm": 0.06176682561635971,
|
|
"learning_rate": 8.093563068061649e-06,
|
|
"loss": 0.0232,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.3242080886609875,
|
|
"grad_norm": 0.047685880213975906,
|
|
"learning_rate": 8.082807604922409e-06,
|
|
"loss": 0.0248,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.3250351501116533,
|
|
"grad_norm": 0.05187467485666275,
|
|
"learning_rate": 8.072029079182862e-06,
|
|
"loss": 0.0245,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.3258622115623191,
|
|
"grad_norm": 0.04737105965614319,
|
|
"learning_rate": 8.061227571477565e-06,
|
|
"loss": 0.0268,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.32668927301298484,
|
|
"grad_norm": 0.04560704901814461,
|
|
"learning_rate": 8.050403162613007e-06,
|
|
"loss": 0.024,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.32751633446365064,
|
|
"grad_norm": 0.057890139520168304,
|
|
"learning_rate": 8.039555933567e-06,
|
|
"loss": 0.0267,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.32834339591431644,
|
|
"grad_norm": 0.04416472092270851,
|
|
"learning_rate": 8.028685965488074e-06,
|
|
"loss": 0.0241,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.32917045736498224,
|
|
"grad_norm": 0.04871301352977753,
|
|
"learning_rate": 8.017793339694873e-06,
|
|
"loss": 0.0237,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.329997518815648,
|
|
"grad_norm": 0.05144352838397026,
|
|
"learning_rate": 8.00687813767554e-06,
|
|
"loss": 0.0236,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.3308245802663138,
|
|
"grad_norm": 0.06144755333662033,
|
|
"learning_rate": 7.995940441087117e-06,
|
|
"loss": 0.0228,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.3308245802663138,
|
|
"eval_loss": 0.025024140253663063,
|
|
"eval_runtime": 1220.32,
|
|
"eval_samples_per_second": 4.916,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.3316516417169796,
|
|
"grad_norm": 0.07986024022102356,
|
|
"learning_rate": 7.984980331754924e-06,
|
|
"loss": 0.0249,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.33247870316764533,
|
|
"grad_norm": 0.04930829629302025,
|
|
"learning_rate": 7.973997891671953e-06,
|
|
"loss": 0.024,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.33330576461831113,
|
|
"grad_norm": 0.07743251323699951,
|
|
"learning_rate": 7.962993202998257e-06,
|
|
"loss": 0.0234,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.33413282606897693,
|
|
"grad_norm": 0.05702010914683342,
|
|
"learning_rate": 7.951966348060325e-06,
|
|
"loss": 0.025,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.33495988751964273,
|
|
"grad_norm": 0.042675841599702835,
|
|
"learning_rate": 7.940917409350476e-06,
|
|
"loss": 0.0245,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.3357869489703085,
|
|
"grad_norm": 0.04492352157831192,
|
|
"learning_rate": 7.929846469526242e-06,
|
|
"loss": 0.025,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.3366140104209743,
|
|
"grad_norm": 0.07774407416582108,
|
|
"learning_rate": 7.91875361140974e-06,
|
|
"loss": 0.0226,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.3374410718716401,
|
|
"grad_norm": 0.06625732779502869,
|
|
"learning_rate": 7.90763891798706e-06,
|
|
"loss": 0.0235,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.3382681333223059,
|
|
"grad_norm": 0.048172276467084885,
|
|
"learning_rate": 7.896502472407644e-06,
|
|
"loss": 0.0236,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.3390951947729716,
|
|
"grad_norm": 0.05588380619883537,
|
|
"learning_rate": 7.885344357983665e-06,
|
|
"loss": 0.0365,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.3399222562236374,
|
|
"grad_norm": 0.04697740450501442,
|
|
"learning_rate": 7.874164658189398e-06,
|
|
"loss": 0.0261,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.3407493176743032,
|
|
"grad_norm": 0.14661569893360138,
|
|
"learning_rate": 7.8629634566606e-06,
|
|
"loss": 0.0422,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.34157637912496897,
|
|
"grad_norm": 0.050860997289419174,
|
|
"learning_rate": 7.851740837193883e-06,
|
|
"loss": 0.0253,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.34240344057563477,
|
|
"grad_norm": 0.06831306964159012,
|
|
"learning_rate": 7.840496883746089e-06,
|
|
"loss": 0.0236,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.34323050202630057,
|
|
"grad_norm": 0.07154014706611633,
|
|
"learning_rate": 7.829231680433658e-06,
|
|
"loss": 0.0241,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.34405756347696637,
|
|
"grad_norm": 0.060069840401411057,
|
|
"learning_rate": 7.817945311532001e-06,
|
|
"loss": 0.0233,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.3448846249276321,
|
|
"grad_norm": 0.06343766301870346,
|
|
"learning_rate": 7.806637861474873e-06,
|
|
"loss": 0.029,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.3457116863782979,
|
|
"grad_norm": 0.046083442866802216,
|
|
"learning_rate": 7.795309414853735e-06,
|
|
"loss": 0.0233,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.3465387478289637,
|
|
"grad_norm": 0.04395199194550514,
|
|
"learning_rate": 7.783960056417123e-06,
|
|
"loss": 0.024,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.34736580927962946,
|
|
"grad_norm": 0.04960530623793602,
|
|
"learning_rate": 7.77258987107002e-06,
|
|
"loss": 0.0252,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.34819287073029526,
|
|
"grad_norm": 0.053416695445775986,
|
|
"learning_rate": 7.76119894387321e-06,
|
|
"loss": 0.0233,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.34901993218096106,
|
|
"grad_norm": 0.06489969789981842,
|
|
"learning_rate": 7.749787360042651e-06,
|
|
"loss": 0.0225,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.34984699363162686,
|
|
"grad_norm": 0.054353874176740646,
|
|
"learning_rate": 7.738355204948833e-06,
|
|
"loss": 0.025,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.3506740550822926,
|
|
"grad_norm": 0.05458907410502434,
|
|
"learning_rate": 7.726902564116141e-06,
|
|
"loss": 0.0234,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.3515011165329584,
|
|
"grad_norm": 0.04842905327677727,
|
|
"learning_rate": 7.715429523222214e-06,
|
|
"loss": 0.0221,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.3523281779836242,
|
|
"grad_norm": 0.0519806407392025,
|
|
"learning_rate": 7.703936168097306e-06,
|
|
"loss": 0.0239,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.35315523943428995,
|
|
"grad_norm": 0.05236365273594856,
|
|
"learning_rate": 7.692422584723641e-06,
|
|
"loss": 0.0235,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.35398230088495575,
|
|
"grad_norm": 0.04914037883281708,
|
|
"learning_rate": 7.68088885923477e-06,
|
|
"loss": 0.0235,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.35480936233562155,
|
|
"grad_norm": 0.05043815076351166,
|
|
"learning_rate": 7.669335077914932e-06,
|
|
"loss": 0.0241,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.35563642378628735,
|
|
"grad_norm": 0.04599103704094887,
|
|
"learning_rate": 7.657761327198404e-06,
|
|
"loss": 0.0242,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.3564634852369531,
|
|
"grad_norm": 0.04246712476015091,
|
|
"learning_rate": 7.646167693668846e-06,
|
|
"loss": 0.0241,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.3572905466876189,
|
|
"grad_norm": 0.04617106169462204,
|
|
"learning_rate": 7.634554264058676e-06,
|
|
"loss": 0.0235,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.3581176081382847,
|
|
"grad_norm": 0.046657975763082504,
|
|
"learning_rate": 7.6229211252483956e-06,
|
|
"loss": 0.0233,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.35894466958895044,
|
|
"grad_norm": 0.047864075750112534,
|
|
"learning_rate": 7.611268364265958e-06,
|
|
"loss": 0.0241,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.35977173103961624,
|
|
"grad_norm": 0.054371584206819534,
|
|
"learning_rate": 7.599596068286111e-06,
|
|
"loss": 0.0238,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.36059879249028204,
|
|
"grad_norm": 0.04631248489022255,
|
|
"learning_rate": 7.58790432462974e-06,
|
|
"loss": 0.0268,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.36142585394094784,
|
|
"grad_norm": 0.06476343423128128,
|
|
"learning_rate": 7.576193220763221e-06,
|
|
"loss": 0.0246,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.3622529153916136,
|
|
"grad_norm": 0.057965509593486786,
|
|
"learning_rate": 7.564462844297766e-06,
|
|
"loss": 0.0233,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.3630799768422794,
|
|
"grad_norm": 0.05117254704236984,
|
|
"learning_rate": 7.552713282988765e-06,
|
|
"loss": 0.024,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.3639070382929452,
|
|
"grad_norm": 0.0481458455324173,
|
|
"learning_rate": 7.540944624735132e-06,
|
|
"loss": 0.0233,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.36473409974361093,
|
|
"grad_norm": 0.0458373986184597,
|
|
"learning_rate": 7.529156957578641e-06,
|
|
"loss": 0.0228,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.36556116119427673,
|
|
"grad_norm": 0.043816305696964264,
|
|
"learning_rate": 7.517350369703279e-06,
|
|
"loss": 0.0234,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.36638822264494253,
|
|
"grad_norm": 0.050691138952970505,
|
|
"learning_rate": 7.505524949434575e-06,
|
|
"loss": 0.0219,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.36721528409560833,
|
|
"grad_norm": 0.0413176566362381,
|
|
"learning_rate": 7.493680785238948e-06,
|
|
"loss": 0.0231,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.3680423455462741,
|
|
"grad_norm": 0.04249545931816101,
|
|
"learning_rate": 7.481817965723035e-06,
|
|
"loss": 0.0226,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.3688694069969399,
|
|
"grad_norm": 0.05581935495138168,
|
|
"learning_rate": 7.4699365796330395e-06,
|
|
"loss": 0.0265,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.3696964684476057,
|
|
"grad_norm": 0.0569755993783474,
|
|
"learning_rate": 7.458036715854059e-06,
|
|
"loss": 0.0232,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.3705235298982714,
|
|
"grad_norm": 0.05333729833364487,
|
|
"learning_rate": 7.4461184634094256e-06,
|
|
"loss": 0.0242,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.3713505913489372,
|
|
"grad_norm": 0.05248766019940376,
|
|
"learning_rate": 7.434181911460036e-06,
|
|
"loss": 0.0307,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.372177652799603,
|
|
"grad_norm": 0.043839454650878906,
|
|
"learning_rate": 7.4222271493036875e-06,
|
|
"loss": 0.0241,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.3730047142502688,
|
|
"grad_norm": 0.05857829377055168,
|
|
"learning_rate": 7.41025426637441e-06,
|
|
"loss": 0.0223,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.37383177570093457,
|
|
"grad_norm": 0.041583914309740067,
|
|
"learning_rate": 7.398263352241788e-06,
|
|
"loss": 0.0225,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.37465883715160037,
|
|
"grad_norm": 0.043787844479084015,
|
|
"learning_rate": 7.386254496610309e-06,
|
|
"loss": 0.0215,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.37548589860226617,
|
|
"grad_norm": 0.04298454895615578,
|
|
"learning_rate": 7.374227789318673e-06,
|
|
"loss": 0.0229,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.3763129600529319,
|
|
"grad_norm": 0.05074107274413109,
|
|
"learning_rate": 7.362183320339133e-06,
|
|
"loss": 0.023,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.3771400215035977,
|
|
"grad_norm": 0.06284487992525101,
|
|
"learning_rate": 7.350121179776819e-06,
|
|
"loss": 0.0231,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.3779670829542635,
|
|
"grad_norm": 0.053102780133485794,
|
|
"learning_rate": 7.33804145786906e-06,
|
|
"loss": 0.0255,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.3787941444049293,
|
|
"grad_norm": 0.04331573098897934,
|
|
"learning_rate": 7.325944244984711e-06,
|
|
"loss": 0.0228,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.37962120585559506,
|
|
"grad_norm": 0.051730215549468994,
|
|
"learning_rate": 7.31382963162348e-06,
|
|
"loss": 0.0216,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.38044826730626086,
|
|
"grad_norm": 0.03934797644615173,
|
|
"learning_rate": 7.301697708415248e-06,
|
|
"loss": 0.0242,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.38127532875692666,
|
|
"grad_norm": 0.04784635826945305,
|
|
"learning_rate": 7.289548566119391e-06,
|
|
"loss": 0.0221,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.3821023902075924,
|
|
"grad_norm": 0.1260228306055069,
|
|
"learning_rate": 7.277382295624104e-06,
|
|
"loss": 0.0282,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.3829294516582582,
|
|
"grad_norm": 0.06200871989130974,
|
|
"learning_rate": 7.265198987945714e-06,
|
|
"loss": 0.0261,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.383756513108924,
|
|
"grad_norm": 0.061095982789993286,
|
|
"learning_rate": 7.252998734228007e-06,
|
|
"loss": 0.0245,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.3845835745595898,
|
|
"grad_norm": 0.053159236907958984,
|
|
"learning_rate": 7.240781625741545e-06,
|
|
"loss": 0.0233,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.38541063601025555,
|
|
"grad_norm": 0.0482206866145134,
|
|
"learning_rate": 7.228547753882976e-06,
|
|
"loss": 0.0261,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.38623769746092135,
|
|
"grad_norm": 0.05078030377626419,
|
|
"learning_rate": 7.216297210174361e-06,
|
|
"loss": 0.0244,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.38706475891158715,
|
|
"grad_norm": 0.044170767068862915,
|
|
"learning_rate": 7.204030086262478e-06,
|
|
"loss": 0.0238,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.3878918203622529,
|
|
"grad_norm": 0.04695448279380798,
|
|
"learning_rate": 7.191746473918148e-06,
|
|
"loss": 0.0223,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.3887188818129187,
|
|
"grad_norm": 0.052788231521844864,
|
|
"learning_rate": 7.179446465035535e-06,
|
|
"loss": 0.0249,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.3895459432635845,
|
|
"grad_norm": 0.05831609293818474,
|
|
"learning_rate": 7.167130151631475e-06,
|
|
"loss": 0.0244,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.3903730047142503,
|
|
"grad_norm": 0.05152612552046776,
|
|
"learning_rate": 7.154797625844773e-06,
|
|
"loss": 0.0224,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.39120006616491604,
|
|
"grad_norm": 0.047528255730867386,
|
|
"learning_rate": 7.142448979935521e-06,
|
|
"loss": 0.0236,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.39202712761558184,
|
|
"grad_norm": 0.051114026457071304,
|
|
"learning_rate": 7.130084306284406e-06,
|
|
"loss": 0.0235,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.39285418906624764,
|
|
"grad_norm": 0.04298287630081177,
|
|
"learning_rate": 7.11770369739202e-06,
|
|
"loss": 0.0224,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.3936812505169134,
|
|
"grad_norm": 0.05048811435699463,
|
|
"learning_rate": 7.105307245878166e-06,
|
|
"loss": 0.0238,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.3945083119675792,
|
|
"grad_norm": 0.04245224595069885,
|
|
"learning_rate": 7.092895044481165e-06,
|
|
"loss": 0.0235,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.395335373418245,
|
|
"grad_norm": 0.05021793767809868,
|
|
"learning_rate": 7.080467186057168e-06,
|
|
"loss": 0.0228,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.3961624348689108,
|
|
"grad_norm": 0.04611439257860184,
|
|
"learning_rate": 7.068023763579453e-06,
|
|
"loss": 0.0304,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.39698949631957653,
|
|
"grad_norm": 0.050482239574193954,
|
|
"learning_rate": 7.055564870137733e-06,
|
|
"loss": 0.0241,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.39781655777024233,
|
|
"grad_norm": 0.050899263471364975,
|
|
"learning_rate": 7.043090598937463e-06,
|
|
"loss": 0.0246,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.39864361922090813,
|
|
"grad_norm": 0.05052196979522705,
|
|
"learning_rate": 7.030601043299138e-06,
|
|
"loss": 0.0238,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.3994706806715739,
|
|
"grad_norm": 0.04977920651435852,
|
|
"learning_rate": 7.018096296657595e-06,
|
|
"loss": 0.0234,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.4002977421222397,
|
|
"grad_norm": 0.0429433137178421,
|
|
"learning_rate": 7.005576452561314e-06,
|
|
"loss": 0.0249,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.4011248035729055,
|
|
"grad_norm": 0.04633225128054619,
|
|
"learning_rate": 6.993041604671727e-06,
|
|
"loss": 0.0221,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.4019518650235713,
|
|
"grad_norm": 0.044517192989587784,
|
|
"learning_rate": 6.980491846762503e-06,
|
|
"loss": 0.023,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.402778926474237,
|
|
"grad_norm": 0.04668491706252098,
|
|
"learning_rate": 6.967927272718855e-06,
|
|
"loss": 0.023,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.4036059879249028,
|
|
"grad_norm": 0.13357175886631012,
|
|
"learning_rate": 6.955347976536841e-06,
|
|
"loss": 0.0218,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.4044330493755686,
|
|
"grad_norm": 0.04721111059188843,
|
|
"learning_rate": 6.942754052322645e-06,
|
|
"loss": 0.0222,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.40526011082623437,
|
|
"grad_norm": 0.07329542189836502,
|
|
"learning_rate": 6.9301455942918934e-06,
|
|
"loss": 0.0219,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.40608717227690017,
|
|
"grad_norm": 0.04098494350910187,
|
|
"learning_rate": 6.9175226967689395e-06,
|
|
"loss": 0.0224,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.40691423372756597,
|
|
"grad_norm": 0.0693870559334755,
|
|
"learning_rate": 6.904885454186155e-06,
|
|
"loss": 0.0239,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.40774129517823177,
|
|
"grad_norm": 0.04788215458393097,
|
|
"learning_rate": 6.89223396108323e-06,
|
|
"loss": 0.0278,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.4085683566288975,
|
|
"grad_norm": 0.041839174926280975,
|
|
"learning_rate": 6.879568312106462e-06,
|
|
"loss": 0.0215,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.4093954180795633,
|
|
"grad_norm": 0.04695757478475571,
|
|
"learning_rate": 6.866888602008053e-06,
|
|
"loss": 0.0235,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.4102224795302291,
|
|
"grad_norm": 0.05025403946638107,
|
|
"learning_rate": 6.854194925645392e-06,
|
|
"loss": 0.023,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.41104954098089486,
|
|
"grad_norm": 0.05418792739510536,
|
|
"learning_rate": 6.841487377980353e-06,
|
|
"loss": 0.0247,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.41187660243156066,
|
|
"grad_norm": 0.05611952021718025,
|
|
"learning_rate": 6.82876605407858e-06,
|
|
"loss": 0.023,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.41270366388222646,
|
|
"grad_norm": 0.04246920347213745,
|
|
"learning_rate": 6.816031049108777e-06,
|
|
"loss": 0.024,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.41353072533289226,
|
|
"grad_norm": 0.044995930045843124,
|
|
"learning_rate": 6.803282458342e-06,
|
|
"loss": 0.0215,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.41353072533289226,
|
|
"eval_loss": 0.024215074256062508,
|
|
"eval_runtime": 1219.5377,
|
|
"eval_samples_per_second": 4.919,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.414357786783558,
|
|
"grad_norm": 0.05199587345123291,
|
|
"learning_rate": 6.790520377150939e-06,
|
|
"loss": 0.0233,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.4151848482342238,
|
|
"grad_norm": 0.04450158774852753,
|
|
"learning_rate": 6.777744901009204e-06,
|
|
"loss": 0.023,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.4160119096848896,
|
|
"grad_norm": 0.0536041297018528,
|
|
"learning_rate": 6.764956125490616e-06,
|
|
"loss": 0.022,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.41683897113555535,
|
|
"grad_norm": 0.04742833226919174,
|
|
"learning_rate": 6.752154146268491e-06,
|
|
"loss": 0.0267,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.41766603258622115,
|
|
"grad_norm": 0.05334756523370743,
|
|
"learning_rate": 6.739339059114916e-06,
|
|
"loss": 0.0232,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.41849309403688695,
|
|
"grad_norm": 0.0501900352537632,
|
|
"learning_rate": 6.726510959900046e-06,
|
|
"loss": 0.0248,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.41932015548755275,
|
|
"grad_norm": 0.04328269138932228,
|
|
"learning_rate": 6.713669944591375e-06,
|
|
"loss": 0.0229,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.4201472169382185,
|
|
"grad_norm": 0.04845112934708595,
|
|
"learning_rate": 6.700816109253023e-06,
|
|
"loss": 0.0242,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.4209742783888843,
|
|
"grad_norm": 0.051792118698358536,
|
|
"learning_rate": 6.6879495500450184e-06,
|
|
"loss": 0.0224,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.4218013398395501,
|
|
"grad_norm": 0.03820064663887024,
|
|
"learning_rate": 6.675070363222581e-06,
|
|
"loss": 0.0225,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.42262840129021584,
|
|
"grad_norm": 0.04609294980764389,
|
|
"learning_rate": 6.662178645135392e-06,
|
|
"loss": 0.0222,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.42345546274088164,
|
|
"grad_norm": 0.043115533888339996,
|
|
"learning_rate": 6.649274492226882e-06,
|
|
"loss": 0.0229,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.42428252419154744,
|
|
"grad_norm": 0.04883312061429024,
|
|
"learning_rate": 6.636358001033508e-06,
|
|
"loss": 0.0228,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.42510958564221324,
|
|
"grad_norm": 0.062484171241521835,
|
|
"learning_rate": 6.623429268184027e-06,
|
|
"loss": 0.0237,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.425936647092879,
|
|
"grad_norm": 0.0440596267580986,
|
|
"learning_rate": 6.6104883903987815e-06,
|
|
"loss": 0.0264,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.4267637085435448,
|
|
"grad_norm": 0.04892463609576225,
|
|
"learning_rate": 6.5975354644889665e-06,
|
|
"loss": 0.0217,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.4275907699942106,
|
|
"grad_norm": 0.04017140343785286,
|
|
"learning_rate": 6.5845705873559094e-06,
|
|
"loss": 0.0225,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.42841783144487633,
|
|
"grad_norm": 0.04880579188466072,
|
|
"learning_rate": 6.571593855990348e-06,
|
|
"loss": 0.023,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.42924489289554213,
|
|
"grad_norm": 0.06134543567895889,
|
|
"learning_rate": 6.5586053674717e-06,
|
|
"loss": 0.0227,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.43007195434620793,
|
|
"grad_norm": 0.03942278400063515,
|
|
"learning_rate": 6.545605218967341e-06,
|
|
"loss": 0.0222,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.43089901579687373,
|
|
"grad_norm": 0.04633478447794914,
|
|
"learning_rate": 6.5325935077318705e-06,
|
|
"loss": 0.0226,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.4317260772475395,
|
|
"grad_norm": 0.06766749918460846,
|
|
"learning_rate": 6.519570331106395e-06,
|
|
"loss": 0.0226,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.4325531386982053,
|
|
"grad_norm": 0.04740046337246895,
|
|
"learning_rate": 6.506535786517789e-06,
|
|
"loss": 0.0261,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.4333802001488711,
|
|
"grad_norm": 0.05168073996901512,
|
|
"learning_rate": 6.493489971477977e-06,
|
|
"loss": 0.0242,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.4342072615995368,
|
|
"grad_norm": 0.05117257684469223,
|
|
"learning_rate": 6.480432983583194e-06,
|
|
"loss": 0.0276,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.4350343230502026,
|
|
"grad_norm": 0.05560829117894173,
|
|
"learning_rate": 6.467364920513257e-06,
|
|
"loss": 0.0235,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.4358613845008684,
|
|
"grad_norm": 0.04257509857416153,
|
|
"learning_rate": 6.454285880030844e-06,
|
|
"loss": 0.022,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.4366884459515342,
|
|
"grad_norm": 0.047841571271419525,
|
|
"learning_rate": 6.441195959980749e-06,
|
|
"loss": 0.0235,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.43751550740219997,
|
|
"grad_norm": 0.04220358282327652,
|
|
"learning_rate": 6.428095258289162e-06,
|
|
"loss": 0.0227,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.43834256885286577,
|
|
"grad_norm": 0.04904833808541298,
|
|
"learning_rate": 6.414983872962924e-06,
|
|
"loss": 0.023,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.43916963030353157,
|
|
"grad_norm": 0.041855499148368835,
|
|
"learning_rate": 6.401861902088809e-06,
|
|
"loss": 0.0247,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.4399966917541973,
|
|
"grad_norm": 0.046882931143045425,
|
|
"learning_rate": 6.388729443832774e-06,
|
|
"loss": 0.0218,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.4408237532048631,
|
|
"grad_norm": 0.06054188311100006,
|
|
"learning_rate": 6.375586596439237e-06,
|
|
"loss": 0.0239,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.4416508146555289,
|
|
"grad_norm": 0.04277319461107254,
|
|
"learning_rate": 6.362433458230337e-06,
|
|
"loss": 0.0232,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.4424778761061947,
|
|
"grad_norm": 0.050606515258550644,
|
|
"learning_rate": 6.349270127605198e-06,
|
|
"loss": 0.0224,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.44330493755686046,
|
|
"grad_norm": 0.050200313329696655,
|
|
"learning_rate": 6.336096703039196e-06,
|
|
"loss": 0.0225,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.44413199900752626,
|
|
"grad_norm": 0.0431785061955452,
|
|
"learning_rate": 6.322913283083214e-06,
|
|
"loss": 0.0223,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.44495906045819206,
|
|
"grad_norm": 0.04577941447496414,
|
|
"learning_rate": 6.309719966362922e-06,
|
|
"loss": 0.0219,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.4457861219088578,
|
|
"grad_norm": 0.04745447263121605,
|
|
"learning_rate": 6.296516851578016e-06,
|
|
"loss": 0.0239,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.4466131833595236,
|
|
"grad_norm": 0.0505000539124012,
|
|
"learning_rate": 6.283304037501501e-06,
|
|
"loss": 0.0238,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.4474402448101894,
|
|
"grad_norm": 0.0681275799870491,
|
|
"learning_rate": 6.270081622978934e-06,
|
|
"loss": 0.0238,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.4482673062608552,
|
|
"grad_norm": 0.05186863988637924,
|
|
"learning_rate": 6.256849706927703e-06,
|
|
"loss": 0.0225,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.44909436771152095,
|
|
"grad_norm": 0.04716340824961662,
|
|
"learning_rate": 6.2436083883362706e-06,
|
|
"loss": 0.022,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.44992142916218675,
|
|
"grad_norm": 0.042241595685482025,
|
|
"learning_rate": 6.230357766263442e-06,
|
|
"loss": 0.0216,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.45074849061285255,
|
|
"grad_norm": 0.04572228342294693,
|
|
"learning_rate": 6.217097939837623e-06,
|
|
"loss": 0.0219,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.4515755520635183,
|
|
"grad_norm": 0.05299137532711029,
|
|
"learning_rate": 6.203829008256075e-06,
|
|
"loss": 0.0222,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.4524026135141841,
|
|
"grad_norm": 0.04044192656874657,
|
|
"learning_rate": 6.190551070784179e-06,
|
|
"loss": 0.0233,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.4532296749648499,
|
|
"grad_norm": 0.04427442327141762,
|
|
"learning_rate": 6.177264226754685e-06,
|
|
"loss": 0.0239,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.4540567364155157,
|
|
"grad_norm": 0.0423441156744957,
|
|
"learning_rate": 6.163968575566979e-06,
|
|
"loss": 0.0243,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.45488379786618144,
|
|
"grad_norm": 0.052600838243961334,
|
|
"learning_rate": 6.150664216686329e-06,
|
|
"loss": 0.0231,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.45571085931684724,
|
|
"grad_norm": 0.04956282302737236,
|
|
"learning_rate": 6.137351249643147e-06,
|
|
"loss": 0.0238,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.45653792076751304,
|
|
"grad_norm": 0.037822380661964417,
|
|
"learning_rate": 6.124029774032242e-06,
|
|
"loss": 0.0224,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.4573649822181788,
|
|
"grad_norm": 0.04192859306931496,
|
|
"learning_rate": 6.110699889512077e-06,
|
|
"loss": 0.0273,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.4581920436688446,
|
|
"grad_norm": 0.04586039483547211,
|
|
"learning_rate": 6.0973616958040265e-06,
|
|
"loss": 0.0223,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.4590191051195104,
|
|
"grad_norm": 0.049864090979099274,
|
|
"learning_rate": 6.084015292691617e-06,
|
|
"loss": 0.0237,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.4598461665701762,
|
|
"grad_norm": 0.061950068920850754,
|
|
"learning_rate": 6.070660780019797e-06,
|
|
"loss": 0.0228,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.46067322802084193,
|
|
"grad_norm": 0.04114188626408577,
|
|
"learning_rate": 6.057298257694182e-06,
|
|
"loss": 0.0233,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.46150028947150773,
|
|
"grad_norm": 0.048220761120319366,
|
|
"learning_rate": 6.043927825680305e-06,
|
|
"loss": 0.0285,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.46232735092217353,
|
|
"grad_norm": 0.047901052981615067,
|
|
"learning_rate": 6.030549584002876e-06,
|
|
"loss": 0.0247,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.4631544123728393,
|
|
"grad_norm": 0.04301442205905914,
|
|
"learning_rate": 6.017163632745025e-06,
|
|
"loss": 0.0222,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.4639814738235051,
|
|
"grad_norm": 0.059639185667037964,
|
|
"learning_rate": 6.003770072047559e-06,
|
|
"loss": 0.0224,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.4648085352741709,
|
|
"grad_norm": 0.05088592320680618,
|
|
"learning_rate": 5.990369002108215e-06,
|
|
"loss": 0.0255,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.4656355967248367,
|
|
"grad_norm": 0.04898575693368912,
|
|
"learning_rate": 5.976960523180904e-06,
|
|
"loss": 0.0221,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.4664626581755024,
|
|
"grad_norm": 0.04929777607321739,
|
|
"learning_rate": 5.963544735574961e-06,
|
|
"loss": 0.023,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.4672897196261682,
|
|
"grad_norm": 0.04379523918032646,
|
|
"learning_rate": 5.9501217396544034e-06,
|
|
"loss": 0.023,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.468116781076834,
|
|
"grad_norm": 0.049279894679784775,
|
|
"learning_rate": 5.93669163583717e-06,
|
|
"loss": 0.0232,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.46894384252749977,
|
|
"grad_norm": 0.044354990124702454,
|
|
"learning_rate": 5.923254524594376e-06,
|
|
"loss": 0.0229,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.46977090397816557,
|
|
"grad_norm": 0.05658494308590889,
|
|
"learning_rate": 5.9098105064495606e-06,
|
|
"loss": 0.0221,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.47059796542883137,
|
|
"grad_norm": 0.041339486837387085,
|
|
"learning_rate": 5.896359681977928e-06,
|
|
"loss": 0.0226,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.47142502687949717,
|
|
"grad_norm": 0.052800171077251434,
|
|
"learning_rate": 5.8829021518056095e-06,
|
|
"loss": 0.0237,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.4722520883301629,
|
|
"grad_norm": 0.04378625750541687,
|
|
"learning_rate": 5.869438016608893e-06,
|
|
"loss": 0.0241,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.4730791497808287,
|
|
"grad_norm": 0.08634616434574127,
|
|
"learning_rate": 5.855967377113487e-06,
|
|
"loss": 0.0263,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.4739062112314945,
|
|
"grad_norm": 0.0738649070262909,
|
|
"learning_rate": 5.842490334093752e-06,
|
|
"loss": 0.0231,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.4747332726821603,
|
|
"grad_norm": 0.04509838670492172,
|
|
"learning_rate": 5.829006988371959e-06,
|
|
"loss": 0.0231,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.47556033413282606,
|
|
"grad_norm": 0.044409893453121185,
|
|
"learning_rate": 5.815517440817526e-06,
|
|
"loss": 0.0222,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.47638739558349186,
|
|
"grad_norm": 0.04454704746603966,
|
|
"learning_rate": 5.8020217923462696e-06,
|
|
"loss": 0.022,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.47721445703415766,
|
|
"grad_norm": 0.04391258582472801,
|
|
"learning_rate": 5.788520143919647e-06,
|
|
"loss": 0.0223,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.4780415184848234,
|
|
"grad_norm": 0.039742667227983475,
|
|
"learning_rate": 5.775012596543999e-06,
|
|
"loss": 0.0236,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.4788685799354892,
|
|
"grad_norm": 0.04627054184675217,
|
|
"learning_rate": 5.761499251269798e-06,
|
|
"loss": 0.0225,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.479695641386155,
|
|
"grad_norm": 0.03860992565751076,
|
|
"learning_rate": 5.7479802091908945e-06,
|
|
"loss": 0.0268,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.4805227028368208,
|
|
"grad_norm": 0.04734113812446594,
|
|
"learning_rate": 5.734455571443751e-06,
|
|
"loss": 0.0233,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.48134976428748655,
|
|
"grad_norm": 0.07089436799287796,
|
|
"learning_rate": 5.720925439206695e-06,
|
|
"loss": 0.0267,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.48217682573815235,
|
|
"grad_norm": 0.04937206953763962,
|
|
"learning_rate": 5.707389913699157e-06,
|
|
"loss": 0.0225,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.48300388718881815,
|
|
"grad_norm": 0.04481448978185654,
|
|
"learning_rate": 5.693849096180917e-06,
|
|
"loss": 0.0221,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.4838309486394839,
|
|
"grad_norm": 0.051826462149620056,
|
|
"learning_rate": 5.680303087951339e-06,
|
|
"loss": 0.0237,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.4846580100901497,
|
|
"grad_norm": 0.13001324236392975,
|
|
"learning_rate": 5.666751990348627e-06,
|
|
"loss": 0.0223,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.4854850715408155,
|
|
"grad_norm": 0.04917273670434952,
|
|
"learning_rate": 5.653195904749054e-06,
|
|
"loss": 0.0219,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.4863121329914813,
|
|
"grad_norm": 0.04470530524849892,
|
|
"learning_rate": 5.639634932566208e-06,
|
|
"loss": 0.0307,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.48713919444214704,
|
|
"grad_norm": 0.04076725244522095,
|
|
"learning_rate": 5.626069175250236e-06,
|
|
"loss": 0.0223,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.48796625589281284,
|
|
"grad_norm": 0.050211817026138306,
|
|
"learning_rate": 5.61249873428708e-06,
|
|
"loss": 0.0227,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.48879331734347864,
|
|
"grad_norm": 0.03654312714934349,
|
|
"learning_rate": 5.5989237111977255e-06,
|
|
"loss": 0.0216,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.4896203787941444,
|
|
"grad_norm": 0.050298597663640976,
|
|
"learning_rate": 5.58534420753743e-06,
|
|
"loss": 0.0217,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.4904474402448102,
|
|
"grad_norm": 0.04905930534005165,
|
|
"learning_rate": 5.571760324894977e-06,
|
|
"loss": 0.0227,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.491274501695476,
|
|
"grad_norm": 0.045814525336027145,
|
|
"learning_rate": 5.558172164891903e-06,
|
|
"loss": 0.0225,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.4921015631461418,
|
|
"grad_norm": 0.06343957781791687,
|
|
"learning_rate": 5.544579829181751e-06,
|
|
"loss": 0.023,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.49292862459680753,
|
|
"grad_norm": 0.042192984372377396,
|
|
"learning_rate": 5.530983419449296e-06,
|
|
"loss": 0.021,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.49375568604747333,
|
|
"grad_norm": 0.04143495857715607,
|
|
"learning_rate": 5.517383037409794e-06,
|
|
"loss": 0.0253,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.49458274749813913,
|
|
"grad_norm": 0.04273596778512001,
|
|
"learning_rate": 5.503778784808218e-06,
|
|
"loss": 0.0226,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.4954098089488049,
|
|
"grad_norm": 0.047943755984306335,
|
|
"learning_rate": 5.490170763418496e-06,
|
|
"loss": 0.022,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.4962368703994707,
|
|
"grad_norm": 0.045045025646686554,
|
|
"learning_rate": 5.476559075042751e-06,
|
|
"loss": 0.0216,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.4962368703994707,
|
|
"eval_loss": 0.02347772754728794,
|
|
"eval_runtime": 1220.4355,
|
|
"eval_samples_per_second": 4.915,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.4970639318501365,
|
|
"grad_norm": 0.04491131007671356,
|
|
"learning_rate": 5.4629438215105375e-06,
|
|
"loss": 0.0228,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.4978909933008023,
|
|
"grad_norm": 0.053035978227853775,
|
|
"learning_rate": 5.449325104678085e-06,
|
|
"loss": 0.0233,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.498718054751468,
|
|
"grad_norm": 0.04346757382154465,
|
|
"learning_rate": 5.4357030264275256e-06,
|
|
"loss": 0.0218,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.4995451162021338,
|
|
"grad_norm": 0.03982304036617279,
|
|
"learning_rate": 5.422077688666145e-06,
|
|
"loss": 0.0216,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.5003721776527996,
|
|
"grad_norm": 0.0594533309340477,
|
|
"learning_rate": 5.4084491933256086e-06,
|
|
"loss": 0.0228,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.5011992391034654,
|
|
"grad_norm": 0.03943202272057533,
|
|
"learning_rate": 5.394817642361206e-06,
|
|
"loss": 0.0231,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.5020263005541312,
|
|
"grad_norm": 0.03965817019343376,
|
|
"learning_rate": 5.381183137751087e-06,
|
|
"loss": 0.0234,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.5028533620047969,
|
|
"grad_norm": 0.05061696469783783,
|
|
"learning_rate": 5.367545781495495e-06,
|
|
"loss": 0.0252,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.5036804234554627,
|
|
"grad_norm": 0.0856064036488533,
|
|
"learning_rate": 5.353905675616008e-06,
|
|
"loss": 0.0228,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.5045074849061285,
|
|
"grad_norm": 0.05830984562635422,
|
|
"learning_rate": 5.340262922154773e-06,
|
|
"loss": 0.0239,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.5053345463567943,
|
|
"grad_norm": 0.042031850665807724,
|
|
"learning_rate": 5.326617623173747e-06,
|
|
"loss": 0.0218,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.5061616078074601,
|
|
"grad_norm": 0.04255002364516258,
|
|
"learning_rate": 5.312969880753928e-06,
|
|
"loss": 0.0257,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.5069886692581259,
|
|
"grad_norm": 0.046407558023929596,
|
|
"learning_rate": 5.299319796994591e-06,
|
|
"loss": 0.0214,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.5078157307087917,
|
|
"grad_norm": 0.044977955520153046,
|
|
"learning_rate": 5.285667474012529e-06,
|
|
"loss": 0.0243,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.5086427921594574,
|
|
"grad_norm": 0.041169311851263046,
|
|
"learning_rate": 5.272013013941289e-06,
|
|
"loss": 0.0221,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.5094698536101232,
|
|
"grad_norm": 0.04349064826965332,
|
|
"learning_rate": 5.258356518930403e-06,
|
|
"loss": 0.0222,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.510296915060789,
|
|
"grad_norm": 0.051616426557302475,
|
|
"learning_rate": 5.244698091144624e-06,
|
|
"loss": 0.0226,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.5111239765114548,
|
|
"grad_norm": 0.04476653039455414,
|
|
"learning_rate": 5.2310378327631695e-06,
|
|
"loss": 0.0225,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.5119510379621206,
|
|
"grad_norm": 0.04472777247428894,
|
|
"learning_rate": 5.21737584597895e-06,
|
|
"loss": 0.0231,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.5127780994127864,
|
|
"grad_norm": 0.05034750699996948,
|
|
"learning_rate": 5.203712232997801e-06,
|
|
"loss": 0.0215,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.5136051608634522,
|
|
"grad_norm": 0.04265570640563965,
|
|
"learning_rate": 5.190047096037734e-06,
|
|
"loss": 0.0246,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.5144322223141179,
|
|
"grad_norm": 0.0414557047188282,
|
|
"learning_rate": 5.176380537328149e-06,
|
|
"loss": 0.0224,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.5152592837647837,
|
|
"grad_norm": 0.047177575528621674,
|
|
"learning_rate": 5.1627126591090945e-06,
|
|
"loss": 0.0248,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.5160863452154495,
|
|
"grad_norm": 0.03995126485824585,
|
|
"learning_rate": 5.149043563630481e-06,
|
|
"loss": 0.0222,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.5169134066661153,
|
|
"grad_norm": 0.038500089198350906,
|
|
"learning_rate": 5.135373353151333e-06,
|
|
"loss": 0.0226,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.5177404681167811,
|
|
"grad_norm": 0.04477696493268013,
|
|
"learning_rate": 5.1217021299390055e-06,
|
|
"loss": 0.0252,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.5185675295674469,
|
|
"grad_norm": 0.04252477362751961,
|
|
"learning_rate": 5.108029996268442e-06,
|
|
"loss": 0.0208,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.5193945910181127,
|
|
"grad_norm": 0.04710827022790909,
|
|
"learning_rate": 5.09435705442139e-06,
|
|
"loss": 0.0208,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 0.5202216524687784,
|
|
"grad_norm": 0.04434856027364731,
|
|
"learning_rate": 5.080683406685644e-06,
|
|
"loss": 0.0223,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 0.5210487139194442,
|
|
"grad_norm": 0.04365675151348114,
|
|
"learning_rate": 5.067009155354281e-06,
|
|
"loss": 0.0219,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.52187577537011,
|
|
"grad_norm": 0.04527043551206589,
|
|
"learning_rate": 5.053334402724891e-06,
|
|
"loss": 0.0216,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 0.5227028368207758,
|
|
"grad_norm": 0.04446522891521454,
|
|
"learning_rate": 5.039659251098818e-06,
|
|
"loss": 0.0325,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 0.5235298982714416,
|
|
"grad_norm": 0.03923187032341957,
|
|
"learning_rate": 5.025983802780387e-06,
|
|
"loss": 0.0225,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 0.5243569597221074,
|
|
"grad_norm": 0.0494740828871727,
|
|
"learning_rate": 5.012308160076143e-06,
|
|
"loss": 0.0236,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 0.5251840211727732,
|
|
"grad_norm": 0.048305340111255646,
|
|
"learning_rate": 4.998632425294089e-06,
|
|
"loss": 0.0219,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 0.5260110826234389,
|
|
"grad_norm": 0.05675299093127251,
|
|
"learning_rate": 4.984956700742914e-06,
|
|
"loss": 0.023,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 0.5268381440741047,
|
|
"grad_norm": 0.05156668648123741,
|
|
"learning_rate": 4.9712810887312285e-06,
|
|
"loss": 0.021,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 0.5276652055247705,
|
|
"grad_norm": 0.0496770441532135,
|
|
"learning_rate": 4.957605691566806e-06,
|
|
"loss": 0.0226,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 0.5284922669754363,
|
|
"grad_norm": 0.044166844338178635,
|
|
"learning_rate": 4.943930611555807e-06,
|
|
"loss": 0.0285,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 0.5293193284261021,
|
|
"grad_norm": 0.0438714399933815,
|
|
"learning_rate": 4.930255951002023e-06,
|
|
"loss": 0.0235,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.5301463898767679,
|
|
"grad_norm": 0.049872253090143204,
|
|
"learning_rate": 4.91658181220611e-06,
|
|
"loss": 0.0213,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 0.5309734513274337,
|
|
"grad_norm": 0.05873720347881317,
|
|
"learning_rate": 4.902908297464815e-06,
|
|
"loss": 0.0214,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 0.5318005127780994,
|
|
"grad_norm": 0.04734335094690323,
|
|
"learning_rate": 4.8892355090702195e-06,
|
|
"loss": 0.0219,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 0.5326275742287652,
|
|
"grad_norm": 0.04171719029545784,
|
|
"learning_rate": 4.875563549308971e-06,
|
|
"loss": 0.0217,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 0.533454635679431,
|
|
"grad_norm": 0.04020686820149422,
|
|
"learning_rate": 4.861892520461514e-06,
|
|
"loss": 0.0229,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 0.5342816971300968,
|
|
"grad_norm": 0.04311240091919899,
|
|
"learning_rate": 4.848222524801341e-06,
|
|
"loss": 0.0232,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 0.5351087585807626,
|
|
"grad_norm": 0.05833645164966583,
|
|
"learning_rate": 4.834553664594197e-06,
|
|
"loss": 0.022,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 0.5359358200314284,
|
|
"grad_norm": 0.0407719612121582,
|
|
"learning_rate": 4.820886042097349e-06,
|
|
"loss": 0.0233,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 0.5367628814820942,
|
|
"grad_norm": 0.03404640033841133,
|
|
"learning_rate": 4.807219759558794e-06,
|
|
"loss": 0.0222,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.5375899429327599,
|
|
"grad_norm": 0.04761282354593277,
|
|
"learning_rate": 4.7935549192165116e-06,
|
|
"loss": 0.0224,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.5384170043834257,
|
|
"grad_norm": 0.04644225910305977,
|
|
"learning_rate": 4.779891623297688e-06,
|
|
"loss": 0.0231,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 0.5392440658340915,
|
|
"grad_norm": 0.04503655433654785,
|
|
"learning_rate": 4.7662299740179544e-06,
|
|
"loss": 0.0226,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 0.5400711272847573,
|
|
"grad_norm": 0.04182233288884163,
|
|
"learning_rate": 4.752570073580632e-06,
|
|
"loss": 0.0207,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 0.5408981887354231,
|
|
"grad_norm": 0.04567556828260422,
|
|
"learning_rate": 4.738912024175945e-06,
|
|
"loss": 0.0218,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 0.5417252501860889,
|
|
"grad_norm": 0.04384360834956169,
|
|
"learning_rate": 4.725255927980283e-06,
|
|
"loss": 0.0214,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 0.5425523116367547,
|
|
"grad_norm": 0.0403946228325367,
|
|
"learning_rate": 4.711601887155417e-06,
|
|
"loss": 0.0264,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 0.5433793730874203,
|
|
"grad_norm": 0.038516897708177567,
|
|
"learning_rate": 4.6979500038477425e-06,
|
|
"loss": 0.0221,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 0.5442064345380861,
|
|
"grad_norm": 0.0414847806096077,
|
|
"learning_rate": 4.684300380187516e-06,
|
|
"loss": 0.0204,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 0.545033495988752,
|
|
"grad_norm": 0.04257076978683472,
|
|
"learning_rate": 4.670653118288085e-06,
|
|
"loss": 0.0211,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 0.5458605574394177,
|
|
"grad_norm": 0.04257350042462349,
|
|
"learning_rate": 4.657008320245136e-06,
|
|
"loss": 0.0218,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.5466876188900835,
|
|
"grad_norm": 0.04577566310763359,
|
|
"learning_rate": 4.643366088135918e-06,
|
|
"loss": 0.0221,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 0.5475146803407493,
|
|
"grad_norm": 0.11741481721401215,
|
|
"learning_rate": 4.629726524018486e-06,
|
|
"loss": 0.0222,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 0.5483417417914151,
|
|
"grad_norm": 0.04335429146885872,
|
|
"learning_rate": 4.616089729930932e-06,
|
|
"loss": 0.0252,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 0.5491688032420808,
|
|
"grad_norm": 0.04533402994275093,
|
|
"learning_rate": 4.602455807890634e-06,
|
|
"loss": 0.0218,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 0.5499958646927466,
|
|
"grad_norm": 0.042610831558704376,
|
|
"learning_rate": 4.588824859893473e-06,
|
|
"loss": 0.022,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 0.5508229261434124,
|
|
"grad_norm": 0.03981228917837143,
|
|
"learning_rate": 4.57519698791309e-06,
|
|
"loss": 0.0227,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 0.5516499875940782,
|
|
"grad_norm": 0.0377313606441021,
|
|
"learning_rate": 4.561572293900109e-06,
|
|
"loss": 0.0226,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 0.552477049044744,
|
|
"grad_norm": 0.08314741402864456,
|
|
"learning_rate": 4.547950879781382e-06,
|
|
"loss": 0.0229,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 0.5533041104954098,
|
|
"grad_norm": 0.04389451816678047,
|
|
"learning_rate": 4.534332847459225e-06,
|
|
"loss": 0.0212,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 0.5541311719460756,
|
|
"grad_norm": 0.04181825742125511,
|
|
"learning_rate": 4.520718298810649e-06,
|
|
"loss": 0.0203,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.5549582333967413,
|
|
"grad_norm": 0.042209409177303314,
|
|
"learning_rate": 4.507107335686611e-06,
|
|
"loss": 0.0234,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 0.5557852948474071,
|
|
"grad_norm": 0.03632921725511551,
|
|
"learning_rate": 4.49350005991124e-06,
|
|
"loss": 0.0213,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 0.5566123562980729,
|
|
"grad_norm": 0.03909287229180336,
|
|
"learning_rate": 4.47989657328108e-06,
|
|
"loss": 0.0259,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 0.5574394177487387,
|
|
"grad_norm": 0.04961128160357475,
|
|
"learning_rate": 4.466296977564331e-06,
|
|
"loss": 0.0229,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 0.5582664791994045,
|
|
"grad_norm": 0.04496648535132408,
|
|
"learning_rate": 4.452701374500079e-06,
|
|
"loss": 0.0207,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 0.5590935406500703,
|
|
"grad_norm": 0.045161984860897064,
|
|
"learning_rate": 4.43910986579755e-06,
|
|
"loss": 0.0233,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 0.5599206021007361,
|
|
"grad_norm": 0.047101061791181564,
|
|
"learning_rate": 4.42552255313533e-06,
|
|
"loss": 0.0327,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 0.5607476635514018,
|
|
"grad_norm": 0.044754352420568466,
|
|
"learning_rate": 4.411939538160621e-06,
|
|
"loss": 0.0221,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 0.5615747250020676,
|
|
"grad_norm": 0.04385341331362724,
|
|
"learning_rate": 4.398360922488474e-06,
|
|
"loss": 0.0266,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 0.5624017864527334,
|
|
"grad_norm": 0.05165982246398926,
|
|
"learning_rate": 4.384786807701024e-06,
|
|
"loss": 0.0218,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.5632288479033992,
|
|
"grad_norm": 0.03928116336464882,
|
|
"learning_rate": 4.371217295346738e-06,
|
|
"loss": 0.022,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 0.564055909354065,
|
|
"grad_norm": 0.038528576493263245,
|
|
"learning_rate": 4.357652486939649e-06,
|
|
"loss": 0.0218,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 0.5648829708047308,
|
|
"grad_norm": 0.04096828028559685,
|
|
"learning_rate": 4.3440924839586045e-06,
|
|
"loss": 0.0221,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 0.5657100322553966,
|
|
"grad_norm": 0.04172588139772415,
|
|
"learning_rate": 4.3305373878465e-06,
|
|
"loss": 0.0214,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 0.5665370937060623,
|
|
"grad_norm": 0.04250342398881912,
|
|
"learning_rate": 4.316987300009521e-06,
|
|
"loss": 0.0216,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 0.5673641551567281,
|
|
"grad_norm": 0.04389472305774689,
|
|
"learning_rate": 4.303442321816388e-06,
|
|
"loss": 0.0225,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 0.5681912166073939,
|
|
"grad_norm": 0.04604129120707512,
|
|
"learning_rate": 4.2899025545975935e-06,
|
|
"loss": 0.025,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 0.5690182780580597,
|
|
"grad_norm": 0.04432059824466705,
|
|
"learning_rate": 4.276368099644649e-06,
|
|
"loss": 0.0223,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 0.5698453395087255,
|
|
"grad_norm": 0.04254218190908432,
|
|
"learning_rate": 4.262839058209325e-06,
|
|
"loss": 0.0254,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 0.5706724009593913,
|
|
"grad_norm": 0.04665306955575943,
|
|
"learning_rate": 4.249315531502892e-06,
|
|
"loss": 0.0233,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.5714994624100571,
|
|
"grad_norm": 0.06424245983362198,
|
|
"learning_rate": 4.235797620695365e-06,
|
|
"loss": 0.0223,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 0.5723265238607228,
|
|
"grad_norm": 0.04606041684746742,
|
|
"learning_rate": 4.222285426914744e-06,
|
|
"loss": 0.0226,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 0.5731535853113886,
|
|
"grad_norm": 0.055455636233091354,
|
|
"learning_rate": 4.208779051246264e-06,
|
|
"loss": 0.0217,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 0.5739806467620544,
|
|
"grad_norm": 0.05722310021519661,
|
|
"learning_rate": 4.1952785947316335e-06,
|
|
"loss": 0.0287,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 0.5748077082127202,
|
|
"grad_norm": 0.047114696353673935,
|
|
"learning_rate": 4.181784158368274e-06,
|
|
"loss": 0.0213,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 0.575634769663386,
|
|
"grad_norm": 0.041593633592128754,
|
|
"learning_rate": 4.1682958431085784e-06,
|
|
"loss": 0.0226,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 0.5764618311140518,
|
|
"grad_norm": 0.044355396181344986,
|
|
"learning_rate": 4.1548137498591415e-06,
|
|
"loss": 0.0214,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 0.5772888925647176,
|
|
"grad_norm": 0.043452925980091095,
|
|
"learning_rate": 4.141337979480014e-06,
|
|
"loss": 0.022,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 0.5781159540153833,
|
|
"grad_norm": 0.04600623995065689,
|
|
"learning_rate": 4.127868632783943e-06,
|
|
"loss": 0.0219,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 0.5789430154660491,
|
|
"grad_norm": 0.045817919075489044,
|
|
"learning_rate": 4.114405810535619e-06,
|
|
"loss": 0.0228,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.5789430154660491,
|
|
"eval_loss": 0.022890722379088402,
|
|
"eval_runtime": 1220.8476,
|
|
"eval_samples_per_second": 4.914,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.5797700769167149,
|
|
"grad_norm": 0.04360632598400116,
|
|
"learning_rate": 4.100949613450929e-06,
|
|
"loss": 0.0232,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 0.5805971383673807,
|
|
"grad_norm": 0.11677900701761246,
|
|
"learning_rate": 4.087500142196188e-06,
|
|
"loss": 0.0239,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 0.5814241998180465,
|
|
"grad_norm": 0.03949005529284477,
|
|
"learning_rate": 4.074057497387402e-06,
|
|
"loss": 0.0215,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 0.5822512612687123,
|
|
"grad_norm": 0.04393787682056427,
|
|
"learning_rate": 4.060621779589505e-06,
|
|
"loss": 0.0224,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 0.5830783227193781,
|
|
"grad_norm": 0.05478642135858536,
|
|
"learning_rate": 4.047193089315608e-06,
|
|
"loss": 0.0217,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 0.5839053841700438,
|
|
"grad_norm": 0.05870141461491585,
|
|
"learning_rate": 4.033771527026252e-06,
|
|
"loss": 0.0218,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 0.5847324456207096,
|
|
"grad_norm": 0.04158046096563339,
|
|
"learning_rate": 4.020357193128655e-06,
|
|
"loss": 0.021,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 0.5855595070713754,
|
|
"grad_norm": 0.05177818983793259,
|
|
"learning_rate": 4.006950187975951e-06,
|
|
"loss": 0.0202,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.5863865685220412,
|
|
"grad_norm": 0.04415697604417801,
|
|
"learning_rate": 3.993550611866458e-06,
|
|
"loss": 0.0222,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 0.587213629972707,
|
|
"grad_norm": 0.06037106364965439,
|
|
"learning_rate": 3.980158565042908e-06,
|
|
"loss": 0.022,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.5880406914233728,
|
|
"grad_norm": 0.03998905047774315,
|
|
"learning_rate": 3.96677414769171e-06,
|
|
"loss": 0.0227,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 0.5888677528740386,
|
|
"grad_norm": 0.035650502890348434,
|
|
"learning_rate": 3.9533974599422e-06,
|
|
"loss": 0.0218,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 0.5896948143247043,
|
|
"grad_norm": 0.04198850691318512,
|
|
"learning_rate": 3.940028601865881e-06,
|
|
"loss": 0.0229,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 0.5905218757753701,
|
|
"grad_norm": 0.041859325021505356,
|
|
"learning_rate": 3.9266676734756894e-06,
|
|
"loss": 0.0217,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 0.5913489372260359,
|
|
"grad_norm": 0.04040461406111717,
|
|
"learning_rate": 3.913314774725234e-06,
|
|
"loss": 0.0212,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 0.5921759986767017,
|
|
"grad_norm": 0.04604990780353546,
|
|
"learning_rate": 3.899970005508053e-06,
|
|
"loss": 0.022,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 0.5930030601273675,
|
|
"grad_norm": 0.04515118896961212,
|
|
"learning_rate": 3.8866334656568765e-06,
|
|
"loss": 0.022,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 0.5938301215780333,
|
|
"grad_norm": 0.04524078220129013,
|
|
"learning_rate": 3.8733052549428566e-06,
|
|
"loss": 0.0215,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 0.5946571830286991,
|
|
"grad_norm": 0.04891633987426758,
|
|
"learning_rate": 3.859985473074847e-06,
|
|
"loss": 0.0226,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 0.5954842444793648,
|
|
"grad_norm": 0.042289573699235916,
|
|
"learning_rate": 3.846674219698635e-06,
|
|
"loss": 0.0213,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.5963113059300306,
|
|
"grad_norm": 0.04168631508946419,
|
|
"learning_rate": 3.833371594396214e-06,
|
|
"loss": 0.0228,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 0.5971383673806964,
|
|
"grad_norm": 0.04345110431313515,
|
|
"learning_rate": 3.820077696685027e-06,
|
|
"loss": 0.0213,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 0.5979654288313622,
|
|
"grad_norm": 0.04696614667773247,
|
|
"learning_rate": 3.8067926260172234e-06,
|
|
"loss": 0.0226,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 0.598792490282028,
|
|
"grad_norm": 0.041647132486104965,
|
|
"learning_rate": 3.793516481778924e-06,
|
|
"loss": 0.022,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 0.5996195517326938,
|
|
"grad_norm": 0.04166780784726143,
|
|
"learning_rate": 3.780249363289459e-06,
|
|
"loss": 0.0253,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 0.6004466131833596,
|
|
"grad_norm": 0.04384204372763634,
|
|
"learning_rate": 3.766991369800649e-06,
|
|
"loss": 0.0219,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 0.6012736746340253,
|
|
"grad_norm": 0.03765762969851494,
|
|
"learning_rate": 3.7537426004960446e-06,
|
|
"loss": 0.0207,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 0.6021007360846911,
|
|
"grad_norm": 0.04585011675953865,
|
|
"learning_rate": 3.7405031544901884e-06,
|
|
"loss": 0.0209,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 0.6029277975353569,
|
|
"grad_norm": 0.04622683674097061,
|
|
"learning_rate": 3.7272731308278777e-06,
|
|
"loss": 0.0225,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 0.6037548589860227,
|
|
"grad_norm": 0.061212386935949326,
|
|
"learning_rate": 3.714052628483417e-06,
|
|
"loss": 0.0202,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.6045819204366885,
|
|
"grad_norm": 0.0689668282866478,
|
|
"learning_rate": 3.700841746359889e-06,
|
|
"loss": 0.0222,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 0.6054089818873543,
|
|
"grad_norm": 0.041381001472473145,
|
|
"learning_rate": 3.6876405832884016e-06,
|
|
"loss": 0.0214,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 0.6062360433380201,
|
|
"grad_norm": 0.0484529472887516,
|
|
"learning_rate": 3.6744492380273533e-06,
|
|
"loss": 0.0219,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 0.6070631047886857,
|
|
"grad_norm": 0.04558572545647621,
|
|
"learning_rate": 3.661267809261698e-06,
|
|
"loss": 0.0212,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 0.6078901662393515,
|
|
"grad_norm": 0.03745023533701897,
|
|
"learning_rate": 3.648096395602202e-06,
|
|
"loss": 0.0231,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 0.6087172276900173,
|
|
"grad_norm": 0.04229872673749924,
|
|
"learning_rate": 3.6349350955847094e-06,
|
|
"loss": 0.0215,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 0.6095442891406831,
|
|
"grad_norm": 0.06078009679913521,
|
|
"learning_rate": 3.6217840076694066e-06,
|
|
"loss": 0.0233,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 0.610371350591349,
|
|
"grad_norm": 0.04391666501760483,
|
|
"learning_rate": 3.6086432302400754e-06,
|
|
"loss": 0.0218,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 0.6111984120420147,
|
|
"grad_norm": 0.04776912182569504,
|
|
"learning_rate": 3.5955128616033717e-06,
|
|
"loss": 0.0238,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 0.6120254734926805,
|
|
"grad_norm": 0.04561059549450874,
|
|
"learning_rate": 3.582392999988078e-06,
|
|
"loss": 0.0229,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.6128525349433462,
|
|
"grad_norm": 0.043533895164728165,
|
|
"learning_rate": 3.569283743544375e-06,
|
|
"loss": 0.022,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 0.613679596394012,
|
|
"grad_norm": 0.03526020050048828,
|
|
"learning_rate": 3.55618519034311e-06,
|
|
"loss": 0.0214,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 0.6145066578446778,
|
|
"grad_norm": 0.03638261556625366,
|
|
"learning_rate": 3.5430974383750503e-06,
|
|
"loss": 0.0208,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 0.6153337192953436,
|
|
"grad_norm": 0.04244010150432587,
|
|
"learning_rate": 3.530020585550166e-06,
|
|
"loss": 0.0224,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 0.6161607807460094,
|
|
"grad_norm": 0.03991573676466942,
|
|
"learning_rate": 3.5169547296968874e-06,
|
|
"loss": 0.0218,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 0.6169878421966752,
|
|
"grad_norm": 0.03916684165596962,
|
|
"learning_rate": 3.5038999685613752e-06,
|
|
"loss": 0.0212,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 0.617814903647341,
|
|
"grad_norm": 0.037909045815467834,
|
|
"learning_rate": 3.4908563998067945e-06,
|
|
"loss": 0.0222,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 0.6186419650980067,
|
|
"grad_norm": 0.048196956515312195,
|
|
"learning_rate": 3.4778241210125718e-06,
|
|
"loss": 0.021,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 0.6194690265486725,
|
|
"grad_norm": 0.04458421468734741,
|
|
"learning_rate": 3.4648032296736805e-06,
|
|
"loss": 0.0236,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 0.6202960879993383,
|
|
"grad_norm": 0.039592791348695755,
|
|
"learning_rate": 3.4517938231999026e-06,
|
|
"loss": 0.0228,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.6211231494500041,
|
|
"grad_norm": 0.048372309654951096,
|
|
"learning_rate": 3.4387959989150977e-06,
|
|
"loss": 0.0215,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 0.6219502109006699,
|
|
"grad_norm": 0.041564539074897766,
|
|
"learning_rate": 3.425809854056482e-06,
|
|
"loss": 0.0219,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 0.6227772723513357,
|
|
"grad_norm": 0.043838802725076675,
|
|
"learning_rate": 3.4128354857738942e-06,
|
|
"loss": 0.0208,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 0.6236043338020015,
|
|
"grad_norm": 0.04145396873354912,
|
|
"learning_rate": 3.3998729911290775e-06,
|
|
"loss": 0.0212,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 0.6244313952526672,
|
|
"grad_norm": 0.04556450992822647,
|
|
"learning_rate": 3.386922467094944e-06,
|
|
"loss": 0.023,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 0.625258456703333,
|
|
"grad_norm": 0.04290676862001419,
|
|
"learning_rate": 3.3739840105548528e-06,
|
|
"loss": 0.021,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 0.6260855181539988,
|
|
"grad_norm": 0.042239073663949966,
|
|
"learning_rate": 3.3610577183018877e-06,
|
|
"loss": 0.0225,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 0.6269125796046646,
|
|
"grad_norm": 0.04751691594719887,
|
|
"learning_rate": 3.348143687038128e-06,
|
|
"loss": 0.0215,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 0.6277396410553304,
|
|
"grad_norm": 0.04237852618098259,
|
|
"learning_rate": 3.3352420133739304e-06,
|
|
"loss": 0.0218,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 0.6285667025059962,
|
|
"grad_norm": 0.03740919381380081,
|
|
"learning_rate": 3.3223527938272076e-06,
|
|
"loss": 0.0213,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 0.629393763956662,
|
|
"grad_norm": 0.036210279911756516,
|
|
"learning_rate": 3.3094761248226948e-06,
|
|
"loss": 0.0255,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 0.6302208254073277,
|
|
"grad_norm": 0.04506264254450798,
|
|
"learning_rate": 3.296612102691241e-06,
|
|
"loss": 0.0224,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 0.6310478868579935,
|
|
"grad_norm": 0.04092979431152344,
|
|
"learning_rate": 3.283760823669082e-06,
|
|
"loss": 0.0206,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 0.6318749483086593,
|
|
"grad_norm": 0.04056790471076965,
|
|
"learning_rate": 3.270922383897121e-06,
|
|
"loss": 0.0213,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 0.6327020097593251,
|
|
"grad_norm": 0.03952750191092491,
|
|
"learning_rate": 3.258096879420216e-06,
|
|
"loss": 0.021,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 0.6335290712099909,
|
|
"grad_norm": 0.04810957983136177,
|
|
"learning_rate": 3.245284406186446e-06,
|
|
"loss": 0.0226,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 0.6343561326606567,
|
|
"grad_norm": 0.038928598165512085,
|
|
"learning_rate": 3.232485060046412e-06,
|
|
"loss": 0.0231,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 0.6351831941113225,
|
|
"grad_norm": 0.03903147578239441,
|
|
"learning_rate": 3.2196989367525035e-06,
|
|
"loss": 0.0255,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 0.6360102555619882,
|
|
"grad_norm": 0.04532884061336517,
|
|
"learning_rate": 3.2069261319581922e-06,
|
|
"loss": 0.02,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"epoch": 0.636837317012654,
|
|
"grad_norm": 0.0435151644051075,
|
|
"learning_rate": 3.19416674121732e-06,
|
|
"loss": 0.022,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 0.6376643784633198,
|
|
"grad_norm": 0.04332192242145538,
|
|
"learning_rate": 3.1814208599833634e-06,
|
|
"loss": 0.0273,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"epoch": 0.6384914399139856,
|
|
"grad_norm": 0.0369616374373436,
|
|
"learning_rate": 3.168688583608748e-06,
|
|
"loss": 0.0214,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 0.6393185013646514,
|
|
"grad_norm": 0.07783352583646774,
|
|
"learning_rate": 3.1559700073441123e-06,
|
|
"loss": 0.0213,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"epoch": 0.6401455628153172,
|
|
"grad_norm": 0.0504750981926918,
|
|
"learning_rate": 3.1432652263376073e-06,
|
|
"loss": 0.0202,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 0.640972624265983,
|
|
"grad_norm": 0.0557858943939209,
|
|
"learning_rate": 3.130574335634181e-06,
|
|
"loss": 0.0222,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 0.6417996857166487,
|
|
"grad_norm": 0.0438205786049366,
|
|
"learning_rate": 3.117897430174863e-06,
|
|
"loss": 0.0211,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 0.6426267471673145,
|
|
"grad_norm": 0.04007831588387489,
|
|
"learning_rate": 3.1052346047960696e-06,
|
|
"loss": 0.0223,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"epoch": 0.6434538086179803,
|
|
"grad_norm": 0.04356636852025986,
|
|
"learning_rate": 3.0925859542288695e-06,
|
|
"loss": 0.021,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 0.6442808700686461,
|
|
"grad_norm": 0.044068679213523865,
|
|
"learning_rate": 3.0799515730982987e-06,
|
|
"loss": 0.0239,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"epoch": 0.6451079315193119,
|
|
"grad_norm": 0.058787260204553604,
|
|
"learning_rate": 3.0673315559226426e-06,
|
|
"loss": 0.0223,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 0.6459349929699777,
|
|
"grad_norm": 0.04351416230201721,
|
|
"learning_rate": 3.054725997112724e-06,
|
|
"loss": 0.0227,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"epoch": 0.6467620544206435,
|
|
"grad_norm": 0.0457034632563591,
|
|
"learning_rate": 3.042134990971205e-06,
|
|
"loss": 0.021,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 0.6475891158713092,
|
|
"grad_norm": 0.04021298885345459,
|
|
"learning_rate": 3.0295586316918816e-06,
|
|
"loss": 0.0205,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"epoch": 0.648416177321975,
|
|
"grad_norm": 0.045050378888845444,
|
|
"learning_rate": 3.0169970133589714e-06,
|
|
"loss": 0.0217,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 0.6492432387726408,
|
|
"grad_norm": 0.036717429757118225,
|
|
"learning_rate": 3.004450229946418e-06,
|
|
"loss": 0.0218,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 0.6500703002233066,
|
|
"grad_norm": 0.05614123493432999,
|
|
"learning_rate": 2.99191837531718e-06,
|
|
"loss": 0.0234,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 0.6508973616739724,
|
|
"grad_norm": 0.037934400141239166,
|
|
"learning_rate": 2.9794015432225363e-06,
|
|
"loss": 0.022,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"epoch": 0.6517244231246382,
|
|
"grad_norm": 0.04340437054634094,
|
|
"learning_rate": 2.966899827301386e-06,
|
|
"loss": 0.0286,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 0.652551484575304,
|
|
"grad_norm": 0.04128657281398773,
|
|
"learning_rate": 2.9544133210795317e-06,
|
|
"loss": 0.0217,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"epoch": 0.6533785460259697,
|
|
"grad_norm": 0.04219742491841316,
|
|
"learning_rate": 2.9419421179690044e-06,
|
|
"loss": 0.0207,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 0.6542056074766355,
|
|
"grad_norm": 0.04193083569407463,
|
|
"learning_rate": 2.929486311267343e-06,
|
|
"loss": 0.0218,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"epoch": 0.6550326689273013,
|
|
"grad_norm": 0.03400260955095291,
|
|
"learning_rate": 2.9170459941569094e-06,
|
|
"loss": 0.0215,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 0.6558597303779671,
|
|
"grad_norm": 0.04170495644211769,
|
|
"learning_rate": 2.904621259704188e-06,
|
|
"loss": 0.0219,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"epoch": 0.6566867918286329,
|
|
"grad_norm": 0.04302512854337692,
|
|
"learning_rate": 2.892212200859086e-06,
|
|
"loss": 0.0244,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 0.6575138532792987,
|
|
"grad_norm": 0.043327417224645615,
|
|
"learning_rate": 2.8798189104542436e-06,
|
|
"loss": 0.022,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 0.6583409147299645,
|
|
"grad_norm": 0.05167660862207413,
|
|
"learning_rate": 2.8674414812043317e-06,
|
|
"loss": 0.0205,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 0.6591679761806302,
|
|
"grad_norm": 0.061974212527275085,
|
|
"learning_rate": 2.855080005705367e-06,
|
|
"loss": 0.0243,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"epoch": 0.659995037631296,
|
|
"grad_norm": 0.04321138933300972,
|
|
"learning_rate": 2.842734576434021e-06,
|
|
"loss": 0.0212,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 0.6608220990819618,
|
|
"grad_norm": 0.05327922850847244,
|
|
"learning_rate": 2.8304052857469107e-06,
|
|
"loss": 0.021,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"epoch": 0.6616491605326276,
|
|
"grad_norm": 0.04471385106444359,
|
|
"learning_rate": 2.8180922258799286e-06,
|
|
"loss": 0.0214,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.6616491605326276,
|
|
"eval_loss": 0.022467145696282387,
|
|
"eval_runtime": 1221.4961,
|
|
"eval_samples_per_second": 4.911,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.6624762219832934,
|
|
"grad_norm": 0.045148443430662155,
|
|
"learning_rate": 2.8057954889475415e-06,
|
|
"loss": 0.0216,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"epoch": 0.6633032834339592,
|
|
"grad_norm": 0.04102947190403938,
|
|
"learning_rate": 2.7935151669421033e-06,
|
|
"loss": 0.0208,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 0.664130344884625,
|
|
"grad_norm": 0.0464673787355423,
|
|
"learning_rate": 2.7812513517331695e-06,
|
|
"loss": 0.0206,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"epoch": 0.6649574063352907,
|
|
"grad_norm": 0.04477581009268761,
|
|
"learning_rate": 2.7690041350667995e-06,
|
|
"loss": 0.0215,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 0.6657844677859565,
|
|
"grad_norm": 0.043795693665742874,
|
|
"learning_rate": 2.7567736085648935e-06,
|
|
"loss": 0.0219,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 0.6666115292366223,
|
|
"grad_norm": 0.04666496440768242,
|
|
"learning_rate": 2.7445598637244746e-06,
|
|
"loss": 0.021,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 0.6674385906872881,
|
|
"grad_norm": 0.039747051894664764,
|
|
"learning_rate": 2.7323629919170334e-06,
|
|
"loss": 0.0219,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"epoch": 0.6682656521379539,
|
|
"grad_norm": 0.037099067121744156,
|
|
"learning_rate": 2.72018308438783e-06,
|
|
"loss": 0.02,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 0.6690927135886197,
|
|
"grad_norm": 0.0401119664311409,
|
|
"learning_rate": 2.7080202322552126e-06,
|
|
"loss": 0.0214,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"epoch": 0.6699197750392855,
|
|
"grad_norm": 0.0409838892519474,
|
|
"learning_rate": 2.6958745265099397e-06,
|
|
"loss": 0.0205,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 0.6707468364899511,
|
|
"grad_norm": 0.035290639847517014,
|
|
"learning_rate": 2.683746058014489e-06,
|
|
"loss": 0.0209,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"epoch": 0.671573897940617,
|
|
"grad_norm": 0.03809922933578491,
|
|
"learning_rate": 2.6716349175023997e-06,
|
|
"loss": 0.022,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 0.6724009593912827,
|
|
"grad_norm": 0.044197600334882736,
|
|
"learning_rate": 2.659541195577571e-06,
|
|
"loss": 0.02,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"epoch": 0.6732280208419485,
|
|
"grad_norm": 0.041063982993364334,
|
|
"learning_rate": 2.6474649827135913e-06,
|
|
"loss": 0.0203,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 0.6740550822926143,
|
|
"grad_norm": 0.039071984589099884,
|
|
"learning_rate": 2.635406369253066e-06,
|
|
"loss": 0.0216,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 0.6748821437432801,
|
|
"grad_norm": 0.038477640599012375,
|
|
"learning_rate": 2.6233654454069397e-06,
|
|
"loss": 0.0217,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 0.675709205193946,
|
|
"grad_norm": 0.05265484377741814,
|
|
"learning_rate": 2.6113423012538184e-06,
|
|
"loss": 0.0223,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"epoch": 0.6765362666446118,
|
|
"grad_norm": 0.04026918113231659,
|
|
"learning_rate": 2.5993370267392998e-06,
|
|
"loss": 0.0212,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 0.6773633280952774,
|
|
"grad_norm": 0.040949251502752304,
|
|
"learning_rate": 2.5873497116752955e-06,
|
|
"loss": 0.0218,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"epoch": 0.6781903895459432,
|
|
"grad_norm": 0.04553502798080444,
|
|
"learning_rate": 2.575380445739363e-06,
|
|
"loss": 0.0224,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 0.679017450996609,
|
|
"grad_norm": 0.040991537272930145,
|
|
"learning_rate": 2.5634293184740337e-06,
|
|
"loss": 0.0207,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"epoch": 0.6798445124472748,
|
|
"grad_norm": 0.04071825370192528,
|
|
"learning_rate": 2.551496419286143e-06,
|
|
"loss": 0.0215,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 0.6806715738979406,
|
|
"grad_norm": 0.04148703068494797,
|
|
"learning_rate": 2.5395818374461626e-06,
|
|
"loss": 0.0215,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"epoch": 0.6814986353486064,
|
|
"grad_norm": 0.04831210896372795,
|
|
"learning_rate": 2.5276856620875267e-06,
|
|
"loss": 0.0204,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 0.6823256967992722,
|
|
"grad_norm": 0.05425499007105827,
|
|
"learning_rate": 2.5158079822059726e-06,
|
|
"loss": 0.0214,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 0.6831527582499379,
|
|
"grad_norm": 0.042809970676898956,
|
|
"learning_rate": 2.503948886658879e-06,
|
|
"loss": 0.0204,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 0.6839798197006037,
|
|
"grad_norm": 0.039092812687158585,
|
|
"learning_rate": 2.492108464164582e-06,
|
|
"loss": 0.0209,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"epoch": 0.6848068811512695,
|
|
"grad_norm": 0.0440199077129364,
|
|
"learning_rate": 2.4802868033017325e-06,
|
|
"loss": 0.0205,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 0.6856339426019353,
|
|
"grad_norm": 0.04389241337776184,
|
|
"learning_rate": 2.4684839925086222e-06,
|
|
"loss": 0.0218,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"epoch": 0.6864610040526011,
|
|
"grad_norm": 0.03971746936440468,
|
|
"learning_rate": 2.4567001200825257e-06,
|
|
"loss": 0.0211,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 0.6872880655032669,
|
|
"grad_norm": 0.03864897042512894,
|
|
"learning_rate": 2.44493527417904e-06,
|
|
"loss": 0.0224,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"epoch": 0.6881151269539327,
|
|
"grad_norm": 0.04412490129470825,
|
|
"learning_rate": 2.4331895428114167e-06,
|
|
"loss": 0.0206,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 0.6889421884045984,
|
|
"grad_norm": 0.045335933566093445,
|
|
"learning_rate": 2.4214630138499235e-06,
|
|
"loss": 0.0203,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"epoch": 0.6897692498552642,
|
|
"grad_norm": 0.040548257529735565,
|
|
"learning_rate": 2.4097557750211627e-06,
|
|
"loss": 0.0208,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 0.69059631130593,
|
|
"grad_norm": 0.043131329119205475,
|
|
"learning_rate": 2.3980679139074314e-06,
|
|
"loss": 0.021,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 0.6914233727565958,
|
|
"grad_norm": 0.039993565529584885,
|
|
"learning_rate": 2.3863995179460612e-06,
|
|
"loss": 0.0222,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 0.6922504342072616,
|
|
"grad_norm": 0.037337690591812134,
|
|
"learning_rate": 2.374750674428764e-06,
|
|
"loss": 0.0218,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"epoch": 0.6930774956579274,
|
|
"grad_norm": 0.042838480323553085,
|
|
"learning_rate": 2.3631214705009806e-06,
|
|
"loss": 0.0208,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 0.6939045571085932,
|
|
"grad_norm": 0.036257416009902954,
|
|
"learning_rate": 2.3515119931612196e-06,
|
|
"loss": 0.02,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"epoch": 0.6947316185592589,
|
|
"grad_norm": 0.042761024087667465,
|
|
"learning_rate": 2.339922329260426e-06,
|
|
"loss": 0.0223,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 0.6955586800099247,
|
|
"grad_norm": 0.04689721390604973,
|
|
"learning_rate": 2.328352565501314e-06,
|
|
"loss": 0.0235,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"epoch": 0.6963857414605905,
|
|
"grad_norm": 0.04648851230740547,
|
|
"learning_rate": 2.316802788437719e-06,
|
|
"loss": 0.0217,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 0.6972128029112563,
|
|
"grad_norm": 0.04260076582431793,
|
|
"learning_rate": 2.3052730844739636e-06,
|
|
"loss": 0.0216,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"epoch": 0.6980398643619221,
|
|
"grad_norm": 0.042848605662584305,
|
|
"learning_rate": 2.293763539864199e-06,
|
|
"loss": 0.0214,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 0.6988669258125879,
|
|
"grad_norm": 0.039934489876031876,
|
|
"learning_rate": 2.2822742407117625e-06,
|
|
"loss": 0.0202,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 0.6996939872632537,
|
|
"grad_norm": 0.03947708010673523,
|
|
"learning_rate": 2.270805272968537e-06,
|
|
"loss": 0.0207,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 0.7005210487139194,
|
|
"grad_norm": 0.03535833582282066,
|
|
"learning_rate": 2.2593567224343037e-06,
|
|
"loss": 0.0225,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"epoch": 0.7013481101645852,
|
|
"grad_norm": 0.04926292970776558,
|
|
"learning_rate": 2.2479286747561037e-06,
|
|
"loss": 0.0221,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 0.702175171615251,
|
|
"grad_norm": 0.03978796303272247,
|
|
"learning_rate": 2.2365212154275908e-06,
|
|
"loss": 0.0226,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"epoch": 0.7030022330659168,
|
|
"grad_norm": 0.04777059331536293,
|
|
"learning_rate": 2.2251344297883996e-06,
|
|
"loss": 0.0204,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.7038292945165826,
|
|
"grad_norm": 0.04967991262674332,
|
|
"learning_rate": 2.2137684030235095e-06,
|
|
"loss": 0.0203,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"epoch": 0.7046563559672484,
|
|
"grad_norm": 0.04070328548550606,
|
|
"learning_rate": 2.202423220162591e-06,
|
|
"loss": 0.0214,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 0.7054834174179142,
|
|
"grad_norm": 0.036942508071660995,
|
|
"learning_rate": 2.191098966079389e-06,
|
|
"loss": 0.0205,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"epoch": 0.7063104788685799,
|
|
"grad_norm": 0.042975060641765594,
|
|
"learning_rate": 2.1797957254910757e-06,
|
|
"loss": 0.0218,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 0.7071375403192457,
|
|
"grad_norm": 0.044698718935251236,
|
|
"learning_rate": 2.168513582957622e-06,
|
|
"loss": 0.0225,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 0.7079646017699115,
|
|
"grad_norm": 0.0593951940536499,
|
|
"learning_rate": 2.1572526228811645e-06,
|
|
"loss": 0.0205,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 0.7087916632205773,
|
|
"grad_norm": 0.042812854051589966,
|
|
"learning_rate": 2.1460129295053666e-06,
|
|
"loss": 0.0215,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"epoch": 0.7096187246712431,
|
|
"grad_norm": 0.05073460936546326,
|
|
"learning_rate": 2.134794586914806e-06,
|
|
"loss": 0.0234,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 0.7104457861219089,
|
|
"grad_norm": 0.03609664365649223,
|
|
"learning_rate": 2.123597679034324e-06,
|
|
"loss": 0.02,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"epoch": 0.7112728475725747,
|
|
"grad_norm": 0.040147822350263596,
|
|
"learning_rate": 2.112422289628412e-06,
|
|
"loss": 0.0205,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 0.7120999090232404,
|
|
"grad_norm": 0.039646077901124954,
|
|
"learning_rate": 2.101268502300582e-06,
|
|
"loss": 0.0213,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"epoch": 0.7129269704739062,
|
|
"grad_norm": 0.04966466873884201,
|
|
"learning_rate": 2.090136400492739e-06,
|
|
"loss": 0.0244,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 0.713754031924572,
|
|
"grad_norm": 0.04764994978904724,
|
|
"learning_rate": 2.0790260674845563e-06,
|
|
"loss": 0.0202,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"epoch": 0.7145810933752378,
|
|
"grad_norm": 0.04711426794528961,
|
|
"learning_rate": 2.0679375863928576e-06,
|
|
"loss": 0.0214,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 0.7154081548259036,
|
|
"grad_norm": 0.04078923165798187,
|
|
"learning_rate": 2.056871040170988e-06,
|
|
"loss": 0.0199,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 0.7162352162765694,
|
|
"grad_norm": 0.039174020290374756,
|
|
"learning_rate": 2.0458265116082002e-06,
|
|
"loss": 0.021,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 0.7170622777272352,
|
|
"grad_norm": 0.04337885230779648,
|
|
"learning_rate": 2.034804083329027e-06,
|
|
"loss": 0.0208,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"epoch": 0.7178893391779009,
|
|
"grad_norm": 0.04172796383500099,
|
|
"learning_rate": 2.0238038377926715e-06,
|
|
"loss": 0.0218,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 0.7187164006285667,
|
|
"grad_norm": 0.043501630425453186,
|
|
"learning_rate": 2.012825857292392e-06,
|
|
"loss": 0.0232,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"epoch": 0.7195434620792325,
|
|
"grad_norm": 0.04335128515958786,
|
|
"learning_rate": 2.00187022395487e-06,
|
|
"loss": 0.0215,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 0.7203705235298983,
|
|
"grad_norm": 0.04587217792868614,
|
|
"learning_rate": 1.9909370197396148e-06,
|
|
"loss": 0.0246,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"epoch": 0.7211975849805641,
|
|
"grad_norm": 0.037936147302389145,
|
|
"learning_rate": 1.9800263264383405e-06,
|
|
"loss": 0.0206,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 0.7220246464312299,
|
|
"grad_norm": 0.0373714417219162,
|
|
"learning_rate": 1.969138225674358e-06,
|
|
"loss": 0.0213,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 0.7228517078818957,
|
|
"grad_norm": 0.04090265929698944,
|
|
"learning_rate": 1.9582727989019607e-06,
|
|
"loss": 0.021,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 0.7236787693325614,
|
|
"grad_norm": 0.033642202615737915,
|
|
"learning_rate": 1.9474301274058125e-06,
|
|
"loss": 0.0198,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 0.7245058307832272,
|
|
"grad_norm": 0.0450110137462616,
|
|
"learning_rate": 1.9366102923003578e-06,
|
|
"loss": 0.0202,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 0.725332892233893,
|
|
"grad_norm": 0.03714507818222046,
|
|
"learning_rate": 1.9258133745291845e-06,
|
|
"loss": 0.0211,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"epoch": 0.7261599536845588,
|
|
"grad_norm": 0.04287153109908104,
|
|
"learning_rate": 1.9150394548644463e-06,
|
|
"loss": 0.02,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 0.7269870151352246,
|
|
"grad_norm": 0.041864458471536636,
|
|
"learning_rate": 1.9042886139062427e-06,
|
|
"loss": 0.0218,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"epoch": 0.7278140765858904,
|
|
"grad_norm": 0.11404255032539368,
|
|
"learning_rate": 1.893560932082023e-06,
|
|
"loss": 0.0224,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 0.7286411380365562,
|
|
"grad_norm": 0.04448498412966728,
|
|
"learning_rate": 1.8828564896459795e-06,
|
|
"loss": 0.0217,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"epoch": 0.7294681994872219,
|
|
"grad_norm": 0.038884907960891724,
|
|
"learning_rate": 1.872175366678451e-06,
|
|
"loss": 0.0206,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 0.7302952609378877,
|
|
"grad_norm": 0.041435256600379944,
|
|
"learning_rate": 1.8615176430853231e-06,
|
|
"loss": 0.0211,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"epoch": 0.7311223223885535,
|
|
"grad_norm": 0.04282752797007561,
|
|
"learning_rate": 1.8508833985974306e-06,
|
|
"loss": 0.0209,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 0.7319493838392193,
|
|
"grad_norm": 0.043493740260601044,
|
|
"learning_rate": 1.8402727127699537e-06,
|
|
"loss": 0.02,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 0.7327764452898851,
|
|
"grad_norm": 0.036238010972738266,
|
|
"learning_rate": 1.8296856649818418e-06,
|
|
"loss": 0.0211,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 0.7336035067405509,
|
|
"grad_norm": 0.04608851671218872,
|
|
"learning_rate": 1.8191223344351932e-06,
|
|
"loss": 0.0222,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"epoch": 0.7344305681912167,
|
|
"grad_norm": 0.04390549659729004,
|
|
"learning_rate": 1.8085828001546869e-06,
|
|
"loss": 0.0207,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 0.7352576296418823,
|
|
"grad_norm": 0.04080136865377426,
|
|
"learning_rate": 1.798067140986976e-06,
|
|
"loss": 0.0215,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"epoch": 0.7360846910925481,
|
|
"grad_norm": 0.05361476168036461,
|
|
"learning_rate": 1.7875754356001052e-06,
|
|
"loss": 0.0215,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 0.736911752543214,
|
|
"grad_norm": 0.041193023324012756,
|
|
"learning_rate": 1.7771077624829213e-06,
|
|
"loss": 0.0226,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"epoch": 0.7377388139938797,
|
|
"grad_norm": 0.05157098174095154,
|
|
"learning_rate": 1.7666641999444777e-06,
|
|
"loss": 0.0213,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 0.7385658754445455,
|
|
"grad_norm": 0.038595810532569885,
|
|
"learning_rate": 1.7562448261134658e-06,
|
|
"loss": 0.0204,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"epoch": 0.7393929368952114,
|
|
"grad_norm": 0.12353651970624924,
|
|
"learning_rate": 1.7458497189376145e-06,
|
|
"loss": 0.0208,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 0.7402199983458772,
|
|
"grad_norm": 0.04080955684185028,
|
|
"learning_rate": 1.735478956183112e-06,
|
|
"loss": 0.0203,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 0.7410470597965428,
|
|
"grad_norm": 0.0376775749027729,
|
|
"learning_rate": 1.725132615434027e-06,
|
|
"loss": 0.0214,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 0.7418741212472086,
|
|
"grad_norm": 0.04121479019522667,
|
|
"learning_rate": 1.7148107740917269e-06,
|
|
"loss": 0.0222,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"epoch": 0.7427011826978744,
|
|
"grad_norm": 0.03592400997877121,
|
|
"learning_rate": 1.7045135093742976e-06,
|
|
"loss": 0.0207,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 0.7435282441485402,
|
|
"grad_norm": 0.03217403218150139,
|
|
"learning_rate": 1.6942408983159648e-06,
|
|
"loss": 0.0208,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"epoch": 0.744355305599206,
|
|
"grad_norm": 0.038189876824617386,
|
|
"learning_rate": 1.6839930177665208e-06,
|
|
"loss": 0.0232,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.744355305599206,
|
|
"eval_loss": 0.02212439477443695,
|
|
"eval_runtime": 1220.8964,
|
|
"eval_samples_per_second": 4.914,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7451823670498718,
|
|
"grad_norm": 0.04287660866975784,
|
|
"learning_rate": 1.6737699443907486e-06,
|
|
"loss": 0.0203,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"epoch": 0.7460094285005376,
|
|
"grad_norm": 0.03629004582762718,
|
|
"learning_rate": 1.663571754667847e-06,
|
|
"loss": 0.0209,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"epoch": 0.7468364899512033,
|
|
"grad_norm": 0.042142104357481,
|
|
"learning_rate": 1.6533985248908551e-06,
|
|
"loss": 0.0203,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"epoch": 0.7476635514018691,
|
|
"grad_norm": 0.042162686586380005,
|
|
"learning_rate": 1.6432503311660963e-06,
|
|
"loss": 0.0195,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"epoch": 0.7484906128525349,
|
|
"grad_norm": 0.042325787246227264,
|
|
"learning_rate": 1.6331272494125865e-06,
|
|
"loss": 0.025,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"epoch": 0.7493176743032007,
|
|
"grad_norm": 0.03958788514137268,
|
|
"learning_rate": 1.6230293553614851e-06,
|
|
"loss": 0.0208,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"epoch": 0.7501447357538665,
|
|
"grad_norm": 0.04631664603948593,
|
|
"learning_rate": 1.612956724555519e-06,
|
|
"loss": 0.0222,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"epoch": 0.7509717972045323,
|
|
"grad_norm": 0.03541667386889458,
|
|
"learning_rate": 1.6029094323484207e-06,
|
|
"loss": 0.0188,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"epoch": 0.7517988586551981,
|
|
"grad_norm": 0.04303780198097229,
|
|
"learning_rate": 1.5928875539043649e-06,
|
|
"loss": 0.0218,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"epoch": 0.7526259201058638,
|
|
"grad_norm": 0.045209407806396484,
|
|
"learning_rate": 1.5828911641973981e-06,
|
|
"loss": 0.0216,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 0.7534529815565296,
|
|
"grad_norm": 0.0393962636590004,
|
|
"learning_rate": 1.5729203380108955e-06,
|
|
"loss": 0.0201,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"epoch": 0.7542800430071954,
|
|
"grad_norm": 0.04141068086028099,
|
|
"learning_rate": 1.5629751499369839e-06,
|
|
"loss": 0.0221,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"epoch": 0.7551071044578612,
|
|
"grad_norm": 0.04183319956064224,
|
|
"learning_rate": 1.553055674375989e-06,
|
|
"loss": 0.0207,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"epoch": 0.755934165908527,
|
|
"grad_norm": 0.04659945145249367,
|
|
"learning_rate": 1.5431619855358842e-06,
|
|
"loss": 0.0228,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"epoch": 0.7567612273591928,
|
|
"grad_norm": 0.04036922752857208,
|
|
"learning_rate": 1.5332941574317294e-06,
|
|
"loss": 0.0218,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"epoch": 0.7575882888098586,
|
|
"grad_norm": 0.04024342820048332,
|
|
"learning_rate": 1.5234522638851213e-06,
|
|
"loss": 0.0213,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"epoch": 0.7584153502605243,
|
|
"grad_norm": 0.04086223989725113,
|
|
"learning_rate": 1.5136363785236362e-06,
|
|
"loss": 0.0206,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"epoch": 0.7592424117111901,
|
|
"grad_norm": 0.045924026519060135,
|
|
"learning_rate": 1.503846574780285e-06,
|
|
"loss": 0.0212,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"epoch": 0.7600694731618559,
|
|
"grad_norm": 0.0389275960624218,
|
|
"learning_rate": 1.4940829258929606e-06,
|
|
"loss": 0.0217,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"epoch": 0.7608965346125217,
|
|
"grad_norm": 0.042410727590322495,
|
|
"learning_rate": 1.4843455049038869e-06,
|
|
"loss": 0.0206,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 0.7617235960631875,
|
|
"grad_norm": 0.04143417999148369,
|
|
"learning_rate": 1.4746343846590783e-06,
|
|
"loss": 0.0218,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"epoch": 0.7625506575138533,
|
|
"grad_norm": 0.04118340089917183,
|
|
"learning_rate": 1.4649496378077983e-06,
|
|
"loss": 0.0203,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"epoch": 0.7633777189645191,
|
|
"grad_norm": 0.04239552468061447,
|
|
"learning_rate": 1.455291336801999e-06,
|
|
"loss": 0.0222,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"epoch": 0.7642047804151848,
|
|
"grad_norm": 0.041403092443943024,
|
|
"learning_rate": 1.4456595538957974e-06,
|
|
"loss": 0.0211,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"epoch": 0.7650318418658506,
|
|
"grad_norm": 0.12348439544439316,
|
|
"learning_rate": 1.436054361144925e-06,
|
|
"loss": 0.0215,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"epoch": 0.7658589033165164,
|
|
"grad_norm": 0.04165393486618996,
|
|
"learning_rate": 1.4264758304061938e-06,
|
|
"loss": 0.0202,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"epoch": 0.7666859647671822,
|
|
"grad_norm": 0.044469356536865234,
|
|
"learning_rate": 1.4169240333369543e-06,
|
|
"loss": 0.0207,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"epoch": 0.767513026217848,
|
|
"grad_norm": 0.04392145201563835,
|
|
"learning_rate": 1.4073990413945582e-06,
|
|
"loss": 0.0208,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"epoch": 0.7683400876685138,
|
|
"grad_norm": 0.043122172355651855,
|
|
"learning_rate": 1.3979009258358367e-06,
|
|
"loss": 0.021,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"epoch": 0.7691671491191796,
|
|
"grad_norm": 0.0898752361536026,
|
|
"learning_rate": 1.3884297577165462e-06,
|
|
"loss": 0.0212,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 0.7699942105698453,
|
|
"grad_norm": 0.04254557564854622,
|
|
"learning_rate": 1.378985607890856e-06,
|
|
"loss": 0.0219,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"epoch": 0.7708212720205111,
|
|
"grad_norm": 0.05117588862776756,
|
|
"learning_rate": 1.3695685470108078e-06,
|
|
"loss": 0.0219,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"epoch": 0.7716483334711769,
|
|
"grad_norm": 0.04056469351053238,
|
|
"learning_rate": 1.3601786455257905e-06,
|
|
"loss": 0.0207,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"epoch": 0.7724753949218427,
|
|
"grad_norm": 0.05269391089677811,
|
|
"learning_rate": 1.3508159736820132e-06,
|
|
"loss": 0.0217,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"epoch": 0.7733024563725085,
|
|
"grad_norm": 0.036445554345846176,
|
|
"learning_rate": 1.341480601521974e-06,
|
|
"loss": 0.0211,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"epoch": 0.7741295178231743,
|
|
"grad_norm": 0.04814046248793602,
|
|
"learning_rate": 1.33217259888395e-06,
|
|
"loss": 0.0212,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"epoch": 0.7749565792738401,
|
|
"grad_norm": 0.038837458938360214,
|
|
"learning_rate": 1.3228920354014607e-06,
|
|
"loss": 0.0209,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"epoch": 0.7757836407245058,
|
|
"grad_norm": 0.0410507507622242,
|
|
"learning_rate": 1.31363898050275e-06,
|
|
"loss": 0.0205,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"epoch": 0.7766107021751716,
|
|
"grad_norm": 0.03645321726799011,
|
|
"learning_rate": 1.3044135034102711e-06,
|
|
"loss": 0.0207,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"epoch": 0.7774377636258374,
|
|
"grad_norm": 0.040732916444540024,
|
|
"learning_rate": 1.2952156731401716e-06,
|
|
"loss": 0.0202,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 0.7782648250765032,
|
|
"grad_norm": 0.043882377445697784,
|
|
"learning_rate": 1.2860455585017634e-06,
|
|
"loss": 0.0204,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"epoch": 0.779091886527169,
|
|
"grad_norm": 0.038812581449747086,
|
|
"learning_rate": 1.2769032280970222e-06,
|
|
"loss": 0.0209,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"epoch": 0.7799189479778348,
|
|
"grad_norm": 0.049354683607816696,
|
|
"learning_rate": 1.2677887503200681e-06,
|
|
"loss": 0.0197,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"epoch": 0.7807460094285006,
|
|
"grad_norm": 0.030933791771531105,
|
|
"learning_rate": 1.258702193356654e-06,
|
|
"loss": 0.0223,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 0.7815730708791663,
|
|
"grad_norm": 0.03828246891498566,
|
|
"learning_rate": 1.2496436251836563e-06,
|
|
"loss": 0.0231,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"epoch": 0.7824001323298321,
|
|
"grad_norm": 0.04567508026957512,
|
|
"learning_rate": 1.2406131135685656e-06,
|
|
"loss": 0.0217,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"epoch": 0.7832271937804979,
|
|
"grad_norm": 0.04124726355075836,
|
|
"learning_rate": 1.231610726068983e-06,
|
|
"loss": 0.0207,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"epoch": 0.7840542552311637,
|
|
"grad_norm": 0.03909214958548546,
|
|
"learning_rate": 1.2226365300321063e-06,
|
|
"loss": 0.021,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"epoch": 0.7848813166818295,
|
|
"grad_norm": 0.03634734824299812,
|
|
"learning_rate": 1.2136905925942367e-06,
|
|
"loss": 0.0214,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"epoch": 0.7857083781324953,
|
|
"grad_norm": 0.04165159910917282,
|
|
"learning_rate": 1.2047729806802739e-06,
|
|
"loss": 0.0205,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.7865354395831611,
|
|
"grad_norm": 0.042302560061216354,
|
|
"learning_rate": 1.195883761003206e-06,
|
|
"loss": 0.0216,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"epoch": 0.7873625010338268,
|
|
"grad_norm": 0.05284997075796127,
|
|
"learning_rate": 1.187023000063623e-06,
|
|
"loss": 0.0205,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"epoch": 0.7881895624844926,
|
|
"grad_norm": 0.037878263741731644,
|
|
"learning_rate": 1.1781907641492129e-06,
|
|
"loss": 0.0224,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"epoch": 0.7890166239351584,
|
|
"grad_norm": 0.04263276234269142,
|
|
"learning_rate": 1.169387119334266e-06,
|
|
"loss": 0.0222,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"epoch": 0.7898436853858242,
|
|
"grad_norm": 0.042502984404563904,
|
|
"learning_rate": 1.1606121314791846e-06,
|
|
"loss": 0.0216,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"epoch": 0.79067074683649,
|
|
"grad_norm": 0.041224028915166855,
|
|
"learning_rate": 1.1518658662299798e-06,
|
|
"loss": 0.0232,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"epoch": 0.7914978082871558,
|
|
"grad_norm": 0.04057363048195839,
|
|
"learning_rate": 1.1431483890177991e-06,
|
|
"loss": 0.0209,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"epoch": 0.7923248697378216,
|
|
"grad_norm": 0.038158901035785675,
|
|
"learning_rate": 1.1344597650584139e-06,
|
|
"loss": 0.0212,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"epoch": 0.7931519311884873,
|
|
"grad_norm": 0.038800351321697235,
|
|
"learning_rate": 1.1258000593517516e-06,
|
|
"loss": 0.0201,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"epoch": 0.7939789926391531,
|
|
"grad_norm": 0.044678255915641785,
|
|
"learning_rate": 1.1171693366813967e-06,
|
|
"loss": 0.0209,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 0.7948060540898189,
|
|
"grad_norm": 0.038671303540468216,
|
|
"learning_rate": 1.1085676616141133e-06,
|
|
"loss": 0.021,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"epoch": 0.7956331155404847,
|
|
"grad_norm": 0.03674842417240143,
|
|
"learning_rate": 1.0999950984993584e-06,
|
|
"loss": 0.0221,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"epoch": 0.7964601769911505,
|
|
"grad_norm": 0.04083636775612831,
|
|
"learning_rate": 1.0914517114687973e-06,
|
|
"loss": 0.0285,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"epoch": 0.7972872384418163,
|
|
"grad_norm": 0.03498871251940727,
|
|
"learning_rate": 1.0829375644358352e-06,
|
|
"loss": 0.0197,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"epoch": 0.7981142998924821,
|
|
"grad_norm": 0.04270506650209427,
|
|
"learning_rate": 1.074452721095129e-06,
|
|
"loss": 0.0199,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"epoch": 0.7989413613431477,
|
|
"grad_norm": 0.04146299883723259,
|
|
"learning_rate": 1.065997244922109e-06,
|
|
"loss": 0.0209,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"epoch": 0.7997684227938135,
|
|
"grad_norm": 0.0386267714202404,
|
|
"learning_rate": 1.057571199172514e-06,
|
|
"loss": 0.0204,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"epoch": 0.8005954842444793,
|
|
"grad_norm": 0.03857827186584473,
|
|
"learning_rate": 1.0491746468819114e-06,
|
|
"loss": 0.0216,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"epoch": 0.8014225456951451,
|
|
"grad_norm": 0.03981781378388405,
|
|
"learning_rate": 1.040807650865226e-06,
|
|
"loss": 0.0207,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"epoch": 0.802249607145811,
|
|
"grad_norm": 0.03532428294420242,
|
|
"learning_rate": 1.0324702737162717e-06,
|
|
"loss": 0.0207,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 0.8030766685964768,
|
|
"grad_norm": 0.059968218207359314,
|
|
"learning_rate": 1.0241625778072823e-06,
|
|
"loss": 0.0216,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"epoch": 0.8039037300471426,
|
|
"grad_norm": 0.042843446135520935,
|
|
"learning_rate": 1.0158846252884464e-06,
|
|
"loss": 0.0196,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"epoch": 0.8047307914978082,
|
|
"grad_norm": 0.04270855337381363,
|
|
"learning_rate": 1.007636478087437e-06,
|
|
"loss": 0.0247,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"epoch": 0.805557852948474,
|
|
"grad_norm": 0.037198904901742935,
|
|
"learning_rate": 9.994181979089563e-07,
|
|
"loss": 0.0249,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"epoch": 0.8063849143991398,
|
|
"grad_norm": 0.04327964037656784,
|
|
"learning_rate": 9.912298462342724e-07,
|
|
"loss": 0.0214,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"epoch": 0.8072119758498056,
|
|
"grad_norm": 0.0518951341509819,
|
|
"learning_rate": 9.8307148432075e-07,
|
|
"loss": 0.0313,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"epoch": 0.8080390373004714,
|
|
"grad_norm": 0.04297772794961929,
|
|
"learning_rate": 9.749431732014047e-07,
|
|
"loss": 0.0201,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"epoch": 0.8088660987511372,
|
|
"grad_norm": 0.04605748876929283,
|
|
"learning_rate": 9.668449736844392e-07,
|
|
"loss": 0.0229,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"epoch": 0.809693160201803,
|
|
"grad_norm": 0.09627640247344971,
|
|
"learning_rate": 9.587769463527908e-07,
|
|
"loss": 0.0231,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"epoch": 0.8105202216524687,
|
|
"grad_norm": 0.05746271833777428,
|
|
"learning_rate": 9.507391515636783e-07,
|
|
"loss": 0.0201,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 0.8113472831031345,
|
|
"grad_norm": 0.038759179413318634,
|
|
"learning_rate": 9.427316494481447e-07,
|
|
"loss": 0.0201,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"epoch": 0.8121743445538003,
|
|
"grad_norm": 0.04053572565317154,
|
|
"learning_rate": 9.347544999106195e-07,
|
|
"loss": 0.0213,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"epoch": 0.8130014060044661,
|
|
"grad_norm": 0.03577113896608353,
|
|
"learning_rate": 9.26807762628461e-07,
|
|
"loss": 0.0212,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"epoch": 0.8138284674551319,
|
|
"grad_norm": 0.04762836545705795,
|
|
"learning_rate": 9.188914970515089e-07,
|
|
"loss": 0.0229,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"epoch": 0.8146555289057977,
|
|
"grad_norm": 0.03575866296887398,
|
|
"learning_rate": 9.110057624016461e-07,
|
|
"loss": 0.0213,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"epoch": 0.8154825903564635,
|
|
"grad_norm": 0.04291502758860588,
|
|
"learning_rate": 9.03150617672352e-07,
|
|
"loss": 0.0211,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"epoch": 0.8163096518071292,
|
|
"grad_norm": 0.05030713975429535,
|
|
"learning_rate": 8.953261216282616e-07,
|
|
"loss": 0.0195,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"epoch": 0.817136713257795,
|
|
"grad_norm": 0.03975163400173187,
|
|
"learning_rate": 8.875323328047258e-07,
|
|
"loss": 0.0199,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"epoch": 0.8179637747084608,
|
|
"grad_norm": 0.049601927399635315,
|
|
"learning_rate": 8.797693095073733e-07,
|
|
"loss": 0.0213,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"epoch": 0.8187908361591266,
|
|
"grad_norm": 0.04565184563398361,
|
|
"learning_rate": 8.72037109811677e-07,
|
|
"loss": 0.0213,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 0.8196178976097924,
|
|
"grad_norm": 0.04753655195236206,
|
|
"learning_rate": 8.643357915625122e-07,
|
|
"loss": 0.0217,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"epoch": 0.8204449590604582,
|
|
"grad_norm": 0.0501379668712616,
|
|
"learning_rate": 8.566654123737322e-07,
|
|
"loss": 0.0215,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"epoch": 0.821272020511124,
|
|
"grad_norm": 0.037206389009952545,
|
|
"learning_rate": 8.490260296277375e-07,
|
|
"loss": 0.0284,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"epoch": 0.8220990819617897,
|
|
"grad_norm": 0.040716107934713364,
|
|
"learning_rate": 8.414177004750357e-07,
|
|
"loss": 0.0219,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"epoch": 0.8229261434124555,
|
|
"grad_norm": 0.05191744118928909,
|
|
"learning_rate": 8.338404818338264e-07,
|
|
"loss": 0.0219,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"epoch": 0.8237532048631213,
|
|
"grad_norm": 0.04306924715638161,
|
|
"learning_rate": 8.262944303895687e-07,
|
|
"loss": 0.0199,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"epoch": 0.8245802663137871,
|
|
"grad_norm": 0.050016891211271286,
|
|
"learning_rate": 8.187796025945588e-07,
|
|
"loss": 0.0207,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"epoch": 0.8254073277644529,
|
|
"grad_norm": 0.036976251751184464,
|
|
"learning_rate": 8.112960546675091e-07,
|
|
"loss": 0.021,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"epoch": 0.8262343892151187,
|
|
"grad_norm": 0.03663003817200661,
|
|
"learning_rate": 8.038438425931216e-07,
|
|
"loss": 0.0204,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"epoch": 0.8270614506657845,
|
|
"grad_norm": 0.0474594421684742,
|
|
"learning_rate": 7.964230221216806e-07,
|
|
"loss": 0.0205,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.8270614506657845,
|
|
"eval_loss": 0.021883510053157806,
|
|
"eval_runtime": 1220.4916,
|
|
"eval_samples_per_second": 4.915,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.8278885121164502,
|
|
"grad_norm": 0.03651763126254082,
|
|
"learning_rate": 7.890336487686218e-07,
|
|
"loss": 0.0205,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"epoch": 0.828715573567116,
|
|
"grad_norm": 0.038936734199523926,
|
|
"learning_rate": 7.816757778141281e-07,
|
|
"loss": 0.0224,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"epoch": 0.8295426350177818,
|
|
"grad_norm": 0.05199515074491501,
|
|
"learning_rate": 7.743494643027094e-07,
|
|
"loss": 0.021,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"epoch": 0.8303696964684476,
|
|
"grad_norm": 0.03698007017374039,
|
|
"learning_rate": 7.670547630427954e-07,
|
|
"loss": 0.0202,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"epoch": 0.8311967579191134,
|
|
"grad_norm": 0.043272070586681366,
|
|
"learning_rate": 7.597917286063233e-07,
|
|
"loss": 0.021,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"epoch": 0.8320238193697792,
|
|
"grad_norm": 0.04484783858060837,
|
|
"learning_rate": 7.525604153283239e-07,
|
|
"loss": 0.0211,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"epoch": 0.832850880820445,
|
|
"grad_norm": 0.03705143555998802,
|
|
"learning_rate": 7.453608773065296e-07,
|
|
"loss": 0.0203,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"epoch": 0.8336779422711107,
|
|
"grad_norm": 0.039405085146427155,
|
|
"learning_rate": 7.381931684009569e-07,
|
|
"loss": 0.0212,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"epoch": 0.8345050037217765,
|
|
"grad_norm": 0.044893745332956314,
|
|
"learning_rate": 7.310573422335044e-07,
|
|
"loss": 0.0203,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"epoch": 0.8353320651724423,
|
|
"grad_norm": 0.03983564302325249,
|
|
"learning_rate": 7.23953452187559e-07,
|
|
"loss": 0.0216,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 0.8361591266231081,
|
|
"grad_norm": 0.044955916702747345,
|
|
"learning_rate": 7.16881551407591e-07,
|
|
"loss": 0.0331,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"epoch": 0.8369861880737739,
|
|
"grad_norm": 0.035963404923677444,
|
|
"learning_rate": 7.098416927987578e-07,
|
|
"loss": 0.0198,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"epoch": 0.8378132495244397,
|
|
"grad_norm": 0.052064284682273865,
|
|
"learning_rate": 7.028339290265068e-07,
|
|
"loss": 0.0219,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"epoch": 0.8386403109751055,
|
|
"grad_norm": 0.04226592183113098,
|
|
"learning_rate": 6.958583125161855e-07,
|
|
"loss": 0.0208,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"epoch": 0.8394673724257712,
|
|
"grad_norm": 0.040528856217861176,
|
|
"learning_rate": 6.889148954526448e-07,
|
|
"loss": 0.0201,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"epoch": 0.840294433876437,
|
|
"grad_norm": 0.04366208612918854,
|
|
"learning_rate": 6.820037297798476e-07,
|
|
"loss": 0.0217,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"epoch": 0.8411214953271028,
|
|
"grad_norm": 0.03569814935326576,
|
|
"learning_rate": 6.75124867200489e-07,
|
|
"loss": 0.0203,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"epoch": 0.8419485567777686,
|
|
"grad_norm": 0.03940269351005554,
|
|
"learning_rate": 6.682783591755998e-07,
|
|
"loss": 0.0277,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"epoch": 0.8427756182284344,
|
|
"grad_norm": 0.04888477176427841,
|
|
"learning_rate": 6.614642569241642e-07,
|
|
"loss": 0.0201,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"epoch": 0.8436026796791002,
|
|
"grad_norm": 0.037041421979665756,
|
|
"learning_rate": 6.546826114227378e-07,
|
|
"loss": 0.0215,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 0.844429741129766,
|
|
"grad_norm": 0.054255735129117966,
|
|
"learning_rate": 6.479334734050713e-07,
|
|
"loss": 0.0204,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"epoch": 0.8452568025804317,
|
|
"grad_norm": 0.04159407690167427,
|
|
"learning_rate": 6.41216893361718e-07,
|
|
"loss": 0.0199,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"epoch": 0.8460838640310975,
|
|
"grad_norm": 0.042949602007865906,
|
|
"learning_rate": 6.345329215396678e-07,
|
|
"loss": 0.0217,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"epoch": 0.8469109254817633,
|
|
"grad_norm": 0.04219160974025726,
|
|
"learning_rate": 6.278816079419675e-07,
|
|
"loss": 0.0214,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"epoch": 0.8477379869324291,
|
|
"grad_norm": 0.0364801287651062,
|
|
"learning_rate": 6.212630023273452e-07,
|
|
"loss": 0.0224,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"epoch": 0.8485650483830949,
|
|
"grad_norm": 0.03598921000957489,
|
|
"learning_rate": 6.146771542098418e-07,
|
|
"loss": 0.0203,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"epoch": 0.8493921098337607,
|
|
"grad_norm": 0.04587122052907944,
|
|
"learning_rate": 6.08124112858432e-07,
|
|
"loss": 0.0203,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"epoch": 0.8502191712844265,
|
|
"grad_norm": 0.05026087537407875,
|
|
"learning_rate": 6.0160392729667e-07,
|
|
"loss": 0.0212,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"epoch": 0.8510462327350922,
|
|
"grad_norm": 0.03648602217435837,
|
|
"learning_rate": 5.951166463023089e-07,
|
|
"loss": 0.0209,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"epoch": 0.851873294185758,
|
|
"grad_norm": 0.03705860301852226,
|
|
"learning_rate": 5.886623184069434e-07,
|
|
"loss": 0.0206,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 0.8527003556364238,
|
|
"grad_norm": 0.03770057111978531,
|
|
"learning_rate": 5.822409918956445e-07,
|
|
"loss": 0.0207,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"epoch": 0.8535274170870896,
|
|
"grad_norm": 0.038956765085458755,
|
|
"learning_rate": 5.758527148065989e-07,
|
|
"loss": 0.0248,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"epoch": 0.8543544785377554,
|
|
"grad_norm": 0.04223814234137535,
|
|
"learning_rate": 5.694975349307503e-07,
|
|
"loss": 0.0211,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"epoch": 0.8551815399884212,
|
|
"grad_norm": 0.04067877680063248,
|
|
"learning_rate": 5.631754998114369e-07,
|
|
"loss": 0.021,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"epoch": 0.856008601439087,
|
|
"grad_norm": 0.03736858442425728,
|
|
"learning_rate": 5.568866567440451e-07,
|
|
"loss": 0.0209,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"epoch": 0.8568356628897527,
|
|
"grad_norm": 0.03979681432247162,
|
|
"learning_rate": 5.506310527756481e-07,
|
|
"loss": 0.0206,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"epoch": 0.8576627243404185,
|
|
"grad_norm": 0.04270453378558159,
|
|
"learning_rate": 5.444087347046534e-07,
|
|
"loss": 0.0222,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"epoch": 0.8584897857910843,
|
|
"grad_norm": 0.03582329303026199,
|
|
"learning_rate": 5.382197490804597e-07,
|
|
"loss": 0.0193,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"epoch": 0.8593168472417501,
|
|
"grad_norm": 0.040274590253829956,
|
|
"learning_rate": 5.32064142203102e-07,
|
|
"loss": 0.0215,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"epoch": 0.8601439086924159,
|
|
"grad_norm": 0.035478100180625916,
|
|
"learning_rate": 5.259419601229076e-07,
|
|
"loss": 0.0193,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 0.8609709701430817,
|
|
"grad_norm": 0.05305038392543793,
|
|
"learning_rate": 5.198532486401536e-07,
|
|
"loss": 0.0208,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"epoch": 0.8617980315937475,
|
|
"grad_norm": 0.03564458340406418,
|
|
"learning_rate": 5.137980533047204e-07,
|
|
"loss": 0.0208,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"epoch": 0.8626250930444131,
|
|
"grad_norm": 0.03643946349620819,
|
|
"learning_rate": 5.077764194157536e-07,
|
|
"loss": 0.0201,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"epoch": 0.863452154495079,
|
|
"grad_norm": 0.03864193707704544,
|
|
"learning_rate": 5.017883920213229e-07,
|
|
"loss": 0.0208,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"epoch": 0.8642792159457447,
|
|
"grad_norm": 0.04269906133413315,
|
|
"learning_rate": 4.95834015918088e-07,
|
|
"loss": 0.0205,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 0.8651062773964106,
|
|
"grad_norm": 0.04071857035160065,
|
|
"learning_rate": 4.899133356509639e-07,
|
|
"loss": 0.0218,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"epoch": 0.8659333388470764,
|
|
"grad_norm": 0.05047852545976639,
|
|
"learning_rate": 4.840263955127811e-07,
|
|
"loss": 0.02,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"epoch": 0.8667604002977422,
|
|
"grad_norm": 0.03635663166642189,
|
|
"learning_rate": 4.78173239543962e-07,
|
|
"loss": 0.0206,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"epoch": 0.867587461748408,
|
|
"grad_norm": 0.05464612692594528,
|
|
"learning_rate": 4.72353911532189e-07,
|
|
"loss": 0.0201,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"epoch": 0.8684145231990736,
|
|
"grad_norm": 0.04334511607885361,
|
|
"learning_rate": 4.665684550120736e-07,
|
|
"loss": 0.0213,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.8692415846497394,
|
|
"grad_norm": 0.036809250712394714,
|
|
"learning_rate": 4.608169132648371e-07,
|
|
"loss": 0.0205,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"epoch": 0.8700686461004052,
|
|
"grad_norm": 0.04075481742620468,
|
|
"learning_rate": 4.5509932931797727e-07,
|
|
"loss": 0.0202,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"epoch": 0.870895707551071,
|
|
"grad_norm": 0.041940901428461075,
|
|
"learning_rate": 4.4941574594495994e-07,
|
|
"loss": 0.0201,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"epoch": 0.8717227690017368,
|
|
"grad_norm": 0.040375709533691406,
|
|
"learning_rate": 4.437662056648845e-07,
|
|
"loss": 0.0219,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"epoch": 0.8725498304524026,
|
|
"grad_norm": 0.0415097214281559,
|
|
"learning_rate": 4.3815075074217615e-07,
|
|
"loss": 0.0204,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"epoch": 0.8733768919030684,
|
|
"grad_norm": 0.0475936122238636,
|
|
"learning_rate": 4.325694231862665e-07,
|
|
"loss": 0.0217,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"epoch": 0.8742039533537341,
|
|
"grad_norm": 0.04286682605743408,
|
|
"learning_rate": 4.2702226475127675e-07,
|
|
"loss": 0.0214,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"epoch": 0.8750310148043999,
|
|
"grad_norm": 0.043143562972545624,
|
|
"learning_rate": 4.2150931693570986e-07,
|
|
"loss": 0.0209,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"epoch": 0.8758580762550657,
|
|
"grad_norm": 0.03890874236822128,
|
|
"learning_rate": 4.1603062098213685e-07,
|
|
"loss": 0.0207,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"epoch": 0.8766851377057315,
|
|
"grad_norm": 0.03903120383620262,
|
|
"learning_rate": 4.1058621787688934e-07,
|
|
"loss": 0.0378,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 0.8775121991563973,
|
|
"grad_norm": 0.03754309564828873,
|
|
"learning_rate": 4.051761483497541e-07,
|
|
"loss": 0.036,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"epoch": 0.8783392606070631,
|
|
"grad_norm": 0.04438405483961105,
|
|
"learning_rate": 3.998004528736632e-07,
|
|
"loss": 0.0213,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 0.8791663220577289,
|
|
"grad_norm": 0.037207264453172684,
|
|
"learning_rate": 3.9445917166439915e-07,
|
|
"loss": 0.0198,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"epoch": 0.8799933835083946,
|
|
"grad_norm": 0.04333435744047165,
|
|
"learning_rate": 3.8915234468029027e-07,
|
|
"loss": 0.0202,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"epoch": 0.8808204449590604,
|
|
"grad_norm": 0.034282222390174866,
|
|
"learning_rate": 3.838800116219082e-07,
|
|
"loss": 0.0205,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"epoch": 0.8816475064097262,
|
|
"grad_norm": 0.04391239210963249,
|
|
"learning_rate": 3.786422119317762e-07,
|
|
"loss": 0.0197,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"epoch": 0.882474567860392,
|
|
"grad_norm": 0.037362392991781235,
|
|
"learning_rate": 3.7343898479407227e-07,
|
|
"loss": 0.0204,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"epoch": 0.8833016293110578,
|
|
"grad_norm": 0.03807242214679718,
|
|
"learning_rate": 3.682703691343353e-07,
|
|
"loss": 0.0209,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"epoch": 0.8841286907617236,
|
|
"grad_norm": 0.0376625694334507,
|
|
"learning_rate": 3.6313640361917535e-07,
|
|
"loss": 0.0203,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"epoch": 0.8849557522123894,
|
|
"grad_norm": 0.039894696325063705,
|
|
"learning_rate": 3.580371266559801e-07,
|
|
"loss": 0.0203,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 0.8857828136630551,
|
|
"grad_norm": 0.03698920086026192,
|
|
"learning_rate": 3.529725763926367e-07,
|
|
"loss": 0.025,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"epoch": 0.8866098751137209,
|
|
"grad_norm": 0.04414455220103264,
|
|
"learning_rate": 3.4794279071723503e-07,
|
|
"loss": 0.0212,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"epoch": 0.8874369365643867,
|
|
"grad_norm": 0.041200559586286545,
|
|
"learning_rate": 3.4294780725779296e-07,
|
|
"loss": 0.022,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"epoch": 0.8882639980150525,
|
|
"grad_norm": 0.03711444512009621,
|
|
"learning_rate": 3.379876633819701e-07,
|
|
"loss": 0.0214,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"epoch": 0.8890910594657183,
|
|
"grad_norm": 0.046689391136169434,
|
|
"learning_rate": 3.3306239619679106e-07,
|
|
"loss": 0.0207,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"epoch": 0.8899181209163841,
|
|
"grad_norm": 0.04057691618800163,
|
|
"learning_rate": 3.281720425483653e-07,
|
|
"loss": 0.0206,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"epoch": 0.8907451823670499,
|
|
"grad_norm": 0.04174448922276497,
|
|
"learning_rate": 3.2331663902161416e-07,
|
|
"loss": 0.0212,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"epoch": 0.8915722438177156,
|
|
"grad_norm": 0.03700239583849907,
|
|
"learning_rate": 3.184962219399945e-07,
|
|
"loss": 0.0206,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"epoch": 0.8923993052683814,
|
|
"grad_norm": 0.03949680179357529,
|
|
"learning_rate": 3.137108273652301e-07,
|
|
"loss": 0.0235,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"epoch": 0.8932263667190472,
|
|
"grad_norm": 0.03353721275925636,
|
|
"learning_rate": 3.0896049109703616e-07,
|
|
"loss": 0.0199,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 0.894053428169713,
|
|
"grad_norm": 0.040740326046943665,
|
|
"learning_rate": 3.0424524867286085e-07,
|
|
"loss": 0.02,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"epoch": 0.8948804896203788,
|
|
"grad_norm": 0.04401690140366554,
|
|
"learning_rate": 2.9956513536760934e-07,
|
|
"loss": 0.0307,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"epoch": 0.8957075510710446,
|
|
"grad_norm": 0.033138833940029144,
|
|
"learning_rate": 2.9492018619338703e-07,
|
|
"loss": 0.0206,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"epoch": 0.8965346125217104,
|
|
"grad_norm": 0.03981183469295502,
|
|
"learning_rate": 2.9031043589923426e-07,
|
|
"loss": 0.0211,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"epoch": 0.8973616739723761,
|
|
"grad_norm": 0.036721404641866684,
|
|
"learning_rate": 2.857359189708669e-07,
|
|
"loss": 0.0215,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"epoch": 0.8981887354230419,
|
|
"grad_norm": 0.049144960939884186,
|
|
"learning_rate": 2.8119666963042025e-07,
|
|
"loss": 0.0218,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"epoch": 0.8990157968737077,
|
|
"grad_norm": 0.0382852703332901,
|
|
"learning_rate": 2.766927218361887e-07,
|
|
"loss": 0.0209,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"epoch": 0.8998428583243735,
|
|
"grad_norm": 0.04550480842590332,
|
|
"learning_rate": 2.722241092823774e-07,
|
|
"loss": 0.0208,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"epoch": 0.9006699197750393,
|
|
"grad_norm": 0.04339880868792534,
|
|
"learning_rate": 2.677908653988465e-07,
|
|
"loss": 0.0211,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"epoch": 0.9014969812257051,
|
|
"grad_norm": 0.036550212651491165,
|
|
"learning_rate": 2.6339302335085914e-07,
|
|
"loss": 0.0197,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 0.9023240426763709,
|
|
"grad_norm": 0.034907784312963486,
|
|
"learning_rate": 2.5903061603883897e-07,
|
|
"loss": 0.0207,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"epoch": 0.9031511041270366,
|
|
"grad_norm": 0.043843794614076614,
|
|
"learning_rate": 2.5470367609812084e-07,
|
|
"loss": 0.0207,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"epoch": 0.9039781655777024,
|
|
"grad_norm": 0.03882720693945885,
|
|
"learning_rate": 2.504122358987049e-07,
|
|
"loss": 0.0206,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"epoch": 0.9048052270283682,
|
|
"grad_norm": 0.04570434242486954,
|
|
"learning_rate": 2.461563275450185e-07,
|
|
"loss": 0.0203,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"epoch": 0.905632288479034,
|
|
"grad_norm": 0.03797876834869385,
|
|
"learning_rate": 2.4193598287567287e-07,
|
|
"loss": 0.0203,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"epoch": 0.9064593499296998,
|
|
"grad_norm": 0.04128405451774597,
|
|
"learning_rate": 2.3775123346322593e-07,
|
|
"loss": 0.0213,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"epoch": 0.9072864113803656,
|
|
"grad_norm": 0.036211997270584106,
|
|
"learning_rate": 2.3360211061394743e-07,
|
|
"loss": 0.0209,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"epoch": 0.9081134728310314,
|
|
"grad_norm": 0.0425226129591465,
|
|
"learning_rate": 2.2948864536757985e-07,
|
|
"loss": 0.0208,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"epoch": 0.9089405342816971,
|
|
"grad_norm": 0.042280830442905426,
|
|
"learning_rate": 2.2541086849711514e-07,
|
|
"loss": 0.021,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"epoch": 0.9097675957323629,
|
|
"grad_norm": 0.03988664597272873,
|
|
"learning_rate": 2.213688105085543e-07,
|
|
"loss": 0.0214,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.9097675957323629,
|
|
"eval_loss": 0.021760277450084686,
|
|
"eval_runtime": 1221.1845,
|
|
"eval_samples_per_second": 4.912,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.9105946571830287,
|
|
"grad_norm": 0.038166627287864685,
|
|
"learning_rate": 2.1736250164068662e-07,
|
|
"loss": 0.0211,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"epoch": 0.9114217186336945,
|
|
"grad_norm": 0.03990177437663078,
|
|
"learning_rate": 2.1339197186486027e-07,
|
|
"loss": 0.0199,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"epoch": 0.9122487800843603,
|
|
"grad_norm": 0.03698251396417618,
|
|
"learning_rate": 2.0945725088475921e-07,
|
|
"loss": 0.0213,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"epoch": 0.9130758415350261,
|
|
"grad_norm": 0.03812938556075096,
|
|
"learning_rate": 2.0555836813618003e-07,
|
|
"loss": 0.0214,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"epoch": 0.9139029029856919,
|
|
"grad_norm": 0.03743589296936989,
|
|
"learning_rate": 2.0169535278680984e-07,
|
|
"loss": 0.0204,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"epoch": 0.9147299644363576,
|
|
"grad_norm": 0.040262360125780106,
|
|
"learning_rate": 1.978682337360155e-07,
|
|
"loss": 0.0205,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"epoch": 0.9155570258870234,
|
|
"grad_norm": 0.03923022374510765,
|
|
"learning_rate": 1.940770396146191e-07,
|
|
"loss": 0.0189,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"epoch": 0.9163840873376892,
|
|
"grad_norm": 0.038444485515356064,
|
|
"learning_rate": 1.903217987846856e-07,
|
|
"loss": 0.0219,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"epoch": 0.917211148788355,
|
|
"grad_norm": 0.04547708109021187,
|
|
"learning_rate": 1.866025393393145e-07,
|
|
"loss": 0.0206,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"epoch": 0.9180382102390208,
|
|
"grad_norm": 0.03776419907808304,
|
|
"learning_rate": 1.8291928910242618e-07,
|
|
"loss": 0.0194,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 0.9188652716896866,
|
|
"grad_norm": 0.037485282868146896,
|
|
"learning_rate": 1.792720756285554e-07,
|
|
"loss": 0.0206,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"epoch": 0.9196923331403524,
|
|
"grad_norm": 0.0491410493850708,
|
|
"learning_rate": 1.7566092620264374e-07,
|
|
"loss": 0.0208,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"epoch": 0.9205193945910181,
|
|
"grad_norm": 0.04069705307483673,
|
|
"learning_rate": 1.720858678398374e-07,
|
|
"loss": 0.0211,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"epoch": 0.9213464560416839,
|
|
"grad_norm": 0.04729039594531059,
|
|
"learning_rate": 1.6854692728528298e-07,
|
|
"loss": 0.0211,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"epoch": 0.9221735174923497,
|
|
"grad_norm": 0.041814010590314865,
|
|
"learning_rate": 1.650441310139278e-07,
|
|
"loss": 0.0201,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"epoch": 0.9230005789430155,
|
|
"grad_norm": 0.04224241524934769,
|
|
"learning_rate": 1.615775052303231e-07,
|
|
"loss": 0.0205,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"epoch": 0.9238276403936813,
|
|
"grad_norm": 0.13403134047985077,
|
|
"learning_rate": 1.5814707586842948e-07,
|
|
"loss": 0.021,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"epoch": 0.9246547018443471,
|
|
"grad_norm": 0.03628065064549446,
|
|
"learning_rate": 1.5475286859141736e-07,
|
|
"loss": 0.0208,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"epoch": 0.9254817632950129,
|
|
"grad_norm": 0.03844155743718147,
|
|
"learning_rate": 1.5139490879147955e-07,
|
|
"loss": 0.0206,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"epoch": 0.9263088247456785,
|
|
"grad_norm": 0.03782174736261368,
|
|
"learning_rate": 1.4807322158964021e-07,
|
|
"loss": 0.0218,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 0.9271358861963443,
|
|
"grad_norm": 0.04245174303650856,
|
|
"learning_rate": 1.4478783183556834e-07,
|
|
"loss": 0.0204,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"epoch": 0.9279629476470102,
|
|
"grad_norm": 0.038376543670892715,
|
|
"learning_rate": 1.4153876410738787e-07,
|
|
"loss": 0.0209,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"epoch": 0.928790009097676,
|
|
"grad_norm": 0.03631012141704559,
|
|
"learning_rate": 1.3832604271149742e-07,
|
|
"loss": 0.0202,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"epoch": 0.9296170705483418,
|
|
"grad_norm": 0.03546414151787758,
|
|
"learning_rate": 1.35149691682388e-07,
|
|
"loss": 0.0211,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"epoch": 0.9304441319990076,
|
|
"grad_norm": 0.03860907629132271,
|
|
"learning_rate": 1.320097347824606e-07,
|
|
"loss": 0.0203,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"epoch": 0.9312711934496734,
|
|
"grad_norm": 0.04019659012556076,
|
|
"learning_rate": 1.2890619550185225e-07,
|
|
"loss": 0.0224,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"epoch": 0.932098254900339,
|
|
"grad_norm": 0.03943018242716789,
|
|
"learning_rate": 1.2583909705825792e-07,
|
|
"loss": 0.0199,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"epoch": 0.9329253163510048,
|
|
"grad_norm": 0.044663459062576294,
|
|
"learning_rate": 1.228084623967568e-07,
|
|
"loss": 0.0218,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"epoch": 0.9337523778016706,
|
|
"grad_norm": 0.041751962155103683,
|
|
"learning_rate": 1.1981431418964185e-07,
|
|
"loss": 0.0241,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"epoch": 0.9345794392523364,
|
|
"grad_norm": 0.03652814030647278,
|
|
"learning_rate": 1.1685667483624763e-07,
|
|
"loss": 0.0202,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 0.9354065007030022,
|
|
"grad_norm": 0.040455400943756104,
|
|
"learning_rate": 1.139355664627878e-07,
|
|
"loss": 0.022,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"epoch": 0.936233562153668,
|
|
"grad_norm": 0.03818695247173309,
|
|
"learning_rate": 1.1105101092218462e-07,
|
|
"loss": 0.0207,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"epoch": 0.9370606236043338,
|
|
"grad_norm": 0.04204050451517105,
|
|
"learning_rate": 1.0820302979390574e-07,
|
|
"loss": 0.0213,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"epoch": 0.9378876850549995,
|
|
"grad_norm": 0.04151742160320282,
|
|
"learning_rate": 1.0539164438380655e-07,
|
|
"loss": 0.0204,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"epoch": 0.9387147465056653,
|
|
"grad_norm": 0.038962677121162415,
|
|
"learning_rate": 1.0261687572396762e-07,
|
|
"loss": 0.0209,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"epoch": 0.9395418079563311,
|
|
"grad_norm": 0.045446451753377914,
|
|
"learning_rate": 9.987874457253799e-08,
|
|
"loss": 0.0207,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"epoch": 0.9403688694069969,
|
|
"grad_norm": 0.038088973611593246,
|
|
"learning_rate": 9.717727141358046e-08,
|
|
"loss": 0.0214,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"epoch": 0.9411959308576627,
|
|
"grad_norm": 0.04547916352748871,
|
|
"learning_rate": 9.45124764569183e-08,
|
|
"loss": 0.0245,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"epoch": 0.9420229923083285,
|
|
"grad_norm": 0.038108475506305695,
|
|
"learning_rate": 9.188437963798314e-08,
|
|
"loss": 0.0262,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"epoch": 0.9428500537589943,
|
|
"grad_norm": 0.041740551590919495,
|
|
"learning_rate": 8.929300061766677e-08,
|
|
"loss": 0.0208,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 0.94367711520966,
|
|
"grad_norm": 0.04090382158756256,
|
|
"learning_rate": 8.673835878217351e-08,
|
|
"loss": 0.0219,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"epoch": 0.9445041766603258,
|
|
"grad_norm": 0.03528539836406708,
|
|
"learning_rate": 8.42204732428764e-08,
|
|
"loss": 0.02,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"epoch": 0.9453312381109916,
|
|
"grad_norm": 0.03898858278989792,
|
|
"learning_rate": 8.173936283617068e-08,
|
|
"loss": 0.0198,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"epoch": 0.9461582995616574,
|
|
"grad_norm": 0.03353780135512352,
|
|
"learning_rate": 7.929504612333827e-08,
|
|
"loss": 0.0204,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"epoch": 0.9469853610123232,
|
|
"grad_norm": 0.03581018000841141,
|
|
"learning_rate": 7.688754139040522e-08,
|
|
"loss": 0.02,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"epoch": 0.947812422462989,
|
|
"grad_norm": 0.0409320667386055,
|
|
"learning_rate": 7.451686664800505e-08,
|
|
"loss": 0.0201,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"epoch": 0.9486394839136548,
|
|
"grad_norm": 0.04594825208187103,
|
|
"learning_rate": 7.218303963124507e-08,
|
|
"loss": 0.0206,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"epoch": 0.9494665453643206,
|
|
"grad_norm": 0.036496005952358246,
|
|
"learning_rate": 6.988607779957357e-08,
|
|
"loss": 0.0216,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"epoch": 0.9502936068149863,
|
|
"grad_norm": 0.043786004185676575,
|
|
"learning_rate": 6.762599833664896e-08,
|
|
"loss": 0.0225,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"epoch": 0.9511206682656521,
|
|
"grad_norm": 0.04733569920063019,
|
|
"learning_rate": 6.540281815021198e-08,
|
|
"loss": 0.0202,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.9519477297163179,
|
|
"grad_norm": 0.033522963523864746,
|
|
"learning_rate": 6.321655387195591e-08,
|
|
"loss": 0.0211,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"epoch": 0.9527747911669837,
|
|
"grad_norm": 0.04227704182267189,
|
|
"learning_rate": 6.106722185740821e-08,
|
|
"loss": 0.02,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"epoch": 0.9536018526176495,
|
|
"grad_norm": 0.04079505801200867,
|
|
"learning_rate": 5.8954838185801834e-08,
|
|
"loss": 0.0205,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"epoch": 0.9544289140683153,
|
|
"grad_norm": 0.0362294465303421,
|
|
"learning_rate": 5.6879418659959716e-08,
|
|
"loss": 0.0215,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"epoch": 0.9552559755189811,
|
|
"grad_norm": 0.039718855172395706,
|
|
"learning_rate": 5.4840978806173786e-08,
|
|
"loss": 0.0199,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"epoch": 0.9560830369696468,
|
|
"grad_norm": 0.038249652832746506,
|
|
"learning_rate": 5.283953387408891e-08,
|
|
"loss": 0.02,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"epoch": 0.9569100984203126,
|
|
"grad_norm": 0.04237562417984009,
|
|
"learning_rate": 5.087509883659136e-08,
|
|
"loss": 0.0206,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"epoch": 0.9577371598709784,
|
|
"grad_norm": 0.042977023869752884,
|
|
"learning_rate": 4.8947688389693325e-08,
|
|
"loss": 0.0213,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"epoch": 0.9585642213216442,
|
|
"grad_norm": 0.03681021183729172,
|
|
"learning_rate": 4.705731695242521e-08,
|
|
"loss": 0.0197,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"epoch": 0.95939128277231,
|
|
"grad_norm": 0.05688457190990448,
|
|
"learning_rate": 4.520399866672798e-08,
|
|
"loss": 0.0203,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 0.9602183442229758,
|
|
"grad_norm": 0.05174125358462334,
|
|
"learning_rate": 4.338774739734541e-08,
|
|
"loss": 0.0204,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"epoch": 0.9610454056736416,
|
|
"grad_norm": 0.041839614510536194,
|
|
"learning_rate": 4.160857673172147e-08,
|
|
"loss": 0.021,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"epoch": 0.9618724671243073,
|
|
"grad_norm": 0.04174220189452171,
|
|
"learning_rate": 3.986649997989922e-08,
|
|
"loss": 0.0222,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"epoch": 0.9626995285749731,
|
|
"grad_norm": 0.04051094874739647,
|
|
"learning_rate": 3.816153017442148e-08,
|
|
"loss": 0.0211,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"epoch": 0.9635265900256389,
|
|
"grad_norm": 0.04232405498623848,
|
|
"learning_rate": 3.649368007023202e-08,
|
|
"loss": 0.0206,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"epoch": 0.9643536514763047,
|
|
"grad_norm": 0.03878673538565636,
|
|
"learning_rate": 3.486296214457952e-08,
|
|
"loss": 0.0213,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"epoch": 0.9651807129269705,
|
|
"grad_norm": 0.04240000247955322,
|
|
"learning_rate": 3.326938859692708e-08,
|
|
"loss": 0.0209,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"epoch": 0.9660077743776363,
|
|
"grad_norm": 0.04394629970192909,
|
|
"learning_rate": 3.171297134885842e-08,
|
|
"loss": 0.0215,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"epoch": 0.9668348358283021,
|
|
"grad_norm": 0.03907477483153343,
|
|
"learning_rate": 3.019372204399018e-08,
|
|
"loss": 0.0202,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"epoch": 0.9676618972789678,
|
|
"grad_norm": 0.03773649409413338,
|
|
"learning_rate": 2.8711652047884176e-08,
|
|
"loss": 0.0203,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 0.9684889587296336,
|
|
"grad_norm": 0.041162021458148956,
|
|
"learning_rate": 2.7266772447961387e-08,
|
|
"loss": 0.0204,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"epoch": 0.9693160201802994,
|
|
"grad_norm": 0.03880157694220543,
|
|
"learning_rate": 2.585909405342091e-08,
|
|
"loss": 0.021,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"epoch": 0.9701430816309652,
|
|
"grad_norm": 0.04054776951670647,
|
|
"learning_rate": 2.4488627395157783e-08,
|
|
"loss": 0.0203,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"epoch": 0.970970143081631,
|
|
"grad_norm": 0.03724834322929382,
|
|
"learning_rate": 2.315538272568585e-08,
|
|
"loss": 0.0206,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"epoch": 0.9717972045322968,
|
|
"grad_norm": 0.04481399431824684,
|
|
"learning_rate": 2.1859370019058913e-08,
|
|
"loss": 0.0245,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"epoch": 0.9726242659829626,
|
|
"grad_norm": 0.04241091012954712,
|
|
"learning_rate": 2.0600598970795804e-08,
|
|
"loss": 0.02,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"epoch": 0.9734513274336283,
|
|
"grad_norm": 0.04424299672245979,
|
|
"learning_rate": 1.9379078997810995e-08,
|
|
"loss": 0.0245,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"epoch": 0.9742783888842941,
|
|
"grad_norm": 0.038491178303956985,
|
|
"learning_rate": 1.8194819238341877e-08,
|
|
"loss": 0.0201,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"epoch": 0.9751054503349599,
|
|
"grad_norm": 0.04305344447493553,
|
|
"learning_rate": 1.7047828551880475e-08,
|
|
"loss": 0.0204,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"epoch": 0.9759325117856257,
|
|
"grad_norm": 0.03870062530040741,
|
|
"learning_rate": 1.59381155191074e-08,
|
|
"loss": 0.0204,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 0.9767595732362915,
|
|
"grad_norm": 0.040076758712530136,
|
|
"learning_rate": 1.4865688441828008e-08,
|
|
"loss": 0.0216,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"epoch": 0.9775866346869573,
|
|
"grad_norm": 0.0319267176091671,
|
|
"learning_rate": 1.3830555342909113e-08,
|
|
"loss": 0.0229,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"epoch": 0.9784136961376231,
|
|
"grad_norm": 0.03797907382249832,
|
|
"learning_rate": 1.283272396622126e-08,
|
|
"loss": 0.0194,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"epoch": 0.9792407575882888,
|
|
"grad_norm": 0.03610234335064888,
|
|
"learning_rate": 1.1872201776578219e-08,
|
|
"loss": 0.0213,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"epoch": 0.9800678190389546,
|
|
"grad_norm": 0.048893995583057404,
|
|
"learning_rate": 1.0948995959683683e-08,
|
|
"loss": 0.022,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"epoch": 0.9808948804896204,
|
|
"grad_norm": 0.1131172701716423,
|
|
"learning_rate": 1.0063113422074667e-08,
|
|
"loss": 0.0202,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"epoch": 0.9817219419402862,
|
|
"grad_norm": 0.038660723716020584,
|
|
"learning_rate": 9.21456079107208e-09,
|
|
"loss": 0.0205,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"epoch": 0.982549003390952,
|
|
"grad_norm": 0.03787451982498169,
|
|
"learning_rate": 8.40334441473023e-09,
|
|
"loss": 0.0219,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"epoch": 0.9833760648416178,
|
|
"grad_norm": 0.0439179353415966,
|
|
"learning_rate": 7.629470361789071e-09,
|
|
"loss": 0.0206,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"epoch": 0.9842031262922836,
|
|
"grad_norm": 0.04450133442878723,
|
|
"learning_rate": 6.892944421630354e-09,
|
|
"loss": 0.0236,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 0.9850301877429493,
|
|
"grad_norm": 0.04056097939610481,
|
|
"learning_rate": 6.193772104232665e-09,
|
|
"loss": 0.0218,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"epoch": 0.9858572491936151,
|
|
"grad_norm": 0.04353098198771477,
|
|
"learning_rate": 5.531958640129787e-09,
|
|
"loss": 0.0228,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"epoch": 0.9866843106442809,
|
|
"grad_norm": 0.03741453215479851,
|
|
"learning_rate": 4.90750898037351e-09,
|
|
"loss": 0.0226,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"epoch": 0.9875113720949467,
|
|
"grad_norm": 0.04343261569738388,
|
|
"learning_rate": 4.32042779649533e-09,
|
|
"loss": 0.0197,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"epoch": 0.9883384335456125,
|
|
"grad_norm": 0.04530951753258705,
|
|
"learning_rate": 3.7707194804725846e-09,
|
|
"loss": 0.0196,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"epoch": 0.9891654949962783,
|
|
"grad_norm": 0.05223441496491432,
|
|
"learning_rate": 3.2583881446929256e-09,
|
|
"loss": 0.0206,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"epoch": 0.9899925564469441,
|
|
"grad_norm": 0.03598857298493385,
|
|
"learning_rate": 2.783437621926566e-09,
|
|
"loss": 0.0223,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"epoch": 0.9908196178976098,
|
|
"grad_norm": 0.044676005840301514,
|
|
"learning_rate": 2.345871465296856e-09,
|
|
"loss": 0.0203,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"epoch": 0.9916466793482756,
|
|
"grad_norm": 0.035769447684288025,
|
|
"learning_rate": 1.945692948253086e-09,
|
|
"loss": 0.0254,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"epoch": 0.9924737407989414,
|
|
"grad_norm": 0.050190046429634094,
|
|
"learning_rate": 1.5829050645449484e-09,
|
|
"loss": 0.0241,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.9924737407989414,
|
|
"eval_loss": 0.021732060238718987,
|
|
"eval_runtime": 1221.3996,
|
|
"eval_samples_per_second": 4.912,
|
|
"eval_steps_per_second": 0.307,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.9933008022496072,
|
|
"grad_norm": 0.038614947348833084,
|
|
"learning_rate": 1.2575105282025545e-09,
|
|
"loss": 0.0251,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"epoch": 0.994127863700273,
|
|
"grad_norm": 0.03808825463056564,
|
|
"learning_rate": 9.695117735147863e-10,
|
|
"loss": 0.0197,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"epoch": 0.9949549251509388,
|
|
"grad_norm": 0.037998467683792114,
|
|
"learning_rate": 7.189109550115314e-10,
|
|
"loss": 0.0211,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"epoch": 0.9957819866016046,
|
|
"grad_norm": 0.03407185524702072,
|
|
"learning_rate": 5.057099474470306e-10,
|
|
"loss": 0.0192,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"epoch": 0.9966090480522702,
|
|
"grad_norm": 0.03972679004073143,
|
|
"learning_rate": 3.299103457854447e-10,
|
|
"loss": 0.0231,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"epoch": 0.997436109502936,
|
|
"grad_norm": 0.0530787818133831,
|
|
"learning_rate": 1.9151346519086233e-10,
|
|
"loss": 0.0203,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"epoch": 0.9982631709536018,
|
|
"grad_norm": 0.03939739987254143,
|
|
"learning_rate": 9.052034101508789e-11,
|
|
"loss": 0.021,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"epoch": 0.9990902324042676,
|
|
"grad_norm": 0.039279334247112274,
|
|
"learning_rate": 2.693172879209005e-11,
|
|
"loss": 0.0211,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"epoch": 0.9999172938549334,
|
|
"grad_norm": 0.04228970408439636,
|
|
"learning_rate": 7.481042302304175e-13,
|
|
"loss": 0.0213,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 12091,
|
|
"total_flos": 2.46882716505537e+20,
|
|
"train_loss": 0.04353406456638118,
|
|
"train_runtime": 186963.3453,
|
|
"train_samples_per_second": 1.035,
|
|
"train_steps_per_second": 0.065
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 12091,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.46882716505537e+20,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|