1190 lines
27 KiB
JSON
1190 lines
27 KiB
JSON
[
|
|
{
|
|
"loss": 1.8996,
|
|
"grad_norm": 5.035277843475342,
|
|
"learning_rate": 2.278481012658228e-05,
|
|
"epoch": 0.012690355329949238,
|
|
"step": 10
|
|
},
|
|
{
|
|
"loss": 0.5315,
|
|
"grad_norm": 1.102072834968567,
|
|
"learning_rate": 4.810126582278481e-05,
|
|
"epoch": 0.025380710659898477,
|
|
"step": 20
|
|
},
|
|
{
|
|
"loss": 0.3353,
|
|
"grad_norm": 0.7798988819122314,
|
|
"learning_rate": 7.341772151898734e-05,
|
|
"epoch": 0.03807106598984772,
|
|
"step": 30
|
|
},
|
|
{
|
|
"loss": 0.2226,
|
|
"grad_norm": 0.8653473854064941,
|
|
"learning_rate": 9.873417721518988e-05,
|
|
"epoch": 0.050761421319796954,
|
|
"step": 40
|
|
},
|
|
{
|
|
"loss": 0.164,
|
|
"grad_norm": 0.7569780349731445,
|
|
"learning_rate": 0.0001240506329113924,
|
|
"epoch": 0.06345177664974619,
|
|
"step": 50
|
|
},
|
|
{
|
|
"loss": 0.1394,
|
|
"grad_norm": 1.0211968421936035,
|
|
"learning_rate": 0.00014936708860759494,
|
|
"epoch": 0.07614213197969544,
|
|
"step": 60
|
|
},
|
|
{
|
|
"loss": 0.1201,
|
|
"grad_norm": 0.5370887517929077,
|
|
"learning_rate": 0.00017468354430379748,
|
|
"epoch": 0.08883248730964467,
|
|
"step": 70
|
|
},
|
|
{
|
|
"loss": 0.122,
|
|
"grad_norm": 0.49917498230934143,
|
|
"learning_rate": 0.0002,
|
|
"epoch": 0.10152284263959391,
|
|
"step": 80
|
|
},
|
|
{
|
|
"loss": 0.1217,
|
|
"grad_norm": 0.4577413499355316,
|
|
"learning_rate": 0.0001999779803602204,
|
|
"epoch": 0.11421319796954314,
|
|
"step": 90
|
|
},
|
|
{
|
|
"loss": 0.0965,
|
|
"grad_norm": 0.48522070050239563,
|
|
"learning_rate": 0.00019991193113817244,
|
|
"epoch": 0.12690355329949238,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 0.11,
|
|
"grad_norm": 0.41902250051498413,
|
|
"learning_rate": 0.00019980188142145754,
|
|
"epoch": 0.13959390862944163,
|
|
"step": 110
|
|
},
|
|
{
|
|
"loss": 0.0823,
|
|
"grad_norm": 0.5561641454696655,
|
|
"learning_rate": 0.00019964787967517817,
|
|
"epoch": 0.15228426395939088,
|
|
"step": 120
|
|
},
|
|
{
|
|
"loss": 0.0856,
|
|
"grad_norm": 0.3316971957683563,
|
|
"learning_rate": 0.00019944999372059388,
|
|
"epoch": 0.1649746192893401,
|
|
"step": 130
|
|
},
|
|
{
|
|
"loss": 0.0849,
|
|
"grad_norm": 0.372153639793396,
|
|
"learning_rate": 0.00019920831070525342,
|
|
"epoch": 0.17766497461928935,
|
|
"step": 140
|
|
},
|
|
{
|
|
"loss": 0.0929,
|
|
"grad_norm": 0.33250877261161804,
|
|
"learning_rate": 0.00019892293706461555,
|
|
"epoch": 0.19035532994923857,
|
|
"step": 150
|
|
},
|
|
{
|
|
"eval_loss": 0.08791538327932358,
|
|
"eval_runtime": 29.62,
|
|
"eval_samples_per_second": 44.801,
|
|
"eval_steps_per_second": 11.209,
|
|
"epoch": 0.19923857868020303,
|
|
"step": 157
|
|
},
|
|
{
|
|
"loss": 0.0824,
|
|
"grad_norm": 0.4130192995071411,
|
|
"learning_rate": 0.00019859399847517567,
|
|
"epoch": 0.20304568527918782,
|
|
"step": 160
|
|
},
|
|
{
|
|
"loss": 0.0902,
|
|
"grad_norm": 0.3217241168022156,
|
|
"learning_rate": 0.0001982216397991188,
|
|
"epoch": 0.21573604060913706,
|
|
"step": 170
|
|
},
|
|
{
|
|
"loss": 0.0766,
|
|
"grad_norm": 0.4728490710258484,
|
|
"learning_rate": 0.0001978060250205232,
|
|
"epoch": 0.22842639593908629,
|
|
"step": 180
|
|
},
|
|
{
|
|
"loss": 0.0844,
|
|
"grad_norm": 0.5730077028274536,
|
|
"learning_rate": 0.0001973473371731431,
|
|
"epoch": 0.24111675126903553,
|
|
"step": 190
|
|
},
|
|
{
|
|
"loss": 0.0841,
|
|
"grad_norm": 0.5745298862457275,
|
|
"learning_rate": 0.00019684577825980192,
|
|
"epoch": 0.25380710659898476,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 0.0797,
|
|
"grad_norm": 0.3141058683395386,
|
|
"learning_rate": 0.0001963015691634317,
|
|
"epoch": 0.26649746192893403,
|
|
"step": 210
|
|
},
|
|
{
|
|
"loss": 0.0822,
|
|
"grad_norm": 0.3730680048465729,
|
|
"learning_rate": 0.00019571494954979775,
|
|
"epoch": 0.27918781725888325,
|
|
"step": 220
|
|
},
|
|
{
|
|
"loss": 0.0677,
|
|
"grad_norm": 0.3915182650089264,
|
|
"learning_rate": 0.00019508617776195167,
|
|
"epoch": 0.2918781725888325,
|
|
"step": 230
|
|
},
|
|
{
|
|
"loss": 0.08,
|
|
"grad_norm": 0.3052193820476532,
|
|
"learning_rate": 0.00019441553070645887,
|
|
"epoch": 0.30456852791878175,
|
|
"step": 240
|
|
},
|
|
{
|
|
"loss": 0.0744,
|
|
"grad_norm": 0.3673352003097534,
|
|
"learning_rate": 0.000193703303731451,
|
|
"epoch": 0.31725888324873097,
|
|
"step": 250
|
|
},
|
|
{
|
|
"loss": 0.0821,
|
|
"grad_norm": 0.39443644881248474,
|
|
"learning_rate": 0.00019294981049655668,
|
|
"epoch": 0.3299492385786802,
|
|
"step": 260
|
|
},
|
|
{
|
|
"loss": 0.073,
|
|
"grad_norm": 0.44178199768066406,
|
|
"learning_rate": 0.0001921553828347681,
|
|
"epoch": 0.3426395939086294,
|
|
"step": 270
|
|
},
|
|
{
|
|
"loss": 0.0784,
|
|
"grad_norm": 0.4202715754508972,
|
|
"learning_rate": 0.00019132037060630409,
|
|
"epoch": 0.3553299492385787,
|
|
"step": 280
|
|
},
|
|
{
|
|
"loss": 0.0646,
|
|
"grad_norm": 0.23640507459640503,
|
|
"learning_rate": 0.00019044514154453434,
|
|
"epoch": 0.3680203045685279,
|
|
"step": 290
|
|
},
|
|
{
|
|
"loss": 0.0785,
|
|
"grad_norm": 0.4354120194911957,
|
|
"learning_rate": 0.0001895300810940321,
|
|
"epoch": 0.38071065989847713,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 0.0656,
|
|
"grad_norm": 0.2467317283153534,
|
|
"learning_rate": 0.00018857559224082736,
|
|
"epoch": 0.3934010152284264,
|
|
"step": 310
|
|
},
|
|
{
|
|
"eval_loss": 0.0728072002530098,
|
|
"eval_runtime": 19.9827,
|
|
"eval_samples_per_second": 66.407,
|
|
"eval_steps_per_second": 16.614,
|
|
"epoch": 0.39847715736040606,
|
|
"step": 314
|
|
},
|
|
{
|
|
"loss": 0.0738,
|
|
"grad_norm": 0.2969267666339874,
|
|
"learning_rate": 0.00018758209533493444,
|
|
"epoch": 0.40609137055837563,
|
|
"step": 320
|
|
},
|
|
{
|
|
"loss": 0.067,
|
|
"grad_norm": 0.3527528643608093,
|
|
"learning_rate": 0.00018655002790523328,
|
|
"epoch": 0.41878172588832485,
|
|
"step": 330
|
|
},
|
|
{
|
|
"loss": 0.0714,
|
|
"grad_norm": 0.2732889950275421,
|
|
"learning_rate": 0.00018547984446678437,
|
|
"epoch": 0.43147208121827413,
|
|
"step": 340
|
|
},
|
|
{
|
|
"loss": 0.0602,
|
|
"grad_norm": 0.25770312547683716,
|
|
"learning_rate": 0.000184372016320664,
|
|
"epoch": 0.44416243654822335,
|
|
"step": 350
|
|
},
|
|
{
|
|
"loss": 0.0624,
|
|
"grad_norm": 0.22473905980587006,
|
|
"learning_rate": 0.00018322703134640654,
|
|
"epoch": 0.45685279187817257,
|
|
"step": 360
|
|
},
|
|
{
|
|
"loss": 0.0709,
|
|
"grad_norm": 0.3180300295352936,
|
|
"learning_rate": 0.00018204539378714561,
|
|
"epoch": 0.46954314720812185,
|
|
"step": 370
|
|
},
|
|
{
|
|
"loss": 0.0698,
|
|
"grad_norm": 0.2796868085861206,
|
|
"learning_rate": 0.00018082762402754936,
|
|
"epoch": 0.48223350253807107,
|
|
"step": 380
|
|
},
|
|
{
|
|
"loss": 0.0658,
|
|
"grad_norm": 0.3655967712402344,
|
|
"learning_rate": 0.0001795742583646466,
|
|
"epoch": 0.4949238578680203,
|
|
"step": 390
|
|
},
|
|
{
|
|
"loss": 0.0682,
|
|
"grad_norm": 0.2886195182800293,
|
|
"learning_rate": 0.0001782858487716455,
|
|
"epoch": 0.5076142131979695,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 0.071,
|
|
"grad_norm": 0.27021610736846924,
|
|
"learning_rate": 0.00017696296265484862,
|
|
"epoch": 0.5203045685279187,
|
|
"step": 410
|
|
},
|
|
{
|
|
"loss": 0.0636,
|
|
"grad_norm": 0.28307008743286133,
|
|
"learning_rate": 0.00017560618260377116,
|
|
"epoch": 0.5329949238578681,
|
|
"step": 420
|
|
},
|
|
{
|
|
"loss": 0.0546,
|
|
"grad_norm": 0.28294482827186584,
|
|
"learning_rate": 0.00017421610613457282,
|
|
"epoch": 0.5456852791878173,
|
|
"step": 430
|
|
},
|
|
{
|
|
"loss": 0.0612,
|
|
"grad_norm": 0.2255251258611679,
|
|
"learning_rate": 0.00017279334542691596,
|
|
"epoch": 0.5583756345177665,
|
|
"step": 440
|
|
},
|
|
{
|
|
"loss": 0.0629,
|
|
"grad_norm": 0.22404751181602478,
|
|
"learning_rate": 0.0001713385270543661,
|
|
"epoch": 0.5710659898477157,
|
|
"step": 450
|
|
},
|
|
{
|
|
"loss": 0.0596,
|
|
"grad_norm": 0.2632795572280884,
|
|
"learning_rate": 0.00016985229170845339,
|
|
"epoch": 0.583756345177665,
|
|
"step": 460
|
|
},
|
|
{
|
|
"loss": 0.0717,
|
|
"grad_norm": 0.3002878427505493,
|
|
"learning_rate": 0.0001683352939165167,
|
|
"epoch": 0.5964467005076142,
|
|
"step": 470
|
|
},
|
|
{
|
|
"eval_loss": 0.06722872704267502,
|
|
"eval_runtime": 20.1214,
|
|
"eval_samples_per_second": 65.95,
|
|
"eval_steps_per_second": 16.5,
|
|
"epoch": 0.5977157360406091,
|
|
"step": 471
|
|
},
|
|
{
|
|
"loss": 0.0618,
|
|
"grad_norm": 0.15326248109340668,
|
|
"learning_rate": 0.00016678820175345454,
|
|
"epoch": 0.6091370558375635,
|
|
"step": 480
|
|
},
|
|
{
|
|
"loss": 0.0718,
|
|
"grad_norm": 0.27122628688812256,
|
|
"learning_rate": 0.00016521169654750968,
|
|
"epoch": 0.6218274111675127,
|
|
"step": 490
|
|
},
|
|
{
|
|
"loss": 0.0636,
|
|
"grad_norm": 0.29509711265563965,
|
|
"learning_rate": 0.00016360647258021696,
|
|
"epoch": 0.6345177664974619,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 0.0655,
|
|
"grad_norm": 0.4090014100074768,
|
|
"learning_rate": 0.00016197323678064697,
|
|
"epoch": 0.6472081218274112,
|
|
"step": 510
|
|
},
|
|
{
|
|
"loss": 0.0606,
|
|
"grad_norm": 0.2687474191188812,
|
|
"learning_rate": 0.00016031270841407926,
|
|
"epoch": 0.6598984771573604,
|
|
"step": 520
|
|
},
|
|
{
|
|
"loss": 0.0519,
|
|
"grad_norm": 0.25125357508659363,
|
|
"learning_rate": 0.00015862561876524338,
|
|
"epoch": 0.6725888324873096,
|
|
"step": 530
|
|
},
|
|
{
|
|
"loss": 0.0623,
|
|
"grad_norm": 0.21579739451408386,
|
|
"learning_rate": 0.0001569127108162662,
|
|
"epoch": 0.6852791878172588,
|
|
"step": 540
|
|
},
|
|
{
|
|
"loss": 0.0612,
|
|
"grad_norm": 0.24012021720409393,
|
|
"learning_rate": 0.000155174738919468,
|
|
"epoch": 0.6979695431472082,
|
|
"step": 550
|
|
},
|
|
{
|
|
"loss": 0.0617,
|
|
"grad_norm": 0.22273781895637512,
|
|
"learning_rate": 0.00015341246846515096,
|
|
"epoch": 0.7106598984771574,
|
|
"step": 560
|
|
},
|
|
{
|
|
"loss": 0.0627,
|
|
"grad_norm": 0.29965269565582275,
|
|
"learning_rate": 0.0001516266755445271,
|
|
"epoch": 0.7233502538071066,
|
|
"step": 570
|
|
},
|
|
{
|
|
"loss": 0.0649,
|
|
"grad_norm": 0.2375640720129013,
|
|
"learning_rate": 0.00014981814660793314,
|
|
"epoch": 0.7360406091370558,
|
|
"step": 580
|
|
},
|
|
{
|
|
"loss": 0.0653,
|
|
"grad_norm": 0.2595769166946411,
|
|
"learning_rate": 0.0001479876781184833,
|
|
"epoch": 0.748730964467005,
|
|
"step": 590
|
|
},
|
|
{
|
|
"loss": 0.0634,
|
|
"grad_norm": 0.28185659646987915,
|
|
"learning_rate": 0.00014613607620131294,
|
|
"epoch": 0.7614213197969543,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 0.0601,
|
|
"grad_norm": 0.20655085146427155,
|
|
"learning_rate": 0.00014426415628856663,
|
|
"epoch": 0.7741116751269036,
|
|
"step": 610
|
|
},
|
|
{
|
|
"loss": 0.0632,
|
|
"grad_norm": 0.4992614686489105,
|
|
"learning_rate": 0.0001423727427602879,
|
|
"epoch": 0.7868020304568528,
|
|
"step": 620
|
|
},
|
|
{
|
|
"eval_loss": 0.05841095373034477,
|
|
"eval_runtime": 20.0018,
|
|
"eval_samples_per_second": 66.344,
|
|
"eval_steps_per_second": 16.599,
|
|
"epoch": 0.7969543147208121,
|
|
"step": 628
|
|
},
|
|
{
|
|
"loss": 0.0522,
|
|
"grad_norm": 0.2023015171289444,
|
|
"learning_rate": 0.0001404626685813681,
|
|
"epoch": 0.799492385786802,
|
|
"step": 630
|
|
},
|
|
{
|
|
"loss": 0.0567,
|
|
"grad_norm": 0.20891991257667542,
|
|
"learning_rate": 0.00013853477493471468,
|
|
"epoch": 0.8121827411167513,
|
|
"step": 640
|
|
},
|
|
{
|
|
"loss": 0.0555,
|
|
"grad_norm": 0.27132412791252136,
|
|
"learning_rate": 0.00013658991085080025,
|
|
"epoch": 0.8248730964467005,
|
|
"step": 650
|
|
},
|
|
{
|
|
"loss": 0.0594,
|
|
"grad_norm": 0.22256866097450256,
|
|
"learning_rate": 0.0001346289328337558,
|
|
"epoch": 0.8375634517766497,
|
|
"step": 660
|
|
},
|
|
{
|
|
"loss": 0.0556,
|
|
"grad_norm": 0.20859505236148834,
|
|
"learning_rate": 0.00013265270448417234,
|
|
"epoch": 0.850253807106599,
|
|
"step": 670
|
|
},
|
|
{
|
|
"loss": 0.0557,
|
|
"grad_norm": 0.2204328030347824,
|
|
"learning_rate": 0.00013066209611877746,
|
|
"epoch": 0.8629441624365483,
|
|
"step": 680
|
|
},
|
|
{
|
|
"loss": 0.059,
|
|
"grad_norm": 0.2515346109867096,
|
|
"learning_rate": 0.00012865798438715413,
|
|
"epoch": 0.8756345177664975,
|
|
"step": 690
|
|
},
|
|
{
|
|
"loss": 0.0546,
|
|
"grad_norm": 0.3130325376987457,
|
|
"learning_rate": 0.00012664125188567056,
|
|
"epoch": 0.8883248730964467,
|
|
"step": 700
|
|
},
|
|
{
|
|
"loss": 0.0475,
|
|
"grad_norm": 0.2509436011314392,
|
|
"learning_rate": 0.00012461278676879098,
|
|
"epoch": 0.9010152284263959,
|
|
"step": 710
|
|
},
|
|
{
|
|
"loss": 0.0561,
|
|
"grad_norm": 0.23676852881908417,
|
|
"learning_rate": 0.00012257348235793897,
|
|
"epoch": 0.9137055837563451,
|
|
"step": 720
|
|
},
|
|
{
|
|
"loss": 0.0536,
|
|
"grad_norm": 0.20894668996334076,
|
|
"learning_rate": 0.00012052423674808513,
|
|
"epoch": 0.9263959390862944,
|
|
"step": 730
|
|
},
|
|
{
|
|
"loss": 0.0517,
|
|
"grad_norm": 0.18107716739177704,
|
|
"learning_rate": 0.00011846595241223247,
|
|
"epoch": 0.9390862944162437,
|
|
"step": 740
|
|
},
|
|
{
|
|
"loss": 0.0623,
|
|
"grad_norm": 0.3013327717781067,
|
|
"learning_rate": 0.00011639953580397367,
|
|
"epoch": 0.9517766497461929,
|
|
"step": 750
|
|
},
|
|
{
|
|
"loss": 0.0579,
|
|
"grad_norm": 0.19317802786827087,
|
|
"learning_rate": 0.00011432589695829576,
|
|
"epoch": 0.9644670050761421,
|
|
"step": 760
|
|
},
|
|
{
|
|
"loss": 0.0559,
|
|
"grad_norm": 0.26291170716285706,
|
|
"learning_rate": 0.00011224594909080704,
|
|
"epoch": 0.9771573604060914,
|
|
"step": 770
|
|
},
|
|
{
|
|
"loss": 0.0537,
|
|
"grad_norm": 0.28403881192207336,
|
|
"learning_rate": 0.00011016060819556353,
|
|
"epoch": 0.9898477157360406,
|
|
"step": 780
|
|
},
|
|
{
|
|
"eval_loss": 0.05360769107937813,
|
|
"eval_runtime": 20.0465,
|
|
"eval_samples_per_second": 66.196,
|
|
"eval_steps_per_second": 16.562,
|
|
"epoch": 0.9961928934010152,
|
|
"step": 785
|
|
},
|
|
{
|
|
"loss": 0.0502,
|
|
"grad_norm": 0.1471383273601532,
|
|
"learning_rate": 0.0001080707926416719,
|
|
"epoch": 1.00253807106599,
|
|
"step": 790
|
|
},
|
|
{
|
|
"loss": 0.038,
|
|
"grad_norm": 0.17716127634048462,
|
|
"learning_rate": 0.00010597742276884614,
|
|
"epoch": 1.015228426395939,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 0.0351,
|
|
"grad_norm": 0.2006382942199707,
|
|
"learning_rate": 0.00010388142048209676,
|
|
"epoch": 1.0279187817258884,
|
|
"step": 810
|
|
},
|
|
{
|
|
"loss": 0.0375,
|
|
"grad_norm": 0.2539692521095276,
|
|
"learning_rate": 0.00010178370884573046,
|
|
"epoch": 1.0406091370558375,
|
|
"step": 820
|
|
},
|
|
{
|
|
"loss": 0.0422,
|
|
"grad_norm": 0.2615308165550232,
|
|
"learning_rate": 9.968521167683905e-05,
|
|
"epoch": 1.0532994923857868,
|
|
"step": 830
|
|
},
|
|
{
|
|
"loss": 0.0406,
|
|
"grad_norm": 0.23757147789001465,
|
|
"learning_rate": 9.758685313845727e-05,
|
|
"epoch": 1.0659898477157361,
|
|
"step": 840
|
|
},
|
|
{
|
|
"loss": 0.0387,
|
|
"grad_norm": 0.16979315876960754,
|
|
"learning_rate": 9.548955733256803e-05,
|
|
"epoch": 1.0786802030456852,
|
|
"step": 850
|
|
},
|
|
{
|
|
"loss": 0.0352,
|
|
"grad_norm": 0.1853126734495163,
|
|
"learning_rate": 9.339424789313445e-05,
|
|
"epoch": 1.0913705583756346,
|
|
"step": 860
|
|
},
|
|
{
|
|
"loss": 0.0356,
|
|
"grad_norm": 0.15106192231178284,
|
|
"learning_rate": 9.13018475793382e-05,
|
|
"epoch": 1.1040609137055837,
|
|
"step": 870
|
|
},
|
|
{
|
|
"loss": 0.037,
|
|
"grad_norm": 0.20427311956882477,
|
|
"learning_rate": 8.921327786920294e-05,
|
|
"epoch": 1.116751269035533,
|
|
"step": 880
|
|
},
|
|
{
|
|
"loss": 0.0324,
|
|
"grad_norm": 0.1580514758825302,
|
|
"learning_rate": 8.712945855378218e-05,
|
|
"epoch": 1.1294416243654823,
|
|
"step": 890
|
|
},
|
|
{
|
|
"loss": 0.0301,
|
|
"grad_norm": 0.2191898375749588,
|
|
"learning_rate": 8.505130733208968e-05,
|
|
"epoch": 1.1421319796954315,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 0.0355,
|
|
"grad_norm": 0.16614247858524323,
|
|
"learning_rate": 8.297973940695163e-05,
|
|
"epoch": 1.1548223350253808,
|
|
"step": 910
|
|
},
|
|
{
|
|
"loss": 0.0349,
|
|
"grad_norm": 0.18907427787780762,
|
|
"learning_rate": 8.091566708195786e-05,
|
|
"epoch": 1.16751269035533,
|
|
"step": 920
|
|
},
|
|
{
|
|
"loss": 0.0336,
|
|
"grad_norm": 0.24296258389949799,
|
|
"learning_rate": 7.885999935968982e-05,
|
|
"epoch": 1.1802030456852792,
|
|
"step": 930
|
|
},
|
|
{
|
|
"loss": 0.0372,
|
|
"grad_norm": 0.1817648708820343,
|
|
"learning_rate": 7.681364154140264e-05,
|
|
"epoch": 1.1928934010152283,
|
|
"step": 940
|
|
},
|
|
{
|
|
"eval_loss": 0.057017017155885696,
|
|
"eval_runtime": 19.9628,
|
|
"eval_samples_per_second": 66.474,
|
|
"eval_steps_per_second": 16.631,
|
|
"epoch": 1.1954314720812182,
|
|
"step": 942
|
|
},
|
|
{
|
|
"loss": 0.03,
|
|
"grad_norm": 0.19095705449581146,
|
|
"learning_rate": 7.47774948283366e-05,
|
|
"epoch": 1.2055837563451777,
|
|
"step": 950
|
|
},
|
|
{
|
|
"loss": 0.035,
|
|
"grad_norm": 0.33682745695114136,
|
|
"learning_rate": 7.275245592483492e-05,
|
|
"epoch": 1.218274111675127,
|
|
"step": 960
|
|
},
|
|
{
|
|
"loss": 0.0384,
|
|
"grad_norm": 0.2646084427833557,
|
|
"learning_rate": 7.073941664344152e-05,
|
|
"epoch": 1.2309644670050761,
|
|
"step": 970
|
|
},
|
|
{
|
|
"loss": 0.0287,
|
|
"grad_norm": 0.1980791836977005,
|
|
"learning_rate": 6.873926351215312e-05,
|
|
"epoch": 1.2436548223350254,
|
|
"step": 980
|
|
},
|
|
{
|
|
"loss": 0.0342,
|
|
"grad_norm": 0.18797655403614044,
|
|
"learning_rate": 6.67528773839989e-05,
|
|
"epoch": 1.2563451776649746,
|
|
"step": 990
|
|
},
|
|
{
|
|
"loss": 0.0337,
|
|
"grad_norm": 0.24009937047958374,
|
|
"learning_rate": 6.478113304911886e-05,
|
|
"epoch": 1.2690355329949239,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 0.0272,
|
|
"grad_norm": 0.29159170389175415,
|
|
"learning_rate": 6.282489884951295e-05,
|
|
"epoch": 1.281725888324873,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"loss": 0.036,
|
|
"grad_norm": 0.16352516412734985,
|
|
"learning_rate": 6.0885036296629064e-05,
|
|
"epoch": 1.2944162436548223,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"loss": 0.0292,
|
|
"grad_norm": 0.17807820439338684,
|
|
"learning_rate": 5.896239969195994e-05,
|
|
"epoch": 1.3071065989847717,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"loss": 0.0332,
|
|
"grad_norm": 0.2500491738319397,
|
|
"learning_rate": 5.7057835750814867e-05,
|
|
"epoch": 1.3197969543147208,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"loss": 0.0294,
|
|
"grad_norm": 0.2208271473646164,
|
|
"learning_rate": 5.517218322943224e-05,
|
|
"epoch": 1.33248730964467,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"loss": 0.0342,
|
|
"grad_norm": 0.23927471041679382,
|
|
"learning_rate": 5.3306272555597504e-05,
|
|
"epoch": 1.3451776649746192,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"loss": 0.0307,
|
|
"grad_norm": 0.20309758186340332,
|
|
"learning_rate": 5.1460925462928546e-05,
|
|
"epoch": 1.3578680203045685,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"loss": 0.0314,
|
|
"grad_norm": 0.23275193572044373,
|
|
"learning_rate": 4.96369546289904e-05,
|
|
"epoch": 1.3705583756345177,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"loss": 0.0333,
|
|
"grad_norm": 0.2078331708908081,
|
|
"learning_rate": 4.783516331739769e-05,
|
|
"epoch": 1.383248730964467,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"eval_loss": 0.05335332825779915,
|
|
"eval_runtime": 19.9859,
|
|
"eval_samples_per_second": 66.397,
|
|
"eval_steps_per_second": 16.612,
|
|
"epoch": 1.3946700507614214,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"loss": 0.0309,
|
|
"grad_norm": 0.18032079935073853,
|
|
"learning_rate": 4.605634502406321e-05,
|
|
"epoch": 1.3959390862944163,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 0.0328,
|
|
"grad_norm": 0.20803005993366241,
|
|
"learning_rate": 4.430128312774804e-05,
|
|
"epoch": 1.4086294416243654,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"loss": 0.027,
|
|
"grad_norm": 0.1680465191602707,
|
|
"learning_rate": 4.2570750545067076e-05,
|
|
"epoch": 1.4213197969543148,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"loss": 0.0317,
|
|
"grad_norm": 0.2528463900089264,
|
|
"learning_rate": 4.086550939010227e-05,
|
|
"epoch": 1.434010152284264,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"loss": 0.0313,
|
|
"grad_norm": 0.19024434685707092,
|
|
"learning_rate": 3.9186310638773047e-05,
|
|
"epoch": 1.4467005076142132,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"loss": 0.0287,
|
|
"grad_norm": 0.20934472978115082,
|
|
"learning_rate": 3.753389379811185e-05,
|
|
"epoch": 1.4593908629441623,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"loss": 0.0265,
|
|
"grad_norm": 0.29412180185317993,
|
|
"learning_rate": 3.590898658059062e-05,
|
|
"epoch": 1.4720812182741116,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"loss": 0.0298,
|
|
"grad_norm": 0.3268195390701294,
|
|
"learning_rate": 3.4312304583641484e-05,
|
|
"epoch": 1.484771573604061,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"loss": 0.0251,
|
|
"grad_norm": 0.17332251369953156,
|
|
"learning_rate": 3.274455097451269e-05,
|
|
"epoch": 1.49746192893401,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"loss": 0.0318,
|
|
"grad_norm": 0.3481772541999817,
|
|
"learning_rate": 3.1206416180598995e-05,
|
|
"epoch": 1.5101522842639594,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"loss": 0.0335,
|
|
"grad_norm": 0.24047453701496124,
|
|
"learning_rate": 2.9698577585382282e-05,
|
|
"epoch": 1.5228426395939088,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 0.0339,
|
|
"grad_norm": 0.21146714687347412,
|
|
"learning_rate": 2.8221699230116793e-05,
|
|
"epoch": 1.5355329949238579,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"loss": 0.0308,
|
|
"grad_norm": 0.140832781791687,
|
|
"learning_rate": 2.67764315213902e-05,
|
|
"epoch": 1.548223350253807,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"loss": 0.026,
|
|
"grad_norm": 0.1721792370080948,
|
|
"learning_rate": 2.536341094468906e-05,
|
|
"epoch": 1.5609137055837563,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"loss": 0.0277,
|
|
"grad_norm": 0.14980490505695343,
|
|
"learning_rate": 2.398325978409539e-05,
|
|
"epoch": 1.5736040609137056,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"loss": 0.028,
|
|
"grad_norm": 0.18908673524856567,
|
|
"learning_rate": 2.263658584823717e-05,
|
|
"epoch": 1.5862944162436547,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"eval_loss": 0.052472274750471115,
|
|
"eval_runtime": 19.9786,
|
|
"eval_samples_per_second": 66.421,
|
|
"eval_steps_per_second": 16.618,
|
|
"epoch": 1.5939086294416245,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"loss": 0.0272,
|
|
"grad_norm": 0.12164825201034546,
|
|
"learning_rate": 2.1323982202613735e-05,
|
|
"epoch": 1.598984771573604,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"loss": 0.0245,
|
|
"grad_norm": 0.2658851146697998,
|
|
"learning_rate": 2.004602690841414e-05,
|
|
"epoch": 1.6116751269035534,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"loss": 0.0304,
|
|
"grad_norm": 0.2891974151134491,
|
|
"learning_rate": 1.8803282767942954e-05,
|
|
"epoch": 1.6243654822335025,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"loss": 0.0292,
|
|
"grad_norm": 0.2979351580142975,
|
|
"learning_rate": 1.7596297076766455e-05,
|
|
"epoch": 1.6370558375634516,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"loss": 0.0284,
|
|
"grad_norm": 0.20141719281673431,
|
|
"learning_rate": 1.6425601382687405e-05,
|
|
"epoch": 1.649746192893401,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 0.0254,
|
|
"grad_norm": 0.1950131356716156,
|
|
"learning_rate": 1.5291711251655316e-05,
|
|
"epoch": 1.6624365482233503,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"loss": 0.0282,
|
|
"grad_norm": 0.21205022931098938,
|
|
"learning_rate": 1.41951260407149e-05,
|
|
"epoch": 1.6751269035532994,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"loss": 0.0247,
|
|
"grad_norm": 0.2470894753932953,
|
|
"learning_rate": 1.3136328678092746e-05,
|
|
"epoch": 1.6878172588832487,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"loss": 0.0257,
|
|
"grad_norm": 0.26378998160362244,
|
|
"learning_rate": 1.2115785450519434e-05,
|
|
"epoch": 1.700507614213198,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"loss": 0.0282,
|
|
"grad_norm": 0.12680888175964355,
|
|
"learning_rate": 1.1133945797879908e-05,
|
|
"epoch": 1.7131979695431472,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"loss": 0.0251,
|
|
"grad_norm": 0.19744935631752014,
|
|
"learning_rate": 1.019124211528365e-05,
|
|
"epoch": 1.7258883248730963,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"loss": 0.0327,
|
|
"grad_norm": 0.18419434130191803,
|
|
"learning_rate": 9.288089562640844e-06,
|
|
"epoch": 1.7385786802030458,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"loss": 0.0282,
|
|
"grad_norm": 0.19115136563777924,
|
|
"learning_rate": 8.42488588182897e-06,
|
|
"epoch": 1.751269035532995,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"loss": 0.0245,
|
|
"grad_norm": 0.17252641916275024,
|
|
"learning_rate": 7.602011221530236e-06,
|
|
"epoch": 1.763959390862944,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"loss": 0.029,
|
|
"grad_norm": 0.22253695130348206,
|
|
"learning_rate": 6.819827969816661e-06,
|
|
"epoch": 1.7766497461928934,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 0.0269,
|
|
"grad_norm": 0.21938475966453552,
|
|
"learning_rate": 6.078680594557163e-06,
|
|
"epoch": 1.7893401015228427,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"eval_loss": 0.05091211572289467,
|
|
"eval_runtime": 20.0145,
|
|
"eval_samples_per_second": 66.302,
|
|
"eval_steps_per_second": 16.588,
|
|
"epoch": 1.7931472081218274,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"loss": 0.0305,
|
|
"grad_norm": 0.2024271935224533,
|
|
"learning_rate": 5.378895491716285e-06,
|
|
"epoch": 1.8020304568527918,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"loss": 0.029,
|
|
"grad_norm": 0.22723488509655,
|
|
"learning_rate": 4.720780841611738e-06,
|
|
"epoch": 1.8147208121827412,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"loss": 0.0266,
|
|
"grad_norm": 0.2747625410556793,
|
|
"learning_rate": 4.104626473194151e-06,
|
|
"epoch": 1.8274111675126905,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"loss": 0.0262,
|
|
"grad_norm": 0.18593831360340118,
|
|
"learning_rate": 3.5307037364083253e-06,
|
|
"epoch": 1.8401015228426396,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"loss": 0.0291,
|
|
"grad_norm": 0.2651998996734619,
|
|
"learning_rate": 2.9992653826927508e-06,
|
|
"epoch": 1.8527918781725887,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"loss": 0.026,
|
|
"grad_norm": 0.19439752399921417,
|
|
"learning_rate": 2.510545453669744e-06,
|
|
"epoch": 1.865482233502538,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"loss": 0.03,
|
|
"grad_norm": 0.17483021318912506,
|
|
"learning_rate": 2.06475917807506e-06,
|
|
"epoch": 1.8781725888324874,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"loss": 0.029,
|
|
"grad_norm": 0.22444817423820496,
|
|
"learning_rate": 1.662102876972882e-06,
|
|
"epoch": 1.8908629441624365,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"loss": 0.0243,
|
|
"grad_norm": 0.17885605990886688,
|
|
"learning_rate": 1.3027538772973026e-06,
|
|
"epoch": 1.9035532994923858,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 0.0272,
|
|
"grad_norm": 0.19312232732772827,
|
|
"learning_rate": 9.868704337588797e-07,
|
|
"epoch": 1.9162436548223352,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"loss": 0.0254,
|
|
"grad_norm": 0.1709776520729065,
|
|
"learning_rate": 7.145916591504098e-07,
|
|
"epoch": 1.9289340101522843,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"loss": 0.0252,
|
|
"grad_norm": 0.18656505644321442,
|
|
"learning_rate": 4.860374630826004e-07,
|
|
"epoch": 1.9416243654822334,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"loss": 0.0267,
|
|
"grad_norm": 0.11956395953893661,
|
|
"learning_rate": 3.0130849917681114e-07,
|
|
"epoch": 1.9543147208121827,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"loss": 0.0305,
|
|
"grad_norm": 0.25038954615592957,
|
|
"learning_rate": 1.604861207378794e-07,
|
|
"epoch": 1.967005076142132,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"loss": 0.025,
|
|
"grad_norm": 0.19318363070487976,
|
|
"learning_rate": 6.363234492674507e-08,
|
|
"epoch": 1.9796954314720812,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"loss": 0.0276,
|
|
"grad_norm": 0.26012641191482544,
|
|
"learning_rate": 1.0789825448476177e-08,
|
|
"epoch": 1.9923857868020305,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"eval_loss": 0.0504293330013752,
|
|
"eval_runtime": 19.8337,
|
|
"eval_samples_per_second": 66.906,
|
|
"eval_steps_per_second": 16.739,
|
|
"epoch": 1.9923857868020305,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"train_runtime": 2681.8444,
|
|
"train_samples_per_second": 18.795,
|
|
"train_steps_per_second": 0.588,
|
|
"total_flos": 6.800278675429786e+17,
|
|
"train_loss": 0.06838441643920647,
|
|
"epoch": 2.0,
|
|
"step": 1576
|
|
}
|
|
] |