1190 lines
27 KiB
JSON
1190 lines
27 KiB
JSON
[
|
|
{
|
|
"loss": 1.5751,
|
|
"grad_norm": 5.8206610679626465,
|
|
"learning_rate": 4.556962025316456e-05,
|
|
"epoch": 0.012690355329949238,
|
|
"step": 10
|
|
},
|
|
{
|
|
"loss": 0.6817,
|
|
"grad_norm": 2.4909415245056152,
|
|
"learning_rate": 9.620253164556962e-05,
|
|
"epoch": 0.025380710659898477,
|
|
"step": 20
|
|
},
|
|
{
|
|
"loss": 0.5317,
|
|
"grad_norm": 2.0207533836364746,
|
|
"learning_rate": 0.0001468354430379747,
|
|
"epoch": 0.03807106598984772,
|
|
"step": 30
|
|
},
|
|
{
|
|
"loss": 0.4042,
|
|
"grad_norm": 2.2393276691436768,
|
|
"learning_rate": 0.00019746835443037975,
|
|
"epoch": 0.050761421319796954,
|
|
"step": 40
|
|
},
|
|
{
|
|
"loss": 0.3339,
|
|
"grad_norm": 1.3759368658065796,
|
|
"learning_rate": 0.0002481012658227848,
|
|
"epoch": 0.06345177664974619,
|
|
"step": 50
|
|
},
|
|
{
|
|
"loss": 0.2921,
|
|
"grad_norm": 1.4795056581497192,
|
|
"learning_rate": 0.0002987341772151899,
|
|
"epoch": 0.07614213197969544,
|
|
"step": 60
|
|
},
|
|
{
|
|
"loss": 0.2594,
|
|
"grad_norm": 1.5126503705978394,
|
|
"learning_rate": 0.00034936708860759495,
|
|
"epoch": 0.08883248730964467,
|
|
"step": 70
|
|
},
|
|
{
|
|
"loss": 0.2418,
|
|
"grad_norm": 1.2799267768859863,
|
|
"learning_rate": 0.0004,
|
|
"epoch": 0.10152284263959391,
|
|
"step": 80
|
|
},
|
|
{
|
|
"loss": 0.2121,
|
|
"grad_norm": 1.081551194190979,
|
|
"learning_rate": 0.0003999559607204408,
|
|
"epoch": 0.11421319796954314,
|
|
"step": 90
|
|
},
|
|
{
|
|
"loss": 0.1958,
|
|
"grad_norm": 1.5507794618606567,
|
|
"learning_rate": 0.0003998238622763449,
|
|
"epoch": 0.12690355329949238,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 0.21,
|
|
"grad_norm": 0.9705930352210999,
|
|
"learning_rate": 0.0003996037628429151,
|
|
"epoch": 0.13959390862944163,
|
|
"step": 110
|
|
},
|
|
{
|
|
"loss": 0.173,
|
|
"grad_norm": 0.9663600325584412,
|
|
"learning_rate": 0.00039929575935035633,
|
|
"epoch": 0.15228426395939088,
|
|
"step": 120
|
|
},
|
|
{
|
|
"loss": 0.1662,
|
|
"grad_norm": 1.1074914932250977,
|
|
"learning_rate": 0.00039889998744118777,
|
|
"epoch": 0.1649746192893401,
|
|
"step": 130
|
|
},
|
|
{
|
|
"loss": 0.1733,
|
|
"grad_norm": 0.9759091734886169,
|
|
"learning_rate": 0.00039841662141050683,
|
|
"epoch": 0.17766497461928935,
|
|
"step": 140
|
|
},
|
|
{
|
|
"loss": 0.1646,
|
|
"grad_norm": 0.8136119842529297,
|
|
"learning_rate": 0.0003978458741292311,
|
|
"epoch": 0.19035532994923857,
|
|
"step": 150
|
|
},
|
|
{
|
|
"eval_loss": 0.15786977112293243,
|
|
"eval_runtime": 24.4689,
|
|
"eval_samples_per_second": 54.232,
|
|
"eval_steps_per_second": 13.568,
|
|
"epoch": 0.19923857868020303,
|
|
"step": 157
|
|
},
|
|
{
|
|
"loss": 0.1468,
|
|
"grad_norm": 0.8433499336242676,
|
|
"learning_rate": 0.00039718799695035134,
|
|
"epoch": 0.20304568527918782,
|
|
"step": 160
|
|
},
|
|
{
|
|
"loss": 0.1321,
|
|
"grad_norm": 0.5848754644393921,
|
|
"learning_rate": 0.0003964432795982376,
|
|
"epoch": 0.21573604060913706,
|
|
"step": 170
|
|
},
|
|
{
|
|
"loss": 0.1427,
|
|
"grad_norm": 0.7149267792701721,
|
|
"learning_rate": 0.0003956120500410464,
|
|
"epoch": 0.22842639593908629,
|
|
"step": 180
|
|
},
|
|
{
|
|
"loss": 0.1277,
|
|
"grad_norm": 0.6142985224723816,
|
|
"learning_rate": 0.0003946946743462862,
|
|
"epoch": 0.24111675126903553,
|
|
"step": 190
|
|
},
|
|
{
|
|
"loss": 0.1355,
|
|
"grad_norm": 0.5795847773551941,
|
|
"learning_rate": 0.00039369155651960383,
|
|
"epoch": 0.25380710659898476,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 0.1229,
|
|
"grad_norm": 0.6075690984725952,
|
|
"learning_rate": 0.0003926031383268634,
|
|
"epoch": 0.26649746192893403,
|
|
"step": 210
|
|
},
|
|
{
|
|
"loss": 0.1307,
|
|
"grad_norm": 0.5007538199424744,
|
|
"learning_rate": 0.0003914298990995955,
|
|
"epoch": 0.27918781725888325,
|
|
"step": 220
|
|
},
|
|
{
|
|
"loss": 0.1179,
|
|
"grad_norm": 0.7290500402450562,
|
|
"learning_rate": 0.00039017235552390333,
|
|
"epoch": 0.2918781725888325,
|
|
"step": 230
|
|
},
|
|
{
|
|
"loss": 0.1097,
|
|
"grad_norm": 0.5982555747032166,
|
|
"learning_rate": 0.00038883106141291774,
|
|
"epoch": 0.30456852791878175,
|
|
"step": 240
|
|
},
|
|
{
|
|
"loss": 0.1304,
|
|
"grad_norm": 0.5894684195518494,
|
|
"learning_rate": 0.000387406607462902,
|
|
"epoch": 0.31725888324873097,
|
|
"step": 250
|
|
},
|
|
{
|
|
"loss": 0.1253,
|
|
"grad_norm": 0.4741029441356659,
|
|
"learning_rate": 0.00038589962099311336,
|
|
"epoch": 0.3299492385786802,
|
|
"step": 260
|
|
},
|
|
{
|
|
"loss": 0.1252,
|
|
"grad_norm": 0.5968972444534302,
|
|
"learning_rate": 0.0003843107656695362,
|
|
"epoch": 0.3426395939086294,
|
|
"step": 270
|
|
},
|
|
{
|
|
"loss": 0.1179,
|
|
"grad_norm": 0.6765711307525635,
|
|
"learning_rate": 0.00038264074121260817,
|
|
"epoch": 0.3553299492385787,
|
|
"step": 280
|
|
},
|
|
{
|
|
"loss": 0.1245,
|
|
"grad_norm": 0.531008243560791,
|
|
"learning_rate": 0.0003808902830890687,
|
|
"epoch": 0.3680203045685279,
|
|
"step": 290
|
|
},
|
|
{
|
|
"loss": 0.1166,
|
|
"grad_norm": 0.6060107350349426,
|
|
"learning_rate": 0.0003790601621880642,
|
|
"epoch": 0.38071065989847713,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 0.1004,
|
|
"grad_norm": 0.6947969198226929,
|
|
"learning_rate": 0.0003771511844816547,
|
|
"epoch": 0.3934010152284264,
|
|
"step": 310
|
|
},
|
|
{
|
|
"eval_loss": 0.1160828173160553,
|
|
"eval_runtime": 9.5412,
|
|
"eval_samples_per_second": 139.081,
|
|
"eval_steps_per_second": 34.796,
|
|
"epoch": 0.39847715736040606,
|
|
"step": 314
|
|
},
|
|
{
|
|
"loss": 0.1162,
|
|
"grad_norm": 0.6814019083976746,
|
|
"learning_rate": 0.0003751641906698689,
|
|
"epoch": 0.40609137055837563,
|
|
"step": 320
|
|
},
|
|
{
|
|
"loss": 0.1209,
|
|
"grad_norm": 0.6832170486450195,
|
|
"learning_rate": 0.00037310005581046656,
|
|
"epoch": 0.41878172588832485,
|
|
"step": 330
|
|
},
|
|
{
|
|
"loss": 0.1132,
|
|
"grad_norm": 0.5093628764152527,
|
|
"learning_rate": 0.00037095968893356875,
|
|
"epoch": 0.43147208121827413,
|
|
"step": 340
|
|
},
|
|
{
|
|
"loss": 0.1072,
|
|
"grad_norm": 0.6062504053115845,
|
|
"learning_rate": 0.000368744032641328,
|
|
"epoch": 0.44416243654822335,
|
|
"step": 350
|
|
},
|
|
{
|
|
"loss": 0.1056,
|
|
"grad_norm": 0.4274779260158539,
|
|
"learning_rate": 0.00036645406269281307,
|
|
"epoch": 0.45685279187817257,
|
|
"step": 360
|
|
},
|
|
{
|
|
"loss": 0.1178,
|
|
"grad_norm": 0.469860315322876,
|
|
"learning_rate": 0.00036409078757429123,
|
|
"epoch": 0.46954314720812185,
|
|
"step": 370
|
|
},
|
|
{
|
|
"loss": 0.1157,
|
|
"grad_norm": 0.5052468776702881,
|
|
"learning_rate": 0.0003616552480550987,
|
|
"epoch": 0.48223350253807107,
|
|
"step": 380
|
|
},
|
|
{
|
|
"loss": 0.1052,
|
|
"grad_norm": 0.6406036019325256,
|
|
"learning_rate": 0.0003591485167292932,
|
|
"epoch": 0.4949238578680203,
|
|
"step": 390
|
|
},
|
|
{
|
|
"loss": 0.1099,
|
|
"grad_norm": 0.4725392162799835,
|
|
"learning_rate": 0.000356571697543291,
|
|
"epoch": 0.5076142131979695,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 0.0995,
|
|
"grad_norm": 0.566696286201477,
|
|
"learning_rate": 0.00035392592530969724,
|
|
"epoch": 0.5203045685279187,
|
|
"step": 410
|
|
},
|
|
{
|
|
"loss": 0.1005,
|
|
"grad_norm": 0.5996329188346863,
|
|
"learning_rate": 0.0003512123652075423,
|
|
"epoch": 0.5329949238578681,
|
|
"step": 420
|
|
},
|
|
{
|
|
"loss": 0.0991,
|
|
"grad_norm": 0.529081404209137,
|
|
"learning_rate": 0.00034843221226914565,
|
|
"epoch": 0.5456852791878173,
|
|
"step": 430
|
|
},
|
|
{
|
|
"loss": 0.0992,
|
|
"grad_norm": 0.49170249700546265,
|
|
"learning_rate": 0.0003455866908538319,
|
|
"epoch": 0.5583756345177665,
|
|
"step": 440
|
|
},
|
|
{
|
|
"loss": 0.1054,
|
|
"grad_norm": 0.4307686388492584,
|
|
"learning_rate": 0.0003426770541087322,
|
|
"epoch": 0.5710659898477157,
|
|
"step": 450
|
|
},
|
|
{
|
|
"loss": 0.0999,
|
|
"grad_norm": 0.42143514752388,
|
|
"learning_rate": 0.00033970458341690677,
|
|
"epoch": 0.583756345177665,
|
|
"step": 460
|
|
},
|
|
{
|
|
"loss": 0.091,
|
|
"grad_norm": 0.49249398708343506,
|
|
"learning_rate": 0.0003366705878330334,
|
|
"epoch": 0.5964467005076142,
|
|
"step": 470
|
|
},
|
|
{
|
|
"eval_loss": 0.09394794702529907,
|
|
"eval_runtime": 9.5472,
|
|
"eval_samples_per_second": 138.993,
|
|
"eval_steps_per_second": 34.774,
|
|
"epoch": 0.5977157360406091,
|
|
"step": 471
|
|
},
|
|
{
|
|
"loss": 0.0934,
|
|
"grad_norm": 0.5115765333175659,
|
|
"learning_rate": 0.00033357640350690907,
|
|
"epoch": 0.6091370558375635,
|
|
"step": 480
|
|
},
|
|
{
|
|
"loss": 0.0869,
|
|
"grad_norm": 0.43772196769714355,
|
|
"learning_rate": 0.00033042339309501936,
|
|
"epoch": 0.6218274111675127,
|
|
"step": 490
|
|
},
|
|
{
|
|
"loss": 0.0887,
|
|
"grad_norm": 0.4491313397884369,
|
|
"learning_rate": 0.0003272129451604339,
|
|
"epoch": 0.6345177664974619,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 0.0912,
|
|
"grad_norm": 0.3995817303657532,
|
|
"learning_rate": 0.00032394647356129394,
|
|
"epoch": 0.6472081218274112,
|
|
"step": 510
|
|
},
|
|
{
|
|
"loss": 0.0923,
|
|
"grad_norm": 0.3744134306907654,
|
|
"learning_rate": 0.0003206254168281585,
|
|
"epoch": 0.6598984771573604,
|
|
"step": 520
|
|
},
|
|
{
|
|
"loss": 0.1008,
|
|
"grad_norm": 0.47935950756073,
|
|
"learning_rate": 0.00031725123753048676,
|
|
"epoch": 0.6725888324873096,
|
|
"step": 530
|
|
},
|
|
{
|
|
"loss": 0.0976,
|
|
"grad_norm": 0.5493370294570923,
|
|
"learning_rate": 0.0003138254216325324,
|
|
"epoch": 0.6852791878172588,
|
|
"step": 540
|
|
},
|
|
{
|
|
"loss": 0.0972,
|
|
"grad_norm": 0.5088523030281067,
|
|
"learning_rate": 0.000310349477838936,
|
|
"epoch": 0.6979695431472082,
|
|
"step": 550
|
|
},
|
|
{
|
|
"loss": 0.0974,
|
|
"grad_norm": 0.4719524681568146,
|
|
"learning_rate": 0.0003068249369303019,
|
|
"epoch": 0.7106598984771574,
|
|
"step": 560
|
|
},
|
|
{
|
|
"loss": 0.0871,
|
|
"grad_norm": 0.5160706043243408,
|
|
"learning_rate": 0.0003032533510890542,
|
|
"epoch": 0.7233502538071066,
|
|
"step": 570
|
|
},
|
|
{
|
|
"loss": 0.0883,
|
|
"grad_norm": 0.42795512080192566,
|
|
"learning_rate": 0.0002996362932158663,
|
|
"epoch": 0.7360406091370558,
|
|
"step": 580
|
|
},
|
|
{
|
|
"loss": 0.0914,
|
|
"grad_norm": 0.5042217969894409,
|
|
"learning_rate": 0.0002959753562369666,
|
|
"epoch": 0.748730964467005,
|
|
"step": 590
|
|
},
|
|
{
|
|
"loss": 0.0858,
|
|
"grad_norm": 0.41944852471351624,
|
|
"learning_rate": 0.0002922721524026259,
|
|
"epoch": 0.7614213197969543,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 0.0784,
|
|
"grad_norm": 0.3173629343509674,
|
|
"learning_rate": 0.00028852831257713326,
|
|
"epoch": 0.7741116751269036,
|
|
"step": 610
|
|
},
|
|
{
|
|
"loss": 0.0899,
|
|
"grad_norm": 0.5276319980621338,
|
|
"learning_rate": 0.0002847454855205758,
|
|
"epoch": 0.7868020304568528,
|
|
"step": 620
|
|
},
|
|
{
|
|
"eval_loss": 0.09238167852163315,
|
|
"eval_runtime": 9.5377,
|
|
"eval_samples_per_second": 139.132,
|
|
"eval_steps_per_second": 34.809,
|
|
"epoch": 0.7969543147208121,
|
|
"step": 628
|
|
},
|
|
{
|
|
"loss": 0.0868,
|
|
"grad_norm": 0.5028842091560364,
|
|
"learning_rate": 0.0002809253371627362,
|
|
"epoch": 0.799492385786802,
|
|
"step": 630
|
|
},
|
|
{
|
|
"loss": 0.0834,
|
|
"grad_norm": 0.40975555777549744,
|
|
"learning_rate": 0.00027706954986942935,
|
|
"epoch": 0.8121827411167513,
|
|
"step": 640
|
|
},
|
|
{
|
|
"loss": 0.0927,
|
|
"grad_norm": 0.5146291851997375,
|
|
"learning_rate": 0.0002731798217016005,
|
|
"epoch": 0.8248730964467005,
|
|
"step": 650
|
|
},
|
|
{
|
|
"loss": 0.0813,
|
|
"grad_norm": 0.3960341811180115,
|
|
"learning_rate": 0.0002692578656675116,
|
|
"epoch": 0.8375634517766497,
|
|
"step": 660
|
|
},
|
|
{
|
|
"loss": 0.0934,
|
|
"grad_norm": 0.4789310693740845,
|
|
"learning_rate": 0.00026530540896834467,
|
|
"epoch": 0.850253807106599,
|
|
"step": 670
|
|
},
|
|
{
|
|
"loss": 0.0917,
|
|
"grad_norm": 0.43943315744400024,
|
|
"learning_rate": 0.00026132419223755493,
|
|
"epoch": 0.8629441624365483,
|
|
"step": 680
|
|
},
|
|
{
|
|
"loss": 0.0862,
|
|
"grad_norm": 0.5376535654067993,
|
|
"learning_rate": 0.00025731596877430826,
|
|
"epoch": 0.8756345177664975,
|
|
"step": 690
|
|
},
|
|
{
|
|
"loss": 0.088,
|
|
"grad_norm": 0.452738493680954,
|
|
"learning_rate": 0.0002532825037713411,
|
|
"epoch": 0.8883248730964467,
|
|
"step": 700
|
|
},
|
|
{
|
|
"loss": 0.0819,
|
|
"grad_norm": 0.4121873080730438,
|
|
"learning_rate": 0.00024922557353758196,
|
|
"epoch": 0.9010152284263959,
|
|
"step": 710
|
|
},
|
|
{
|
|
"loss": 0.0917,
|
|
"grad_norm": 0.39808085560798645,
|
|
"learning_rate": 0.00024514696471587794,
|
|
"epoch": 0.9137055837563451,
|
|
"step": 720
|
|
},
|
|
{
|
|
"loss": 0.0802,
|
|
"grad_norm": 0.43925586342811584,
|
|
"learning_rate": 0.00024104847349617025,
|
|
"epoch": 0.9263959390862944,
|
|
"step": 730
|
|
},
|
|
{
|
|
"loss": 0.0867,
|
|
"grad_norm": 0.42966383695602417,
|
|
"learning_rate": 0.00023693190482446493,
|
|
"epoch": 0.9390862944162437,
|
|
"step": 740
|
|
},
|
|
{
|
|
"loss": 0.081,
|
|
"grad_norm": 0.4360509514808655,
|
|
"learning_rate": 0.00023279907160794733,
|
|
"epoch": 0.9517766497461929,
|
|
"step": 750
|
|
},
|
|
{
|
|
"loss": 0.0943,
|
|
"grad_norm": 0.4277612268924713,
|
|
"learning_rate": 0.00022865179391659153,
|
|
"epoch": 0.9644670050761421,
|
|
"step": 760
|
|
},
|
|
{
|
|
"loss": 0.0847,
|
|
"grad_norm": 0.44837474822998047,
|
|
"learning_rate": 0.00022449189818161407,
|
|
"epoch": 0.9771573604060914,
|
|
"step": 770
|
|
},
|
|
{
|
|
"loss": 0.0857,
|
|
"grad_norm": 0.4456328749656677,
|
|
"learning_rate": 0.00022032121639112707,
|
|
"epoch": 0.9898477157360406,
|
|
"step": 780
|
|
},
|
|
{
|
|
"eval_loss": 0.08206839114427567,
|
|
"eval_runtime": 9.5084,
|
|
"eval_samples_per_second": 139.561,
|
|
"eval_steps_per_second": 34.917,
|
|
"epoch": 0.9961928934010152,
|
|
"step": 785
|
|
},
|
|
{
|
|
"loss": 0.0885,
|
|
"grad_norm": 0.3194786608219147,
|
|
"learning_rate": 0.0002161415852833438,
|
|
"epoch": 1.00253807106599,
|
|
"step": 790
|
|
},
|
|
{
|
|
"loss": 0.0613,
|
|
"grad_norm": 0.3748820722103119,
|
|
"learning_rate": 0.00021195484553769228,
|
|
"epoch": 1.015228426395939,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 0.0608,
|
|
"grad_norm": 0.4187680780887604,
|
|
"learning_rate": 0.00020776284096419353,
|
|
"epoch": 1.0279187817258884,
|
|
"step": 810
|
|
},
|
|
{
|
|
"loss": 0.0511,
|
|
"grad_norm": 0.29912981390953064,
|
|
"learning_rate": 0.0002035674176914609,
|
|
"epoch": 1.0406091370558375,
|
|
"step": 820
|
|
},
|
|
{
|
|
"loss": 0.052,
|
|
"grad_norm": 0.538901150226593,
|
|
"learning_rate": 0.0001993704233536781,
|
|
"epoch": 1.0532994923857868,
|
|
"step": 830
|
|
},
|
|
{
|
|
"loss": 0.0583,
|
|
"grad_norm": 0.33813655376434326,
|
|
"learning_rate": 0.00019517370627691454,
|
|
"epoch": 1.0659898477157361,
|
|
"step": 840
|
|
},
|
|
{
|
|
"loss": 0.064,
|
|
"grad_norm": 0.43808960914611816,
|
|
"learning_rate": 0.00019097911466513606,
|
|
"epoch": 1.0786802030456852,
|
|
"step": 850
|
|
},
|
|
{
|
|
"loss": 0.0644,
|
|
"grad_norm": 0.34947964549064636,
|
|
"learning_rate": 0.0001867884957862689,
|
|
"epoch": 1.0913705583756346,
|
|
"step": 860
|
|
},
|
|
{
|
|
"loss": 0.0541,
|
|
"grad_norm": 0.3377130627632141,
|
|
"learning_rate": 0.0001826036951586764,
|
|
"epoch": 1.1040609137055837,
|
|
"step": 870
|
|
},
|
|
{
|
|
"loss": 0.0585,
|
|
"grad_norm": 0.5515534281730652,
|
|
"learning_rate": 0.00017842655573840587,
|
|
"epoch": 1.116751269035533,
|
|
"step": 880
|
|
},
|
|
{
|
|
"loss": 0.0499,
|
|
"grad_norm": 0.37032851576805115,
|
|
"learning_rate": 0.00017425891710756437,
|
|
"epoch": 1.1294416243654823,
|
|
"step": 890
|
|
},
|
|
{
|
|
"loss": 0.0537,
|
|
"grad_norm": 0.439820796251297,
|
|
"learning_rate": 0.00017010261466417936,
|
|
"epoch": 1.1421319796954315,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 0.053,
|
|
"grad_norm": 0.47381657361984253,
|
|
"learning_rate": 0.00016595947881390327,
|
|
"epoch": 1.1548223350253808,
|
|
"step": 910
|
|
},
|
|
{
|
|
"loss": 0.0576,
|
|
"grad_norm": 0.33624354004859924,
|
|
"learning_rate": 0.00016183133416391573,
|
|
"epoch": 1.16751269035533,
|
|
"step": 920
|
|
},
|
|
{
|
|
"loss": 0.0544,
|
|
"grad_norm": 0.4324477016925812,
|
|
"learning_rate": 0.00015771999871937964,
|
|
"epoch": 1.1802030456852792,
|
|
"step": 930
|
|
},
|
|
{
|
|
"loss": 0.0562,
|
|
"grad_norm": 0.48575344681739807,
|
|
"learning_rate": 0.00015362728308280528,
|
|
"epoch": 1.1928934010152283,
|
|
"step": 940
|
|
},
|
|
{
|
|
"eval_loss": 0.07984930276870728,
|
|
"eval_runtime": 9.6222,
|
|
"eval_samples_per_second": 137.91,
|
|
"eval_steps_per_second": 34.504,
|
|
"epoch": 1.1954314720812182,
|
|
"step": 942
|
|
},
|
|
{
|
|
"loss": 0.0556,
|
|
"grad_norm": 0.3980204463005066,
|
|
"learning_rate": 0.0001495549896566732,
|
|
"epoch": 1.2055837563451777,
|
|
"step": 950
|
|
},
|
|
{
|
|
"loss": 0.0525,
|
|
"grad_norm": 0.40248578786849976,
|
|
"learning_rate": 0.00014550491184966985,
|
|
"epoch": 1.218274111675127,
|
|
"step": 960
|
|
},
|
|
{
|
|
"loss": 0.0601,
|
|
"grad_norm": 0.4088118076324463,
|
|
"learning_rate": 0.00014147883328688305,
|
|
"epoch": 1.2309644670050761,
|
|
"step": 970
|
|
},
|
|
{
|
|
"loss": 0.0561,
|
|
"grad_norm": 0.49921339750289917,
|
|
"learning_rate": 0.00013747852702430624,
|
|
"epoch": 1.2436548223350254,
|
|
"step": 980
|
|
},
|
|
{
|
|
"loss": 0.0584,
|
|
"grad_norm": 0.404526948928833,
|
|
"learning_rate": 0.0001335057547679978,
|
|
"epoch": 1.2563451776649746,
|
|
"step": 990
|
|
},
|
|
{
|
|
"loss": 0.0575,
|
|
"grad_norm": 0.4947160482406616,
|
|
"learning_rate": 0.00012956226609823771,
|
|
"epoch": 1.2690355329949239,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 0.0539,
|
|
"grad_norm": 0.40267854928970337,
|
|
"learning_rate": 0.0001256497976990259,
|
|
"epoch": 1.281725888324873,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"loss": 0.0571,
|
|
"grad_norm": 0.4298776686191559,
|
|
"learning_rate": 0.00012177007259325813,
|
|
"epoch": 1.2944162436548223,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"loss": 0.0472,
|
|
"grad_norm": 0.4440214931964874,
|
|
"learning_rate": 0.00011792479938391988,
|
|
"epoch": 1.3071065989847717,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"loss": 0.0557,
|
|
"grad_norm": 0.4760426878929138,
|
|
"learning_rate": 0.00011411567150162973,
|
|
"epoch": 1.3197969543147208,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"loss": 0.0525,
|
|
"grad_norm": 0.38695594668388367,
|
|
"learning_rate": 0.00011034436645886447,
|
|
"epoch": 1.33248730964467,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"loss": 0.0521,
|
|
"grad_norm": 0.4604351222515106,
|
|
"learning_rate": 0.00010661254511119501,
|
|
"epoch": 1.3451776649746192,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"loss": 0.0514,
|
|
"grad_norm": 0.4119158089160919,
|
|
"learning_rate": 0.00010292185092585709,
|
|
"epoch": 1.3578680203045685,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"loss": 0.0551,
|
|
"grad_norm": 0.48942986130714417,
|
|
"learning_rate": 9.92739092579808e-05,
|
|
"epoch": 1.3705583756345177,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"loss": 0.0507,
|
|
"grad_norm": 0.34124505519866943,
|
|
"learning_rate": 9.567032663479538e-05,
|
|
"epoch": 1.383248730964467,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"eval_loss": 0.07261822372674942,
|
|
"eval_runtime": 9.4618,
|
|
"eval_samples_per_second": 140.248,
|
|
"eval_steps_per_second": 35.089,
|
|
"epoch": 1.3946700507614214,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"loss": 0.0521,
|
|
"grad_norm": 0.37472787499427795,
|
|
"learning_rate": 9.211269004812642e-05,
|
|
"epoch": 1.3959390862944163,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 0.0521,
|
|
"grad_norm": 0.41362205147743225,
|
|
"learning_rate": 8.860256625549608e-05,
|
|
"epoch": 1.4086294416243654,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"loss": 0.0502,
|
|
"grad_norm": 0.41950875520706177,
|
|
"learning_rate": 8.514150109013415e-05,
|
|
"epoch": 1.4213197969543148,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"loss": 0.0496,
|
|
"grad_norm": 0.42962411046028137,
|
|
"learning_rate": 8.173101878020454e-05,
|
|
"epoch": 1.434010152284264,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"loss": 0.0531,
|
|
"grad_norm": 0.44318121671676636,
|
|
"learning_rate": 7.837262127754609e-05,
|
|
"epoch": 1.4467005076142132,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"loss": 0.0521,
|
|
"grad_norm": 0.4739588797092438,
|
|
"learning_rate": 7.50677875962237e-05,
|
|
"epoch": 1.4593908629441623,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"loss": 0.0496,
|
|
"grad_norm": 0.47502049803733826,
|
|
"learning_rate": 7.181797316118124e-05,
|
|
"epoch": 1.4720812182741116,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"loss": 0.0489,
|
|
"grad_norm": 0.41291266679763794,
|
|
"learning_rate": 6.862460916728297e-05,
|
|
"epoch": 1.484771573604061,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"loss": 0.0477,
|
|
"grad_norm": 0.3678615689277649,
|
|
"learning_rate": 6.548910194902538e-05,
|
|
"epoch": 1.49746192893401,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"loss": 0.0539,
|
|
"grad_norm": 0.4368578791618347,
|
|
"learning_rate": 6.241283236119799e-05,
|
|
"epoch": 1.5101522842639594,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"loss": 0.0471,
|
|
"grad_norm": 0.3706371784210205,
|
|
"learning_rate": 5.9397155170764564e-05,
|
|
"epoch": 1.5228426395939088,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 0.0546,
|
|
"grad_norm": 0.48822730779647827,
|
|
"learning_rate": 5.644339846023359e-05,
|
|
"epoch": 1.5355329949238579,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"loss": 0.0458,
|
|
"grad_norm": 0.3171629011631012,
|
|
"learning_rate": 5.35528630427804e-05,
|
|
"epoch": 1.548223350253807,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"loss": 0.0524,
|
|
"grad_norm": 0.5258194208145142,
|
|
"learning_rate": 5.072682188937812e-05,
|
|
"epoch": 1.5609137055837563,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"loss": 0.0439,
|
|
"grad_norm": 0.40187448263168335,
|
|
"learning_rate": 4.796651956819078e-05,
|
|
"epoch": 1.5736040609137056,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"loss": 0.0488,
|
|
"grad_norm": 0.5225071310997009,
|
|
"learning_rate": 4.527317169647434e-05,
|
|
"epoch": 1.5862944162436547,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"eval_loss": 0.06903357803821564,
|
|
"eval_runtime": 9.4791,
|
|
"eval_samples_per_second": 139.992,
|
|
"eval_steps_per_second": 35.024,
|
|
"epoch": 1.5939086294416245,
|
|
"step": 1256
|
|
},
|
|
{
|
|
"loss": 0.0469,
|
|
"grad_norm": 0.37204620242118835,
|
|
"learning_rate": 4.264796440522747e-05,
|
|
"epoch": 1.598984771573604,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"loss": 0.045,
|
|
"grad_norm": 0.40722259879112244,
|
|
"learning_rate": 4.009205381682828e-05,
|
|
"epoch": 1.6116751269035534,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"loss": 0.05,
|
|
"grad_norm": 0.4333907663822174,
|
|
"learning_rate": 3.760656553588591e-05,
|
|
"epoch": 1.6243654822335025,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"loss": 0.0408,
|
|
"grad_norm": 0.3389703631401062,
|
|
"learning_rate": 3.519259415353291e-05,
|
|
"epoch": 1.6370558375634516,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"loss": 0.0478,
|
|
"grad_norm": 0.43472781777381897,
|
|
"learning_rate": 3.285120276537481e-05,
|
|
"epoch": 1.649746192893401,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 0.0449,
|
|
"grad_norm": 0.35716742277145386,
|
|
"learning_rate": 3.058342250331063e-05,
|
|
"epoch": 1.6624365482233503,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"loss": 0.0455,
|
|
"grad_norm": 0.419575959444046,
|
|
"learning_rate": 2.83902520814298e-05,
|
|
"epoch": 1.6751269035532994,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"loss": 0.044,
|
|
"grad_norm": 0.29724448919296265,
|
|
"learning_rate": 2.627265735618549e-05,
|
|
"epoch": 1.6878172588832487,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"loss": 0.0469,
|
|
"grad_norm": 0.4269305169582367,
|
|
"learning_rate": 2.4231570901038868e-05,
|
|
"epoch": 1.700507614213198,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"loss": 0.0459,
|
|
"grad_norm": 0.5313772559165955,
|
|
"learning_rate": 2.2267891595759816e-05,
|
|
"epoch": 1.7131979695431472,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"loss": 0.0418,
|
|
"grad_norm": 0.4228779077529907,
|
|
"learning_rate": 2.03824842305673e-05,
|
|
"epoch": 1.7258883248730963,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"loss": 0.0476,
|
|
"grad_norm": 0.42005908489227295,
|
|
"learning_rate": 1.8576179125281688e-05,
|
|
"epoch": 1.7385786802030458,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"loss": 0.0477,
|
|
"grad_norm": 0.40062543749809265,
|
|
"learning_rate": 1.684977176365794e-05,
|
|
"epoch": 1.751269035532995,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"loss": 0.0475,
|
|
"grad_norm": 0.4797188639640808,
|
|
"learning_rate": 1.5204022443060472e-05,
|
|
"epoch": 1.763959390862944,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"loss": 0.0468,
|
|
"grad_norm": 0.391634076833725,
|
|
"learning_rate": 1.3639655939633323e-05,
|
|
"epoch": 1.7766497461928934,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 0.0489,
|
|
"grad_norm": 0.42081767320632935,
|
|
"learning_rate": 1.2157361189114325e-05,
|
|
"epoch": 1.7893401015228427,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"eval_loss": 0.0661860853433609,
|
|
"eval_runtime": 9.5602,
|
|
"eval_samples_per_second": 138.805,
|
|
"eval_steps_per_second": 34.727,
|
|
"epoch": 1.7931472081218274,
|
|
"step": 1413
|
|
},
|
|
{
|
|
"loss": 0.0454,
|
|
"grad_norm": 0.46962904930114746,
|
|
"learning_rate": 1.075779098343257e-05,
|
|
"epoch": 1.8020304568527918,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"loss": 0.0441,
|
|
"grad_norm": 0.4605846405029297,
|
|
"learning_rate": 9.441561683223476e-06,
|
|
"epoch": 1.8147208121827412,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"loss": 0.0414,
|
|
"grad_norm": 0.41056081652641296,
|
|
"learning_rate": 8.209252946388302e-06,
|
|
"epoch": 1.8274111675126905,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"loss": 0.0376,
|
|
"grad_norm": 0.4158293902873993,
|
|
"learning_rate": 7.0614074728166506e-06,
|
|
"epoch": 1.8401015228426396,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"loss": 0.0466,
|
|
"grad_norm": 0.5301814675331116,
|
|
"learning_rate": 5.9985307653855016e-06,
|
|
"epoch": 1.8527918781725887,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"loss": 0.0458,
|
|
"grad_norm": 0.44248396158218384,
|
|
"learning_rate": 5.021090907339488e-06,
|
|
"epoch": 1.865482233502538,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"loss": 0.0454,
|
|
"grad_norm": 0.32068517804145813,
|
|
"learning_rate": 4.12951835615012e-06,
|
|
"epoch": 1.8781725888324874,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"loss": 0.0453,
|
|
"grad_norm": 0.5575969815254211,
|
|
"learning_rate": 3.324205753945764e-06,
|
|
"epoch": 1.8908629441624365,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"loss": 0.0404,
|
|
"grad_norm": 0.3342937231063843,
|
|
"learning_rate": 2.605507754594605e-06,
|
|
"epoch": 1.9035532994923858,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 0.0452,
|
|
"grad_norm": 0.516743004322052,
|
|
"learning_rate": 1.9737408675177594e-06,
|
|
"epoch": 1.9162436548223352,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"loss": 0.0438,
|
|
"grad_norm": 0.4447910487651825,
|
|
"learning_rate": 1.4291833183008196e-06,
|
|
"epoch": 1.9289340101522843,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"loss": 0.0523,
|
|
"grad_norm": 0.46259304881095886,
|
|
"learning_rate": 9.720749261652007e-07,
|
|
"epoch": 1.9416243654822334,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"loss": 0.0463,
|
|
"grad_norm": 0.4858075678348541,
|
|
"learning_rate": 6.026169983536223e-07,
|
|
"epoch": 1.9543147208121827,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"loss": 0.0486,
|
|
"grad_norm": 0.43060243129730225,
|
|
"learning_rate": 3.209722414757588e-07,
|
|
"epoch": 1.967005076142132,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"loss": 0.0416,
|
|
"grad_norm": 0.4320192337036133,
|
|
"learning_rate": 1.2726468985349015e-07,
|
|
"epoch": 1.9796954314720812,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"loss": 0.0368,
|
|
"grad_norm": 0.38553470373153687,
|
|
"learning_rate": 2.1579650896952354e-08,
|
|
"epoch": 1.9923857868020305,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"eval_loss": 0.06596987694501877,
|
|
"eval_runtime": 9.5315,
|
|
"eval_samples_per_second": 139.222,
|
|
"eval_steps_per_second": 34.832,
|
|
"epoch": 1.9923857868020305,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"train_runtime": 366.7473,
|
|
"train_samples_per_second": 137.435,
|
|
"train_steps_per_second": 4.297,
|
|
"total_flos": 9515820530880000.0,
|
|
"train_loss": 0.10226353834652659,
|
|
"epoch": 2.0,
|
|
"step": 1576
|
|
}
|
|
] |