Files
gemma-3-270m-it-SuperGPQA-C…/train/log.json
ModelHub XC 9923cf48f1 初始化项目,由ModelHub XC社区提供模型
Model: kth8/gemma-3-270m-it-SuperGPQA-Classifier
Source: Original Platform
2026-04-28 00:34:06 +08:00

1190 lines
27 KiB
JSON

[
{
"loss": 1.5751,
"grad_norm": 5.8206610679626465,
"learning_rate": 4.556962025316456e-05,
"epoch": 0.012690355329949238,
"step": 10
},
{
"loss": 0.6817,
"grad_norm": 2.4909415245056152,
"learning_rate": 9.620253164556962e-05,
"epoch": 0.025380710659898477,
"step": 20
},
{
"loss": 0.5317,
"grad_norm": 2.0207533836364746,
"learning_rate": 0.0001468354430379747,
"epoch": 0.03807106598984772,
"step": 30
},
{
"loss": 0.4042,
"grad_norm": 2.2393276691436768,
"learning_rate": 0.00019746835443037975,
"epoch": 0.050761421319796954,
"step": 40
},
{
"loss": 0.3339,
"grad_norm": 1.3759368658065796,
"learning_rate": 0.0002481012658227848,
"epoch": 0.06345177664974619,
"step": 50
},
{
"loss": 0.2921,
"grad_norm": 1.4795056581497192,
"learning_rate": 0.0002987341772151899,
"epoch": 0.07614213197969544,
"step": 60
},
{
"loss": 0.2594,
"grad_norm": 1.5126503705978394,
"learning_rate": 0.00034936708860759495,
"epoch": 0.08883248730964467,
"step": 70
},
{
"loss": 0.2418,
"grad_norm": 1.2799267768859863,
"learning_rate": 0.0004,
"epoch": 0.10152284263959391,
"step": 80
},
{
"loss": 0.2121,
"grad_norm": 1.081551194190979,
"learning_rate": 0.0003999559607204408,
"epoch": 0.11421319796954314,
"step": 90
},
{
"loss": 0.1958,
"grad_norm": 1.5507794618606567,
"learning_rate": 0.0003998238622763449,
"epoch": 0.12690355329949238,
"step": 100
},
{
"loss": 0.21,
"grad_norm": 0.9705930352210999,
"learning_rate": 0.0003996037628429151,
"epoch": 0.13959390862944163,
"step": 110
},
{
"loss": 0.173,
"grad_norm": 0.9663600325584412,
"learning_rate": 0.00039929575935035633,
"epoch": 0.15228426395939088,
"step": 120
},
{
"loss": 0.1662,
"grad_norm": 1.1074914932250977,
"learning_rate": 0.00039889998744118777,
"epoch": 0.1649746192893401,
"step": 130
},
{
"loss": 0.1733,
"grad_norm": 0.9759091734886169,
"learning_rate": 0.00039841662141050683,
"epoch": 0.17766497461928935,
"step": 140
},
{
"loss": 0.1646,
"grad_norm": 0.8136119842529297,
"learning_rate": 0.0003978458741292311,
"epoch": 0.19035532994923857,
"step": 150
},
{
"eval_loss": 0.15786977112293243,
"eval_runtime": 24.4689,
"eval_samples_per_second": 54.232,
"eval_steps_per_second": 13.568,
"epoch": 0.19923857868020303,
"step": 157
},
{
"loss": 0.1468,
"grad_norm": 0.8433499336242676,
"learning_rate": 0.00039718799695035134,
"epoch": 0.20304568527918782,
"step": 160
},
{
"loss": 0.1321,
"grad_norm": 0.5848754644393921,
"learning_rate": 0.0003964432795982376,
"epoch": 0.21573604060913706,
"step": 170
},
{
"loss": 0.1427,
"grad_norm": 0.7149267792701721,
"learning_rate": 0.0003956120500410464,
"epoch": 0.22842639593908629,
"step": 180
},
{
"loss": 0.1277,
"grad_norm": 0.6142985224723816,
"learning_rate": 0.0003946946743462862,
"epoch": 0.24111675126903553,
"step": 190
},
{
"loss": 0.1355,
"grad_norm": 0.5795847773551941,
"learning_rate": 0.00039369155651960383,
"epoch": 0.25380710659898476,
"step": 200
},
{
"loss": 0.1229,
"grad_norm": 0.6075690984725952,
"learning_rate": 0.0003926031383268634,
"epoch": 0.26649746192893403,
"step": 210
},
{
"loss": 0.1307,
"grad_norm": 0.5007538199424744,
"learning_rate": 0.0003914298990995955,
"epoch": 0.27918781725888325,
"step": 220
},
{
"loss": 0.1179,
"grad_norm": 0.7290500402450562,
"learning_rate": 0.00039017235552390333,
"epoch": 0.2918781725888325,
"step": 230
},
{
"loss": 0.1097,
"grad_norm": 0.5982555747032166,
"learning_rate": 0.00038883106141291774,
"epoch": 0.30456852791878175,
"step": 240
},
{
"loss": 0.1304,
"grad_norm": 0.5894684195518494,
"learning_rate": 0.000387406607462902,
"epoch": 0.31725888324873097,
"step": 250
},
{
"loss": 0.1253,
"grad_norm": 0.4741029441356659,
"learning_rate": 0.00038589962099311336,
"epoch": 0.3299492385786802,
"step": 260
},
{
"loss": 0.1252,
"grad_norm": 0.5968972444534302,
"learning_rate": 0.0003843107656695362,
"epoch": 0.3426395939086294,
"step": 270
},
{
"loss": 0.1179,
"grad_norm": 0.6765711307525635,
"learning_rate": 0.00038264074121260817,
"epoch": 0.3553299492385787,
"step": 280
},
{
"loss": 0.1245,
"grad_norm": 0.531008243560791,
"learning_rate": 0.0003808902830890687,
"epoch": 0.3680203045685279,
"step": 290
},
{
"loss": 0.1166,
"grad_norm": 0.6060107350349426,
"learning_rate": 0.0003790601621880642,
"epoch": 0.38071065989847713,
"step": 300
},
{
"loss": 0.1004,
"grad_norm": 0.6947969198226929,
"learning_rate": 0.0003771511844816547,
"epoch": 0.3934010152284264,
"step": 310
},
{
"eval_loss": 0.1160828173160553,
"eval_runtime": 9.5412,
"eval_samples_per_second": 139.081,
"eval_steps_per_second": 34.796,
"epoch": 0.39847715736040606,
"step": 314
},
{
"loss": 0.1162,
"grad_norm": 0.6814019083976746,
"learning_rate": 0.0003751641906698689,
"epoch": 0.40609137055837563,
"step": 320
},
{
"loss": 0.1209,
"grad_norm": 0.6832170486450195,
"learning_rate": 0.00037310005581046656,
"epoch": 0.41878172588832485,
"step": 330
},
{
"loss": 0.1132,
"grad_norm": 0.5093628764152527,
"learning_rate": 0.00037095968893356875,
"epoch": 0.43147208121827413,
"step": 340
},
{
"loss": 0.1072,
"grad_norm": 0.6062504053115845,
"learning_rate": 0.000368744032641328,
"epoch": 0.44416243654822335,
"step": 350
},
{
"loss": 0.1056,
"grad_norm": 0.4274779260158539,
"learning_rate": 0.00036645406269281307,
"epoch": 0.45685279187817257,
"step": 360
},
{
"loss": 0.1178,
"grad_norm": 0.469860315322876,
"learning_rate": 0.00036409078757429123,
"epoch": 0.46954314720812185,
"step": 370
},
{
"loss": 0.1157,
"grad_norm": 0.5052468776702881,
"learning_rate": 0.0003616552480550987,
"epoch": 0.48223350253807107,
"step": 380
},
{
"loss": 0.1052,
"grad_norm": 0.6406036019325256,
"learning_rate": 0.0003591485167292932,
"epoch": 0.4949238578680203,
"step": 390
},
{
"loss": 0.1099,
"grad_norm": 0.4725392162799835,
"learning_rate": 0.000356571697543291,
"epoch": 0.5076142131979695,
"step": 400
},
{
"loss": 0.0995,
"grad_norm": 0.566696286201477,
"learning_rate": 0.00035392592530969724,
"epoch": 0.5203045685279187,
"step": 410
},
{
"loss": 0.1005,
"grad_norm": 0.5996329188346863,
"learning_rate": 0.0003512123652075423,
"epoch": 0.5329949238578681,
"step": 420
},
{
"loss": 0.0991,
"grad_norm": 0.529081404209137,
"learning_rate": 0.00034843221226914565,
"epoch": 0.5456852791878173,
"step": 430
},
{
"loss": 0.0992,
"grad_norm": 0.49170249700546265,
"learning_rate": 0.0003455866908538319,
"epoch": 0.5583756345177665,
"step": 440
},
{
"loss": 0.1054,
"grad_norm": 0.4307686388492584,
"learning_rate": 0.0003426770541087322,
"epoch": 0.5710659898477157,
"step": 450
},
{
"loss": 0.0999,
"grad_norm": 0.42143514752388,
"learning_rate": 0.00033970458341690677,
"epoch": 0.583756345177665,
"step": 460
},
{
"loss": 0.091,
"grad_norm": 0.49249398708343506,
"learning_rate": 0.0003366705878330334,
"epoch": 0.5964467005076142,
"step": 470
},
{
"eval_loss": 0.09394794702529907,
"eval_runtime": 9.5472,
"eval_samples_per_second": 138.993,
"eval_steps_per_second": 34.774,
"epoch": 0.5977157360406091,
"step": 471
},
{
"loss": 0.0934,
"grad_norm": 0.5115765333175659,
"learning_rate": 0.00033357640350690907,
"epoch": 0.6091370558375635,
"step": 480
},
{
"loss": 0.0869,
"grad_norm": 0.43772196769714355,
"learning_rate": 0.00033042339309501936,
"epoch": 0.6218274111675127,
"step": 490
},
{
"loss": 0.0887,
"grad_norm": 0.4491313397884369,
"learning_rate": 0.0003272129451604339,
"epoch": 0.6345177664974619,
"step": 500
},
{
"loss": 0.0912,
"grad_norm": 0.3995817303657532,
"learning_rate": 0.00032394647356129394,
"epoch": 0.6472081218274112,
"step": 510
},
{
"loss": 0.0923,
"grad_norm": 0.3744134306907654,
"learning_rate": 0.0003206254168281585,
"epoch": 0.6598984771573604,
"step": 520
},
{
"loss": 0.1008,
"grad_norm": 0.47935950756073,
"learning_rate": 0.00031725123753048676,
"epoch": 0.6725888324873096,
"step": 530
},
{
"loss": 0.0976,
"grad_norm": 0.5493370294570923,
"learning_rate": 0.0003138254216325324,
"epoch": 0.6852791878172588,
"step": 540
},
{
"loss": 0.0972,
"grad_norm": 0.5088523030281067,
"learning_rate": 0.000310349477838936,
"epoch": 0.6979695431472082,
"step": 550
},
{
"loss": 0.0974,
"grad_norm": 0.4719524681568146,
"learning_rate": 0.0003068249369303019,
"epoch": 0.7106598984771574,
"step": 560
},
{
"loss": 0.0871,
"grad_norm": 0.5160706043243408,
"learning_rate": 0.0003032533510890542,
"epoch": 0.7233502538071066,
"step": 570
},
{
"loss": 0.0883,
"grad_norm": 0.42795512080192566,
"learning_rate": 0.0002996362932158663,
"epoch": 0.7360406091370558,
"step": 580
},
{
"loss": 0.0914,
"grad_norm": 0.5042217969894409,
"learning_rate": 0.0002959753562369666,
"epoch": 0.748730964467005,
"step": 590
},
{
"loss": 0.0858,
"grad_norm": 0.41944852471351624,
"learning_rate": 0.0002922721524026259,
"epoch": 0.7614213197969543,
"step": 600
},
{
"loss": 0.0784,
"grad_norm": 0.3173629343509674,
"learning_rate": 0.00028852831257713326,
"epoch": 0.7741116751269036,
"step": 610
},
{
"loss": 0.0899,
"grad_norm": 0.5276319980621338,
"learning_rate": 0.0002847454855205758,
"epoch": 0.7868020304568528,
"step": 620
},
{
"eval_loss": 0.09238167852163315,
"eval_runtime": 9.5377,
"eval_samples_per_second": 139.132,
"eval_steps_per_second": 34.809,
"epoch": 0.7969543147208121,
"step": 628
},
{
"loss": 0.0868,
"grad_norm": 0.5028842091560364,
"learning_rate": 0.0002809253371627362,
"epoch": 0.799492385786802,
"step": 630
},
{
"loss": 0.0834,
"grad_norm": 0.40975555777549744,
"learning_rate": 0.00027706954986942935,
"epoch": 0.8121827411167513,
"step": 640
},
{
"loss": 0.0927,
"grad_norm": 0.5146291851997375,
"learning_rate": 0.0002731798217016005,
"epoch": 0.8248730964467005,
"step": 650
},
{
"loss": 0.0813,
"grad_norm": 0.3960341811180115,
"learning_rate": 0.0002692578656675116,
"epoch": 0.8375634517766497,
"step": 660
},
{
"loss": 0.0934,
"grad_norm": 0.4789310693740845,
"learning_rate": 0.00026530540896834467,
"epoch": 0.850253807106599,
"step": 670
},
{
"loss": 0.0917,
"grad_norm": 0.43943315744400024,
"learning_rate": 0.00026132419223755493,
"epoch": 0.8629441624365483,
"step": 680
},
{
"loss": 0.0862,
"grad_norm": 0.5376535654067993,
"learning_rate": 0.00025731596877430826,
"epoch": 0.8756345177664975,
"step": 690
},
{
"loss": 0.088,
"grad_norm": 0.452738493680954,
"learning_rate": 0.0002532825037713411,
"epoch": 0.8883248730964467,
"step": 700
},
{
"loss": 0.0819,
"grad_norm": 0.4121873080730438,
"learning_rate": 0.00024922557353758196,
"epoch": 0.9010152284263959,
"step": 710
},
{
"loss": 0.0917,
"grad_norm": 0.39808085560798645,
"learning_rate": 0.00024514696471587794,
"epoch": 0.9137055837563451,
"step": 720
},
{
"loss": 0.0802,
"grad_norm": 0.43925586342811584,
"learning_rate": 0.00024104847349617025,
"epoch": 0.9263959390862944,
"step": 730
},
{
"loss": 0.0867,
"grad_norm": 0.42966383695602417,
"learning_rate": 0.00023693190482446493,
"epoch": 0.9390862944162437,
"step": 740
},
{
"loss": 0.081,
"grad_norm": 0.4360509514808655,
"learning_rate": 0.00023279907160794733,
"epoch": 0.9517766497461929,
"step": 750
},
{
"loss": 0.0943,
"grad_norm": 0.4277612268924713,
"learning_rate": 0.00022865179391659153,
"epoch": 0.9644670050761421,
"step": 760
},
{
"loss": 0.0847,
"grad_norm": 0.44837474822998047,
"learning_rate": 0.00022449189818161407,
"epoch": 0.9771573604060914,
"step": 770
},
{
"loss": 0.0857,
"grad_norm": 0.4456328749656677,
"learning_rate": 0.00022032121639112707,
"epoch": 0.9898477157360406,
"step": 780
},
{
"eval_loss": 0.08206839114427567,
"eval_runtime": 9.5084,
"eval_samples_per_second": 139.561,
"eval_steps_per_second": 34.917,
"epoch": 0.9961928934010152,
"step": 785
},
{
"loss": 0.0885,
"grad_norm": 0.3194786608219147,
"learning_rate": 0.0002161415852833438,
"epoch": 1.00253807106599,
"step": 790
},
{
"loss": 0.0613,
"grad_norm": 0.3748820722103119,
"learning_rate": 0.00021195484553769228,
"epoch": 1.015228426395939,
"step": 800
},
{
"loss": 0.0608,
"grad_norm": 0.4187680780887604,
"learning_rate": 0.00020776284096419353,
"epoch": 1.0279187817258884,
"step": 810
},
{
"loss": 0.0511,
"grad_norm": 0.29912981390953064,
"learning_rate": 0.0002035674176914609,
"epoch": 1.0406091370558375,
"step": 820
},
{
"loss": 0.052,
"grad_norm": 0.538901150226593,
"learning_rate": 0.0001993704233536781,
"epoch": 1.0532994923857868,
"step": 830
},
{
"loss": 0.0583,
"grad_norm": 0.33813655376434326,
"learning_rate": 0.00019517370627691454,
"epoch": 1.0659898477157361,
"step": 840
},
{
"loss": 0.064,
"grad_norm": 0.43808960914611816,
"learning_rate": 0.00019097911466513606,
"epoch": 1.0786802030456852,
"step": 850
},
{
"loss": 0.0644,
"grad_norm": 0.34947964549064636,
"learning_rate": 0.0001867884957862689,
"epoch": 1.0913705583756346,
"step": 860
},
{
"loss": 0.0541,
"grad_norm": 0.3377130627632141,
"learning_rate": 0.0001826036951586764,
"epoch": 1.1040609137055837,
"step": 870
},
{
"loss": 0.0585,
"grad_norm": 0.5515534281730652,
"learning_rate": 0.00017842655573840587,
"epoch": 1.116751269035533,
"step": 880
},
{
"loss": 0.0499,
"grad_norm": 0.37032851576805115,
"learning_rate": 0.00017425891710756437,
"epoch": 1.1294416243654823,
"step": 890
},
{
"loss": 0.0537,
"grad_norm": 0.439820796251297,
"learning_rate": 0.00017010261466417936,
"epoch": 1.1421319796954315,
"step": 900
},
{
"loss": 0.053,
"grad_norm": 0.47381657361984253,
"learning_rate": 0.00016595947881390327,
"epoch": 1.1548223350253808,
"step": 910
},
{
"loss": 0.0576,
"grad_norm": 0.33624354004859924,
"learning_rate": 0.00016183133416391573,
"epoch": 1.16751269035533,
"step": 920
},
{
"loss": 0.0544,
"grad_norm": 0.4324477016925812,
"learning_rate": 0.00015771999871937964,
"epoch": 1.1802030456852792,
"step": 930
},
{
"loss": 0.0562,
"grad_norm": 0.48575344681739807,
"learning_rate": 0.00015362728308280528,
"epoch": 1.1928934010152283,
"step": 940
},
{
"eval_loss": 0.07984930276870728,
"eval_runtime": 9.6222,
"eval_samples_per_second": 137.91,
"eval_steps_per_second": 34.504,
"epoch": 1.1954314720812182,
"step": 942
},
{
"loss": 0.0556,
"grad_norm": 0.3980204463005066,
"learning_rate": 0.0001495549896566732,
"epoch": 1.2055837563451777,
"step": 950
},
{
"loss": 0.0525,
"grad_norm": 0.40248578786849976,
"learning_rate": 0.00014550491184966985,
"epoch": 1.218274111675127,
"step": 960
},
{
"loss": 0.0601,
"grad_norm": 0.4088118076324463,
"learning_rate": 0.00014147883328688305,
"epoch": 1.2309644670050761,
"step": 970
},
{
"loss": 0.0561,
"grad_norm": 0.49921339750289917,
"learning_rate": 0.00013747852702430624,
"epoch": 1.2436548223350254,
"step": 980
},
{
"loss": 0.0584,
"grad_norm": 0.404526948928833,
"learning_rate": 0.0001335057547679978,
"epoch": 1.2563451776649746,
"step": 990
},
{
"loss": 0.0575,
"grad_norm": 0.4947160482406616,
"learning_rate": 0.00012956226609823771,
"epoch": 1.2690355329949239,
"step": 1000
},
{
"loss": 0.0539,
"grad_norm": 0.40267854928970337,
"learning_rate": 0.0001256497976990259,
"epoch": 1.281725888324873,
"step": 1010
},
{
"loss": 0.0571,
"grad_norm": 0.4298776686191559,
"learning_rate": 0.00012177007259325813,
"epoch": 1.2944162436548223,
"step": 1020
},
{
"loss": 0.0472,
"grad_norm": 0.4440214931964874,
"learning_rate": 0.00011792479938391988,
"epoch": 1.3071065989847717,
"step": 1030
},
{
"loss": 0.0557,
"grad_norm": 0.4760426878929138,
"learning_rate": 0.00011411567150162973,
"epoch": 1.3197969543147208,
"step": 1040
},
{
"loss": 0.0525,
"grad_norm": 0.38695594668388367,
"learning_rate": 0.00011034436645886447,
"epoch": 1.33248730964467,
"step": 1050
},
{
"loss": 0.0521,
"grad_norm": 0.4604351222515106,
"learning_rate": 0.00010661254511119501,
"epoch": 1.3451776649746192,
"step": 1060
},
{
"loss": 0.0514,
"grad_norm": 0.4119158089160919,
"learning_rate": 0.00010292185092585709,
"epoch": 1.3578680203045685,
"step": 1070
},
{
"loss": 0.0551,
"grad_norm": 0.48942986130714417,
"learning_rate": 9.92739092579808e-05,
"epoch": 1.3705583756345177,
"step": 1080
},
{
"loss": 0.0507,
"grad_norm": 0.34124505519866943,
"learning_rate": 9.567032663479538e-05,
"epoch": 1.383248730964467,
"step": 1090
},
{
"eval_loss": 0.07261822372674942,
"eval_runtime": 9.4618,
"eval_samples_per_second": 140.248,
"eval_steps_per_second": 35.089,
"epoch": 1.3946700507614214,
"step": 1099
},
{
"loss": 0.0521,
"grad_norm": 0.37472787499427795,
"learning_rate": 9.211269004812642e-05,
"epoch": 1.3959390862944163,
"step": 1100
},
{
"loss": 0.0521,
"grad_norm": 0.41362205147743225,
"learning_rate": 8.860256625549608e-05,
"epoch": 1.4086294416243654,
"step": 1110
},
{
"loss": 0.0502,
"grad_norm": 0.41950875520706177,
"learning_rate": 8.514150109013415e-05,
"epoch": 1.4213197969543148,
"step": 1120
},
{
"loss": 0.0496,
"grad_norm": 0.42962411046028137,
"learning_rate": 8.173101878020454e-05,
"epoch": 1.434010152284264,
"step": 1130
},
{
"loss": 0.0531,
"grad_norm": 0.44318121671676636,
"learning_rate": 7.837262127754609e-05,
"epoch": 1.4467005076142132,
"step": 1140
},
{
"loss": 0.0521,
"grad_norm": 0.4739588797092438,
"learning_rate": 7.50677875962237e-05,
"epoch": 1.4593908629441623,
"step": 1150
},
{
"loss": 0.0496,
"grad_norm": 0.47502049803733826,
"learning_rate": 7.181797316118124e-05,
"epoch": 1.4720812182741116,
"step": 1160
},
{
"loss": 0.0489,
"grad_norm": 0.41291266679763794,
"learning_rate": 6.862460916728297e-05,
"epoch": 1.484771573604061,
"step": 1170
},
{
"loss": 0.0477,
"grad_norm": 0.3678615689277649,
"learning_rate": 6.548910194902538e-05,
"epoch": 1.49746192893401,
"step": 1180
},
{
"loss": 0.0539,
"grad_norm": 0.4368578791618347,
"learning_rate": 6.241283236119799e-05,
"epoch": 1.5101522842639594,
"step": 1190
},
{
"loss": 0.0471,
"grad_norm": 0.3706371784210205,
"learning_rate": 5.9397155170764564e-05,
"epoch": 1.5228426395939088,
"step": 1200
},
{
"loss": 0.0546,
"grad_norm": 0.48822730779647827,
"learning_rate": 5.644339846023359e-05,
"epoch": 1.5355329949238579,
"step": 1210
},
{
"loss": 0.0458,
"grad_norm": 0.3171629011631012,
"learning_rate": 5.35528630427804e-05,
"epoch": 1.548223350253807,
"step": 1220
},
{
"loss": 0.0524,
"grad_norm": 0.5258194208145142,
"learning_rate": 5.072682188937812e-05,
"epoch": 1.5609137055837563,
"step": 1230
},
{
"loss": 0.0439,
"grad_norm": 0.40187448263168335,
"learning_rate": 4.796651956819078e-05,
"epoch": 1.5736040609137056,
"step": 1240
},
{
"loss": 0.0488,
"grad_norm": 0.5225071310997009,
"learning_rate": 4.527317169647434e-05,
"epoch": 1.5862944162436547,
"step": 1250
},
{
"eval_loss": 0.06903357803821564,
"eval_runtime": 9.4791,
"eval_samples_per_second": 139.992,
"eval_steps_per_second": 35.024,
"epoch": 1.5939086294416245,
"step": 1256
},
{
"loss": 0.0469,
"grad_norm": 0.37204620242118835,
"learning_rate": 4.264796440522747e-05,
"epoch": 1.598984771573604,
"step": 1260
},
{
"loss": 0.045,
"grad_norm": 0.40722259879112244,
"learning_rate": 4.009205381682828e-05,
"epoch": 1.6116751269035534,
"step": 1270
},
{
"loss": 0.05,
"grad_norm": 0.4333907663822174,
"learning_rate": 3.760656553588591e-05,
"epoch": 1.6243654822335025,
"step": 1280
},
{
"loss": 0.0408,
"grad_norm": 0.3389703631401062,
"learning_rate": 3.519259415353291e-05,
"epoch": 1.6370558375634516,
"step": 1290
},
{
"loss": 0.0478,
"grad_norm": 0.43472781777381897,
"learning_rate": 3.285120276537481e-05,
"epoch": 1.649746192893401,
"step": 1300
},
{
"loss": 0.0449,
"grad_norm": 0.35716742277145386,
"learning_rate": 3.058342250331063e-05,
"epoch": 1.6624365482233503,
"step": 1310
},
{
"loss": 0.0455,
"grad_norm": 0.419575959444046,
"learning_rate": 2.83902520814298e-05,
"epoch": 1.6751269035532994,
"step": 1320
},
{
"loss": 0.044,
"grad_norm": 0.29724448919296265,
"learning_rate": 2.627265735618549e-05,
"epoch": 1.6878172588832487,
"step": 1330
},
{
"loss": 0.0469,
"grad_norm": 0.4269305169582367,
"learning_rate": 2.4231570901038868e-05,
"epoch": 1.700507614213198,
"step": 1340
},
{
"loss": 0.0459,
"grad_norm": 0.5313772559165955,
"learning_rate": 2.2267891595759816e-05,
"epoch": 1.7131979695431472,
"step": 1350
},
{
"loss": 0.0418,
"grad_norm": 0.4228779077529907,
"learning_rate": 2.03824842305673e-05,
"epoch": 1.7258883248730963,
"step": 1360
},
{
"loss": 0.0476,
"grad_norm": 0.42005908489227295,
"learning_rate": 1.8576179125281688e-05,
"epoch": 1.7385786802030458,
"step": 1370
},
{
"loss": 0.0477,
"grad_norm": 0.40062543749809265,
"learning_rate": 1.684977176365794e-05,
"epoch": 1.751269035532995,
"step": 1380
},
{
"loss": 0.0475,
"grad_norm": 0.4797188639640808,
"learning_rate": 1.5204022443060472e-05,
"epoch": 1.763959390862944,
"step": 1390
},
{
"loss": 0.0468,
"grad_norm": 0.391634076833725,
"learning_rate": 1.3639655939633323e-05,
"epoch": 1.7766497461928934,
"step": 1400
},
{
"loss": 0.0489,
"grad_norm": 0.42081767320632935,
"learning_rate": 1.2157361189114325e-05,
"epoch": 1.7893401015228427,
"step": 1410
},
{
"eval_loss": 0.0661860853433609,
"eval_runtime": 9.5602,
"eval_samples_per_second": 138.805,
"eval_steps_per_second": 34.727,
"epoch": 1.7931472081218274,
"step": 1413
},
{
"loss": 0.0454,
"grad_norm": 0.46962904930114746,
"learning_rate": 1.075779098343257e-05,
"epoch": 1.8020304568527918,
"step": 1420
},
{
"loss": 0.0441,
"grad_norm": 0.4605846405029297,
"learning_rate": 9.441561683223476e-06,
"epoch": 1.8147208121827412,
"step": 1430
},
{
"loss": 0.0414,
"grad_norm": 0.41056081652641296,
"learning_rate": 8.209252946388302e-06,
"epoch": 1.8274111675126905,
"step": 1440
},
{
"loss": 0.0376,
"grad_norm": 0.4158293902873993,
"learning_rate": 7.0614074728166506e-06,
"epoch": 1.8401015228426396,
"step": 1450
},
{
"loss": 0.0466,
"grad_norm": 0.5301814675331116,
"learning_rate": 5.9985307653855016e-06,
"epoch": 1.8527918781725887,
"step": 1460
},
{
"loss": 0.0458,
"grad_norm": 0.44248396158218384,
"learning_rate": 5.021090907339488e-06,
"epoch": 1.865482233502538,
"step": 1470
},
{
"loss": 0.0454,
"grad_norm": 0.32068517804145813,
"learning_rate": 4.12951835615012e-06,
"epoch": 1.8781725888324874,
"step": 1480
},
{
"loss": 0.0453,
"grad_norm": 0.5575969815254211,
"learning_rate": 3.324205753945764e-06,
"epoch": 1.8908629441624365,
"step": 1490
},
{
"loss": 0.0404,
"grad_norm": 0.3342937231063843,
"learning_rate": 2.605507754594605e-06,
"epoch": 1.9035532994923858,
"step": 1500
},
{
"loss": 0.0452,
"grad_norm": 0.516743004322052,
"learning_rate": 1.9737408675177594e-06,
"epoch": 1.9162436548223352,
"step": 1510
},
{
"loss": 0.0438,
"grad_norm": 0.4447910487651825,
"learning_rate": 1.4291833183008196e-06,
"epoch": 1.9289340101522843,
"step": 1520
},
{
"loss": 0.0523,
"grad_norm": 0.46259304881095886,
"learning_rate": 9.720749261652007e-07,
"epoch": 1.9416243654822334,
"step": 1530
},
{
"loss": 0.0463,
"grad_norm": 0.4858075678348541,
"learning_rate": 6.026169983536223e-07,
"epoch": 1.9543147208121827,
"step": 1540
},
{
"loss": 0.0486,
"grad_norm": 0.43060243129730225,
"learning_rate": 3.209722414757588e-07,
"epoch": 1.967005076142132,
"step": 1550
},
{
"loss": 0.0416,
"grad_norm": 0.4320192337036133,
"learning_rate": 1.2726468985349015e-07,
"epoch": 1.9796954314720812,
"step": 1560
},
{
"loss": 0.0368,
"grad_norm": 0.38553470373153687,
"learning_rate": 2.1579650896952354e-08,
"epoch": 1.9923857868020305,
"step": 1570
},
{
"eval_loss": 0.06596987694501877,
"eval_runtime": 9.5315,
"eval_samples_per_second": 139.222,
"eval_steps_per_second": 34.832,
"epoch": 1.9923857868020305,
"step": 1570
},
{
"train_runtime": 366.7473,
"train_samples_per_second": 137.435,
"train_steps_per_second": 4.297,
"total_flos": 9515820530880000.0,
"train_loss": 0.10226353834652659,
"epoch": 2.0,
"step": 1576
}
]