Files
Llama-3.3-8B-Instruct-Super…/train/log.json
ModelHub XC 0fd06ce448 初始化项目,由ModelHub XC社区提供模型
Model: kth8/Llama-3.3-8B-Instruct-SuperGPQA-Classifier
Source: Original Platform
2026-04-18 14:08:36 +08:00

1190 lines
27 KiB
JSON

[
{
"loss": 1.8996,
"grad_norm": 5.035277843475342,
"learning_rate": 2.278481012658228e-05,
"epoch": 0.012690355329949238,
"step": 10
},
{
"loss": 0.5315,
"grad_norm": 1.102072834968567,
"learning_rate": 4.810126582278481e-05,
"epoch": 0.025380710659898477,
"step": 20
},
{
"loss": 0.3353,
"grad_norm": 0.7798988819122314,
"learning_rate": 7.341772151898734e-05,
"epoch": 0.03807106598984772,
"step": 30
},
{
"loss": 0.2226,
"grad_norm": 0.8653473854064941,
"learning_rate": 9.873417721518988e-05,
"epoch": 0.050761421319796954,
"step": 40
},
{
"loss": 0.164,
"grad_norm": 0.7569780349731445,
"learning_rate": 0.0001240506329113924,
"epoch": 0.06345177664974619,
"step": 50
},
{
"loss": 0.1394,
"grad_norm": 1.0211968421936035,
"learning_rate": 0.00014936708860759494,
"epoch": 0.07614213197969544,
"step": 60
},
{
"loss": 0.1201,
"grad_norm": 0.5370887517929077,
"learning_rate": 0.00017468354430379748,
"epoch": 0.08883248730964467,
"step": 70
},
{
"loss": 0.122,
"grad_norm": 0.49917498230934143,
"learning_rate": 0.0002,
"epoch": 0.10152284263959391,
"step": 80
},
{
"loss": 0.1217,
"grad_norm": 0.4577413499355316,
"learning_rate": 0.0001999779803602204,
"epoch": 0.11421319796954314,
"step": 90
},
{
"loss": 0.0965,
"grad_norm": 0.48522070050239563,
"learning_rate": 0.00019991193113817244,
"epoch": 0.12690355329949238,
"step": 100
},
{
"loss": 0.11,
"grad_norm": 0.41902250051498413,
"learning_rate": 0.00019980188142145754,
"epoch": 0.13959390862944163,
"step": 110
},
{
"loss": 0.0823,
"grad_norm": 0.5561641454696655,
"learning_rate": 0.00019964787967517817,
"epoch": 0.15228426395939088,
"step": 120
},
{
"loss": 0.0856,
"grad_norm": 0.3316971957683563,
"learning_rate": 0.00019944999372059388,
"epoch": 0.1649746192893401,
"step": 130
},
{
"loss": 0.0849,
"grad_norm": 0.372153639793396,
"learning_rate": 0.00019920831070525342,
"epoch": 0.17766497461928935,
"step": 140
},
{
"loss": 0.0929,
"grad_norm": 0.33250877261161804,
"learning_rate": 0.00019892293706461555,
"epoch": 0.19035532994923857,
"step": 150
},
{
"eval_loss": 0.08791538327932358,
"eval_runtime": 29.62,
"eval_samples_per_second": 44.801,
"eval_steps_per_second": 11.209,
"epoch": 0.19923857868020303,
"step": 157
},
{
"loss": 0.0824,
"grad_norm": 0.4130192995071411,
"learning_rate": 0.00019859399847517567,
"epoch": 0.20304568527918782,
"step": 160
},
{
"loss": 0.0902,
"grad_norm": 0.3217241168022156,
"learning_rate": 0.0001982216397991188,
"epoch": 0.21573604060913706,
"step": 170
},
{
"loss": 0.0766,
"grad_norm": 0.4728490710258484,
"learning_rate": 0.0001978060250205232,
"epoch": 0.22842639593908629,
"step": 180
},
{
"loss": 0.0844,
"grad_norm": 0.5730077028274536,
"learning_rate": 0.0001973473371731431,
"epoch": 0.24111675126903553,
"step": 190
},
{
"loss": 0.0841,
"grad_norm": 0.5745298862457275,
"learning_rate": 0.00019684577825980192,
"epoch": 0.25380710659898476,
"step": 200
},
{
"loss": 0.0797,
"grad_norm": 0.3141058683395386,
"learning_rate": 0.0001963015691634317,
"epoch": 0.26649746192893403,
"step": 210
},
{
"loss": 0.0822,
"grad_norm": 0.3730680048465729,
"learning_rate": 0.00019571494954979775,
"epoch": 0.27918781725888325,
"step": 220
},
{
"loss": 0.0677,
"grad_norm": 0.3915182650089264,
"learning_rate": 0.00019508617776195167,
"epoch": 0.2918781725888325,
"step": 230
},
{
"loss": 0.08,
"grad_norm": 0.3052193820476532,
"learning_rate": 0.00019441553070645887,
"epoch": 0.30456852791878175,
"step": 240
},
{
"loss": 0.0744,
"grad_norm": 0.3673352003097534,
"learning_rate": 0.000193703303731451,
"epoch": 0.31725888324873097,
"step": 250
},
{
"loss": 0.0821,
"grad_norm": 0.39443644881248474,
"learning_rate": 0.00019294981049655668,
"epoch": 0.3299492385786802,
"step": 260
},
{
"loss": 0.073,
"grad_norm": 0.44178199768066406,
"learning_rate": 0.0001921553828347681,
"epoch": 0.3426395939086294,
"step": 270
},
{
"loss": 0.0784,
"grad_norm": 0.4202715754508972,
"learning_rate": 0.00019132037060630409,
"epoch": 0.3553299492385787,
"step": 280
},
{
"loss": 0.0646,
"grad_norm": 0.23640507459640503,
"learning_rate": 0.00019044514154453434,
"epoch": 0.3680203045685279,
"step": 290
},
{
"loss": 0.0785,
"grad_norm": 0.4354120194911957,
"learning_rate": 0.0001895300810940321,
"epoch": 0.38071065989847713,
"step": 300
},
{
"loss": 0.0656,
"grad_norm": 0.2467317283153534,
"learning_rate": 0.00018857559224082736,
"epoch": 0.3934010152284264,
"step": 310
},
{
"eval_loss": 0.0728072002530098,
"eval_runtime": 19.9827,
"eval_samples_per_second": 66.407,
"eval_steps_per_second": 16.614,
"epoch": 0.39847715736040606,
"step": 314
},
{
"loss": 0.0738,
"grad_norm": 0.2969267666339874,
"learning_rate": 0.00018758209533493444,
"epoch": 0.40609137055837563,
"step": 320
},
{
"loss": 0.067,
"grad_norm": 0.3527528643608093,
"learning_rate": 0.00018655002790523328,
"epoch": 0.41878172588832485,
"step": 330
},
{
"loss": 0.0714,
"grad_norm": 0.2732889950275421,
"learning_rate": 0.00018547984446678437,
"epoch": 0.43147208121827413,
"step": 340
},
{
"loss": 0.0602,
"grad_norm": 0.25770312547683716,
"learning_rate": 0.000184372016320664,
"epoch": 0.44416243654822335,
"step": 350
},
{
"loss": 0.0624,
"grad_norm": 0.22473905980587006,
"learning_rate": 0.00018322703134640654,
"epoch": 0.45685279187817257,
"step": 360
},
{
"loss": 0.0709,
"grad_norm": 0.3180300295352936,
"learning_rate": 0.00018204539378714561,
"epoch": 0.46954314720812185,
"step": 370
},
{
"loss": 0.0698,
"grad_norm": 0.2796868085861206,
"learning_rate": 0.00018082762402754936,
"epoch": 0.48223350253807107,
"step": 380
},
{
"loss": 0.0658,
"grad_norm": 0.3655967712402344,
"learning_rate": 0.0001795742583646466,
"epoch": 0.4949238578680203,
"step": 390
},
{
"loss": 0.0682,
"grad_norm": 0.2886195182800293,
"learning_rate": 0.0001782858487716455,
"epoch": 0.5076142131979695,
"step": 400
},
{
"loss": 0.071,
"grad_norm": 0.27021610736846924,
"learning_rate": 0.00017696296265484862,
"epoch": 0.5203045685279187,
"step": 410
},
{
"loss": 0.0636,
"grad_norm": 0.28307008743286133,
"learning_rate": 0.00017560618260377116,
"epoch": 0.5329949238578681,
"step": 420
},
{
"loss": 0.0546,
"grad_norm": 0.28294482827186584,
"learning_rate": 0.00017421610613457282,
"epoch": 0.5456852791878173,
"step": 430
},
{
"loss": 0.0612,
"grad_norm": 0.2255251258611679,
"learning_rate": 0.00017279334542691596,
"epoch": 0.5583756345177665,
"step": 440
},
{
"loss": 0.0629,
"grad_norm": 0.22404751181602478,
"learning_rate": 0.0001713385270543661,
"epoch": 0.5710659898477157,
"step": 450
},
{
"loss": 0.0596,
"grad_norm": 0.2632795572280884,
"learning_rate": 0.00016985229170845339,
"epoch": 0.583756345177665,
"step": 460
},
{
"loss": 0.0717,
"grad_norm": 0.3002878427505493,
"learning_rate": 0.0001683352939165167,
"epoch": 0.5964467005076142,
"step": 470
},
{
"eval_loss": 0.06722872704267502,
"eval_runtime": 20.1214,
"eval_samples_per_second": 65.95,
"eval_steps_per_second": 16.5,
"epoch": 0.5977157360406091,
"step": 471
},
{
"loss": 0.0618,
"grad_norm": 0.15326248109340668,
"learning_rate": 0.00016678820175345454,
"epoch": 0.6091370558375635,
"step": 480
},
{
"loss": 0.0718,
"grad_norm": 0.27122628688812256,
"learning_rate": 0.00016521169654750968,
"epoch": 0.6218274111675127,
"step": 490
},
{
"loss": 0.0636,
"grad_norm": 0.29509711265563965,
"learning_rate": 0.00016360647258021696,
"epoch": 0.6345177664974619,
"step": 500
},
{
"loss": 0.0655,
"grad_norm": 0.4090014100074768,
"learning_rate": 0.00016197323678064697,
"epoch": 0.6472081218274112,
"step": 510
},
{
"loss": 0.0606,
"grad_norm": 0.2687474191188812,
"learning_rate": 0.00016031270841407926,
"epoch": 0.6598984771573604,
"step": 520
},
{
"loss": 0.0519,
"grad_norm": 0.25125357508659363,
"learning_rate": 0.00015862561876524338,
"epoch": 0.6725888324873096,
"step": 530
},
{
"loss": 0.0623,
"grad_norm": 0.21579739451408386,
"learning_rate": 0.0001569127108162662,
"epoch": 0.6852791878172588,
"step": 540
},
{
"loss": 0.0612,
"grad_norm": 0.24012021720409393,
"learning_rate": 0.000155174738919468,
"epoch": 0.6979695431472082,
"step": 550
},
{
"loss": 0.0617,
"grad_norm": 0.22273781895637512,
"learning_rate": 0.00015341246846515096,
"epoch": 0.7106598984771574,
"step": 560
},
{
"loss": 0.0627,
"grad_norm": 0.29965269565582275,
"learning_rate": 0.0001516266755445271,
"epoch": 0.7233502538071066,
"step": 570
},
{
"loss": 0.0649,
"grad_norm": 0.2375640720129013,
"learning_rate": 0.00014981814660793314,
"epoch": 0.7360406091370558,
"step": 580
},
{
"loss": 0.0653,
"grad_norm": 0.2595769166946411,
"learning_rate": 0.0001479876781184833,
"epoch": 0.748730964467005,
"step": 590
},
{
"loss": 0.0634,
"grad_norm": 0.28185659646987915,
"learning_rate": 0.00014613607620131294,
"epoch": 0.7614213197969543,
"step": 600
},
{
"loss": 0.0601,
"grad_norm": 0.20655085146427155,
"learning_rate": 0.00014426415628856663,
"epoch": 0.7741116751269036,
"step": 610
},
{
"loss": 0.0632,
"grad_norm": 0.4992614686489105,
"learning_rate": 0.0001423727427602879,
"epoch": 0.7868020304568528,
"step": 620
},
{
"eval_loss": 0.05841095373034477,
"eval_runtime": 20.0018,
"eval_samples_per_second": 66.344,
"eval_steps_per_second": 16.599,
"epoch": 0.7969543147208121,
"step": 628
},
{
"loss": 0.0522,
"grad_norm": 0.2023015171289444,
"learning_rate": 0.0001404626685813681,
"epoch": 0.799492385786802,
"step": 630
},
{
"loss": 0.0567,
"grad_norm": 0.20891991257667542,
"learning_rate": 0.00013853477493471468,
"epoch": 0.8121827411167513,
"step": 640
},
{
"loss": 0.0555,
"grad_norm": 0.27132412791252136,
"learning_rate": 0.00013658991085080025,
"epoch": 0.8248730964467005,
"step": 650
},
{
"loss": 0.0594,
"grad_norm": 0.22256866097450256,
"learning_rate": 0.0001346289328337558,
"epoch": 0.8375634517766497,
"step": 660
},
{
"loss": 0.0556,
"grad_norm": 0.20859505236148834,
"learning_rate": 0.00013265270448417234,
"epoch": 0.850253807106599,
"step": 670
},
{
"loss": 0.0557,
"grad_norm": 0.2204328030347824,
"learning_rate": 0.00013066209611877746,
"epoch": 0.8629441624365483,
"step": 680
},
{
"loss": 0.059,
"grad_norm": 0.2515346109867096,
"learning_rate": 0.00012865798438715413,
"epoch": 0.8756345177664975,
"step": 690
},
{
"loss": 0.0546,
"grad_norm": 0.3130325376987457,
"learning_rate": 0.00012664125188567056,
"epoch": 0.8883248730964467,
"step": 700
},
{
"loss": 0.0475,
"grad_norm": 0.2509436011314392,
"learning_rate": 0.00012461278676879098,
"epoch": 0.9010152284263959,
"step": 710
},
{
"loss": 0.0561,
"grad_norm": 0.23676852881908417,
"learning_rate": 0.00012257348235793897,
"epoch": 0.9137055837563451,
"step": 720
},
{
"loss": 0.0536,
"grad_norm": 0.20894668996334076,
"learning_rate": 0.00012052423674808513,
"epoch": 0.9263959390862944,
"step": 730
},
{
"loss": 0.0517,
"grad_norm": 0.18107716739177704,
"learning_rate": 0.00011846595241223247,
"epoch": 0.9390862944162437,
"step": 740
},
{
"loss": 0.0623,
"grad_norm": 0.3013327717781067,
"learning_rate": 0.00011639953580397367,
"epoch": 0.9517766497461929,
"step": 750
},
{
"loss": 0.0579,
"grad_norm": 0.19317802786827087,
"learning_rate": 0.00011432589695829576,
"epoch": 0.9644670050761421,
"step": 760
},
{
"loss": 0.0559,
"grad_norm": 0.26291170716285706,
"learning_rate": 0.00011224594909080704,
"epoch": 0.9771573604060914,
"step": 770
},
{
"loss": 0.0537,
"grad_norm": 0.28403881192207336,
"learning_rate": 0.00011016060819556353,
"epoch": 0.9898477157360406,
"step": 780
},
{
"eval_loss": 0.05360769107937813,
"eval_runtime": 20.0465,
"eval_samples_per_second": 66.196,
"eval_steps_per_second": 16.562,
"epoch": 0.9961928934010152,
"step": 785
},
{
"loss": 0.0502,
"grad_norm": 0.1471383273601532,
"learning_rate": 0.0001080707926416719,
"epoch": 1.00253807106599,
"step": 790
},
{
"loss": 0.038,
"grad_norm": 0.17716127634048462,
"learning_rate": 0.00010597742276884614,
"epoch": 1.015228426395939,
"step": 800
},
{
"loss": 0.0351,
"grad_norm": 0.2006382942199707,
"learning_rate": 0.00010388142048209676,
"epoch": 1.0279187817258884,
"step": 810
},
{
"loss": 0.0375,
"grad_norm": 0.2539692521095276,
"learning_rate": 0.00010178370884573046,
"epoch": 1.0406091370558375,
"step": 820
},
{
"loss": 0.0422,
"grad_norm": 0.2615308165550232,
"learning_rate": 9.968521167683905e-05,
"epoch": 1.0532994923857868,
"step": 830
},
{
"loss": 0.0406,
"grad_norm": 0.23757147789001465,
"learning_rate": 9.758685313845727e-05,
"epoch": 1.0659898477157361,
"step": 840
},
{
"loss": 0.0387,
"grad_norm": 0.16979315876960754,
"learning_rate": 9.548955733256803e-05,
"epoch": 1.0786802030456852,
"step": 850
},
{
"loss": 0.0352,
"grad_norm": 0.1853126734495163,
"learning_rate": 9.339424789313445e-05,
"epoch": 1.0913705583756346,
"step": 860
},
{
"loss": 0.0356,
"grad_norm": 0.15106192231178284,
"learning_rate": 9.13018475793382e-05,
"epoch": 1.1040609137055837,
"step": 870
},
{
"loss": 0.037,
"grad_norm": 0.20427311956882477,
"learning_rate": 8.921327786920294e-05,
"epoch": 1.116751269035533,
"step": 880
},
{
"loss": 0.0324,
"grad_norm": 0.1580514758825302,
"learning_rate": 8.712945855378218e-05,
"epoch": 1.1294416243654823,
"step": 890
},
{
"loss": 0.0301,
"grad_norm": 0.2191898375749588,
"learning_rate": 8.505130733208968e-05,
"epoch": 1.1421319796954315,
"step": 900
},
{
"loss": 0.0355,
"grad_norm": 0.16614247858524323,
"learning_rate": 8.297973940695163e-05,
"epoch": 1.1548223350253808,
"step": 910
},
{
"loss": 0.0349,
"grad_norm": 0.18907427787780762,
"learning_rate": 8.091566708195786e-05,
"epoch": 1.16751269035533,
"step": 920
},
{
"loss": 0.0336,
"grad_norm": 0.24296258389949799,
"learning_rate": 7.885999935968982e-05,
"epoch": 1.1802030456852792,
"step": 930
},
{
"loss": 0.0372,
"grad_norm": 0.1817648708820343,
"learning_rate": 7.681364154140264e-05,
"epoch": 1.1928934010152283,
"step": 940
},
{
"eval_loss": 0.057017017155885696,
"eval_runtime": 19.9628,
"eval_samples_per_second": 66.474,
"eval_steps_per_second": 16.631,
"epoch": 1.1954314720812182,
"step": 942
},
{
"loss": 0.03,
"grad_norm": 0.19095705449581146,
"learning_rate": 7.47774948283366e-05,
"epoch": 1.2055837563451777,
"step": 950
},
{
"loss": 0.035,
"grad_norm": 0.33682745695114136,
"learning_rate": 7.275245592483492e-05,
"epoch": 1.218274111675127,
"step": 960
},
{
"loss": 0.0384,
"grad_norm": 0.2646084427833557,
"learning_rate": 7.073941664344152e-05,
"epoch": 1.2309644670050761,
"step": 970
},
{
"loss": 0.0287,
"grad_norm": 0.1980791836977005,
"learning_rate": 6.873926351215312e-05,
"epoch": 1.2436548223350254,
"step": 980
},
{
"loss": 0.0342,
"grad_norm": 0.18797655403614044,
"learning_rate": 6.67528773839989e-05,
"epoch": 1.2563451776649746,
"step": 990
},
{
"loss": 0.0337,
"grad_norm": 0.24009937047958374,
"learning_rate": 6.478113304911886e-05,
"epoch": 1.2690355329949239,
"step": 1000
},
{
"loss": 0.0272,
"grad_norm": 0.29159170389175415,
"learning_rate": 6.282489884951295e-05,
"epoch": 1.281725888324873,
"step": 1010
},
{
"loss": 0.036,
"grad_norm": 0.16352516412734985,
"learning_rate": 6.0885036296629064e-05,
"epoch": 1.2944162436548223,
"step": 1020
},
{
"loss": 0.0292,
"grad_norm": 0.17807820439338684,
"learning_rate": 5.896239969195994e-05,
"epoch": 1.3071065989847717,
"step": 1030
},
{
"loss": 0.0332,
"grad_norm": 0.2500491738319397,
"learning_rate": 5.7057835750814867e-05,
"epoch": 1.3197969543147208,
"step": 1040
},
{
"loss": 0.0294,
"grad_norm": 0.2208271473646164,
"learning_rate": 5.517218322943224e-05,
"epoch": 1.33248730964467,
"step": 1050
},
{
"loss": 0.0342,
"grad_norm": 0.23927471041679382,
"learning_rate": 5.3306272555597504e-05,
"epoch": 1.3451776649746192,
"step": 1060
},
{
"loss": 0.0307,
"grad_norm": 0.20309758186340332,
"learning_rate": 5.1460925462928546e-05,
"epoch": 1.3578680203045685,
"step": 1070
},
{
"loss": 0.0314,
"grad_norm": 0.23275193572044373,
"learning_rate": 4.96369546289904e-05,
"epoch": 1.3705583756345177,
"step": 1080
},
{
"loss": 0.0333,
"grad_norm": 0.2078331708908081,
"learning_rate": 4.783516331739769e-05,
"epoch": 1.383248730964467,
"step": 1090
},
{
"eval_loss": 0.05335332825779915,
"eval_runtime": 19.9859,
"eval_samples_per_second": 66.397,
"eval_steps_per_second": 16.612,
"epoch": 1.3946700507614214,
"step": 1099
},
{
"loss": 0.0309,
"grad_norm": 0.18032079935073853,
"learning_rate": 4.605634502406321e-05,
"epoch": 1.3959390862944163,
"step": 1100
},
{
"loss": 0.0328,
"grad_norm": 0.20803005993366241,
"learning_rate": 4.430128312774804e-05,
"epoch": 1.4086294416243654,
"step": 1110
},
{
"loss": 0.027,
"grad_norm": 0.1680465191602707,
"learning_rate": 4.2570750545067076e-05,
"epoch": 1.4213197969543148,
"step": 1120
},
{
"loss": 0.0317,
"grad_norm": 0.2528463900089264,
"learning_rate": 4.086550939010227e-05,
"epoch": 1.434010152284264,
"step": 1130
},
{
"loss": 0.0313,
"grad_norm": 0.19024434685707092,
"learning_rate": 3.9186310638773047e-05,
"epoch": 1.4467005076142132,
"step": 1140
},
{
"loss": 0.0287,
"grad_norm": 0.20934472978115082,
"learning_rate": 3.753389379811185e-05,
"epoch": 1.4593908629441623,
"step": 1150
},
{
"loss": 0.0265,
"grad_norm": 0.29412180185317993,
"learning_rate": 3.590898658059062e-05,
"epoch": 1.4720812182741116,
"step": 1160
},
{
"loss": 0.0298,
"grad_norm": 0.3268195390701294,
"learning_rate": 3.4312304583641484e-05,
"epoch": 1.484771573604061,
"step": 1170
},
{
"loss": 0.0251,
"grad_norm": 0.17332251369953156,
"learning_rate": 3.274455097451269e-05,
"epoch": 1.49746192893401,
"step": 1180
},
{
"loss": 0.0318,
"grad_norm": 0.3481772541999817,
"learning_rate": 3.1206416180598995e-05,
"epoch": 1.5101522842639594,
"step": 1190
},
{
"loss": 0.0335,
"grad_norm": 0.24047453701496124,
"learning_rate": 2.9698577585382282e-05,
"epoch": 1.5228426395939088,
"step": 1200
},
{
"loss": 0.0339,
"grad_norm": 0.21146714687347412,
"learning_rate": 2.8221699230116793e-05,
"epoch": 1.5355329949238579,
"step": 1210
},
{
"loss": 0.0308,
"grad_norm": 0.140832781791687,
"learning_rate": 2.67764315213902e-05,
"epoch": 1.548223350253807,
"step": 1220
},
{
"loss": 0.026,
"grad_norm": 0.1721792370080948,
"learning_rate": 2.536341094468906e-05,
"epoch": 1.5609137055837563,
"step": 1230
},
{
"loss": 0.0277,
"grad_norm": 0.14980490505695343,
"learning_rate": 2.398325978409539e-05,
"epoch": 1.5736040609137056,
"step": 1240
},
{
"loss": 0.028,
"grad_norm": 0.18908673524856567,
"learning_rate": 2.263658584823717e-05,
"epoch": 1.5862944162436547,
"step": 1250
},
{
"eval_loss": 0.052472274750471115,
"eval_runtime": 19.9786,
"eval_samples_per_second": 66.421,
"eval_steps_per_second": 16.618,
"epoch": 1.5939086294416245,
"step": 1256
},
{
"loss": 0.0272,
"grad_norm": 0.12164825201034546,
"learning_rate": 2.1323982202613735e-05,
"epoch": 1.598984771573604,
"step": 1260
},
{
"loss": 0.0245,
"grad_norm": 0.2658851146697998,
"learning_rate": 2.004602690841414e-05,
"epoch": 1.6116751269035534,
"step": 1270
},
{
"loss": 0.0304,
"grad_norm": 0.2891974151134491,
"learning_rate": 1.8803282767942954e-05,
"epoch": 1.6243654822335025,
"step": 1280
},
{
"loss": 0.0292,
"grad_norm": 0.2979351580142975,
"learning_rate": 1.7596297076766455e-05,
"epoch": 1.6370558375634516,
"step": 1290
},
{
"loss": 0.0284,
"grad_norm": 0.20141719281673431,
"learning_rate": 1.6425601382687405e-05,
"epoch": 1.649746192893401,
"step": 1300
},
{
"loss": 0.0254,
"grad_norm": 0.1950131356716156,
"learning_rate": 1.5291711251655316e-05,
"epoch": 1.6624365482233503,
"step": 1310
},
{
"loss": 0.0282,
"grad_norm": 0.21205022931098938,
"learning_rate": 1.41951260407149e-05,
"epoch": 1.6751269035532994,
"step": 1320
},
{
"loss": 0.0247,
"grad_norm": 0.2470894753932953,
"learning_rate": 1.3136328678092746e-05,
"epoch": 1.6878172588832487,
"step": 1330
},
{
"loss": 0.0257,
"grad_norm": 0.26378998160362244,
"learning_rate": 1.2115785450519434e-05,
"epoch": 1.700507614213198,
"step": 1340
},
{
"loss": 0.0282,
"grad_norm": 0.12680888175964355,
"learning_rate": 1.1133945797879908e-05,
"epoch": 1.7131979695431472,
"step": 1350
},
{
"loss": 0.0251,
"grad_norm": 0.19744935631752014,
"learning_rate": 1.019124211528365e-05,
"epoch": 1.7258883248730963,
"step": 1360
},
{
"loss": 0.0327,
"grad_norm": 0.18419434130191803,
"learning_rate": 9.288089562640844e-06,
"epoch": 1.7385786802030458,
"step": 1370
},
{
"loss": 0.0282,
"grad_norm": 0.19115136563777924,
"learning_rate": 8.42488588182897e-06,
"epoch": 1.751269035532995,
"step": 1380
},
{
"loss": 0.0245,
"grad_norm": 0.17252641916275024,
"learning_rate": 7.602011221530236e-06,
"epoch": 1.763959390862944,
"step": 1390
},
{
"loss": 0.029,
"grad_norm": 0.22253695130348206,
"learning_rate": 6.819827969816661e-06,
"epoch": 1.7766497461928934,
"step": 1400
},
{
"loss": 0.0269,
"grad_norm": 0.21938475966453552,
"learning_rate": 6.078680594557163e-06,
"epoch": 1.7893401015228427,
"step": 1410
},
{
"eval_loss": 0.05091211572289467,
"eval_runtime": 20.0145,
"eval_samples_per_second": 66.302,
"eval_steps_per_second": 16.588,
"epoch": 1.7931472081218274,
"step": 1413
},
{
"loss": 0.0305,
"grad_norm": 0.2024271935224533,
"learning_rate": 5.378895491716285e-06,
"epoch": 1.8020304568527918,
"step": 1420
},
{
"loss": 0.029,
"grad_norm": 0.22723488509655,
"learning_rate": 4.720780841611738e-06,
"epoch": 1.8147208121827412,
"step": 1430
},
{
"loss": 0.0266,
"grad_norm": 0.2747625410556793,
"learning_rate": 4.104626473194151e-06,
"epoch": 1.8274111675126905,
"step": 1440
},
{
"loss": 0.0262,
"grad_norm": 0.18593831360340118,
"learning_rate": 3.5307037364083253e-06,
"epoch": 1.8401015228426396,
"step": 1450
},
{
"loss": 0.0291,
"grad_norm": 0.2651998996734619,
"learning_rate": 2.9992653826927508e-06,
"epoch": 1.8527918781725887,
"step": 1460
},
{
"loss": 0.026,
"grad_norm": 0.19439752399921417,
"learning_rate": 2.510545453669744e-06,
"epoch": 1.865482233502538,
"step": 1470
},
{
"loss": 0.03,
"grad_norm": 0.17483021318912506,
"learning_rate": 2.06475917807506e-06,
"epoch": 1.8781725888324874,
"step": 1480
},
{
"loss": 0.029,
"grad_norm": 0.22444817423820496,
"learning_rate": 1.662102876972882e-06,
"epoch": 1.8908629441624365,
"step": 1490
},
{
"loss": 0.0243,
"grad_norm": 0.17885605990886688,
"learning_rate": 1.3027538772973026e-06,
"epoch": 1.9035532994923858,
"step": 1500
},
{
"loss": 0.0272,
"grad_norm": 0.19312232732772827,
"learning_rate": 9.868704337588797e-07,
"epoch": 1.9162436548223352,
"step": 1510
},
{
"loss": 0.0254,
"grad_norm": 0.1709776520729065,
"learning_rate": 7.145916591504098e-07,
"epoch": 1.9289340101522843,
"step": 1520
},
{
"loss": 0.0252,
"grad_norm": 0.18656505644321442,
"learning_rate": 4.860374630826004e-07,
"epoch": 1.9416243654822334,
"step": 1530
},
{
"loss": 0.0267,
"grad_norm": 0.11956395953893661,
"learning_rate": 3.0130849917681114e-07,
"epoch": 1.9543147208121827,
"step": 1540
},
{
"loss": 0.0305,
"grad_norm": 0.25038954615592957,
"learning_rate": 1.604861207378794e-07,
"epoch": 1.967005076142132,
"step": 1550
},
{
"loss": 0.025,
"grad_norm": 0.19318363070487976,
"learning_rate": 6.363234492674507e-08,
"epoch": 1.9796954314720812,
"step": 1560
},
{
"loss": 0.0276,
"grad_norm": 0.26012641191482544,
"learning_rate": 1.0789825448476177e-08,
"epoch": 1.9923857868020305,
"step": 1570
},
{
"eval_loss": 0.0504293330013752,
"eval_runtime": 19.8337,
"eval_samples_per_second": 66.906,
"eval_steps_per_second": 16.739,
"epoch": 1.9923857868020305,
"step": 1570
},
{
"train_runtime": 2681.8444,
"train_samples_per_second": 18.795,
"train_steps_per_second": 0.588,
"total_flos": 6.800278675429786e+17,
"train_loss": 0.06838441643920647,
"epoch": 2.0,
"step": 1576
}
]