Files
QWEN3-4B-CPT/trainer_state.json
ModelHub XC 67c081bc2e 初始化项目,由ModelHub XC社区提供模型
Model: alwaysgood/QWEN3-4B-CPT
Source: Original Platform
2026-05-01 18:43:25 +08:00

1097 lines
27 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006771626883358727,
"grad_norm": 1.5234375,
"learning_rate": 6.081081081081082e-07,
"loss": 1.8358331680297852,
"step": 10
},
{
"epoch": 0.013543253766717453,
"grad_norm": 1.5078125,
"learning_rate": 1.2837837837837838e-06,
"loss": 1.840726089477539,
"step": 20
},
{
"epoch": 0.02031488065007618,
"grad_norm": 1.0859375,
"learning_rate": 1.9594594594594595e-06,
"loss": 1.8267410278320313,
"step": 30
},
{
"epoch": 0.027086507533434907,
"grad_norm": 1.1640625,
"learning_rate": 2.6351351351351353e-06,
"loss": 1.8383310317993165,
"step": 40
},
{
"epoch": 0.03385813441679363,
"grad_norm": 1.0859375,
"learning_rate": 3.310810810810811e-06,
"loss": 1.8384885787963867,
"step": 50
},
{
"epoch": 0.04062976130015236,
"grad_norm": 1.03125,
"learning_rate": 3.986486486486487e-06,
"loss": 1.8087802886962892,
"step": 60
},
{
"epoch": 0.04740138818351109,
"grad_norm": 1.015625,
"learning_rate": 4.6621621621621625e-06,
"loss": 1.8259227752685547,
"step": 70
},
{
"epoch": 0.05417301506686981,
"grad_norm": 1.046875,
"learning_rate": 5.337837837837838e-06,
"loss": 1.8241001129150392,
"step": 80
},
{
"epoch": 0.06094464195022854,
"grad_norm": 0.96484375,
"learning_rate": 6.013513513513514e-06,
"loss": 1.82220516204834,
"step": 90
},
{
"epoch": 0.06771626883358726,
"grad_norm": 0.953125,
"learning_rate": 6.689189189189191e-06,
"loss": 1.7921783447265625,
"step": 100
},
{
"epoch": 0.074487895716946,
"grad_norm": 0.9296875,
"learning_rate": 7.3648648648648655e-06,
"loss": 1.797548484802246,
"step": 110
},
{
"epoch": 0.08125952260030472,
"grad_norm": 0.89453125,
"learning_rate": 8.040540540540541e-06,
"loss": 1.7889528274536133,
"step": 120
},
{
"epoch": 0.08803114948366345,
"grad_norm": 0.90234375,
"learning_rate": 8.716216216216217e-06,
"loss": 1.7663179397583009,
"step": 130
},
{
"epoch": 0.09480277636702218,
"grad_norm": 0.89453125,
"learning_rate": 9.391891891891893e-06,
"loss": 1.7635225296020507,
"step": 140
},
{
"epoch": 0.1015744032503809,
"grad_norm": 0.91015625,
"learning_rate": 9.999986030219255e-06,
"loss": 1.7774492263793946,
"step": 150
},
{
"epoch": 0.10834603013373963,
"grad_norm": 0.91796875,
"learning_rate": 9.998309750982693e-06,
"loss": 1.7622718811035156,
"step": 160
},
{
"epoch": 0.11511765701709836,
"grad_norm": 0.890625,
"learning_rate": 9.993840588849743e-06,
"loss": 1.7750001907348634,
"step": 170
},
{
"epoch": 0.12188928390045708,
"grad_norm": 0.890625,
"learning_rate": 9.986581041033881e-06,
"loss": 1.767216110229492,
"step": 180
},
{
"epoch": 0.1286609107838158,
"grad_norm": 0.921875,
"learning_rate": 9.976535163919757e-06,
"loss": 1.7609657287597655,
"step": 190
},
{
"epoch": 0.13543253766717453,
"grad_norm": 0.87109375,
"learning_rate": 9.96370857079661e-06,
"loss": 1.7535722732543946,
"step": 200
},
{
"epoch": 0.14220416455053325,
"grad_norm": 0.86328125,
"learning_rate": 9.948108428721782e-06,
"loss": 1.7395360946655274,
"step": 210
},
{
"epoch": 0.148975791433892,
"grad_norm": 0.88671875,
"learning_rate": 9.92974345451598e-06,
"loss": 1.7465991973876953,
"step": 220
},
{
"epoch": 0.15574741831725072,
"grad_norm": 0.87890625,
"learning_rate": 9.908623909892651e-06,
"loss": 1.7506902694702149,
"step": 230
},
{
"epoch": 0.16251904520060945,
"grad_norm": 0.8984375,
"learning_rate": 9.884761595724068e-06,
"loss": 1.7368896484375,
"step": 240
},
{
"epoch": 0.16929067208396817,
"grad_norm": 0.8671875,
"learning_rate": 9.858169845447417e-06,
"loss": 1.7515613555908203,
"step": 250
},
{
"epoch": 0.1760622989673269,
"grad_norm": 0.85546875,
"learning_rate": 9.828863517614533e-06,
"loss": 1.7509956359863281,
"step": 260
},
{
"epoch": 0.1828339258506856,
"grad_norm": 0.9140625,
"learning_rate": 9.796858987589462e-06,
"loss": 1.753628921508789,
"step": 270
},
{
"epoch": 0.18960555273404436,
"grad_norm": 0.85546875,
"learning_rate": 9.762174138398456e-06,
"loss": 1.7379936218261718,
"step": 280
},
{
"epoch": 0.19637717961740309,
"grad_norm": 0.88671875,
"learning_rate": 9.724828350737574e-06,
"loss": 1.7442964553833007,
"step": 290
},
{
"epoch": 0.2031488065007618,
"grad_norm": 0.87109375,
"learning_rate": 9.684842492143399e-06,
"loss": 1.7366142272949219,
"step": 300
},
{
"epoch": 0.20992043338412053,
"grad_norm": 0.84765625,
"learning_rate": 9.642238905333e-06,
"loss": 1.7396051406860351,
"step": 310
},
{
"epoch": 0.21669206026747925,
"grad_norm": 0.87109375,
"learning_rate": 9.597041395719573e-06,
"loss": 1.732611083984375,
"step": 320
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.8828125,
"learning_rate": 9.549275218110818e-06,
"loss": 1.7453182220458985,
"step": 330
},
{
"epoch": 0.23023531403419673,
"grad_norm": 0.875,
"learning_rate": 9.498967062597403e-06,
"loss": 1.7297761917114258,
"step": 340
},
{
"epoch": 0.23700694091755545,
"grad_norm": 0.875,
"learning_rate": 9.446145039639486e-06,
"loss": 1.728118324279785,
"step": 350
},
{
"epoch": 0.24377856780091417,
"grad_norm": 0.890625,
"learning_rate": 9.390838664359539e-06,
"loss": 1.7387624740600587,
"step": 360
},
{
"epoch": 0.2505501946842729,
"grad_norm": 0.85546875,
"learning_rate": 9.333078840050331e-06,
"loss": 1.7364713668823242,
"step": 370
},
{
"epoch": 0.2573218215676316,
"grad_norm": 0.8828125,
"learning_rate": 9.27289784090723e-06,
"loss": 1.7236080169677734,
"step": 380
},
{
"epoch": 0.26409344845099036,
"grad_norm": 0.890625,
"learning_rate": 9.210329293994495e-06,
"loss": 1.7224924087524414,
"step": 390
},
{
"epoch": 0.27086507533434906,
"grad_norm": 0.8671875,
"learning_rate": 9.145408160455642e-06,
"loss": 1.7099193572998046,
"step": 400
},
{
"epoch": 0.2776367022177078,
"grad_norm": 0.8515625,
"learning_rate": 9.078170715978353e-06,
"loss": 1.737176513671875,
"step": 410
},
{
"epoch": 0.2844083291010665,
"grad_norm": 0.9140625,
"learning_rate": 9.008654530524883e-06,
"loss": 1.73763427734375,
"step": 420
},
{
"epoch": 0.29117995598442525,
"grad_norm": 0.85546875,
"learning_rate": 8.936898447339257e-06,
"loss": 1.7290821075439453,
"step": 430
},
{
"epoch": 0.297951582867784,
"grad_norm": 0.8984375,
"learning_rate": 8.86294256124301e-06,
"loss": 1.7403568267822265,
"step": 440
},
{
"epoch": 0.3047232097511427,
"grad_norm": 0.859375,
"learning_rate": 8.786828196231584e-06,
"loss": 1.7217792510986327,
"step": 450
},
{
"epoch": 0.31149483663450145,
"grad_norm": 0.87109375,
"learning_rate": 8.708597882383908e-06,
"loss": 1.7103708267211915,
"step": 460
},
{
"epoch": 0.31826646351786014,
"grad_norm": 0.91796875,
"learning_rate": 8.62829533209805e-06,
"loss": 1.7208784103393555,
"step": 470
},
{
"epoch": 0.3250380904012189,
"grad_norm": 0.859375,
"learning_rate": 8.545965415666254e-06,
"loss": 1.7223230361938477,
"step": 480
},
{
"epoch": 0.33180971728457764,
"grad_norm": 0.8671875,
"learning_rate": 8.46165413620295e-06,
"loss": 1.719701385498047,
"step": 490
},
{
"epoch": 0.33858134416793634,
"grad_norm": 0.85546875,
"learning_rate": 8.375408603939827e-06,
"loss": 1.721092987060547,
"step": 500
},
{
"epoch": 0.33858134416793634,
"eval_loss": 1.7143864631652832,
"eval_runtime": 177.179,
"eval_samples_per_second": 5.401,
"eval_steps_per_second": 0.677,
"step": 500
},
{
"epoch": 0.3453529710512951,
"grad_norm": 0.859375,
"learning_rate": 8.287277009902237e-06,
"loss": 1.7325265884399415,
"step": 510
},
{
"epoch": 0.3521245979346538,
"grad_norm": 0.83984375,
"learning_rate": 8.197308598981731e-06,
"loss": 1.7298921585083007,
"step": 520
},
{
"epoch": 0.35889622481801253,
"grad_norm": 0.8828125,
"learning_rate": 8.105553642419708e-06,
"loss": 1.6982412338256836,
"step": 530
},
{
"epoch": 0.3656678517013712,
"grad_norm": 0.91015625,
"learning_rate": 8.012063409717578e-06,
"loss": 1.7173789978027343,
"step": 540
},
{
"epoch": 0.37243947858473,
"grad_norm": 0.875,
"learning_rate": 7.916890139989147e-06,
"loss": 1.724541473388672,
"step": 550
},
{
"epoch": 0.3792111054680887,
"grad_norm": 0.859375,
"learning_rate": 7.820087012771184e-06,
"loss": 1.701674461364746,
"step": 560
},
{
"epoch": 0.3859827323514474,
"grad_norm": 0.85546875,
"learning_rate": 7.721708118308556e-06,
"loss": 1.7177881240844726,
"step": 570
},
{
"epoch": 0.39275435923480617,
"grad_norm": 0.87890625,
"learning_rate": 7.621808427330447e-06,
"loss": 1.6985021591186524,
"step": 580
},
{
"epoch": 0.39952598611816487,
"grad_norm": 0.87109375,
"learning_rate": 7.5204437603346224e-06,
"loss": 1.709127426147461,
"step": 590
},
{
"epoch": 0.4062976130015236,
"grad_norm": 0.88671875,
"learning_rate": 7.417670756396863e-06,
"loss": 1.7201419830322267,
"step": 600
},
{
"epoch": 0.41306923988488237,
"grad_norm": 0.8984375,
"learning_rate": 7.313546841522998e-06,
"loss": 1.7153247833251952,
"step": 610
},
{
"epoch": 0.41984086676824106,
"grad_norm": 0.875,
"learning_rate": 7.2081301965612435e-06,
"loss": 1.707881546020508,
"step": 620
},
{
"epoch": 0.4266124936515998,
"grad_norm": 0.87109375,
"learning_rate": 7.10147972469275e-06,
"loss": 1.7271339416503906,
"step": 630
},
{
"epoch": 0.4333841205349585,
"grad_norm": 1.3515625,
"learning_rate": 6.993655018518541e-06,
"loss": 1.7222976684570312,
"step": 640
},
{
"epoch": 0.44015574741831726,
"grad_norm": 0.85546875,
"learning_rate": 6.884716326761218e-06,
"loss": 1.7006675720214843,
"step": 650
},
{
"epoch": 0.44692737430167595,
"grad_norm": 0.87109375,
"learning_rate": 6.774724520600069e-06,
"loss": 1.6978439331054687,
"step": 660
},
{
"epoch": 0.4536990011850347,
"grad_norm": 0.87890625,
"learning_rate": 6.663741059658337e-06,
"loss": 1.7124168395996093,
"step": 670
},
{
"epoch": 0.46047062806839345,
"grad_norm": 0.87890625,
"learning_rate": 6.551827957661722e-06,
"loss": 1.7023361206054688,
"step": 680
},
{
"epoch": 0.46724225495175215,
"grad_norm": 0.86328125,
"learning_rate": 6.439047747787242e-06,
"loss": 1.700748825073242,
"step": 690
},
{
"epoch": 0.4740138818351109,
"grad_norm": 0.85546875,
"learning_rate": 6.325463447721852e-06,
"loss": 1.6977190017700194,
"step": 700
},
{
"epoch": 0.4807855087184696,
"grad_norm": 0.8984375,
"learning_rate": 6.211138524450347e-06,
"loss": 1.7250362396240235,
"step": 710
},
{
"epoch": 0.48755713560182834,
"grad_norm": 0.90234375,
"learning_rate": 6.096136858792193e-06,
"loss": 1.7249008178710938,
"step": 720
},
{
"epoch": 0.4943287624851871,
"grad_norm": 0.8671875,
"learning_rate": 5.980522709707132e-06,
"loss": 1.7153186798095703,
"step": 730
},
{
"epoch": 0.5011003893685458,
"grad_norm": 0.8828125,
"learning_rate": 5.864360678389497e-06,
"loss": 1.6841873168945312,
"step": 740
},
{
"epoch": 0.5078720162519045,
"grad_norm": 0.8515625,
"learning_rate": 5.747715672171295e-06,
"loss": 1.7151117324829102,
"step": 750
},
{
"epoch": 0.5146436431352632,
"grad_norm": 0.95703125,
"learning_rate": 5.630652868254229e-06,
"loss": 1.704267692565918,
"step": 760
},
{
"epoch": 0.521415270018622,
"grad_norm": 0.88671875,
"learning_rate": 5.51323767729093e-06,
"loss": 1.7240329742431642,
"step": 770
},
{
"epoch": 0.5281868969019807,
"grad_norm": 0.87890625,
"learning_rate": 5.395535706835744e-06,
"loss": 1.7058921813964845,
"step": 780
},
{
"epoch": 0.5349585237853395,
"grad_norm": 0.8828125,
"learning_rate": 5.27761272468549e-06,
"loss": 1.6999113082885742,
"step": 790
},
{
"epoch": 0.5417301506686981,
"grad_norm": 0.9140625,
"learning_rate": 5.159534622130695e-06,
"loss": 1.7173538208007812,
"step": 800
},
{
"epoch": 0.5485017775520569,
"grad_norm": 0.85546875,
"learning_rate": 5.04136737713781e-06,
"loss": 1.706464958190918,
"step": 810
},
{
"epoch": 0.5552734044354156,
"grad_norm": 0.84765625,
"learning_rate": 4.923177017483002e-06,
"loss": 1.7123580932617188,
"step": 820
},
{
"epoch": 0.5620450313187744,
"grad_norm": 0.84765625,
"learning_rate": 4.805029583858115e-06,
"loss": 1.7076505661010741,
"step": 830
},
{
"epoch": 0.568816658202133,
"grad_norm": 0.87109375,
"learning_rate": 4.686991092969408e-06,
"loss": 1.7007432937622071,
"step": 840
},
{
"epoch": 0.5755882850854918,
"grad_norm": 0.83984375,
"learning_rate": 4.569127500649701e-06,
"loss": 1.7156892776489259,
"step": 850
},
{
"epoch": 0.5823599119688505,
"grad_norm": 0.85546875,
"learning_rate": 4.4515046650045316e-06,
"loss": 1.6989547729492187,
"step": 860
},
{
"epoch": 0.5891315388522093,
"grad_norm": 0.859375,
"learning_rate": 4.334188309612923e-06,
"loss": 1.701683235168457,
"step": 870
},
{
"epoch": 0.595903165735568,
"grad_norm": 0.875,
"learning_rate": 4.217243986803315e-06,
"loss": 1.7004409790039063,
"step": 880
},
{
"epoch": 0.6026747926189266,
"grad_norm": 0.88671875,
"learning_rate": 4.100737041025188e-06,
"loss": 1.727794075012207,
"step": 890
},
{
"epoch": 0.6094464195022854,
"grad_norm": 0.89453125,
"learning_rate": 3.984732572336837e-06,
"loss": 1.6976716995239258,
"step": 900
},
{
"epoch": 0.6162180463856441,
"grad_norm": 0.89453125,
"learning_rate": 3.869295400029714e-06,
"loss": 1.6927717208862305,
"step": 910
},
{
"epoch": 0.6229896732690029,
"grad_norm": 0.84375,
"learning_rate": 3.754490026409637e-06,
"loss": 1.6997186660766601,
"step": 920
},
{
"epoch": 0.6297613001523616,
"grad_norm": 0.93359375,
"learning_rate": 3.6403806007551373e-06,
"loss": 1.7196897506713866,
"step": 930
},
{
"epoch": 0.6365329270357203,
"grad_norm": 0.83203125,
"learning_rate": 3.527030883473055e-06,
"loss": 1.7054462432861328,
"step": 940
},
{
"epoch": 0.643304553919079,
"grad_norm": 0.890625,
"learning_rate": 3.414504210471421e-06,
"loss": 1.7200759887695312,
"step": 950
},
{
"epoch": 0.6500761808024378,
"grad_norm": 0.890625,
"learning_rate": 3.302863457769544e-06,
"loss": 1.6951274871826172,
"step": 960
},
{
"epoch": 0.6568478076857965,
"grad_norm": 0.90625,
"learning_rate": 3.192171006365061e-06,
"loss": 1.7151849746704102,
"step": 970
},
{
"epoch": 0.6636194345691553,
"grad_norm": 0.8984375,
"learning_rate": 3.0824887073775877e-06,
"loss": 1.713322067260742,
"step": 980
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.83984375,
"learning_rate": 2.973877847488451e-06,
"loss": 1.7172536849975586,
"step": 990
},
{
"epoch": 0.6771626883358727,
"grad_norm": 0.859375,
"learning_rate": 2.8663991146958064e-06,
"loss": 1.7149576187133788,
"step": 1000
},
{
"epoch": 0.6771626883358727,
"eval_loss": 1.7007688283920288,
"eval_runtime": 165.432,
"eval_samples_per_second": 5.785,
"eval_steps_per_second": 0.725,
"step": 1000
},
{
"epoch": 0.6839343152192314,
"grad_norm": 0.90625,
"learning_rate": 2.7601125644042777e-06,
"loss": 1.714142417907715,
"step": 1010
},
{
"epoch": 0.6907059421025902,
"grad_norm": 0.859375,
"learning_rate": 2.6550775858680793e-06,
"loss": 1.7104360580444335,
"step": 1020
},
{
"epoch": 0.6974775689859489,
"grad_norm": 0.90234375,
"learning_rate": 2.551352869006338e-06,
"loss": 1.7032684326171874,
"step": 1030
},
{
"epoch": 0.7042491958693076,
"grad_norm": 0.86328125,
"learning_rate": 2.4489963716092096e-06,
"loss": 1.701323890686035,
"step": 1040
},
{
"epoch": 0.7110208227526663,
"grad_norm": 0.890625,
"learning_rate": 2.348065286953048e-06,
"loss": 1.7169862747192384,
"step": 1050
},
{
"epoch": 0.7177924496360251,
"grad_norm": 0.87890625,
"learning_rate": 2.2486160118427958e-06,
"loss": 1.701096534729004,
"step": 1060
},
{
"epoch": 0.7245640765193838,
"grad_norm": 0.88671875,
"learning_rate": 2.1507041150993813e-06,
"loss": 1.700172233581543,
"step": 1070
},
{
"epoch": 0.7313357034027425,
"grad_norm": 0.859375,
"learning_rate": 2.054384306509794e-06,
"loss": 1.7045093536376954,
"step": 1080
},
{
"epoch": 0.7381073302861012,
"grad_norm": 0.859375,
"learning_rate": 1.9597104062571337e-06,
"loss": 1.7091920852661133,
"step": 1090
},
{
"epoch": 0.74487895716946,
"grad_norm": 0.86328125,
"learning_rate": 1.8667353148477547e-06,
"loss": 1.7001871109008788,
"step": 1100
},
{
"epoch": 0.7516505840528187,
"grad_norm": 0.85546875,
"learning_rate": 1.7755109835522938e-06,
"loss": 1.7016315460205078,
"step": 1110
},
{
"epoch": 0.7584222109361775,
"grad_norm": 0.87890625,
"learning_rate": 1.6860883853770848e-06,
"loss": 1.7196449279785155,
"step": 1120
},
{
"epoch": 0.7651938378195361,
"grad_norm": 0.89453125,
"learning_rate": 1.5985174865822146e-06,
"loss": 1.701955223083496,
"step": 1130
},
{
"epoch": 0.7719654647028948,
"grad_norm": 0.85546875,
"learning_rate": 1.5128472187620886e-06,
"loss": 1.703407096862793,
"step": 1140
},
{
"epoch": 0.7787370915862536,
"grad_norm": 0.875,
"learning_rate": 1.4291254515041592e-06,
"loss": 1.7057323455810547,
"step": 1150
},
{
"epoch": 0.7855087184696123,
"grad_norm": 0.8828125,
"learning_rate": 1.3473989656410413e-06,
"loss": 1.6963571548461913,
"step": 1160
},
{
"epoch": 0.7922803453529711,
"grad_norm": 0.8671875,
"learning_rate": 1.2677134271110082e-06,
"loss": 1.7136796951293944,
"step": 1170
},
{
"epoch": 0.7990519722363297,
"grad_norm": 0.89453125,
"learning_rate": 1.1901133614414352e-06,
"loss": 1.7095062255859375,
"step": 1180
},
{
"epoch": 0.8058235991196885,
"grad_norm": 0.875,
"learning_rate": 1.114642128869473e-06,
"loss": 1.7052017211914063,
"step": 1190
},
{
"epoch": 0.8125952260030472,
"grad_norm": 0.8984375,
"learning_rate": 1.0413419001138525e-06,
"loss": 1.7166055679321288,
"step": 1200
},
{
"epoch": 0.819366852886406,
"grad_norm": 0.87890625,
"learning_rate": 9.702536328113305e-07,
"loss": 1.7042055130004883,
"step": 1210
},
{
"epoch": 0.8261384797697647,
"grad_norm": 0.8671875,
"learning_rate": 9.014170486309875e-07,
"loss": 1.6885286331176759,
"step": 1220
},
{
"epoch": 0.8329101066531234,
"grad_norm": 0.84375,
"learning_rate": 8.348706110791238e-07,
"loss": 1.7065910339355468,
"step": 1230
},
{
"epoch": 0.8396817335364821,
"grad_norm": 0.87109375,
"learning_rate": 7.706515040071854e-07,
"loss": 1.6999498367309571,
"step": 1240
},
{
"epoch": 0.8464533604198409,
"grad_norm": 0.8828125,
"learning_rate": 7.08795610834706e-07,
"loss": 1.7021600723266601,
"step": 1250
},
{
"epoch": 0.8532249873031996,
"grad_norm": 0.87890625,
"learning_rate": 6.493374944988984e-07,
"loss": 1.722920799255371,
"step": 1260
},
{
"epoch": 0.8599966141865584,
"grad_norm": 0.8671875,
"learning_rate": 5.923103781420708e-07,
"loss": 1.7148597717285157,
"step": 1270
},
{
"epoch": 0.866768241069917,
"grad_norm": 0.890625,
"learning_rate": 5.377461265476868e-07,
"loss": 1.7151250839233398,
"step": 1280
},
{
"epoch": 0.8735398679532758,
"grad_norm": 0.8671875,
"learning_rate": 4.856752283354277e-07,
"loss": 1.7023918151855468,
"step": 1290
},
{
"epoch": 0.8803114948366345,
"grad_norm": 0.8671875,
"learning_rate": 4.3612677892519496e-07,
"loss": 1.7045417785644532,
"step": 1300
},
{
"epoch": 0.8870831217199933,
"grad_norm": 0.86328125,
"learning_rate": 3.891284642796045e-07,
"loss": 1.7008039474487304,
"step": 1310
},
{
"epoch": 0.8938547486033519,
"grad_norm": 0.8671875,
"learning_rate": 3.447065454340198e-07,
"loss": 1.7126380920410156,
"step": 1320
},
{
"epoch": 0.9006263754867107,
"grad_norm": 0.88671875,
"learning_rate": 3.028858438227966e-07,
"loss": 1.7127569198608399,
"step": 1330
},
{
"epoch": 0.9073980023700694,
"grad_norm": 0.86328125,
"learning_rate": 2.636897274099187e-07,
"loss": 1.7151193618774414,
"step": 1340
},
{
"epoch": 0.9141696292534282,
"grad_norm": 0.8515625,
"learning_rate": 2.2714009763178945e-07,
"loss": 1.704157829284668,
"step": 1350
},
{
"epoch": 0.9209412561367869,
"grad_norm": 0.87890625,
"learning_rate": 1.932573771594648e-07,
"loss": 1.7036989212036133,
"step": 1360
},
{
"epoch": 0.9277128830201455,
"grad_norm": 0.8671875,
"learning_rate": 1.6206049848716765e-07,
"loss": 1.7044996261596679,
"step": 1370
},
{
"epoch": 0.9344845099035043,
"grad_norm": 1.109375,
"learning_rate": 1.3356689335346728e-07,
"loss": 1.7029462814331056,
"step": 1380
},
{
"epoch": 0.941256136786863,
"grad_norm": 0.91015625,
"learning_rate": 1.0779248300102352e-07,
"loss": 1.7133670806884767,
"step": 1390
},
{
"epoch": 0.9480277636702218,
"grad_norm": 0.859375,
"learning_rate": 8.475166928034684e-08,
"loss": 1.6992549896240234,
"step": 1400
},
{
"epoch": 0.9547993905535805,
"grad_norm": 0.85546875,
"learning_rate": 6.445732660254056e-08,
"loss": 1.7066579818725587,
"step": 1410
},
{
"epoch": 0.9615710174369392,
"grad_norm": 0.9140625,
"learning_rate": 4.692079474552691e-08,
"loss": 1.6963106155395509,
"step": 1420
},
{
"epoch": 0.9683426443202979,
"grad_norm": 0.8515625,
"learning_rate": 3.2151872517767194e-08,
"loss": 1.7118385314941407,
"step": 1430
},
{
"epoch": 0.9751142712036567,
"grad_norm": 0.84375,
"learning_rate": 2.0158812283030403e-08,
"loss": 1.6870197296142577,
"step": 1440
},
{
"epoch": 0.9818858980870154,
"grad_norm": 0.87109375,
"learning_rate": 1.094831534925289e-08,
"loss": 1.7051671981811523,
"step": 1450
},
{
"epoch": 0.9886575249703742,
"grad_norm": 0.86328125,
"learning_rate": 4.5255282240802554e-09,
"loss": 1.7082006454467773,
"step": 1460
},
{
"epoch": 0.9954291518537328,
"grad_norm": 0.8828125,
"learning_rate": 8.940397391787869e-10,
"loss": 1.707107162475586,
"step": 1470
},
{
"epoch": 1.0,
"eval_loss": 1.7002202272415161,
"eval_runtime": 169.1979,
"eval_samples_per_second": 5.656,
"eval_steps_per_second": 0.709,
"step": 1477
},
{
"epoch": 1.0,
"step": 1477,
"total_flos": 2.103177196962902e+18,
"train_loss": 1.7256613558986822,
"train_runtime": 29239.084,
"train_samples_per_second": 1.616,
"train_steps_per_second": 0.051
}
],
"logging_steps": 10,
"max_steps": 1477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.103177196962902e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}