Files
anwgpt4.1-chat/checkpoint-5000/trainer_state.json
ModelHub XC d874da34ff 初始化项目,由ModelHub XC社区提供模型
Model: anwgpt/anwgpt4.1-chat
Source: Original Platform
2026-06-05 17:52:26 +08:00

735 lines
16 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 20.0,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.2,
"grad_norm": 2.8039450645446777,
"learning_rate": 6.125e-05,
"loss": 8.1001,
"step": 50
},
{
"epoch": 0.4,
"grad_norm": 1.782693862915039,
"learning_rate": 0.00012375,
"loss": 6.0373,
"step": 100
},
{
"epoch": 0.6,
"grad_norm": 1.6799567937850952,
"learning_rate": 0.00018625,
"loss": 5.4579,
"step": 150
},
{
"epoch": 0.8,
"grad_norm": 1.5503798723220825,
"learning_rate": 0.00024875,
"loss": 5.237,
"step": 200
},
{
"epoch": 1.0,
"grad_norm": 1.5336804389953613,
"learning_rate": 0.00024788793103448277,
"loss": 5.0786,
"step": 250
},
{
"epoch": 1.2,
"grad_norm": 1.5419166088104248,
"learning_rate": 0.0002457327586206897,
"loss": 4.6644,
"step": 300
},
{
"epoch": 1.4,
"grad_norm": 1.5083167552947998,
"learning_rate": 0.00024357758620689656,
"loss": 4.5686,
"step": 350
},
{
"epoch": 1.6,
"grad_norm": 1.5312546491622925,
"learning_rate": 0.00024142241379310344,
"loss": 4.4955,
"step": 400
},
{
"epoch": 1.8,
"grad_norm": 1.4562275409698486,
"learning_rate": 0.00023926724137931035,
"loss": 4.4404,
"step": 450
},
{
"epoch": 2.0,
"grad_norm": 1.547753930091858,
"learning_rate": 0.00023711206896551723,
"loss": 4.379,
"step": 500
},
{
"epoch": 2.2,
"grad_norm": 1.611249566078186,
"learning_rate": 0.00023495689655172414,
"loss": 4.01,
"step": 550
},
{
"epoch": 2.4,
"grad_norm": 1.383882999420166,
"learning_rate": 0.00023280172413793105,
"loss": 3.9787,
"step": 600
},
{
"epoch": 2.6,
"grad_norm": 1.521315574645996,
"learning_rate": 0.00023064655172413793,
"loss": 3.9931,
"step": 650
},
{
"epoch": 2.8,
"grad_norm": 1.4775588512420654,
"learning_rate": 0.00022849137931034484,
"loss": 3.915,
"step": 700
},
{
"epoch": 3.0,
"grad_norm": 1.5033624172210693,
"learning_rate": 0.00022633620689655173,
"loss": 3.9011,
"step": 750
},
{
"epoch": 3.2,
"grad_norm": 1.6199653148651123,
"learning_rate": 0.0002241810344827586,
"loss": 3.5909,
"step": 800
},
{
"epoch": 3.4,
"grad_norm": 1.4687559604644775,
"learning_rate": 0.00022202586206896552,
"loss": 3.6409,
"step": 850
},
{
"epoch": 3.6,
"grad_norm": 1.4177491664886475,
"learning_rate": 0.00021987068965517243,
"loss": 3.5883,
"step": 900
},
{
"epoch": 3.8,
"grad_norm": 1.395131230354309,
"learning_rate": 0.0002177155172413793,
"loss": 3.5468,
"step": 950
},
{
"epoch": 4.0,
"grad_norm": 1.4466007947921753,
"learning_rate": 0.00021556034482758622,
"loss": 3.5746,
"step": 1000
},
{
"epoch": 4.2,
"grad_norm": 1.3139097690582275,
"learning_rate": 0.00021340517241379313,
"loss": 3.3073,
"step": 1050
},
{
"epoch": 4.4,
"grad_norm": 1.5377048254013062,
"learning_rate": 0.00021124999999999998,
"loss": 3.3136,
"step": 1100
},
{
"epoch": 4.6,
"grad_norm": 1.4794890880584717,
"learning_rate": 0.0002090948275862069,
"loss": 3.287,
"step": 1150
},
{
"epoch": 4.8,
"grad_norm": 1.458489179611206,
"learning_rate": 0.0002069396551724138,
"loss": 3.3283,
"step": 1200
},
{
"epoch": 5.0,
"grad_norm": 1.5431376695632935,
"learning_rate": 0.0002047844827586207,
"loss": 3.2917,
"step": 1250
},
{
"epoch": 5.2,
"grad_norm": 1.27516770362854,
"learning_rate": 0.0002026293103448276,
"loss": 3.0597,
"step": 1300
},
{
"epoch": 5.4,
"grad_norm": 1.572311282157898,
"learning_rate": 0.0002004741379310345,
"loss": 3.0982,
"step": 1350
},
{
"epoch": 5.6,
"grad_norm": 1.4160852432250977,
"learning_rate": 0.0001983189655172414,
"loss": 3.0492,
"step": 1400
},
{
"epoch": 5.8,
"grad_norm": 1.5755418539047241,
"learning_rate": 0.0001961637931034483,
"loss": 3.0756,
"step": 1450
},
{
"epoch": 6.0,
"grad_norm": 1.5663151741027832,
"learning_rate": 0.00019400862068965515,
"loss": 3.1045,
"step": 1500
},
{
"epoch": 6.2,
"grad_norm": 1.4873141050338745,
"learning_rate": 0.00019185344827586206,
"loss": 2.857,
"step": 1550
},
{
"epoch": 6.4,
"grad_norm": 1.4372650384902954,
"learning_rate": 0.00018969827586206897,
"loss": 2.8887,
"step": 1600
},
{
"epoch": 6.6,
"grad_norm": 1.6041676998138428,
"learning_rate": 0.00018754310344827585,
"loss": 2.8847,
"step": 1650
},
{
"epoch": 6.8,
"grad_norm": 1.597738265991211,
"learning_rate": 0.00018538793103448276,
"loss": 2.9167,
"step": 1700
},
{
"epoch": 7.0,
"grad_norm": 1.4618918895721436,
"learning_rate": 0.00018323275862068967,
"loss": 2.9025,
"step": 1750
},
{
"epoch": 7.2,
"grad_norm": 1.4208807945251465,
"learning_rate": 0.00018107758620689656,
"loss": 2.6906,
"step": 1800
},
{
"epoch": 7.4,
"grad_norm": 1.3772602081298828,
"learning_rate": 0.00017892241379310344,
"loss": 2.7045,
"step": 1850
},
{
"epoch": 7.6,
"grad_norm": 1.3876913785934448,
"learning_rate": 0.00017676724137931035,
"loss": 2.7185,
"step": 1900
},
{
"epoch": 7.8,
"grad_norm": 1.6142256259918213,
"learning_rate": 0.00017461206896551723,
"loss": 2.7453,
"step": 1950
},
{
"epoch": 8.0,
"grad_norm": 1.7862074375152588,
"learning_rate": 0.00017245689655172414,
"loss": 2.7847,
"step": 2000
},
{
"epoch": 8.2,
"grad_norm": 1.520495057106018,
"learning_rate": 0.00017030172413793105,
"loss": 2.5499,
"step": 2050
},
{
"epoch": 8.4,
"grad_norm": 1.5806176662445068,
"learning_rate": 0.00016814655172413793,
"loss": 2.5625,
"step": 2100
},
{
"epoch": 8.6,
"grad_norm": 1.4896948337554932,
"learning_rate": 0.00016599137931034484,
"loss": 2.576,
"step": 2150
},
{
"epoch": 8.8,
"grad_norm": 1.5307574272155762,
"learning_rate": 0.00016383620689655172,
"loss": 2.6138,
"step": 2200
},
{
"epoch": 9.0,
"grad_norm": 1.5115731954574585,
"learning_rate": 0.0001616810344827586,
"loss": 2.6294,
"step": 2250
},
{
"epoch": 9.2,
"grad_norm": 1.5139647722244263,
"learning_rate": 0.00015952586206896552,
"loss": 2.4292,
"step": 2300
},
{
"epoch": 9.4,
"grad_norm": 1.4001915454864502,
"learning_rate": 0.00015737068965517243,
"loss": 2.444,
"step": 2350
},
{
"epoch": 9.6,
"grad_norm": 1.3104708194732666,
"learning_rate": 0.0001552155172413793,
"loss": 2.4509,
"step": 2400
},
{
"epoch": 9.8,
"grad_norm": 1.6207849979400635,
"learning_rate": 0.00015306034482758622,
"loss": 2.4975,
"step": 2450
},
{
"epoch": 10.0,
"grad_norm": 1.5676051378250122,
"learning_rate": 0.00015090517241379313,
"loss": 2.4849,
"step": 2500
},
{
"epoch": 10.2,
"grad_norm": 1.6440703868865967,
"learning_rate": 0.00014874999999999998,
"loss": 2.2924,
"step": 2550
},
{
"epoch": 10.4,
"grad_norm": 1.434339165687561,
"learning_rate": 0.0001465948275862069,
"loss": 2.3264,
"step": 2600
},
{
"epoch": 10.6,
"grad_norm": 1.5489099025726318,
"learning_rate": 0.0001444396551724138,
"loss": 2.3671,
"step": 2650
},
{
"epoch": 10.8,
"grad_norm": 1.3774558305740356,
"learning_rate": 0.00014228448275862069,
"loss": 2.3834,
"step": 2700
},
{
"epoch": 11.0,
"grad_norm": 1.4112049341201782,
"learning_rate": 0.0001401293103448276,
"loss": 2.386,
"step": 2750
},
{
"epoch": 11.2,
"grad_norm": 1.637389898300171,
"learning_rate": 0.0001379741379310345,
"loss": 2.1944,
"step": 2800
},
{
"epoch": 11.4,
"grad_norm": 1.717990756034851,
"learning_rate": 0.0001358189655172414,
"loss": 2.2413,
"step": 2850
},
{
"epoch": 11.6,
"grad_norm": 1.5509297847747803,
"learning_rate": 0.0001336637931034483,
"loss": 2.2519,
"step": 2900
},
{
"epoch": 11.8,
"grad_norm": 1.3987082242965698,
"learning_rate": 0.00013150862068965515,
"loss": 2.2691,
"step": 2950
},
{
"epoch": 12.0,
"grad_norm": 1.4241447448730469,
"learning_rate": 0.00012935344827586206,
"loss": 2.2907,
"step": 3000
},
{
"epoch": 12.2,
"grad_norm": 1.4504032135009766,
"learning_rate": 0.00012719827586206897,
"loss": 2.1044,
"step": 3050
},
{
"epoch": 12.4,
"grad_norm": 1.6953134536743164,
"learning_rate": 0.00012504310344827585,
"loss": 2.1447,
"step": 3100
},
{
"epoch": 12.6,
"grad_norm": 1.6625018119812012,
"learning_rate": 0.00012288793103448276,
"loss": 2.1829,
"step": 3150
},
{
"epoch": 12.8,
"grad_norm": 1.5335619449615479,
"learning_rate": 0.00012073275862068966,
"loss": 2.2124,
"step": 3200
},
{
"epoch": 13.0,
"grad_norm": 1.4856654405593872,
"learning_rate": 0.00011857758620689656,
"loss": 2.1592,
"step": 3250
},
{
"epoch": 13.2,
"grad_norm": 1.517542839050293,
"learning_rate": 0.00011642241379310345,
"loss": 2.0258,
"step": 3300
},
{
"epoch": 13.4,
"grad_norm": 1.494611382484436,
"learning_rate": 0.00011426724137931035,
"loss": 2.0496,
"step": 3350
},
{
"epoch": 13.6,
"grad_norm": 1.5751359462738037,
"learning_rate": 0.00011211206896551724,
"loss": 2.0851,
"step": 3400
},
{
"epoch": 13.8,
"grad_norm": 1.5572459697723389,
"learning_rate": 0.00010995689655172414,
"loss": 2.1255,
"step": 3450
},
{
"epoch": 14.0,
"grad_norm": 1.5155609846115112,
"learning_rate": 0.00010780172413793104,
"loss": 2.0991,
"step": 3500
},
{
"epoch": 14.2,
"grad_norm": 1.48418128490448,
"learning_rate": 0.00010564655172413793,
"loss": 1.9705,
"step": 3550
},
{
"epoch": 14.4,
"grad_norm": 1.5270135402679443,
"learning_rate": 0.00010349137931034483,
"loss": 2.0004,
"step": 3600
},
{
"epoch": 14.6,
"grad_norm": 1.5730317831039429,
"learning_rate": 0.00010133620689655172,
"loss": 2.0317,
"step": 3650
},
{
"epoch": 14.8,
"grad_norm": 1.5356438159942627,
"learning_rate": 9.918103448275863e-05,
"loss": 1.9999,
"step": 3700
},
{
"epoch": 15.0,
"grad_norm": 1.6630584001541138,
"learning_rate": 9.702586206896552e-05,
"loss": 2.0269,
"step": 3750
},
{
"epoch": 15.2,
"grad_norm": 1.5010318756103516,
"learning_rate": 9.487068965517241e-05,
"loss": 1.9149,
"step": 3800
},
{
"epoch": 15.4,
"grad_norm": 1.5633668899536133,
"learning_rate": 9.271551724137932e-05,
"loss": 1.9338,
"step": 3850
},
{
"epoch": 15.6,
"grad_norm": 1.59940767288208,
"learning_rate": 9.056034482758622e-05,
"loss": 1.946,
"step": 3900
},
{
"epoch": 15.8,
"grad_norm": 1.5850439071655273,
"learning_rate": 8.84051724137931e-05,
"loss": 1.9543,
"step": 3950
},
{
"epoch": 16.0,
"grad_norm": 1.5602020025253296,
"learning_rate": 8.625e-05,
"loss": 1.9461,
"step": 4000
},
{
"epoch": 16.2,
"grad_norm": 1.679661750793457,
"learning_rate": 8.40948275862069e-05,
"loss": 1.8593,
"step": 4050
},
{
"epoch": 16.4,
"grad_norm": 1.5591189861297607,
"learning_rate": 8.193965517241379e-05,
"loss": 1.8601,
"step": 4100
},
{
"epoch": 16.6,
"grad_norm": 1.426254153251648,
"learning_rate": 7.978448275862068e-05,
"loss": 1.9044,
"step": 4150
},
{
"epoch": 16.8,
"grad_norm": 1.553831696510315,
"learning_rate": 7.76293103448276e-05,
"loss": 1.8799,
"step": 4200
},
{
"epoch": 17.0,
"grad_norm": 1.7395073175430298,
"learning_rate": 7.547413793103449e-05,
"loss": 1.9057,
"step": 4250
},
{
"epoch": 17.2,
"grad_norm": 1.6055529117584229,
"learning_rate": 7.331896551724137e-05,
"loss": 1.7787,
"step": 4300
},
{
"epoch": 17.4,
"grad_norm": 1.6556615829467773,
"learning_rate": 7.116379310344828e-05,
"loss": 1.7999,
"step": 4350
},
{
"epoch": 17.6,
"grad_norm": 1.632834553718567,
"learning_rate": 6.900862068965518e-05,
"loss": 1.8312,
"step": 4400
},
{
"epoch": 17.8,
"grad_norm": 1.749241828918457,
"learning_rate": 6.685344827586206e-05,
"loss": 1.8596,
"step": 4450
},
{
"epoch": 18.0,
"grad_norm": 1.4276556968688965,
"learning_rate": 6.469827586206897e-05,
"loss": 1.8669,
"step": 4500
},
{
"epoch": 18.2,
"grad_norm": 1.562339425086975,
"learning_rate": 6.254310344827587e-05,
"loss": 1.755,
"step": 4550
},
{
"epoch": 18.4,
"grad_norm": 1.4620212316513062,
"learning_rate": 6.0387931034482755e-05,
"loss": 1.7945,
"step": 4600
},
{
"epoch": 18.6,
"grad_norm": 1.601141095161438,
"learning_rate": 5.823275862068966e-05,
"loss": 1.7674,
"step": 4650
},
{
"epoch": 18.8,
"grad_norm": 1.6034783124923706,
"learning_rate": 5.6077586206896554e-05,
"loss": 1.7728,
"step": 4700
},
{
"epoch": 19.0,
"grad_norm": 1.545716643333435,
"learning_rate": 5.392241379310345e-05,
"loss": 1.8167,
"step": 4750
},
{
"epoch": 19.2,
"grad_norm": 1.442853569984436,
"learning_rate": 5.1767241379310346e-05,
"loss": 1.7204,
"step": 4800
},
{
"epoch": 19.4,
"grad_norm": 1.524326205253601,
"learning_rate": 4.961206896551725e-05,
"loss": 1.7265,
"step": 4850
},
{
"epoch": 19.6,
"grad_norm": 1.6314244270324707,
"learning_rate": 4.745689655172414e-05,
"loss": 1.7385,
"step": 4900
},
{
"epoch": 19.8,
"grad_norm": 1.5261213779449463,
"learning_rate": 4.5301724137931034e-05,
"loss": 1.7385,
"step": 4950
},
{
"epoch": 20.0,
"grad_norm": 1.6839487552642822,
"learning_rate": 4.314655172413793e-05,
"loss": 1.7663,
"step": 5000
}
],
"logging_steps": 50,
"max_steps": 6000,
"num_input_tokens_seen": 0,
"num_train_epochs": 24,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5233484759040000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}