Files
Summary-0.1/last-checkpoint/trainer_state.json
ModelHub XC a5967341f6 初始化项目,由ModelHub XC社区提供模型
Model: MahmoudIbrahim/Summary-0.1
Source: Original Platform
2026-05-27 15:12:24 +08:00

515 lines
13 KiB
JSON

{
"best_global_step": 334,
"best_metric": 1.941367506980896,
"best_model_checkpoint": "./Summary-0.1/checkpoint-334",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 501,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.059880239520958084,
"grad_norm": 38.75,
"learning_rate": 4.5e-06,
"loss": 2.7202,
"mean_token_accuracy": 0.45009988248348237,
"num_tokens": 29469.0,
"step": 10
},
{
"epoch": 0.11976047904191617,
"grad_norm": 9.25,
"learning_rate": 9.5e-06,
"loss": 2.2626,
"mean_token_accuracy": 0.5219378590583801,
"num_tokens": 58579.0,
"step": 20
},
{
"epoch": 0.17964071856287425,
"grad_norm": 6.375,
"learning_rate": 1.45e-05,
"loss": 2.2181,
"mean_token_accuracy": 0.518680065870285,
"num_tokens": 88193.0,
"step": 30
},
{
"epoch": 0.23952095808383234,
"grad_norm": 5.9375,
"learning_rate": 1.9500000000000003e-05,
"loss": 2.0667,
"mean_token_accuracy": 0.5430938720703125,
"num_tokens": 116933.0,
"step": 40
},
{
"epoch": 0.2994011976047904,
"grad_norm": 5.65625,
"learning_rate": 2.45e-05,
"loss": 2.1906,
"mean_token_accuracy": 0.5175440430641174,
"num_tokens": 146513.0,
"step": 50
},
{
"epoch": 0.3592814371257485,
"grad_norm": 5.9375,
"learning_rate": 2.95e-05,
"loss": 2.092,
"mean_token_accuracy": 0.5363078862428665,
"num_tokens": 175698.0,
"step": 60
},
{
"epoch": 0.41916167664670656,
"grad_norm": 4.9375,
"learning_rate": 3.45e-05,
"loss": 1.9358,
"mean_token_accuracy": 0.568104338645935,
"num_tokens": 202607.0,
"step": 70
},
{
"epoch": 0.47904191616766467,
"grad_norm": 5.1875,
"learning_rate": 3.9500000000000005e-05,
"loss": 2.0201,
"mean_token_accuracy": 0.550568813085556,
"num_tokens": 231199.0,
"step": 80
},
{
"epoch": 0.5389221556886228,
"grad_norm": 5.03125,
"learning_rate": 4.4500000000000004e-05,
"loss": 1.9076,
"mean_token_accuracy": 0.5696590662002563,
"num_tokens": 260064.0,
"step": 90
},
{
"epoch": 0.5988023952095808,
"grad_norm": 5.03125,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.9292,
"mean_token_accuracy": 0.5686947405338287,
"num_tokens": 288635.0,
"step": 100
},
{
"epoch": 0.6586826347305389,
"grad_norm": 5.40625,
"learning_rate": 4.8076923076923084e-05,
"loss": 2.0754,
"mean_token_accuracy": 0.5396111845970154,
"num_tokens": 315994.0,
"step": 110
},
{
"epoch": 0.718562874251497,
"grad_norm": 4.78125,
"learning_rate": 4.594017094017094e-05,
"loss": 1.942,
"mean_token_accuracy": 0.567721825838089,
"num_tokens": 345852.0,
"step": 120
},
{
"epoch": 0.7784431137724551,
"grad_norm": 4.84375,
"learning_rate": 4.3803418803418805e-05,
"loss": 2.0083,
"mean_token_accuracy": 0.5502024054527282,
"num_tokens": 375046.0,
"step": 130
},
{
"epoch": 0.8383233532934131,
"grad_norm": 4.59375,
"learning_rate": 4.166666666666667e-05,
"loss": 1.9348,
"mean_token_accuracy": 0.563525739312172,
"num_tokens": 403950.0,
"step": 140
},
{
"epoch": 0.8982035928143712,
"grad_norm": 4.90625,
"learning_rate": 3.952991452991453e-05,
"loss": 1.8796,
"mean_token_accuracy": 0.5778123795986175,
"num_tokens": 433871.0,
"step": 150
},
{
"epoch": 0.9580838323353293,
"grad_norm": 4.375,
"learning_rate": 3.739316239316239e-05,
"loss": 2.0021,
"mean_token_accuracy": 0.5490101099014282,
"num_tokens": 463543.0,
"step": 160
},
{
"epoch": 1.0,
"eval_loss": 1.9482988119125366,
"eval_mean_token_accuracy": 0.5604007748457102,
"eval_num_tokens": 482617.0,
"eval_runtime": 39.6692,
"eval_samples_per_second": 2.521,
"eval_steps_per_second": 0.328,
"step": 167
},
{
"epoch": 1.0179640718562875,
"grad_norm": 4.53125,
"learning_rate": 3.525641025641026e-05,
"loss": 1.8939,
"mean_token_accuracy": 0.5751068115234375,
"num_tokens": 491782.0,
"step": 170
},
{
"epoch": 1.0778443113772456,
"grad_norm": 4.25,
"learning_rate": 3.311965811965812e-05,
"loss": 1.572,
"mean_token_accuracy": 0.6416267931461335,
"num_tokens": 521599.0,
"step": 180
},
{
"epoch": 1.1377245508982037,
"grad_norm": 4.71875,
"learning_rate": 3.098290598290599e-05,
"loss": 1.5429,
"mean_token_accuracy": 0.6442601144313812,
"num_tokens": 550968.0,
"step": 190
},
{
"epoch": 1.1976047904191618,
"grad_norm": 4.03125,
"learning_rate": 2.8846153846153845e-05,
"loss": 1.4855,
"mean_token_accuracy": 0.6554409444332123,
"num_tokens": 579850.0,
"step": 200
},
{
"epoch": 1.2574850299401197,
"grad_norm": 4.53125,
"learning_rate": 2.670940170940171e-05,
"loss": 1.5925,
"mean_token_accuracy": 0.6354905068874359,
"num_tokens": 607566.0,
"step": 210
},
{
"epoch": 1.3173652694610778,
"grad_norm": 4.34375,
"learning_rate": 2.4572649572649573e-05,
"loss": 1.6961,
"mean_token_accuracy": 0.611818504333496,
"num_tokens": 636366.0,
"step": 220
},
{
"epoch": 1.377245508982036,
"grad_norm": 4.28125,
"learning_rate": 2.2435897435897437e-05,
"loss": 1.6871,
"mean_token_accuracy": 0.6123433768749237,
"num_tokens": 665501.0,
"step": 230
},
{
"epoch": 1.437125748502994,
"grad_norm": 5.34375,
"learning_rate": 2.02991452991453e-05,
"loss": 1.6756,
"mean_token_accuracy": 0.6171969532966614,
"num_tokens": 692397.0,
"step": 240
},
{
"epoch": 1.4970059880239521,
"grad_norm": 4.40625,
"learning_rate": 1.8162393162393162e-05,
"loss": 1.6237,
"mean_token_accuracy": 0.6245103716850281,
"num_tokens": 720330.0,
"step": 250
},
{
"epoch": 1.55688622754491,
"grad_norm": 3.9375,
"learning_rate": 1.602564102564103e-05,
"loss": 1.6596,
"mean_token_accuracy": 0.6197108209133149,
"num_tokens": 747976.0,
"step": 260
},
{
"epoch": 1.6167664670658684,
"grad_norm": 4.3125,
"learning_rate": 1.388888888888889e-05,
"loss": 1.6428,
"mean_token_accuracy": 0.6236989557743072,
"num_tokens": 776610.0,
"step": 270
},
{
"epoch": 1.6766467065868262,
"grad_norm": 4.125,
"learning_rate": 1.1752136752136752e-05,
"loss": 1.6659,
"mean_token_accuracy": 0.6144291937351227,
"num_tokens": 806986.0,
"step": 280
},
{
"epoch": 1.7365269461077846,
"grad_norm": 4.5625,
"learning_rate": 9.615384615384616e-06,
"loss": 1.6745,
"mean_token_accuracy": 0.6144052445888519,
"num_tokens": 835571.0,
"step": 290
},
{
"epoch": 1.7964071856287425,
"grad_norm": 4.625,
"learning_rate": 7.478632478632479e-06,
"loss": 1.6576,
"mean_token_accuracy": 0.6180627286434174,
"num_tokens": 865294.0,
"step": 300
},
{
"epoch": 1.8562874251497006,
"grad_norm": 4.25,
"learning_rate": 5.341880341880342e-06,
"loss": 1.6627,
"mean_token_accuracy": 0.6169491648674011,
"num_tokens": 894249.0,
"step": 310
},
{
"epoch": 1.9161676646706587,
"grad_norm": 4.96875,
"learning_rate": 3.205128205128205e-06,
"loss": 1.5248,
"mean_token_accuracy": 0.6464354753494262,
"num_tokens": 924046.0,
"step": 320
},
{
"epoch": 1.9760479041916168,
"grad_norm": 4.09375,
"learning_rate": 1.0683760683760685e-06,
"loss": 1.6545,
"mean_token_accuracy": 0.6218094885349273,
"num_tokens": 954354.0,
"step": 330
},
{
"epoch": 2.0,
"eval_loss": 1.941367506980896,
"eval_mean_token_accuracy": 0.564078491467696,
"eval_num_tokens": 965234.0,
"eval_runtime": 39.4675,
"eval_samples_per_second": 2.534,
"eval_steps_per_second": 0.329,
"step": 334
},
{
"epoch": 2.035928143712575,
"grad_norm": 4.375,
"learning_rate": 2.0199501246882794e-05,
"loss": 1.3888,
"mean_token_accuracy": 0.6779806514581045,
"num_tokens": 16794.0,
"step": 340
},
{
"epoch": 2.095808383233533,
"grad_norm": 4.625,
"learning_rate": 1.8952618453865337e-05,
"loss": 1.5284,
"mean_token_accuracy": 0.6481667637825013,
"num_tokens": 44874.0,
"step": 350
},
{
"epoch": 2.155688622754491,
"grad_norm": 5.90625,
"learning_rate": 1.770573566084788e-05,
"loss": 1.5412,
"mean_token_accuracy": 0.6448413729667664,
"num_tokens": 71727.0,
"step": 360
},
{
"epoch": 2.215568862275449,
"grad_norm": 4.3125,
"learning_rate": 1.6458852867830423e-05,
"loss": 1.5195,
"mean_token_accuracy": 0.6477404713630677,
"num_tokens": 101889.0,
"step": 370
},
{
"epoch": 2.2754491017964074,
"grad_norm": 4.0,
"learning_rate": 1.5211970074812968e-05,
"loss": 1.4846,
"mean_token_accuracy": 0.6572466909885406,
"num_tokens": 132140.0,
"step": 380
},
{
"epoch": 2.3353293413173652,
"grad_norm": 4.3125,
"learning_rate": 1.396508728179551e-05,
"loss": 1.604,
"mean_token_accuracy": 0.6328544735908508,
"num_tokens": 161063.0,
"step": 390
},
{
"epoch": 2.3952095808383236,
"grad_norm": 4.71875,
"learning_rate": 1.2718204488778054e-05,
"loss": 1.5462,
"mean_token_accuracy": 0.6424768209457398,
"num_tokens": 189798.0,
"step": 400
},
{
"epoch": 2.4550898203592815,
"grad_norm": 4.34375,
"learning_rate": 1.1471321695760599e-05,
"loss": 1.5468,
"mean_token_accuracy": 0.6386782228946686,
"num_tokens": 219194.0,
"step": 410
},
{
"epoch": 2.5149700598802394,
"grad_norm": 4.65625,
"learning_rate": 1.0224438902743143e-05,
"loss": 1.5713,
"mean_token_accuracy": 0.6371028661727905,
"num_tokens": 249445.0,
"step": 420
},
{
"epoch": 2.5748502994011977,
"grad_norm": 3.875,
"learning_rate": 8.977556109725686e-06,
"loss": 1.4073,
"mean_token_accuracy": 0.6741897523403168,
"num_tokens": 277096.0,
"step": 430
},
{
"epoch": 2.6347305389221556,
"grad_norm": 5.15625,
"learning_rate": 7.73067331670823e-06,
"loss": 1.5873,
"mean_token_accuracy": 0.6345809698104858,
"num_tokens": 306656.0,
"step": 440
},
{
"epoch": 2.694610778443114,
"grad_norm": 4.125,
"learning_rate": 6.483790523690773e-06,
"loss": 1.5398,
"mean_token_accuracy": 0.6446067214012146,
"num_tokens": 334989.0,
"step": 450
},
{
"epoch": 2.754491017964072,
"grad_norm": 4.78125,
"learning_rate": 5.236907730673317e-06,
"loss": 1.4324,
"mean_token_accuracy": 0.6648351371288299,
"num_tokens": 363393.0,
"step": 460
},
{
"epoch": 2.81437125748503,
"grad_norm": 4.375,
"learning_rate": 3.99002493765586e-06,
"loss": 1.4939,
"mean_token_accuracy": 0.6552098572254181,
"num_tokens": 392445.0,
"step": 470
},
{
"epoch": 2.874251497005988,
"grad_norm": 4.625,
"learning_rate": 2.743142144638404e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.6401039361953735,
"num_tokens": 421549.0,
"step": 480
},
{
"epoch": 2.934131736526946,
"grad_norm": 4.9375,
"learning_rate": 1.4962593516209476e-06,
"loss": 1.5018,
"mean_token_accuracy": 0.6498535394668579,
"num_tokens": 450516.0,
"step": 490
},
{
"epoch": 2.9940119760479043,
"grad_norm": 4.8125,
"learning_rate": 2.4937655860349126e-07,
"loss": 1.502,
"mean_token_accuracy": 0.651291674375534,
"num_tokens": 480685.0,
"step": 500
},
{
"epoch": 3.0,
"eval_loss": 1.9642236232757568,
"eval_mean_token_accuracy": 0.5623479668910687,
"eval_num_tokens": 482617.0,
"eval_runtime": 39.682,
"eval_samples_per_second": 2.52,
"eval_steps_per_second": 0.328,
"step": 501
}
],
"logging_steps": 10,
"max_steps": 501,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2647290262044672.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}