{ "best_metric": 1.06145525, "best_model_checkpoint": "/data/coding/ms-swift/output/v7-20250220-132503/checkpoint-108", "epoch": 1.9829351535836177, "eval_steps": 50, "global_step": 108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01820250284414107, "grad_norm": 6.1875, "learning_rate": 1.6666666666666667e-06, "loss": 1.5638728141784668, "memory(GiB)": 29.73, "step": 1, "token_acc": 0.6939769707705934, "train_speed(iter/s)": 0.129066 }, { "epoch": 0.09101251422070535, "grad_norm": 4.96875, "learning_rate": 8.333333333333334e-06, "loss": 1.518846035003662, "memory(GiB)": 41.66, "step": 5, "token_acc": 0.7012233049968899, "train_speed(iter/s)": 0.226379 }, { "epoch": 0.1820250284414107, "grad_norm": 3.515625, "learning_rate": 9.96210254835968e-06, "loss": 1.3327471733093261, "memory(GiB)": 41.66, "step": 10, "token_acc": 0.7161527878935017, "train_speed(iter/s)": 0.240583 }, { "epoch": 0.27303754266211605, "grad_norm": 2.296875, "learning_rate": 9.809128215864096e-06, "loss": 1.1619236946105957, "memory(GiB)": 45.97, "step": 15, "token_acc": 0.7329187688216199, "train_speed(iter/s)": 0.245364 }, { "epoch": 0.3640500568828214, "grad_norm": 2.296875, "learning_rate": 9.542326359097619e-06, "loss": 1.1952880859375, "memory(GiB)": 45.97, "step": 20, "token_acc": 0.7218057637847742, "train_speed(iter/s)": 0.252196 }, { "epoch": 0.4550625711035267, "grad_norm": 2.328125, "learning_rate": 9.168011926105598e-06, "loss": 1.111426544189453, "memory(GiB)": 45.97, "step": 25, "token_acc": 0.7425333872925941, "train_speed(iter/s)": 0.251882 }, { "epoch": 0.5460750853242321, "grad_norm": 2.578125, "learning_rate": 8.695044586103297e-06, "loss": 1.1071309089660644, "memory(GiB)": 45.97, "step": 30, "token_acc": 0.7414093361083974, "train_speed(iter/s)": 0.254082 }, { "epoch": 0.6370875995449374, "grad_norm": 2.078125, "learning_rate": 8.134619029470535e-06, "loss": 1.0020055770874023, "memory(GiB)": 45.97, "step": 35, "token_acc": 0.7621102932675633, "train_speed(iter/s)": 0.251894 }, { "epoch": 0.7281001137656428, "grad_norm": 2.359375, "learning_rate": 7.500000000000001e-06, "loss": 1.0203259468078614, "memory(GiB)": 50.29, "step": 40, "token_acc": 0.7580885395117914, "train_speed(iter/s)": 0.252412 }, { "epoch": 0.8191126279863481, "grad_norm": 2.21875, "learning_rate": 6.806208330935766e-06, "loss": 0.9908183097839356, "memory(GiB)": 50.29, "step": 45, "token_acc": 0.7667093258473352, "train_speed(iter/s)": 0.252266 }, { "epoch": 0.9101251422070534, "grad_norm": 2.5, "learning_rate": 6.0696654160324875e-06, "loss": 1.045759677886963, "memory(GiB)": 50.29, "step": 50, "token_acc": 0.7470934799685781, "train_speed(iter/s)": 0.251372 }, { "epoch": 0.9101251422070534, "eval_loss": 1.073840618133545, "eval_runtime": 0.6301, "eval_samples_per_second": 68.243, "eval_steps_per_second": 14.283, "eval_token_acc": 0.7637732857709076, "step": 50 }, { "epoch": 1.018202502844141, "grad_norm": 3.59375, "learning_rate": 5.3078045306697154e-06, "loss": 1.223165225982666, "memory(GiB)": 57.52, "step": 55, "token_acc": 0.7527071344595012, "train_speed(iter/s)": 0.246897 }, { "epoch": 1.1092150170648465, "grad_norm": 2.328125, "learning_rate": 4.53865820268349e-06, "loss": 1.0227657318115235, "memory(GiB)": 57.52, "step": 60, "token_acc": 0.755286734276229, "train_speed(iter/s)": 0.248832 }, { "epoch": 1.2002275312855517, "grad_norm": 2.03125, "learning_rate": 3.7804313994581143e-06, "loss": 0.9702803611755371, "memory(GiB)": 57.52, "step": 65, "token_acc": 0.7658987281017519, "train_speed(iter/s)": 0.248641 }, { "epoch": 1.2912400455062572, "grad_norm": 2.3125, "learning_rate": 3.0510706335366034e-06, "loss": 1.0110454559326172, "memory(GiB)": 57.52, "step": 70, "token_acc": 0.7643304928863696, "train_speed(iter/s)": 0.248241 }, { "epoch": 1.3822525597269624, "grad_norm": 2.390625, "learning_rate": 2.3678391856132203e-06, "loss": 0.9339286804199218, "memory(GiB)": 57.52, "step": 75, "token_acc": 0.7785570747468379, "train_speed(iter/s)": 0.247963 }, { "epoch": 1.4732650739476678, "grad_norm": 1.953125, "learning_rate": 1.746908498978791e-06, "loss": 0.9071330070495606, "memory(GiB)": 57.52, "step": 80, "token_acc": 0.7775242441528808, "train_speed(iter/s)": 0.248416 }, { "epoch": 1.5642775881683733, "grad_norm": 2.203125, "learning_rate": 1.202975416726464e-06, "loss": 1.0261162757873534, "memory(GiB)": 57.52, "step": 85, "token_acc": 0.7509206426287888, "train_speed(iter/s)": 0.248819 }, { "epoch": 1.6552901023890785, "grad_norm": 2.078125, "learning_rate": 7.489143213519301e-07, "loss": 0.995113468170166, "memory(GiB)": 57.52, "step": 90, "token_acc": 0.7631283572516636, "train_speed(iter/s)": 0.24959 }, { "epoch": 1.7463026166097837, "grad_norm": 2.1875, "learning_rate": 3.9547241027523164e-07, "loss": 0.9445444107055664, "memory(GiB)": 57.52, "step": 95, "token_acc": 0.7700910688608404, "train_speed(iter/s)": 0.249979 }, { "epoch": 1.8373151308304891, "grad_norm": 2.03125, "learning_rate": 1.510153198249531e-07, "loss": 0.9724701881408692, "memory(GiB)": 57.52, "step": 100, "token_acc": 0.7649354027573051, "train_speed(iter/s)": 0.250388 }, { "epoch": 1.8373151308304891, "eval_loss": 1.0617759227752686, "eval_runtime": 0.6342, "eval_samples_per_second": 67.805, "eval_steps_per_second": 14.192, "eval_token_acc": 0.7669441141498217, "step": 100 }, { "epoch": 1.9283276450511946, "grad_norm": 2.1875, "learning_rate": 2.1329118524827662e-08, "loss": 0.9729574203491211, "memory(GiB)": 57.52, "step": 105, "token_acc": 0.7660818713450293, "train_speed(iter/s)": 0.250386 }, { "epoch": 1.9829351535836177, "eval_loss": 1.061455249786377, "eval_runtime": 0.6272, "eval_samples_per_second": 68.563, "eval_steps_per_second": 14.35, "eval_token_acc": 0.7653586999603647, "step": 108 } ], "logging_steps": 5, "max_steps": 108, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.19180283271168e+16, "train_batch_size": 5, "trial_name": null, "trial_params": null }