{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 152.6875, "epoch": 0.5714285714285714, "grad_norm": 9.704741898985958e-09, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 7.741498485207558, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.0381250008940697, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, "rewards/question_recreation_reward_func": 0.9377485089353286, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.171875, "step": 2 }, { "completion_length": 166.125, "epoch": 1.0, "grad_norm": 0.00010551138257142156, "kl": 6.533205315906176e-07, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 8.665034532546997, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.248999993006388, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.6666666666666667, "rewards/question_recreation_reward_func": 0.9993677288293839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 4 }, { "completion_length": 152.0625, "epoch": 1.5714285714285714, "grad_norm": 1.6149582862854004, "kl": 0.004159657626122026, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 7.852498576045036, "reward_std": 0.022097086533904076, "rewards/concensus_correctness_reward_func": 2.164750002324581, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.125, "rewards/question_recreation_reward_func": 0.9377485089353286, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.171875, "step": 6 }, { "completion_length": 173.0, "epoch": 2.0, "grad_norm": 1.253675103187561, "kl": 0.009927107773364696, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 8.412867744763693, "reward_std": 0.3535533547401428, "rewards/concensus_correctness_reward_func": 2.0801666577657065, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.5833333333333333, "rewards/question_recreation_reward_func": 0.9993677288293839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 8 }, { "completion_length": 166.59375, "epoch": 2.571428571428571, "grad_norm": 0.003849708242341876, "kl": 0.010342858462536242, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 7.742899313569069, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.1649999991059303, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.0, "rewards/question_recreation_reward_func": 0.9372743055573665, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.171875, "step": 10 }, { "completion_length": 160.79166666666666, "epoch": 3.0, "grad_norm": 1.604172945022583, "kl": 0.009363556828854295, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 8.413166761398315, "reward_std": 0.3535534143447876, "rewards/concensus_correctness_reward_func": 2.0798333287239075, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.5833333333333333, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 12 }, { "completion_length": 156.125, "epoch": 3.571428571428571, "grad_norm": 1.433937668800354, "kl": 0.013254275647341274, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 8.117686077952385, "reward_std": 0.176865161061869, "rewards/concensus_correctness_reward_func": 2.1643749997019768, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.375, "rewards/question_recreation_reward_func": 0.9377485089353286, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.171812504529953, "step": 14 }, { "completion_length": 172.5, "epoch": 4.0, "grad_norm": 1.5800690650939941, "kl": 0.007285505174271141, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 7.917034427324931, "reward_std": 0.34836794932683307, "rewards/concensus_correctness_reward_func": 2.000999997059504, "rewards/consensus_reward_func": 1.9166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.25, "rewards/question_recreation_reward_func": 0.9993677288293839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 16 }, { "completion_length": 150.96875, "epoch": 4.571428571428571, "grad_norm": 0.009897509589791298, "kl": 0.007831997059838613, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 8.117623582482338, "reward_std": 0.1767767146229744, "rewards/concensus_correctness_reward_func": 2.1642499938607216, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.375, "rewards/question_recreation_reward_func": 0.9377485089353286, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.171875, "step": 18 }, { "completion_length": 177.08333333333334, "epoch": 5.0, "grad_norm": 1.1512504816055298, "kl": 0.00556948969218259, "learning_rate": 3.4096741493194193e-09, "loss": 0.0, "reward": 8.246867775917053, "reward_std": 0.11785115798314412, "rewards/concensus_correctness_reward_func": 2.0808333357175193, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 1.4166666666666667, "rewards/question_recreation_reward_func": 0.9993677288293839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 5.9759746136645034e-06, "train_runtime": 2064.6435, "train_samples_per_second": 0.155, "train_steps_per_second": 0.01 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }