{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 277.4375, "epoch": 0.5714285714285714, "grad_norm": 6.789405822753906, "kl": 0.0, "learning_rate": 5e-07, "loss": -0.0, "reward": 2.3237390760332346, "reward_std": 1.5290137782576494, "rewards/concensus_correctness_reward_func": 0.8025624714791775, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5013640912948176, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.6916875028982759, "step": 2 }, { "completion_length": 240.41666666666666, "epoch": 1.0, "grad_norm": 4.878081798553467, "kl": 0.002382916432300893, "learning_rate": 4.864543104251586e-07, "loss": 0.0, "reward": 2.5553423265616098, "reward_std": 0.5683276208971316, "rewards/concensus_correctness_reward_func": 0.42125000059604645, "rewards/consensus_reward_func": 0.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6468839564671119, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 1.0705416450897853, "step": 4 }, { "completion_length": 320.875, "epoch": 1.5714285714285714, "grad_norm": 81.54737091064453, "kl": 0.0035806876549031585, "learning_rate": 4.472851273490984e-07, "loss": 0.0, "reward": 2.4654137454926968, "reward_std": 1.087094948044978, "rewards/concensus_correctness_reward_func": 0.2971874997019768, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7451949981041253, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.9230312146246433, "step": 6 }, { "completion_length": 322.5416666666667, "epoch": 2.0, "grad_norm": 4.378849506378174, "kl": 0.006137622442717354, "learning_rate": 3.867370395306068e-07, "loss": 0.0, "reward": 2.417641818523407, "reward_std": 0.9718923264493545, "rewards/concensus_correctness_reward_func": 0.27650000154972076, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.7511000943680605, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.8067083209753036, "step": 8 }, { "completion_length": 263.78125, "epoch": 2.571428571428571, "grad_norm": 7.5848822593688965, "kl": 0.010713139752624556, "learning_rate": 3.1137137178519977e-07, "loss": 0.0, "reward": 2.334370069205761, "reward_std": 1.0359732687065843, "rewards/concensus_correctness_reward_func": 0.2455000001937151, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6358388164080679, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 1.0155312195420265, "step": 10 }, { "completion_length": 265.6666666666667, "epoch": 3.0, "grad_norm": 6.6377458572387695, "kl": 0.0317639890126884, "learning_rate": 2.2935516363191693e-07, "loss": 0.0, "reward": 2.7291881144046783, "reward_std": 0.9668942355783656, "rewards/concensus_correctness_reward_func": 0.49399999777475995, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.6956047763427099, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.9562499796350797, "step": 12 }, { "completion_length": 258.9375, "epoch": 3.571428571428571, "grad_norm": 5.181027889251709, "kl": 0.01114592287922278, "learning_rate": 1.4957614383675767e-07, "loss": 0.0, "reward": 2.9897976852953434, "reward_std": 1.4940979477250949, "rewards/concensus_correctness_reward_func": 0.7932500019669533, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.7159852180629969, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 1.1055624894797802, "step": 14 }, { "completion_length": 292.5, "epoch": 4.0, "grad_norm": 5.158603191375732, "kl": 0.03570538493416583, "learning_rate": 8.067960709356478e-08, "loss": 0.0, "reward": 2.0371210873126984, "reward_std": 0.7878820024585972, "rewards/concensus_correctness_reward_func": 0.23308333257834116, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6077877643207709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.9462499924314519, "step": 16 }, { "completion_length": 276.5625, "epoch": 4.571428571428571, "grad_norm": 5.967617988586426, "kl": 0.018310003797523677, "learning_rate": 3.013156219837776e-08, "loss": 0.0, "reward": 2.2351491451263428, "reward_std": 0.8348404020653106, "rewards/concensus_correctness_reward_func": 0.1536250002682209, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.7462741751223803, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.9602499902248383, "step": 18 }, { "completion_length": 427.9583333333333, "epoch": 5.0, "grad_norm": 4.099704265594482, "kl": 356.1126844395476, "learning_rate": 3.4096741493194193e-09, "loss": 0.2671, "reward": 1.3956897395352523, "reward_std": 1.659088636127611, "rewards/concensus_correctness_reward_func": 0.18975000083446503, "rewards/consensus_reward_func": 0.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6243564016185701, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2482500026623408, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 0.026718258014625462, "train_runtime": 202.5588, "train_samples_per_second": 1.58, "train_steps_per_second": 0.099 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }