{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 424.46875, "epoch": 0.1, "grad_norm": 4.621828079223633, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": -0.0, "reward": 1.155371225439012, "reward_std": 1.157450430560857, "rewards/concensus_correctness_reward_func": 0.263374999165535, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.2525274936342612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.20196874719113111, "step": 2 }, { "completion_length": 423.6875, "epoch": 0.2, "grad_norm": 7.296565532684326, "kl": 0.0013047958818788175, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 0.8222260437905788, "reward_std": 0.636947439139476, "rewards/concensus_correctness_reward_func": 0.05025000125169754, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.315944813657552, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3310312391258776, "step": 4 }, { "completion_length": 550.5625, "epoch": 0.3, "grad_norm": 10.931665420532227, "kl": 0.001646797751163831, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 0.8547968231141567, "reward_std": 1.5359444813802838, "rewards/concensus_correctness_reward_func": 0.23118750005960464, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.3315468009095639, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.08893750491552055, "step": 6 }, { "completion_length": 527.5625, "epoch": 0.4, "grad_norm": 4.5462517738342285, "kl": 0.0033753589050320443, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 1.172942945966497, "reward_std": 1.0442042673239484, "rewards/concensus_correctness_reward_func": 0.28293750062584877, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.39450543955899775, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.24549999507144094, "step": 8 }, { "completion_length": 596.84375, "epoch": 0.5, "grad_norm": 5.691103458404541, "kl": 0.002918783080531284, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 1.119660965166986, "reward_std": 1.2069853162392974, "rewards/concensus_correctness_reward_func": 0.1850624978542328, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.3388797022635117, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2832187442108989, "step": 10 }, { "completion_length": 592.96875, "epoch": 0.6, "grad_norm": 4.646827220916748, "kl": 0.0031351612269645557, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 0.9737958908081055, "reward_std": 0.9545698054134846, "rewards/concensus_correctness_reward_func": 0.2938749995082617, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.40088965790346265, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21653124503791332, "step": 12 }, { "completion_length": 575.0, "epoch": 0.7, "grad_norm": 5.13407564163208, "kl": 0.004790089908055961, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 2.0287008844316006, "reward_std": 1.4835871619870886, "rewards/concensus_correctness_reward_func": 0.545687498524785, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.5870445817708969, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3334687491878867, "step": 14 }, { "completion_length": 541.90625, "epoch": 0.8, "grad_norm": 4.688849449157715, "kl": 0.004843814662308432, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 1.3065089867450297, "reward_std": 0.554467682595714, "rewards/concensus_correctness_reward_func": 0.1198749989271164, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.4176964627113193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.3314374880865216, "step": 16 }, { "completion_length": 787.59375, "epoch": 0.9, "grad_norm": 2.6661102771759033, "kl": 0.0034712271644821158, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 1.0092257296200842, "reward_std": 1.405159011046635, "rewards/concensus_correctness_reward_func": 0.20868750102818012, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.315475704104756, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11006249859929085, "step": 18 }, { "completion_length": 573.4375, "epoch": 1.0, "grad_norm": 3.8552772998809814, "kl": 0.004824902229302097, "learning_rate": 0.0, "loss": 0.0, "reward": 2.2213319819420576, "reward_std": 2.8470122441649437, "rewards/concensus_correctness_reward_func": 1.1250000074505806, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.5113319638185203, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.05375000648200512, "step": 20 }, { "epoch": 1.0, "step": 20, "total_flos": 0.0, "train_loss": 3.0882656574249268e-06, "train_runtime": 382.8125, "train_samples_per_second": 0.836, "train_steps_per_second": 0.052 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }