{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 232.21875, "epoch": 0.5714285714285714, "grad_norm": 44.04344940185547, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 2.9121293793432415, "reward_std": 0.6280596783617511, "rewards/concensus_correctness_reward_func": 0.5593750020489097, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6154730841517448, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.6435312470421195, "step": 2 }, { "completion_length": 178.70833333333334, "epoch": 1.0, "grad_norm": 146.21438598632812, "kl": 8.268213170580566, "learning_rate": 4.864543104251586e-07, "loss": 0.0062, "reward": 5.387973050276439, "reward_std": 0.6165593440334002, "rewards/concensus_correctness_reward_func": 1.5517499844233196, "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.8114312589168549, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 1.108125001192093, "step": 4 }, { "completion_length": 150.28125, "epoch": 1.5714285714285714, "grad_norm": 19.16786003112793, "kl": 2.884129573008977, "learning_rate": 4.472851273490984e-07, "loss": 0.0029, "reward": 5.311543390154839, "reward_std": 0.7719176085665822, "rewards/concensus_correctness_reward_func": 1.4498750008642673, "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.71429343521595, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.163000002503395, "step": 6 }, { "completion_length": 184.33333333333334, "epoch": 2.0, "grad_norm": 18.322921752929688, "kl": 1.2294259141975392, "learning_rate": 3.867370395306068e-07, "loss": 0.0009, "reward": 4.496874541044235, "reward_std": 0.8491996126249433, "rewards/concensus_correctness_reward_func": 1.015833326925834, "rewards/consensus_reward_func": 1.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6787495116392771, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 1.0522916664679844, "step": 8 }, { "completion_length": 154.625, "epoch": 2.571428571428571, "grad_norm": 17.83030128479004, "kl": 8.403996711946093, "learning_rate": 3.1137137178519977e-07, "loss": 0.0084, "reward": 4.695109188556671, "reward_std": 0.9860417204909027, "rewards/concensus_correctness_reward_func": 1.0651875026524067, "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7041404494084418, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 1.17578125, "step": 10 }, { "completion_length": 134.54166666666666, "epoch": 3.0, "grad_norm": 17.385499954223633, "kl": 28912.023177654482, "learning_rate": 2.2935516363191693e-07, "loss": 21.684, "reward": 5.548654953638713, "reward_std": 0.5412371944015225, "rewards/concensus_correctness_reward_func": 1.4962499924004078, "rewards/consensus_reward_func": 1.5833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.8024049550294876, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3125, "rewards/xmlcount_reward_func": 1.1875, "step": 12 }, { "completion_length": 140.71875, "epoch": 3.571428571428571, "grad_norm": 1344.9619140625, "kl": 6139.031279045157, "learning_rate": 1.4957614383675767e-07, "loss": 6.139, "reward": 5.497947037220001, "reward_std": 0.4656379229563754, "rewards/concensus_correctness_reward_func": 1.4404374985024333, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8114158157259226, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.19921875, "step": 14 }, { "completion_length": 160.54166666666666, "epoch": 4.0, "grad_norm": 12.484821319580078, "kl": 8.58236723113805, "learning_rate": 8.067960709356478e-08, "loss": 0.0064, "reward": 4.575502196947734, "reward_std": 1.0911046511416014, "rewards/concensus_correctness_reward_func": 0.9780833274126053, "rewards/consensus_reward_func": 1.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7998771816492081, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.2916666666666667, "rewards/xmlcount_reward_func": 1.1725416680177052, "step": 16 }, { "completion_length": 138.53125, "epoch": 4.571428571428571, "grad_norm": 572.0084228515625, "kl": 28.125015974976122, "learning_rate": 3.013156219837776e-08, "loss": 0.0281, "reward": 5.397134527564049, "reward_std": 0.664117572363466, "rewards/concensus_correctness_reward_func": 1.403562496881932, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.7240407671779394, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.34375, "rewards/xmlcount_reward_func": 1.17578125, "step": 18 }, { "completion_length": 182.375, "epoch": 5.0, "grad_norm": 16.89112091064453, "kl": 497.25228943574865, "learning_rate": 3.4096741493194193e-09, "loss": 0.3729, "reward": 4.701884349187215, "reward_std": 0.9557839000287155, "rewards/concensus_correctness_reward_func": 1.163333331545194, "rewards/consensus_reward_func": 1.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7833427041769028, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.3541666666666667, "rewards/xmlcount_reward_func": 1.1510416666666667, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 2.8248958706099074, "train_runtime": 118.0721, "train_samples_per_second": 2.71, "train_steps_per_second": 0.169 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }