{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 241.375, "epoch": 0.5714285714285714, "grad_norm": 18.91314125061035, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 4.494268886744976, "reward_std": 0.659155557746999, "rewards/concensus_correctness_reward_func": 1.9506250014528632, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7150813667103648, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.8285625139251351, "step": 2 }, { "completion_length": 170.20833333333334, "epoch": 1.0, "grad_norm": 21.518081665039062, "kl": 1.1660978463478386, "learning_rate": 4.864543104251586e-07, "loss": 0.0009, "reward": 5.892316182454427, "reward_std": 0.7085997781444652, "rewards/concensus_correctness_reward_func": 1.924833317597707, "rewards/consensus_reward_func": 1.4166666666666667, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9082745363314947, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.1875, "rewards/xmlcount_reward_func": 1.2050416767597198, "step": 4 }, { "completion_length": 168.78125, "epoch": 1.5714285714285714, "grad_norm": 28654.76953125, "kl": 1190.526932534296, "learning_rate": 4.472851273490984e-07, "loss": 1.1905, "reward": 5.072421103715897, "reward_std": 1.378951239719754, "rewards/concensus_correctness_reward_func": 1.4344375059008598, "rewards/consensus_reward_func": 1.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.8482648283243179, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 1.0553437564522028, "step": 6 }, { "completion_length": 203.79166666666666, "epoch": 2.0, "grad_norm": 91.56695556640625, "kl": 14.810836016510924, "learning_rate": 3.867370395306068e-07, "loss": 0.0111, "reward": 4.737937867641449, "reward_std": 0.6826455221356204, "rewards/concensus_correctness_reward_func": 1.1815833238263924, "rewards/consensus_reward_func": 1.3333333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9630212336778641, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 1.1350000252326329, "step": 8 }, { "completion_length": 170.1875, "epoch": 2.571428571428571, "grad_norm": 50.522186279296875, "kl": 16.655639334581792, "learning_rate": 3.1137137178519977e-07, "loss": 0.0167, "reward": 5.022570252418518, "reward_std": 2.068594033829868, "rewards/concensus_correctness_reward_func": 1.6066250018775463, "rewards/consensus_reward_func": 1.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8850077828392386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.078125, "rewards/xmlcount_reward_func": 1.0778124928474426, "step": 10 }, { "completion_length": 133.54166666666666, "epoch": 3.0, "grad_norm": 597.9561767578125, "kl": 47.79180937260389, "learning_rate": 2.2935516363191693e-07, "loss": 0.0358, "reward": 5.702193597952525, "reward_std": 0.3240309691561076, "rewards/concensus_correctness_reward_func": 1.697833323230346, "rewards/consensus_reward_func": 1.5833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.16666666666666666, "rewards/question_recreation_reward_func": 0.9767769972483317, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.10416666666666667, "rewards/xmlcount_reward_func": 1.1734166741371155, "step": 12 }, { "completion_length": 170.53125, "epoch": 3.571428571428571, "grad_norm": 31.776348114013672, "kl": 1.4045644905418158, "learning_rate": 1.4957614383675767e-07, "loss": 0.0014, "reward": 5.180432453751564, "reward_std": 0.39799850789131597, "rewards/concensus_correctness_reward_func": 1.376062492839992, "rewards/consensus_reward_func": 1.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.988057479262352, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.109375, "rewards/xmlcount_reward_func": 1.1444375179708004, "step": 14 }, { "completion_length": 180.91666666666666, "epoch": 4.0, "grad_norm": 27.417720794677734, "kl": 28.36734463647008, "learning_rate": 8.067960709356478e-08, "loss": 0.0213, "reward": 4.723206460475922, "reward_std": 0.8197268722578883, "rewards/concensus_correctness_reward_func": 1.172333334883054, "rewards/consensus_reward_func": 1.0833333333333333, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3333333333333333, "rewards/question_recreation_reward_func": 0.9045815169811249, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.08333333333333333, "rewards/xmlcount_reward_func": 1.1462916831175487, "step": 16 }, { "completion_length": 198.0, "epoch": 4.571428571428571, "grad_norm": 37.20878219604492, "kl": 37.27795463940129, "learning_rate": 3.013156219837776e-08, "loss": 0.0373, "reward": 4.845614403486252, "reward_std": 0.5414760503917933, "rewards/concensus_correctness_reward_func": 1.4188124909996986, "rewards/consensus_reward_func": 1.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9487081132829189, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.09375, "rewards/xmlcount_reward_func": 1.0718437694013119, "step": 18 }, { "completion_length": 181.08333333333334, "epoch": 5.0, "grad_norm": 37.08019256591797, "kl": 2.6332666520029306, "learning_rate": 3.4096741493194193e-09, "loss": 0.002, "reward": 6.089526116847992, "reward_std": 2.58219914448758, "rewards/concensus_correctness_reward_func": 2.7640000035365424, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.08333333333333333, "rewards/question_recreation_reward_func": 0.9599844515323639, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.14583333333333334, "rewards/xmlcount_reward_func": 1.1363750000794728, "step": 20 }, { "epoch": 5.0, "step": 20, "total_flos": 0.0, "train_loss": 0.13169335020356812, "train_runtime": 89.9305, "train_samples_per_second": 3.558, "train_steps_per_second": 0.222 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }