{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5633802816901409, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 14.0, "completions/mean_terminated_length": 14.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.028169014084507043, "grad_norm": 32.06034851074219, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0176, "num_tokens": 568.0, "reward": 0.020783159881830215, "reward_std": 0.012284406460821629, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.020783159881830215, "rewards/question_recreation_reward_func/std": 0.027877027168869972, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 29.5, "completions/mean_terminated_length": 29.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.056338028169014086, "grad_norm": 6.299616813659668, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.0791, "num_tokens": 1198.0, "reward": 0.02762589044868946, "reward_std": 0.0009784403955563903, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.02762589231133461, "rewards/question_recreation_reward_func/std": 0.0051710638217628, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.08450704225352113, "grad_norm": 3.1016180515289307, "kl": 8.319292101077735e-05, "learning_rate": 9.931806517013612e-07, "loss": 0.3379, "num_tokens": 2182.0, "reward": 0.03331246227025986, "reward_std": 0.0052673472091555595, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.03331245854496956, "rewards/question_recreation_reward_func/std": 0.009135501459240913, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.11267605633802817, "grad_norm": 29.29948616027832, "kl": 0.05142414569854736, "learning_rate": 9.729086208503173e-07, "loss": -0.1736, "num_tokens": 2746.0, "reward": 0.14608332514762878, "reward_std": 0.007730965036898851, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.14608332514762878, "rewards/question_recreation_reward_func/std": 0.009648437611758709, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.14084507042253522, "grad_norm": 40.53683853149414, "kl": 0.011120198392518432, "learning_rate": 9.397368756032444e-07, "loss": 0.0004, "num_tokens": 3312.0, "reward": 0.08978645503520966, "reward_std": 0.013956054113805294, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.08978645503520966, "rewards/question_recreation_reward_func/std": 0.08247601985931396, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.16901408450704225, "grad_norm": 0.06918946653604507, "kl": 0.0004225224256515503, "learning_rate": 8.945702546981968e-07, "loss": 0.0, "num_tokens": 3856.0, "reward": 0.19858156144618988, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.19858156144618988, "rewards/question_recreation_reward_func/std": 0.0, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 8.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19718309859154928, "grad_norm": 41.10600280761719, "kl": 0.06056380271911621, "learning_rate": 8.386407858128706e-07, "loss": -0.0184, "num_tokens": 4401.0, "reward": 0.03791220486164093, "reward_std": 0.047372184693813324, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.006662202067673206, "rewards/question_recreation_reward_func/std": 0.004442098084837198, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.03125, "rewards/xmlcount_reward_func/std": 0.0625, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 21.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 11.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.22535211267605634, "grad_norm": 84.13429260253906, "kl": 0.9762128964066505, "learning_rate": 7.734740790612136e-07, "loss": 0.1866, "num_tokens": 4958.0, "reward": 0.019636042416095734, "reward_std": 0.0071492972783744335, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.019636042416095734, "rewards/question_recreation_reward_func/std": 0.00888961460441351, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 37.75, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2535211267605634, "grad_norm": 4.597474098205566, "kl": 0.008542275987565517, "learning_rate": 7.008477123264847e-07, "loss": -0.2221, "num_tokens": 5621.0, "reward": 0.08780995011329651, "reward_std": 0.08147402852773666, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.025309953838586807, "rewards/question_recreation_reward_func/std": 0.00799198541790247, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0625, "rewards/xmlcount_reward_func/std": 0.125, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.28169014084507044, "grad_norm": 0.32836952805519104, "kl": 0.010951906442642212, "learning_rate": 6.227427435703995e-07, "loss": 0.0004, "num_tokens": 6165.0, "reward": 0.02834007889032364, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.02834007889032364, "rewards/question_recreation_reward_func/std": 0.014024702832102776, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 12.5, "completions/mean_terminated_length": 12.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.30985915492957744, "grad_norm": 14.266270637512207, "kl": 0.016900939866900444, "learning_rate": 5.412896727361662e-07, "loss": -0.1861, "num_tokens": 6727.0, "reward": 0.14206652343273163, "reward_std": 0.02492138370871544, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.14206652343273163, "rewards/question_recreation_reward_func/std": 0.04201439768075943, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 15.333333015441895, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3380281690140845, "grad_norm": 1.9125043153762817, "kl": 0.014537290277075954, "learning_rate": 4.5871032726383385e-07, "loss": -0.3405, "num_tokens": 7733.0, "reward": 0.5619847178459167, "reward_std": 0.6531730890274048, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.5, "rewards/final_correctness_reward_func/std": 1.0, "rewards/question_recreation_reward_func/mean": 0.030734695494174957, "rewards/question_recreation_reward_func/std": 0.019535422325134277, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.03125, "rewards/xmlcount_reward_func/std": 0.0625, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 27.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 17.5, "completions/mean_terminated_length": 17.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.36619718309859156, "grad_norm": 0.005306577309966087, "kl": 5.5371059715980664e-05, "learning_rate": 3.772572564296004e-07, "loss": 0.0, "num_tokens": 8315.0, "reward": 0.01907399296760559, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.01907399296760559, "rewards/question_recreation_reward_func/std": 0.003249133238568902, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.39436619718309857, "grad_norm": 0.021956074982881546, "kl": 6.914883852005005e-05, "learning_rate": 2.9915228767351535e-07, "loss": 0.0, "num_tokens": 8859.0, "reward": 0.014652014710009098, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.014652014710009098, "rewards/question_recreation_reward_func/std": 0.0, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4225352112676056, "grad_norm": 0.48415833711624146, "kl": 0.0022591277956962585, "learning_rate": 2.2652592093878665e-07, "loss": 0.0001, "num_tokens": 9403.0, "reward": 0.1094527393579483, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.1094527393579483, "rewards/question_recreation_reward_func/std": 0.0, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 19.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 13.5, "completions/mean_terminated_length": 13.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4507042253521127, "grad_norm": 0.04596748575568199, "kl": 0.0005076723755337298, "learning_rate": 1.6135921418712955e-07, "loss": 0.0, "num_tokens": 9969.0, "reward": 0.018024075776338577, "reward_std": 0.0, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.018024075776338577, "rewards/question_recreation_reward_func/std": 0.009574447758495808, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 14.333333015441895, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4788732394366197, "grad_norm": 14.307421684265137, "kl": 0.3401991333812475, "learning_rate": 1.0542974530180327e-07, "loss": 0.5429, "num_tokens": 10972.0, "reward": 0.0627230554819107, "reward_std": 0.0639922246336937, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.031473059207201004, "rewards/question_recreation_reward_func/std": 0.02106659673154354, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.03125, "rewards/xmlcount_reward_func/std": 0.0625, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 25.0, "completions/mean_terminated_length": 25.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5070422535211268, "grad_norm": 8.994085311889648, "kl": 0.03856681424076669, "learning_rate": 6.026312439675551e-08, "loss": -0.1128, "num_tokens": 11584.0, "reward": 0.10017985105514526, "reward_std": 0.0002543535956647247, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.10017985105514526, "rewards/question_recreation_reward_func/std": 0.00035971030592918396, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 34.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5352112676056338, "grad_norm": 7.788077354431152, "kl": 0.012560278875753284, "learning_rate": 2.7091379149682682e-08, "loss": 0.4072, "num_tokens": 12646.0, "reward": 0.012618010863661766, "reward_std": 0.005928314756602049, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.012618010863661766, "rewards/question_recreation_reward_func/std": 0.006028006784617901, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 448.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 93.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.5633802816901409, "grad_norm": 7.968515872955322, "kl": 0.023016713559627533, "learning_rate": 6.819348298638839e-09, "loss": 0.244, "num_tokens": 14240.0, "reward": 0.11969352513551712, "reward_std": 0.05395982041954994, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.11969352513551712, "rewards/question_recreation_reward_func/std": 0.1334262639284134, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.0, "rewards/xmlcount_reward_func/std": 0.0, "step": 20 }, { "epoch": 0.5633802816901409, "step": 20, "total_flos": 0.0, "train_loss": 0.030225585971913917, "train_runtime": 15476.4298, "train_samples_per_second": 0.005, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 20, "num_input_tokens_seen": 14240, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }