{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7692307692307693, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 141.3333282470703, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.038461538461538464, "grad_norm": 9.404427528381348, "kl": 0.0, "learning_rate": 0.0, "loss": 0.2213, "num_tokens": 1384.0, "reward": 0.27538180351257324, "reward_std": 0.5752406120300293, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.3473817706108093, "rewards/question_recreation_reward_func/std": 0.03370636701583862, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": -0.07199999690055847, "rewards/xmlcount_reward_func/std": 0.9370037317276001, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.07692307692307693, "grad_norm": 13.685848236083984, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.1909, "num_tokens": 2391.0, "reward": 0.2730637192726135, "reward_std": 0.18166151642799377, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.023063700646162033, "rewards/question_recreation_reward_func/std": 0.009388059377670288, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.25, "rewards/xmlcount_reward_func/std": 0.20412415266036987, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.11538461538461539, "grad_norm": 8.721884727478027, "kl": 0.0005104035590193234, "learning_rate": 9.931806517013612e-07, "loss": -0.1137, "num_tokens": 3583.0, "reward": 0.5263556241989136, "reward_std": 0.10577338933944702, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.08885563910007477, "rewards/question_recreation_reward_func/std": 0.022719671949744225, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.4375, "rewards/xmlcount_reward_func/std": 0.125, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 89.75, "completions/mean_terminated_length": 89.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.15384615384615385, "grad_norm": 18.96868324279785, "kl": 0.0063872426908346824, "learning_rate": 9.729086208503173e-07, "loss": 0.2846, "num_tokens": 4454.0, "reward": 0.8816813230514526, "reward_std": 0.8639886379241943, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.5, "rewards/final_correctness_reward_func/std": 1.0, "rewards/question_recreation_reward_func/mean": 0.13168135285377502, "rewards/question_recreation_reward_func/std": 0.041856855154037476, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.25, "rewards/xmlcount_reward_func/std": 0.20412415266036987, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 97.66666412353516, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.19230769230769232, "grad_norm": 11.014723777770996, "kl": 0.0015677963965572417, "learning_rate": 9.397368756032444e-07, "loss": 0.4359, "num_tokens": 5707.0, "reward": 0.01884479820728302, "reward_std": 0.5485635995864868, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.03259478509426117, "rewards/question_recreation_reward_func/std": 0.005219103768467903, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": -0.013750001788139343, "rewards/xmlcount_reward_func/std": 0.6304149031639099, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 112.75, "completions/mean_terminated_length": 112.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.23076923076923078, "grad_norm": 16.488595962524414, "kl": 0.013569895178079605, "learning_rate": 8.945702546981968e-07, "loss": 0.161, "num_tokens": 6670.0, "reward": 0.5298745632171631, "reward_std": 0.11173999309539795, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.03187461569905281, "rewards/question_recreation_reward_func/std": 0.011758098378777504, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.49799996614456177, "rewards/xmlcount_reward_func/std": 0.21535088121891022, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 81.66666412353516, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.2692307692307692, "grad_norm": 17.747295379638672, "kl": 0.05565625987946987, "learning_rate": 8.386407858128706e-07, "loss": 0.2248, "num_tokens": 7875.0, "reward": 0.4869118928909302, "reward_std": 0.11581836640834808, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.04941185563802719, "rewards/question_recreation_reward_func/std": 0.04165700823068619, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.4375, "rewards/xmlcount_reward_func/std": 0.125, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.3076923076923077, "grad_norm": 12.994662284851074, "kl": 0.030461058020591736, "learning_rate": 7.734740790612136e-07, "loss": -0.0224, "num_tokens": 8846.0, "reward": 0.5597025752067566, "reward_std": 0.1636485457420349, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.2159525454044342, "rewards/question_recreation_reward_func/std": 0.05881807208061218, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.34375, "rewards/xmlcount_reward_func/std": 0.1875, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 63.66666793823242, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.34615384615384615, "grad_norm": 16.384445190429688, "kl": 0.02970386017113924, "learning_rate": 7.008477123264847e-07, "loss": 0.2749, "num_tokens": 9997.0, "reward": 0.43550199270248413, "reward_std": 0.09783899784088135, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.12300197780132294, "rewards/question_recreation_reward_func/std": 0.0868106335401535, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.3125, "rewards/xmlcount_reward_func/std": 0.23935678601264954, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.38461538461538464, "grad_norm": 19.250350952148438, "kl": 0.039656256791204214, "learning_rate": 6.227427435703995e-07, "loss": -0.1137, "num_tokens": 10697.0, "reward": 0.6099977493286133, "reward_std": 0.021727532148361206, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.23549777269363403, "rewards/question_recreation_reward_func/std": 0.02440648153424263, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.37450000643730164, "rewards/xmlcount_reward_func/std": 0.1437625288963318, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4230769230769231, "grad_norm": 18.542329788208008, "kl": 0.020436149090528488, "learning_rate": 5.412896727361662e-07, "loss": -0.0652, "num_tokens": 11415.0, "reward": 0.6023240089416504, "reward_std": 0.06527406722307205, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.0183239858597517, "rewards/question_recreation_reward_func/std": 0.008217486552894115, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.5839999914169312, "rewards/xmlcount_reward_func/std": 0.11785867810249329, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 71.25, "completions/mean_terminated_length": 71.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.46153846153846156, "grad_norm": 14.851263046264648, "kl": 0.04291308671236038, "learning_rate": 4.5871032726383385e-07, "loss": -0.1437, "num_tokens": 12212.0, "reward": 0.5428086519241333, "reward_std": 0.00478853564709425, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.04280860722064972, "rewards/question_recreation_reward_func/std": 0.005272616166621447, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.5, "rewards/xmlcount_reward_func/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 55.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.5, "grad_norm": 12.394680976867676, "kl": 0.022660875925794244, "learning_rate": 3.772572564296004e-07, "loss": 0.341, "num_tokens": 13337.0, "reward": 0.3977251648902893, "reward_std": 0.18016040325164795, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.022725149989128113, "rewards/question_recreation_reward_func/std": 0.01266756746917963, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.375, "rewards/xmlcount_reward_func/std": 0.25, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 92.0, "completions/mean_terminated_length": 92.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5384615384615384, "grad_norm": 26.36524200439453, "kl": 0.0786033570766449, "learning_rate": 2.9915228767351535e-07, "loss": 0.0012, "num_tokens": 14217.0, "reward": 0.8583254814147949, "reward_std": 0.6752742528915405, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.5, "rewards/final_correctness_reward_func/std": 1.0, "rewards/question_recreation_reward_func/mean": 0.13957545161247253, "rewards/question_recreation_reward_func/std": 0.03446739539504051, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.21875, "rewards/xmlcount_reward_func/std": 0.2576940953731537, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5769230769230769, "grad_norm": 9.72214126586914, "kl": 0.059507149271667004, "learning_rate": 2.2652592093878665e-07, "loss": -0.0895, "num_tokens": 15486.0, "reward": 0.6740732789039612, "reward_std": 0.2836502194404602, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.29907330870628357, "rewards/question_recreation_reward_func/std": 0.14153823256492615, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.375, "rewards/xmlcount_reward_func/std": 0.25, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.6153846153846154, "grad_norm": 12.116292953491211, "kl": 0.025627892464399338, "learning_rate": 1.6135921418712955e-07, "loss": 0.2017, "num_tokens": 16710.0, "reward": 0.37010395526885986, "reward_std": 0.09250971674919128, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.05760394036769867, "rewards/question_recreation_reward_func/std": 0.03707271069288254, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.3125, "rewards/xmlcount_reward_func/std": 0.23935678601264954, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.6538461538461539, "grad_norm": 11.819451332092285, "kl": 0.04005518322810531, "learning_rate": 1.0542974530180327e-07, "loss": -0.1202, "num_tokens": 17592.0, "reward": 0.5551368594169617, "reward_std": 0.021570861339569092, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.05513682961463928, "rewards/question_recreation_reward_func/std": 0.03840121626853943, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.5, "rewards/xmlcount_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.6923076923076923, "grad_norm": 12.008862495422363, "kl": 0.03604450775310397, "learning_rate": 6.026312439675551e-08, "loss": -0.1226, "num_tokens": 18764.0, "reward": 0.46445250511169434, "reward_std": 0.07633315026760101, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.026952533051371574, "rewards/question_recreation_reward_func/std": 0.02285909652709961, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.4375, "rewards/xmlcount_reward_func/std": 0.125, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 67.25, "completions/mean_terminated_length": 67.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.7307692307692307, "grad_norm": 15.652201652526855, "kl": 0.011676187859848142, "learning_rate": 2.7091379149682682e-08, "loss": 0.1494, "num_tokens": 19545.0, "reward": 0.5532028675079346, "reward_std": 0.015393441542983055, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.05320284515619278, "rewards/question_recreation_reward_func/std": 0.026126131415367126, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.5, "rewards/xmlcount_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 448.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 36.33333206176758, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.7692307692307693, "grad_norm": 12.616737365722656, "kl": 0.01751775573939085, "learning_rate": 6.819348298638839e-09, "loss": -0.3033, "num_tokens": 20614.0, "reward": 0.372824102640152, "reward_std": 0.048663586378097534, "rewards/concensus_correctness_reward_func/mean": 0.0, "rewards/concensus_correctness_reward_func/std": 0.0, "rewards/consensus_reward_func/mean": 0.0, "rewards/consensus_reward_func/std": 0.0, "rewards/cumulative_reward_2/mean": 0.0, "rewards/cumulative_reward_2/std": 0.0, "rewards/final_correctness_reward_func/mean": 0.0, "rewards/final_correctness_reward_func/std": 0.0, "rewards/question_recreation_reward_func/mean": 0.02907411754131317, "rewards/question_recreation_reward_func/std": 0.014660523273050785, "rewards/soft_format_reward_func/mean": 0.0, "rewards/soft_format_reward_func/std": 0.0, "rewards/strict_format_reward_func/mean": 0.0, "rewards/strict_format_reward_func/std": 0.0, "rewards/xmlcount_reward_func/mean": 0.34375, "rewards/xmlcount_reward_func/std": 0.1875, "step": 20 }, { "epoch": 0.7692307692307693, "step": 20, "total_flos": 0.0, "train_loss": 0.0696285206824541, "train_runtime": 6061.8527, "train_samples_per_second": 0.013, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 20, "num_input_tokens_seen": 20614, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }