{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.714285714285714, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 300.15625, "epoch": 0.05714285714285714, "grad_norm": 17.426481246948242, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "reward": 3.95498262392357, "reward_std": 0.699341673636809, "rewards/concensus_correctness_reward_func": 1.2692499943077564, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.7033888604491949, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.5604687528684735, "step": 2 }, { "completion_length": 121.5, "epoch": 0.11428571428571428, "grad_norm": 13.969701766967773, "kl": 0.01049548169839909, "learning_rate": 5e-07, "loss": 0.0, "reward": 6.702177166938782, "reward_std": 0.05493173561990261, "rewards/concensus_correctness_reward_func": 1.9659999907016754, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9393021687865257, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.21875, "step": 4 }, { "completion_length": 125.96875, "epoch": 0.17142857142857143, "grad_norm": 0.03055919148027897, "kl": 0.01876932127197506, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": 7.562659442424774, "reward_std": 0.1466759592294693, "rewards/concensus_correctness_reward_func": 2.416374996304512, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.9744094498455524, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 6 }, { "completion_length": 139.78125, "epoch": 0.22857142857142856, "grad_norm": 10.420638084411621, "kl": 0.46951429128239397, "learning_rate": 9.99934441832816e-07, "loss": 0.0005, "reward": 7.0254947245121, "reward_std": 0.20259349179104902, "rewards/concensus_correctness_reward_func": 2.1086249873042107, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9793697707355022, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 8 }, { "completion_length": 118.625, "epoch": 0.2857142857142857, "grad_norm": 0.7034396529197693, "kl": 0.5558207163994666, "learning_rate": 9.994100796397953e-07, "loss": 0.0006, "reward": 6.637874931097031, "reward_std": 0.04419417306780815, "rewards/concensus_correctness_reward_func": 1.919124998152256, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 10 }, { "completion_length": 128.15625, "epoch": 0.34285714285714286, "grad_norm": 0.3636714816093445, "kl": 0.360770606843289, "learning_rate": 9.983619052372847e-07, "loss": 0.0004, "reward": 7.086013972759247, "reward_std": 0.1177134495228529, "rewards/concensus_correctness_reward_func": 2.16924998909235, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9619827643036842, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.220406249165535, "step": 12 }, { "completion_length": 124.78125, "epoch": 0.4, "grad_norm": 0.39200520515441895, "kl": 0.1974896949250251, "learning_rate": 9.967910180154888e-07, "loss": 0.0002, "reward": 7.418499946594238, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.2934999763965607, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 14 }, { "completion_length": 123.96875, "epoch": 0.45714285714285713, "grad_norm": 0.08412964642047882, "kl": 0.0694609482306987, "learning_rate": 9.946990656181779e-07, "loss": 0.0001, "reward": 6.889249950647354, "reward_std": 0.04419417306780815, "rewards/concensus_correctness_reward_func": 2.0454999804496765, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 16 }, { "completion_length": 132.78125, "epoch": 0.5142857142857142, "grad_norm": 13294.4853515625, "kl": 1929.7287216553232, "learning_rate": 9.92088242214537e-07, "loss": 1.9297, "reward": 6.682024419307709, "reward_std": 0.29592345282435417, "rewards/concensus_correctness_reward_func": 1.975374985486269, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9722743965685368, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.21875, "step": 18 }, { "completion_length": 125.90625, "epoch": 0.5714285714285714, "grad_norm": 0.12487129867076874, "kl": 0.062417162815108895, "learning_rate": 9.889612861977853e-07, "loss": 0.0001, "reward": 7.668124943971634, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.418124996125698, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 20 }, { "completion_length": 127.65625, "epoch": 0.6285714285714286, "grad_norm": 19.497220993041992, "kl": 4.291850817389786, "learning_rate": 9.853214773129795e-07, "loss": 0.0043, "reward": 6.502693980932236, "reward_std": 0.052721514366567135, "rewards/concensus_correctness_reward_func": 1.9194999784231186, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9738190732896328, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 22 }, { "completion_length": 116.53125, "epoch": 0.6857142857142857, "grad_norm": 14.217891693115234, "kl": 0.40878968324977905, "learning_rate": 9.81172633217015e-07, "loss": 0.0004, "reward": 6.780536770820618, "reward_std": 0.14366747711028438, "rewards/concensus_correctness_reward_func": 1.8821249827742577, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.996068000793457, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 24 }, { "completion_length": 122.8125, "epoch": 0.7428571428571429, "grad_norm": 21.322853088378906, "kl": 2.3591567099792883, "learning_rate": 9.765191054744304e-07, "loss": 0.0024, "reward": 6.85475081205368, "reward_std": 0.02316407673060894, "rewards/concensus_correctness_reward_func": 2.064124978147447, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.970313299447298, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 26 }, { "completion_length": 119.15625, "epoch": 0.8, "grad_norm": 0.48698848485946655, "kl": 0.04026831721421331, "learning_rate": 9.713657749932171e-07, "loss": 0.0, "reward": 7.164504557847977, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.1684999763965607, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9960045665502548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 28 }, { "completion_length": 119.875, "epoch": 0.8571428571428571, "grad_norm": 50.563270568847656, "kl": 0.13291181785461958, "learning_rate": 9.657180469054212e-07, "loss": 0.0001, "reward": 7.1191529631614685, "reward_std": 0.0648374930024147, "rewards/concensus_correctness_reward_func": 2.1649999916553497, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9893092103302479, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 30 }, { "completion_length": 120.5625, "epoch": 0.9142857142857143, "grad_norm": 2.136101007461548, "kl": 0.061591801641043276, "learning_rate": 9.59581844897906e-07, "loss": 0.0001, "reward": 6.7411342561244965, "reward_std": 0.005238220939645544, "rewards/concensus_correctness_reward_func": 1.874749980866909, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9913843646645546, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 32 }, { "completion_length": 124.40625, "epoch": 0.9714285714285714, "grad_norm": 1.9825115203857422, "kl": 0.30299730040133, "learning_rate": 9.529636049992233e-07, "loss": 0.0003, "reward": 6.669965773820877, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.920499987900257, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.999465811997652, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 34 }, { "completion_length": 129.1875, "epoch": 1.0285714285714285, "grad_norm": 0.25820305943489075, "kl": 0.10265114883077331, "learning_rate": 9.458702688291071e-07, "loss": 0.0001, "reward": 6.62774994969368, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.8777499794960022, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 36 }, { "completion_length": 123.28125, "epoch": 1.0857142857142856, "grad_norm": 1.8184473514556885, "kl": 0.3953003305650782, "learning_rate": 9.383092763176738e-07, "loss": 0.0004, "reward": 7.328000098466873, "reward_std": 0.022097086533904076, "rewards/concensus_correctness_reward_func": 2.2186249792575836, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 38 }, { "completion_length": 135.28125, "epoch": 1.1428571428571428, "grad_norm": 160.05328369140625, "kl": 35.685647501493804, "learning_rate": 9.302885579019626e-07, "loss": 0.0357, "reward": 6.5664326548576355, "reward_std": 0.14593601133674383, "rewards/concensus_correctness_reward_func": 1.9196249842643738, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9788388833403587, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.21484375, "step": 40 }, { "completion_length": 131.625, "epoch": 1.2, "grad_norm": 39835.796875, "kl": 3820.177204770036, "learning_rate": 9.218165262080022e-07, "loss": 3.8202, "reward": 7.211937814950943, "reward_std": 0.0015021136496216059, "rewards/concensus_correctness_reward_func": 2.2130000069737434, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9989378377795219, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 42 }, { "completion_length": 127.25, "epoch": 1.2571428571428571, "grad_norm": 0.04504234343767166, "kl": 0.05154561816016212, "learning_rate": 9.129020672271281e-07, "loss": 0.0001, "reward": 6.667250007390976, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.9172499775886536, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 44 }, { "completion_length": 118.40625, "epoch": 1.3142857142857143, "grad_norm": 23.088890075683594, "kl": 2.3278085422934964, "learning_rate": 9.035545309958046e-07, "loss": 0.0024, "reward": 6.5357484221458435, "reward_std": 0.6359185018609423, "rewards/concensus_correctness_reward_func": 1.8128749802708626, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.9924046844244003, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 46 }, { "completion_length": 116.53125, "epoch": 1.3714285714285714, "grad_norm": 34.027565002441406, "kl": 3.6760852289153263, "learning_rate": 8.937837217887272e-07, "loss": 0.0037, "reward": 7.155503273010254, "reward_std": 0.005231709423242137, "rewards/concensus_correctness_reward_func": 2.1656249910593033, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9898783266544342, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 48 }, { "completion_length": 133.34375, "epoch": 1.4285714285714286, "grad_norm": 225.2295379638672, "kl": 1160.2006183795165, "learning_rate": 8.83599887835493e-07, "loss": 1.1602, "reward": 6.594452530145645, "reward_std": 0.10580936633050442, "rewards/concensus_correctness_reward_func": 1.9199999868869781, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9596088528633118, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.24609375, "step": 50 }, { "completion_length": 116.0625, "epoch": 1.4857142857142858, "grad_norm": 0.057164259254932404, "kl": 0.04885981511324644, "learning_rate": 8.73013710571623e-07, "loss": 0.0, "reward": 6.660125017166138, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.9101249799132347, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 52 }, { "completion_length": 115.84375, "epoch": 1.5428571428571427, "grad_norm": 809270.9375, "kl": 13683351.59927877, "learning_rate": 8.620362934352108e-07, "loss": 13683.3525, "reward": 7.599349647760391, "reward_std": 0.0883883461356163, "rewards/concensus_correctness_reward_func": 2.4121249988675117, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.9997246712446213, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 54 }, { "completion_length": 137.53125, "epoch": 1.6, "grad_norm": 27.686599731445312, "kl": 1.2052045605960302, "learning_rate": 8.506791502209496e-07, "loss": 0.0012, "reward": 6.557819962501526, "reward_std": 0.09846582496538758, "rewards/concensus_correctness_reward_func": 1.9089999869465828, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9378825686872005, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2421875, "step": 56 }, { "completion_length": 119.625, "epoch": 1.657142857142857, "grad_norm": 0.09275330603122711, "kl": 0.4611619641073048, "learning_rate": 8.389541930037516e-07, "loss": 0.0005, "reward": 7.025811791419983, "reward_std": 0.20320578664541245, "rewards/concensus_correctness_reward_func": 2.109499979764223, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9944368153810501, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 58 }, { "completion_length": 125.25, "epoch": 1.7142857142857144, "grad_norm": 16.619831085205078, "kl": 0.32900134613737464, "learning_rate": 8.268737196446263e-07, "loss": 0.0003, "reward": 7.025674343109131, "reward_std": 0.04453253000974655, "rewards/concensus_correctness_reward_func": 2.0638749888166785, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9852368384599686, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 60 }, { "completion_length": 122.96875, "epoch": 1.7714285714285714, "grad_norm": 0.030277982354164124, "kl": 0.046715385746210814, "learning_rate": 8.144504008919222e-07, "loss": 0.0, "reward": 6.91824996471405, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.0432499945163727, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 62 }, { "completion_length": 119.65625, "epoch": 1.8285714285714287, "grad_norm": 0.02598767727613449, "kl": 0.7929197051562369, "learning_rate": 8.016972670914623e-07, "loss": 0.0008, "reward": 7.665361404418945, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.4198749884963036, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9954864755272865, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 64 }, { "completion_length": 130.28125, "epoch": 1.8857142857142857, "grad_norm": 15.967117309570312, "kl": 1.8949127969099209, "learning_rate": 7.886276945195097e-07, "loss": 0.0019, "reward": 6.652400374412537, "reward_std": 0.023298587650060654, "rewards/concensus_correctness_reward_func": 1.918874979019165, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9835254140198231, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 66 }, { "completion_length": 129.34375, "epoch": 1.9428571428571428, "grad_norm": 97.32298278808594, "kl": 34.17437303136103, "learning_rate": 7.752553913529018e-07, "loss": 0.0342, "reward": 7.380637466907501, "reward_std": 0.013560200110077858, "rewards/concensus_correctness_reward_func": 2.2669999822974205, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9886375181376934, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 68 }, { "completion_length": 120.09375, "epoch": 2.0, "grad_norm": 6060796.5, "kl": 790862.3630039439, "learning_rate": 7.61594383291065e-07, "loss": 790.8625, "reward": 7.042518824338913, "reward_std": 0.0015643856022506952, "rewards/concensus_correctness_reward_func": 2.1686249747872353, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9988938048481941, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 70 }, { "completion_length": 130.0, "epoch": 2.057142857142857, "grad_norm": 5.242808818817139, "kl": 0.1041660463088192, "learning_rate": 7.476589988449938e-07, "loss": 0.0001, "reward": 6.87304162979126, "reward_std": 0.06393423862755299, "rewards/concensus_correctness_reward_func": 2.043249987065792, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9938541650772095, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2421875, "step": 72 }, { "completion_length": 128.21875, "epoch": 2.1142857142857143, "grad_norm": 0.26571622490882874, "kl": 0.12202630296815187, "learning_rate": 7.334638543086203e-07, "loss": 0.0001, "reward": 7.147529572248459, "reward_std": 0.029656609520316124, "rewards/concensus_correctness_reward_func": 2.1684999763965607, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9985608533024788, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 74 }, { "completion_length": 124.21875, "epoch": 2.1714285714285713, "grad_norm": 1.5463571548461914, "kl": 8.507067229540553, "learning_rate": 7.190238384283412e-07, "loss": 0.0085, "reward": 7.2041696310043335, "reward_std": 0.06748650316148996, "rewards/concensus_correctness_reward_func": 2.128124989569187, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9744821637868881, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 76 }, { "completion_length": 121.34375, "epoch": 2.2285714285714286, "grad_norm": 0.041979704052209854, "kl": 0.032122639124281704, "learning_rate": 7.043540967867781e-07, "loss": 0.0, "reward": 6.918124943971634, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.043124981224537, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 78 }, { "completion_length": 129.90625, "epoch": 2.2857142857142856, "grad_norm": 0.04424755647778511, "kl": 50.4671565569588, "learning_rate": 6.894700159171534e-07, "loss": 0.0505, "reward": 7.512331336736679, "reward_std": 0.042842648923397064, "rewards/concensus_correctness_reward_func": 2.4197499975562096, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9675813280045986, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 80 }, { "completion_length": 122.9375, "epoch": 2.342857142857143, "grad_norm": 0.009761347435414791, "kl": 0.03142669008229859, "learning_rate": 6.743872071649411e-07, "loss": 0.0, "reward": 6.914698511362076, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.0433749929070473, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9963235296308994, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 82 }, { "completion_length": 112.25, "epoch": 2.4, "grad_norm": 0.03034272789955139, "kl": 0.044959868711885065, "learning_rate": 6.59121490313722e-07, "loss": 0.0, "reward": 6.688625007867813, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.9386249845847487, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 84 }, { "completion_length": 119.59375, "epoch": 2.4571428571428573, "grad_norm": 13.426407814025879, "kl": 1.5327934579690918, "learning_rate": 6.436888769924141e-07, "loss": 0.0015, "reward": 7.0252565741539, "reward_std": 0.08249054872430861, "rewards/concensus_correctness_reward_func": 2.0841249749064445, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9762878231704235, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 86 }, { "completion_length": 127.625, "epoch": 2.5142857142857142, "grad_norm": 3.1005895137786865, "kl": 0.7444930193014443, "learning_rate": 6.281055538812861e-07, "loss": 0.0007, "reward": 6.580017119646072, "reward_std": 0.06936378590762615, "rewards/concensus_correctness_reward_func": 1.7627499774098396, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9930483475327492, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 88 }, { "completion_length": 132.625, "epoch": 2.571428571428571, "grad_norm": 2.983717679977417, "kl": 9.172500962711638, "learning_rate": 6.123878657343647e-07, "loss": 0.0092, "reward": 6.720099925994873, "reward_std": 0.280580037884647, "rewards/concensus_correctness_reward_func": 1.92112497985363, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9864749535918236, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 90 }, { "completion_length": 117.65625, "epoch": 2.6285714285714286, "grad_norm": 24.54834747314453, "kl": 2860627968.304006, "learning_rate": 5.96552298236044e-07, "loss": 2860628.0, "reward": 7.669991672039032, "reward_std": 0.002275592749356292, "rewards/concensus_correctness_reward_func": 2.421749994158745, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9982417523860931, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 92 }, { "completion_length": 114.40625, "epoch": 2.685714285714286, "grad_norm": 8755.4248046875, "kl": 1104.330926183262, "learning_rate": 5.806154607098799e-07, "loss": 1.1043, "reward": 6.758044958114624, "reward_std": 0.14052551635541022, "rewards/concensus_correctness_reward_func": 2.0443749874830246, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9128886782564223, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.22265625, "step": 94 }, { "completion_length": 121.03125, "epoch": 2.742857142857143, "grad_norm": 50945.7890625, "kl": 814.7917639114894, "learning_rate": 5.645940686977032e-07, "loss": 0.8148, "reward": 6.610010415315628, "reward_std": 0.08342374488711357, "rewards/concensus_correctness_reward_func": 1.9189999848604202, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.968354269862175, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23828125, "step": 96 }, { "completion_length": 125.0, "epoch": 2.8, "grad_norm": 47.311058044433594, "kl": 14.8274484872818, "learning_rate": 5.485049264273241e-07, "loss": 0.0148, "reward": 7.000439822673798, "reward_std": 0.09996174834668636, "rewards/concensus_correctness_reward_func": 2.168874979019165, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.921408599242568, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.20703125, "step": 98 }, { "completion_length": 128.9375, "epoch": 2.857142857142857, "grad_norm": 18.47180938720703, "kl": 0.5244096268434078, "learning_rate": 5.323649091872178e-07, "loss": 0.0005, "reward": 7.01810696721077, "reward_std": 0.02690199576318264, "rewards/concensus_correctness_reward_func": 2.166124999523163, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.976981982588768, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 100 }, { "completion_length": 133.21875, "epoch": 2.914285714285714, "grad_norm": 91.73603057861328, "kl": 6.441082598932553, "learning_rate": 5.16190945626678e-07, "loss": 0.0064, "reward": 6.685827881097794, "reward_std": 0.11496807099319994, "rewards/concensus_correctness_reward_func": 2.0406249836087227, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9225466102361679, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 1.17578125, "step": 102 }, { "completion_length": 116.25, "epoch": 2.9714285714285715, "grad_norm": 0.03504316136240959, "kl": 2.1308261496014893, "learning_rate": 5e-07, "loss": 0.0021, "reward": 7.162474691867828, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.162749983370304, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9997246712446213, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 104 }, { "completion_length": 132.4375, "epoch": 3.0285714285714285, "grad_norm": 8.74528980255127, "kl": 4.539779104059562, "learning_rate": 4.838090543733221e-07, "loss": 0.0045, "reward": 6.900434046983719, "reward_std": 0.3393835090100765, "rewards/concensus_correctness_reward_func": 2.0893749855458736, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.9673090130090714, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 106 }, { "completion_length": 128.25, "epoch": 3.085714285714286, "grad_norm": 14.53085708618164, "kl": 2.8838787593413144, "learning_rate": 4.676350908127821e-07, "loss": 0.0029, "reward": 6.811203122138977, "reward_std": 0.011762974609155208, "rewards/concensus_correctness_reward_func": 2.0439999774098396, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9390781422844157, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.234375, "step": 108 }, { "completion_length": 151.625, "epoch": 3.142857142857143, "grad_norm": 0.030175382271409035, "kl": 0.15155013505136594, "learning_rate": 4.5149507357267597e-07, "loss": 0.0002, "reward": 6.892297387123108, "reward_std": 0.028420399874448776, "rewards/concensus_correctness_reward_func": 2.0434999987483025, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9737974181771278, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 110 }, { "completion_length": 151.1875, "epoch": 3.2, "grad_norm": 18.525300979614258, "kl": 3.5956702362163924, "learning_rate": 4.354059313022969e-07, "loss": 0.0036, "reward": 6.804599553346634, "reward_std": 0.15393569320440292, "rewards/concensus_correctness_reward_func": 2.0421249717473984, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9421620704233646, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 112 }, { "completion_length": 130.84375, "epoch": 3.257142857142857, "grad_norm": 17.65426254272461, "kl": 6.075715421116911, "learning_rate": 4.193845392901201e-07, "loss": 0.0061, "reward": 7.065226078033447, "reward_std": 0.11806740239262581, "rewards/concensus_correctness_reward_func": 2.1596249863505363, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9602885581552982, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 114 }, { "completion_length": 127.53125, "epoch": 3.314285714285714, "grad_norm": 0.08821436017751694, "kl": 133.908757203375, "learning_rate": 4.0344770176395606e-07, "loss": 0.1339, "reward": 6.4563820362091064, "reward_std": 0.009752611629664898, "rewards/concensus_correctness_reward_func": 1.9166249781847, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9772570580244064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.21875, "step": 116 }, { "completion_length": 126.4375, "epoch": 3.3714285714285714, "grad_norm": 5.672122955322266, "kl": 0.2845922822598368, "learning_rate": 3.8761213426563543e-07, "loss": 0.0003, "reward": 6.744957059621811, "reward_std": 0.23483475990360603, "rewards/concensus_correctness_reward_func": 1.9836249835789204, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9878945499658585, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2421875, "step": 118 }, { "completion_length": 122.46875, "epoch": 3.4285714285714284, "grad_norm": 11.123212814331055, "kl": 2.7103491379966727, "learning_rate": 3.718944461187138e-07, "loss": 0.0027, "reward": 7.969624936580658, "reward_std": 0.13258254528045654, "rewards/concensus_correctness_reward_func": 2.5633749878033996, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.6875, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 120 }, { "completion_length": 118.375, "epoch": 3.4857142857142858, "grad_norm": 1.7454348802566528, "kl": 0.32422800606582314, "learning_rate": 3.563111230075859e-07, "loss": 0.0003, "reward": 6.972374975681305, "reward_std": 0.04419417306780815, "rewards/concensus_correctness_reward_func": 2.003624975681305, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 122 }, { "completion_length": 111.71875, "epoch": 3.5428571428571427, "grad_norm": 0.019805356860160828, "kl": 9161774.07055326, "learning_rate": 3.408785096862782e-07, "loss": 9161.7734, "reward": 7.169749945402145, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.1697499975562096, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 124 }, { "completion_length": 126.78125, "epoch": 3.6, "grad_norm": 5.305757522583008, "kl": 0.40615585184423253, "learning_rate": 3.2561279283505884e-07, "loss": 0.0004, "reward": 7.333453685045242, "reward_std": 0.22105737775564194, "rewards/concensus_correctness_reward_func": 2.2409999817609787, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9987036660313606, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.234375, "step": 126 }, { "completion_length": 120.75, "epoch": 3.657142857142857, "grad_norm": 3.495209217071533, "kl": 0.09194360801484436, "learning_rate": 3.105299840828466e-07, "loss": 0.0001, "reward": 6.911025762557983, "reward_std": 0.00014028578880243003, "rewards/concensus_correctness_reward_func": 2.036124996840954, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.999900795519352, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 128 }, { "completion_length": 128.96875, "epoch": 3.7142857142857144, "grad_norm": 0.07500150054693222, "kl": 0.16381528400233947, "learning_rate": 2.95645903213222e-07, "loss": 0.0002, "reward": 6.640499949455261, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.8904999792575836, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 130 }, { "completion_length": 119.8125, "epoch": 3.7714285714285714, "grad_norm": 0.019938524812459946, "kl": 1.173585046082735, "learning_rate": 2.8097616157165885e-07, "loss": 0.0012, "reward": 7.227047026157379, "reward_std": 0.09221191157121211, "rewards/concensus_correctness_reward_func": 2.294374980032444, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9834532365202904, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 132 }, { "completion_length": 123.40625, "epoch": 3.8285714285714287, "grad_norm": 0.01626521907746792, "kl": 0.05231146275764331, "learning_rate": 2.665361456913797e-07, "loss": 0.0001, "reward": 6.836877226829529, "reward_std": 0.002825208706781268, "rewards/concensus_correctness_reward_func": 1.963874988257885, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9980022832751274, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 134 }, { "completion_length": 118.125, "epoch": 3.8857142857142857, "grad_norm": 0.039282504469156265, "kl": 0.046510634711012244, "learning_rate": 2.523410011550064e-07, "loss": 0.0, "reward": 6.843999952077866, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 1.9689999967813492, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 136 }, { "completion_length": 120.96875, "epoch": 3.942857142857143, "grad_norm": 252.50936889648438, "kl": 87.47745934262639, "learning_rate": 2.3840561670893495e-07, "loss": 0.0875, "reward": 6.647455930709839, "reward_std": 0.030291149392724037, "rewards/concensus_correctness_reward_func": 1.918874979019165, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.994205929338932, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 138 }, { "completion_length": 130.15625, "epoch": 4.0, "grad_norm": 19.294851303100586, "kl": 0.41531820816453546, "learning_rate": 2.247446086470982e-07, "loss": 0.0004, "reward": 6.778987288475037, "reward_std": 0.02229178324341774, "rewards/concensus_correctness_reward_func": 2.0447499975562096, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9998623356223106, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 140 }, { "completion_length": 120.90625, "epoch": 4.057142857142857, "grad_norm": 0.03657761588692665, "kl": 0.06334134587086737, "learning_rate": 2.113723054804904e-07, "loss": 0.0001, "reward": 6.919625014066696, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.044624984264374, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 142 }, { "completion_length": 118.46875, "epoch": 4.114285714285714, "grad_norm": 80.59430694580078, "kl": 18.28185724944342, "learning_rate": 1.9830273290853766e-07, "loss": 0.0183, "reward": 7.364194869995117, "reward_std": 0.07015768438577652, "rewards/concensus_correctness_reward_func": 2.2896249666810036, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.98472610861063, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 144 }, { "completion_length": 120.25, "epoch": 4.171428571428572, "grad_norm": 0.026064695790410042, "kl": 0.45868788298685104, "learning_rate": 1.8554959910807772e-07, "loss": 0.0005, "reward": 6.647747337818146, "reward_std": 0.02520996890962124, "rewards/concensus_correctness_reward_func": 1.9192499816417694, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9784973822534084, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 146 }, { "completion_length": 125.9375, "epoch": 4.228571428571429, "grad_norm": 3.1762712001800537, "kl": 6.2984363965224475, "learning_rate": 1.7312628035537386e-07, "loss": 0.0063, "reward": 7.706711947917938, "reward_std": 0.0663449972635135, "rewards/concensus_correctness_reward_func": 2.3786249980330467, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.968712005764246, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 148 }, { "completion_length": 128.21875, "epoch": 4.285714285714286, "grad_norm": 0.71825110912323, "kl": 0.12271159124793485, "learning_rate": 1.6104580699624837e-07, "loss": 0.0001, "reward": 6.778389781713486, "reward_std": 0.022097086533904076, "rewards/concensus_correctness_reward_func": 2.0452499836683273, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9987648203969002, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 150 }, { "completion_length": 115.53125, "epoch": 4.3428571428571425, "grad_norm": 434.4410705566406, "kl": 51.27761673851637, "learning_rate": 1.493208497790504e-07, "loss": 0.0513, "reward": 6.786718785762787, "reward_std": 0.027621358633041382, "rewards/concensus_correctness_reward_func": 1.9312499752268195, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 152 }, { "completion_length": 127.96875, "epoch": 4.4, "grad_norm": 1.3492827415466309, "kl": 6.197932193404995, "learning_rate": 1.3796370656478934e-07, "loss": 0.0062, "reward": 6.995270878076553, "reward_std": 0.18507131934165955, "rewards/concensus_correctness_reward_func": 2.110624987632036, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9627709090709686, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 154 }, { "completion_length": 120.875, "epoch": 4.457142857142857, "grad_norm": 8.5289888381958, "kl": 120.79435319965705, "learning_rate": 1.2698628942837697e-07, "loss": 0.1208, "reward": 7.774048000574112, "reward_std": 0.0011493656784296036, "rewards/concensus_correctness_reward_func": 2.5381249859929085, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.985922958701849, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 156 }, { "completion_length": 130.1875, "epoch": 4.514285714285714, "grad_norm": 33.96376037597656, "kl": 2.418273651972413, "learning_rate": 1.1640011216450691e-07, "loss": 0.0024, "reward": 6.574308484792709, "reward_std": 0.1323229782283306, "rewards/concensus_correctness_reward_func": 1.9178749769926071, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9611210152506828, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 158 }, { "completion_length": 125.4375, "epoch": 4.571428571428571, "grad_norm": 2.4072132110595703, "kl": 0.4212528702628333, "learning_rate": 1.0621627821127288e-07, "loss": 0.0004, "reward": 7.118884056806564, "reward_std": 0.07193531304073986, "rewards/concensus_correctness_reward_func": 2.169749990105629, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9999153092503548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 160 }, { "completion_length": 126.46875, "epoch": 4.628571428571428, "grad_norm": 0.01093831192702055, "kl": 1.1463339874171652, "learning_rate": 9.644546900419531e-08, "loss": 0.0011, "reward": 6.793500006198883, "reward_std": 0.17306438088417053, "rewards/concensus_correctness_reward_func": 1.9809999950230122, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 162 }, { "completion_length": 129.53125, "epoch": 4.685714285714286, "grad_norm": 2244.8427734375, "kl": 333.1012824997306, "learning_rate": 8.70979327728718e-08, "loss": 0.3331, "reward": 6.814886599779129, "reward_std": 0.025744116224814206, "rewards/concensus_correctness_reward_func": 2.0433749854564667, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9511991124600172, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 164 }, { "completion_length": 129.15625, "epoch": 4.742857142857143, "grad_norm": 17.648225784301758, "kl": 2.1229274289216846, "learning_rate": 7.81834737919978e-08, "loss": 0.0021, "reward": 7.373527824878693, "reward_std": 0.051579396531451494, "rewards/concensus_correctness_reward_func": 2.2849999964237213, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9791528545320034, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 166 }, { "completion_length": 121.84375, "epoch": 4.8, "grad_norm": 17.635286331176758, "kl": 0.524610364343971, "learning_rate": 6.971144209803736e-08, "loss": 0.0005, "reward": 6.5660789012908936, "reward_std": 0.13991801149677485, "rewards/concensus_correctness_reward_func": 1.9189999923110008, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9634852148592472, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.21484375, "step": 168 }, { "completion_length": 132.78125, "epoch": 4.857142857142857, "grad_norm": 0.019757816568017006, "kl": 0.8419713787152432, "learning_rate": 6.16907236823262e-08, "loss": 0.0008, "reward": 6.322874933481216, "reward_std": 0.19533824920654297, "rewards/concensus_correctness_reward_func": 1.7759999670088291, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.25, "step": 170 }, { "completion_length": 134.03125, "epoch": 4.914285714285715, "grad_norm": 241.08810424804688, "kl": 24.628016122151166, "learning_rate": 5.412973117089287e-08, "loss": 0.0246, "reward": 6.505185455083847, "reward_std": 0.031045368872582912, "rewards/concensus_correctness_reward_func": 1.7921249866485596, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9083729842677712, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2109375, "step": 172 }, { "completion_length": 127.25, "epoch": 4.9714285714285715, "grad_norm": 21.49008560180664, "kl": 9.178197413566522, "learning_rate": 4.703639500077655e-08, "loss": 0.0092, "reward": 7.088774770498276, "reward_std": 0.07385746456566267, "rewards/concensus_correctness_reward_func": 2.1410000026226044, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9985561221837997, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.23046875, "step": 174 }, { "completion_length": 128.28125, "epoch": 5.0285714285714285, "grad_norm": 4.017098426818848, "kl": 0.2039044737466611, "learning_rate": 4.041815510209395e-08, "loss": 0.0002, "reward": 7.131375730037689, "reward_std": 0.28885310888290405, "rewards/concensus_correctness_reward_func": 2.13349998742342, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9978756718337536, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 176 }, { "completion_length": 127.75, "epoch": 5.085714285714285, "grad_norm": 4.335840702056885, "kl": 0.6839758672285825, "learning_rate": 3.4281953094578875e-08, "loss": 0.0007, "reward": 7.12287500500679, "reward_std": 0.06629125960171223, "rewards/concensus_correctness_reward_func": 2.1697499826550484, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.234375, "step": 178 }, { "completion_length": 160.46875, "epoch": 5.142857142857143, "grad_norm": 720.6825561523438, "kl": 144.5732550881803, "learning_rate": 2.8634225006782864e-08, "loss": 0.1446, "reward": 6.793639570474625, "reward_std": 0.17524122472968884, "rewards/concensus_correctness_reward_func": 2.043374978005886, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9455770738422871, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2109375, "step": 180 }, { "completion_length": 129.84375, "epoch": 5.2, "grad_norm": 5.253899097442627, "kl": 1.940689492301317, "learning_rate": 2.348089452556956e-08, "loss": 0.0019, "reward": 6.638930112123489, "reward_std": 0.035586774349212646, "rewards/concensus_correctness_reward_func": 2.042999990284443, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9396800906397402, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.203125, "step": 182 }, { "completion_length": 124.84375, "epoch": 5.257142857142857, "grad_norm": 0.04068833962082863, "kl": 0.04214784048963338, "learning_rate": 1.882736678298491e-08, "loss": 0.0, "reward": 7.167124956846237, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.1671249717473984, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 184 }, { "completion_length": 131.25, "epoch": 5.314285714285714, "grad_norm": 6659.9755859375, "kl": 727.840228227491, "learning_rate": 1.4678522687020412e-08, "loss": 0.7278, "reward": 6.89025542140007, "reward_std": 0.024933231994509697, "rewards/concensus_correctness_reward_func": 2.044499985873699, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9941929578781128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 186 }, { "completion_length": 121.40625, "epoch": 5.371428571428572, "grad_norm": 26.308658599853516, "kl": 0.40540319198044017, "learning_rate": 1.1038713802214717e-08, "loss": 0.0004, "reward": 7.417273789644241, "reward_std": 0.0028156833723187447, "rewards/concensus_correctness_reward_func": 2.2954999804496765, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.996773824095726, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 188 }, { "completion_length": 120.0625, "epoch": 5.428571428571429, "grad_norm": 0.20397737622261047, "kl": 0.1569939429173246, "learning_rate": 7.91175778546288e-09, "loss": 0.0002, "reward": 7.27496874332428, "reward_std": 0.027621358633041382, "rewards/concensus_correctness_reward_func": 2.294499985873699, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 190 }, { "completion_length": 126.28125, "epoch": 5.485714285714286, "grad_norm": 0.02739715203642845, "kl": 0.04604100895812735, "learning_rate": 5.3009343818219975e-09, "loss": 0.0, "reward": 6.741377204656601, "reward_std": 0.002825208706781268, "rewards/concensus_correctness_reward_func": 1.8683749809861183, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9980022832751274, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 192 }, { "completion_length": 125.25, "epoch": 5.542857142857143, "grad_norm": 0.047832246869802475, "kl": 0.05312615679576993, "learning_rate": 3.2089819845111944e-09, "loss": 0.0001, "reward": 6.788500040769577, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.0384999737143517, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 194 }, { "completion_length": 130.9375, "epoch": 5.6, "grad_norm": 14.318893432617188, "kl": 1.7824806793942116, "learning_rate": 1.638094762715314e-09, "loss": 0.0018, "reward": 7.678025424480438, "reward_std": 0.0315791592001915, "rewards/concensus_correctness_reward_func": 2.328750006854534, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.974275503307581, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 196 }, { "completion_length": 122.1875, "epoch": 5.6571428571428575, "grad_norm": 24.025115966796875, "kl": 0.2889690902666189, "learning_rate": 5.899203602046654e-10, "loss": 0.0003, "reward": 6.572003036737442, "reward_std": 0.03199221845716238, "rewards/concensus_correctness_reward_func": 1.8446249887347221, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9969093427062035, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 198 }, { "completion_length": 145.59375, "epoch": 5.714285714285714, "grad_norm": 21.43385124206543, "kl": 9.787296281545423, "learning_rate": 6.555816718389895e-11, "loss": 0.0098, "reward": 7.064323574304581, "reward_std": 0.1496257558465004, "rewards/concensus_correctness_reward_func": 2.1701249852776527, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9606048539280891, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.21484375, "step": 200 }, { "epoch": 5.714285714285714, "step": 200, "total_flos": 0.0, "train_loss": 28842.747129663287, "train_runtime": 1262.9855, "train_samples_per_second": 2.534, "train_steps_per_second": 0.158 } ], "logging_steps": 2, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }