Files
Qwen2.5-0.5B-Instruct-Gensy…/trainer_state.json

1944 lines
72 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.714285714285714,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 300.15625,
"epoch": 0.05714285714285714,
"grad_norm": 17.426481246948242,
"kl": 0.0,
"learning_rate": 1.6666666666666665e-07,
"loss": -0.0,
"reward": 3.95498262392357,
"reward_std": 0.699341673636809,
"rewards/concensus_correctness_reward_func": 1.2692499943077564,
"rewards/consensus_reward_func": 0.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.7033888604491949,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.234375,
"rewards/xmlcount_reward_func": 0.5604687528684735,
"step": 2
},
{
"completion_length": 121.5,
"epoch": 0.11428571428571428,
"grad_norm": 13.969701766967773,
"kl": 0.01049548169839909,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 6.702177166938782,
"reward_std": 0.05493173561990261,
"rewards/concensus_correctness_reward_func": 1.9659999907016754,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9393021687865257,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.21875,
"step": 4
},
{
"completion_length": 125.96875,
"epoch": 0.17142857142857143,
"grad_norm": 0.03055919148027897,
"kl": 0.01876932127197506,
"learning_rate": 8.333333333333333e-07,
"loss": 0.0,
"reward": 7.562659442424774,
"reward_std": 0.1466759592294693,
"rewards/concensus_correctness_reward_func": 2.416374996304512,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.9744094498455524,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 6
},
{
"completion_length": 139.78125,
"epoch": 0.22857142857142856,
"grad_norm": 10.420638084411621,
"kl": 0.46951429128239397,
"learning_rate": 9.99934441832816e-07,
"loss": 0.0005,
"reward": 7.0254947245121,
"reward_std": 0.20259349179104902,
"rewards/concensus_correctness_reward_func": 2.1086249873042107,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9793697707355022,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 8
},
{
"completion_length": 118.625,
"epoch": 0.2857142857142857,
"grad_norm": 0.7034396529197693,
"kl": 0.5558207163994666,
"learning_rate": 9.994100796397953e-07,
"loss": 0.0006,
"reward": 6.637874931097031,
"reward_std": 0.04419417306780815,
"rewards/concensus_correctness_reward_func": 1.919124998152256,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 10
},
{
"completion_length": 128.15625,
"epoch": 0.34285714285714286,
"grad_norm": 0.3636714816093445,
"kl": 0.360770606843289,
"learning_rate": 9.983619052372847e-07,
"loss": 0.0004,
"reward": 7.086013972759247,
"reward_std": 0.1177134495228529,
"rewards/concensus_correctness_reward_func": 2.16924998909235,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9619827643036842,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.220406249165535,
"step": 12
},
{
"completion_length": 124.78125,
"epoch": 0.4,
"grad_norm": 0.39200520515441895,
"kl": 0.1974896949250251,
"learning_rate": 9.967910180154888e-07,
"loss": 0.0002,
"reward": 7.418499946594238,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.2934999763965607,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 14
},
{
"completion_length": 123.96875,
"epoch": 0.45714285714285713,
"grad_norm": 0.08412964642047882,
"kl": 0.0694609482306987,
"learning_rate": 9.946990656181779e-07,
"loss": 0.0001,
"reward": 6.889249950647354,
"reward_std": 0.04419417306780815,
"rewards/concensus_correctness_reward_func": 2.0454999804496765,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 16
},
{
"completion_length": 132.78125,
"epoch": 0.5142857142857142,
"grad_norm": 13294.4853515625,
"kl": 1929.7287216553232,
"learning_rate": 9.92088242214537e-07,
"loss": 1.9297,
"reward": 6.682024419307709,
"reward_std": 0.29592345282435417,
"rewards/concensus_correctness_reward_func": 1.975374985486269,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9722743965685368,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.21875,
"step": 18
},
{
"completion_length": 125.90625,
"epoch": 0.5714285714285714,
"grad_norm": 0.12487129867076874,
"kl": 0.062417162815108895,
"learning_rate": 9.889612861977853e-07,
"loss": 0.0001,
"reward": 7.668124943971634,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.418124996125698,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 20
},
{
"completion_length": 127.65625,
"epoch": 0.6285714285714286,
"grad_norm": 19.497220993041992,
"kl": 4.291850817389786,
"learning_rate": 9.853214773129795e-07,
"loss": 0.0043,
"reward": 6.502693980932236,
"reward_std": 0.052721514366567135,
"rewards/concensus_correctness_reward_func": 1.9194999784231186,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9738190732896328,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 22
},
{
"completion_length": 116.53125,
"epoch": 0.6857142857142857,
"grad_norm": 14.217891693115234,
"kl": 0.40878968324977905,
"learning_rate": 9.81172633217015e-07,
"loss": 0.0004,
"reward": 6.780536770820618,
"reward_std": 0.14366747711028438,
"rewards/concensus_correctness_reward_func": 1.8821249827742577,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.996068000793457,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 24
},
{
"completion_length": 122.8125,
"epoch": 0.7428571428571429,
"grad_norm": 21.322853088378906,
"kl": 2.3591567099792883,
"learning_rate": 9.765191054744304e-07,
"loss": 0.0024,
"reward": 6.85475081205368,
"reward_std": 0.02316407673060894,
"rewards/concensus_correctness_reward_func": 2.064124978147447,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.970313299447298,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 26
},
{
"completion_length": 119.15625,
"epoch": 0.8,
"grad_norm": 0.48698848485946655,
"kl": 0.04026831721421331,
"learning_rate": 9.713657749932171e-07,
"loss": 0.0,
"reward": 7.164504557847977,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.1684999763965607,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9960045665502548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 28
},
{
"completion_length": 119.875,
"epoch": 0.8571428571428571,
"grad_norm": 50.563270568847656,
"kl": 0.13291181785461958,
"learning_rate": 9.657180469054212e-07,
"loss": 0.0001,
"reward": 7.1191529631614685,
"reward_std": 0.0648374930024147,
"rewards/concensus_correctness_reward_func": 2.1649999916553497,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9893092103302479,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 30
},
{
"completion_length": 120.5625,
"epoch": 0.9142857142857143,
"grad_norm": 2.136101007461548,
"kl": 0.061591801641043276,
"learning_rate": 9.59581844897906e-07,
"loss": 0.0001,
"reward": 6.7411342561244965,
"reward_std": 0.005238220939645544,
"rewards/concensus_correctness_reward_func": 1.874749980866909,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9913843646645546,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 32
},
{
"completion_length": 124.40625,
"epoch": 0.9714285714285714,
"grad_norm": 1.9825115203857422,
"kl": 0.30299730040133,
"learning_rate": 9.529636049992233e-07,
"loss": 0.0003,
"reward": 6.669965773820877,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.920499987900257,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.999465811997652,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 34
},
{
"completion_length": 129.1875,
"epoch": 1.0285714285714285,
"grad_norm": 0.25820305943489075,
"kl": 0.10265114883077331,
"learning_rate": 9.458702688291071e-07,
"loss": 0.0001,
"reward": 6.62774994969368,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.8777499794960022,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 36
},
{
"completion_length": 123.28125,
"epoch": 1.0857142857142856,
"grad_norm": 1.8184473514556885,
"kl": 0.3953003305650782,
"learning_rate": 9.383092763176738e-07,
"loss": 0.0004,
"reward": 7.328000098466873,
"reward_std": 0.022097086533904076,
"rewards/concensus_correctness_reward_func": 2.2186249792575836,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 38
},
{
"completion_length": 135.28125,
"epoch": 1.1428571428571428,
"grad_norm": 160.05328369140625,
"kl": 35.685647501493804,
"learning_rate": 9.302885579019626e-07,
"loss": 0.0357,
"reward": 6.5664326548576355,
"reward_std": 0.14593601133674383,
"rewards/concensus_correctness_reward_func": 1.9196249842643738,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9788388833403587,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 40
},
{
"completion_length": 131.625,
"epoch": 1.2,
"grad_norm": 39835.796875,
"kl": 3820.177204770036,
"learning_rate": 9.218165262080022e-07,
"loss": 3.8202,
"reward": 7.211937814950943,
"reward_std": 0.0015021136496216059,
"rewards/concensus_correctness_reward_func": 2.2130000069737434,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9989378377795219,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 42
},
{
"completion_length": 127.25,
"epoch": 1.2571428571428571,
"grad_norm": 0.04504234343767166,
"kl": 0.05154561816016212,
"learning_rate": 9.129020672271281e-07,
"loss": 0.0001,
"reward": 6.667250007390976,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.9172499775886536,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 44
},
{
"completion_length": 118.40625,
"epoch": 1.3142857142857143,
"grad_norm": 23.088890075683594,
"kl": 2.3278085422934964,
"learning_rate": 9.035545309958046e-07,
"loss": 0.0024,
"reward": 6.5357484221458435,
"reward_std": 0.6359185018609423,
"rewards/concensus_correctness_reward_func": 1.8128749802708626,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.9924046844244003,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 46
},
{
"completion_length": 116.53125,
"epoch": 1.3714285714285714,
"grad_norm": 34.027565002441406,
"kl": 3.6760852289153263,
"learning_rate": 8.937837217887272e-07,
"loss": 0.0037,
"reward": 7.155503273010254,
"reward_std": 0.005231709423242137,
"rewards/concensus_correctness_reward_func": 2.1656249910593033,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9898783266544342,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 48
},
{
"completion_length": 133.34375,
"epoch": 1.4285714285714286,
"grad_norm": 225.2295379638672,
"kl": 1160.2006183795165,
"learning_rate": 8.83599887835493e-07,
"loss": 1.1602,
"reward": 6.594452530145645,
"reward_std": 0.10580936633050442,
"rewards/concensus_correctness_reward_func": 1.9199999868869781,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9596088528633118,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 50
},
{
"completion_length": 116.0625,
"epoch": 1.4857142857142858,
"grad_norm": 0.057164259254932404,
"kl": 0.04885981511324644,
"learning_rate": 8.73013710571623e-07,
"loss": 0.0,
"reward": 6.660125017166138,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.9101249799132347,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 52
},
{
"completion_length": 115.84375,
"epoch": 1.5428571428571427,
"grad_norm": 809270.9375,
"kl": 13683351.59927877,
"learning_rate": 8.620362934352108e-07,
"loss": 13683.3525,
"reward": 7.599349647760391,
"reward_std": 0.0883883461356163,
"rewards/concensus_correctness_reward_func": 2.4121249988675117,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.9997246712446213,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 54
},
{
"completion_length": 137.53125,
"epoch": 1.6,
"grad_norm": 27.686599731445312,
"kl": 1.2052045605960302,
"learning_rate": 8.506791502209496e-07,
"loss": 0.0012,
"reward": 6.557819962501526,
"reward_std": 0.09846582496538758,
"rewards/concensus_correctness_reward_func": 1.9089999869465828,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9378825686872005,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 56
},
{
"completion_length": 119.625,
"epoch": 1.657142857142857,
"grad_norm": 0.09275330603122711,
"kl": 0.4611619641073048,
"learning_rate": 8.389541930037516e-07,
"loss": 0.0005,
"reward": 7.025811791419983,
"reward_std": 0.20320578664541245,
"rewards/concensus_correctness_reward_func": 2.109499979764223,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9944368153810501,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 58
},
{
"completion_length": 125.25,
"epoch": 1.7142857142857144,
"grad_norm": 16.619831085205078,
"kl": 0.32900134613737464,
"learning_rate": 8.268737196446263e-07,
"loss": 0.0003,
"reward": 7.025674343109131,
"reward_std": 0.04453253000974655,
"rewards/concensus_correctness_reward_func": 2.0638749888166785,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9852368384599686,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 60
},
{
"completion_length": 122.96875,
"epoch": 1.7714285714285714,
"grad_norm": 0.030277982354164124,
"kl": 0.046715385746210814,
"learning_rate": 8.144504008919222e-07,
"loss": 0.0,
"reward": 6.91824996471405,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.0432499945163727,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 62
},
{
"completion_length": 119.65625,
"epoch": 1.8285714285714287,
"grad_norm": 0.02598767727613449,
"kl": 0.7929197051562369,
"learning_rate": 8.016972670914623e-07,
"loss": 0.0008,
"reward": 7.665361404418945,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.4198749884963036,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9954864755272865,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 64
},
{
"completion_length": 130.28125,
"epoch": 1.8857142857142857,
"grad_norm": 15.967117309570312,
"kl": 1.8949127969099209,
"learning_rate": 7.886276945195097e-07,
"loss": 0.0019,
"reward": 6.652400374412537,
"reward_std": 0.023298587650060654,
"rewards/concensus_correctness_reward_func": 1.918874979019165,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9835254140198231,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 66
},
{
"completion_length": 129.34375,
"epoch": 1.9428571428571428,
"grad_norm": 97.32298278808594,
"kl": 34.17437303136103,
"learning_rate": 7.752553913529018e-07,
"loss": 0.0342,
"reward": 7.380637466907501,
"reward_std": 0.013560200110077858,
"rewards/concensus_correctness_reward_func": 2.2669999822974205,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9886375181376934,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 68
},
{
"completion_length": 120.09375,
"epoch": 2.0,
"grad_norm": 6060796.5,
"kl": 790862.3630039439,
"learning_rate": 7.61594383291065e-07,
"loss": 790.8625,
"reward": 7.042518824338913,
"reward_std": 0.0015643856022506952,
"rewards/concensus_correctness_reward_func": 2.1686249747872353,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9988938048481941,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 70
},
{
"completion_length": 130.0,
"epoch": 2.057142857142857,
"grad_norm": 5.242808818817139,
"kl": 0.1041660463088192,
"learning_rate": 7.476589988449938e-07,
"loss": 0.0001,
"reward": 6.87304162979126,
"reward_std": 0.06393423862755299,
"rewards/concensus_correctness_reward_func": 2.043249987065792,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9938541650772095,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 72
},
{
"completion_length": 128.21875,
"epoch": 2.1142857142857143,
"grad_norm": 0.26571622490882874,
"kl": 0.12202630296815187,
"learning_rate": 7.334638543086203e-07,
"loss": 0.0001,
"reward": 7.147529572248459,
"reward_std": 0.029656609520316124,
"rewards/concensus_correctness_reward_func": 2.1684999763965607,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9985608533024788,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 74
},
{
"completion_length": 124.21875,
"epoch": 2.1714285714285713,
"grad_norm": 1.5463571548461914,
"kl": 8.507067229540553,
"learning_rate": 7.190238384283412e-07,
"loss": 0.0085,
"reward": 7.2041696310043335,
"reward_std": 0.06748650316148996,
"rewards/concensus_correctness_reward_func": 2.128124989569187,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9744821637868881,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 76
},
{
"completion_length": 121.34375,
"epoch": 2.2285714285714286,
"grad_norm": 0.041979704052209854,
"kl": 0.032122639124281704,
"learning_rate": 7.043540967867781e-07,
"loss": 0.0,
"reward": 6.918124943971634,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.043124981224537,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 78
},
{
"completion_length": 129.90625,
"epoch": 2.2857142857142856,
"grad_norm": 0.04424755647778511,
"kl": 50.4671565569588,
"learning_rate": 6.894700159171534e-07,
"loss": 0.0505,
"reward": 7.512331336736679,
"reward_std": 0.042842648923397064,
"rewards/concensus_correctness_reward_func": 2.4197499975562096,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9675813280045986,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 80
},
{
"completion_length": 122.9375,
"epoch": 2.342857142857143,
"grad_norm": 0.009761347435414791,
"kl": 0.03142669008229859,
"learning_rate": 6.743872071649411e-07,
"loss": 0.0,
"reward": 6.914698511362076,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.0433749929070473,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9963235296308994,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 82
},
{
"completion_length": 112.25,
"epoch": 2.4,
"grad_norm": 0.03034272789955139,
"kl": 0.044959868711885065,
"learning_rate": 6.59121490313722e-07,
"loss": 0.0,
"reward": 6.688625007867813,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.9386249845847487,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 84
},
{
"completion_length": 119.59375,
"epoch": 2.4571428571428573,
"grad_norm": 13.426407814025879,
"kl": 1.5327934579690918,
"learning_rate": 6.436888769924141e-07,
"loss": 0.0015,
"reward": 7.0252565741539,
"reward_std": 0.08249054872430861,
"rewards/concensus_correctness_reward_func": 2.0841249749064445,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9762878231704235,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 86
},
{
"completion_length": 127.625,
"epoch": 2.5142857142857142,
"grad_norm": 3.1005895137786865,
"kl": 0.7444930193014443,
"learning_rate": 6.281055538812861e-07,
"loss": 0.0007,
"reward": 6.580017119646072,
"reward_std": 0.06936378590762615,
"rewards/concensus_correctness_reward_func": 1.7627499774098396,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9930483475327492,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 88
},
{
"completion_length": 132.625,
"epoch": 2.571428571428571,
"grad_norm": 2.983717679977417,
"kl": 9.172500962711638,
"learning_rate": 6.123878657343647e-07,
"loss": 0.0092,
"reward": 6.720099925994873,
"reward_std": 0.280580037884647,
"rewards/concensus_correctness_reward_func": 1.92112497985363,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9864749535918236,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 90
},
{
"completion_length": 117.65625,
"epoch": 2.6285714285714286,
"grad_norm": 24.54834747314453,
"kl": 2860627968.304006,
"learning_rate": 5.96552298236044e-07,
"loss": 2860628.0,
"reward": 7.669991672039032,
"reward_std": 0.002275592749356292,
"rewards/concensus_correctness_reward_func": 2.421749994158745,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9982417523860931,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 92
},
{
"completion_length": 114.40625,
"epoch": 2.685714285714286,
"grad_norm": 8755.4248046875,
"kl": 1104.330926183262,
"learning_rate": 5.806154607098799e-07,
"loss": 1.1043,
"reward": 6.758044958114624,
"reward_std": 0.14052551635541022,
"rewards/concensus_correctness_reward_func": 2.0443749874830246,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9128886782564223,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.22265625,
"step": 94
},
{
"completion_length": 121.03125,
"epoch": 2.742857142857143,
"grad_norm": 50945.7890625,
"kl": 814.7917639114894,
"learning_rate": 5.645940686977032e-07,
"loss": 0.8148,
"reward": 6.610010415315628,
"reward_std": 0.08342374488711357,
"rewards/concensus_correctness_reward_func": 1.9189999848604202,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.968354269862175,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23828125,
"step": 96
},
{
"completion_length": 125.0,
"epoch": 2.8,
"grad_norm": 47.311058044433594,
"kl": 14.8274484872818,
"learning_rate": 5.485049264273241e-07,
"loss": 0.0148,
"reward": 7.000439822673798,
"reward_std": 0.09996174834668636,
"rewards/concensus_correctness_reward_func": 2.168874979019165,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.921408599242568,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 98
},
{
"completion_length": 128.9375,
"epoch": 2.857142857142857,
"grad_norm": 18.47180938720703,
"kl": 0.5244096268434078,
"learning_rate": 5.323649091872178e-07,
"loss": 0.0005,
"reward": 7.01810696721077,
"reward_std": 0.02690199576318264,
"rewards/concensus_correctness_reward_func": 2.166124999523163,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.976981982588768,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 100
},
{
"completion_length": 133.21875,
"epoch": 2.914285714285714,
"grad_norm": 91.73603057861328,
"kl": 6.441082598932553,
"learning_rate": 5.16190945626678e-07,
"loss": 0.0064,
"reward": 6.685827881097794,
"reward_std": 0.11496807099319994,
"rewards/concensus_correctness_reward_func": 2.0406249836087227,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9225466102361679,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.421875,
"rewards/xmlcount_reward_func": 1.17578125,
"step": 102
},
{
"completion_length": 116.25,
"epoch": 2.9714285714285715,
"grad_norm": 0.03504316136240959,
"kl": 2.1308261496014893,
"learning_rate": 5e-07,
"loss": 0.0021,
"reward": 7.162474691867828,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.162749983370304,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9997246712446213,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 104
},
{
"completion_length": 132.4375,
"epoch": 3.0285714285714285,
"grad_norm": 8.74528980255127,
"kl": 4.539779104059562,
"learning_rate": 4.838090543733221e-07,
"loss": 0.0045,
"reward": 6.900434046983719,
"reward_std": 0.3393835090100765,
"rewards/concensus_correctness_reward_func": 2.0893749855458736,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.9673090130090714,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 106
},
{
"completion_length": 128.25,
"epoch": 3.085714285714286,
"grad_norm": 14.53085708618164,
"kl": 2.8838787593413144,
"learning_rate": 4.676350908127821e-07,
"loss": 0.0029,
"reward": 6.811203122138977,
"reward_std": 0.011762974609155208,
"rewards/concensus_correctness_reward_func": 2.0439999774098396,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9390781422844157,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.234375,
"step": 108
},
{
"completion_length": 151.625,
"epoch": 3.142857142857143,
"grad_norm": 0.030175382271409035,
"kl": 0.15155013505136594,
"learning_rate": 4.5149507357267597e-07,
"loss": 0.0002,
"reward": 6.892297387123108,
"reward_std": 0.028420399874448776,
"rewards/concensus_correctness_reward_func": 2.0434999987483025,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9737974181771278,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 110
},
{
"completion_length": 151.1875,
"epoch": 3.2,
"grad_norm": 18.525300979614258,
"kl": 3.5956702362163924,
"learning_rate": 4.354059313022969e-07,
"loss": 0.0036,
"reward": 6.804599553346634,
"reward_std": 0.15393569320440292,
"rewards/concensus_correctness_reward_func": 2.0421249717473984,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9421620704233646,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 112
},
{
"completion_length": 130.84375,
"epoch": 3.257142857142857,
"grad_norm": 17.65426254272461,
"kl": 6.075715421116911,
"learning_rate": 4.193845392901201e-07,
"loss": 0.0061,
"reward": 7.065226078033447,
"reward_std": 0.11806740239262581,
"rewards/concensus_correctness_reward_func": 2.1596249863505363,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9602885581552982,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 114
},
{
"completion_length": 127.53125,
"epoch": 3.314285714285714,
"grad_norm": 0.08821436017751694,
"kl": 133.908757203375,
"learning_rate": 4.0344770176395606e-07,
"loss": 0.1339,
"reward": 6.4563820362091064,
"reward_std": 0.009752611629664898,
"rewards/concensus_correctness_reward_func": 1.9166249781847,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9772570580244064,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.21875,
"step": 116
},
{
"completion_length": 126.4375,
"epoch": 3.3714285714285714,
"grad_norm": 5.672122955322266,
"kl": 0.2845922822598368,
"learning_rate": 3.8761213426563543e-07,
"loss": 0.0003,
"reward": 6.744957059621811,
"reward_std": 0.23483475990360603,
"rewards/concensus_correctness_reward_func": 1.9836249835789204,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9878945499658585,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 118
},
{
"completion_length": 122.46875,
"epoch": 3.4285714285714284,
"grad_norm": 11.123212814331055,
"kl": 2.7103491379966727,
"learning_rate": 3.718944461187138e-07,
"loss": 0.0027,
"reward": 7.969624936580658,
"reward_std": 0.13258254528045654,
"rewards/concensus_correctness_reward_func": 2.5633749878033996,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 120
},
{
"completion_length": 118.375,
"epoch": 3.4857142857142858,
"grad_norm": 1.7454348802566528,
"kl": 0.32422800606582314,
"learning_rate": 3.563111230075859e-07,
"loss": 0.0003,
"reward": 6.972374975681305,
"reward_std": 0.04419417306780815,
"rewards/concensus_correctness_reward_func": 2.003624975681305,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 122
},
{
"completion_length": 111.71875,
"epoch": 3.5428571428571427,
"grad_norm": 0.019805356860160828,
"kl": 9161774.07055326,
"learning_rate": 3.408785096862782e-07,
"loss": 9161.7734,
"reward": 7.169749945402145,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.1697499975562096,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 124
},
{
"completion_length": 126.78125,
"epoch": 3.6,
"grad_norm": 5.305757522583008,
"kl": 0.40615585184423253,
"learning_rate": 3.2561279283505884e-07,
"loss": 0.0004,
"reward": 7.333453685045242,
"reward_std": 0.22105737775564194,
"rewards/concensus_correctness_reward_func": 2.2409999817609787,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9987036660313606,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.234375,
"step": 126
},
{
"completion_length": 120.75,
"epoch": 3.657142857142857,
"grad_norm": 3.495209217071533,
"kl": 0.09194360801484436,
"learning_rate": 3.105299840828466e-07,
"loss": 0.0001,
"reward": 6.911025762557983,
"reward_std": 0.00014028578880243003,
"rewards/concensus_correctness_reward_func": 2.036124996840954,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.999900795519352,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 128
},
{
"completion_length": 128.96875,
"epoch": 3.7142857142857144,
"grad_norm": 0.07500150054693222,
"kl": 0.16381528400233947,
"learning_rate": 2.95645903213222e-07,
"loss": 0.0002,
"reward": 6.640499949455261,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.8904999792575836,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 130
},
{
"completion_length": 119.8125,
"epoch": 3.7714285714285714,
"grad_norm": 0.019938524812459946,
"kl": 1.173585046082735,
"learning_rate": 2.8097616157165885e-07,
"loss": 0.0012,
"reward": 7.227047026157379,
"reward_std": 0.09221191157121211,
"rewards/concensus_correctness_reward_func": 2.294374980032444,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9834532365202904,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 132
},
{
"completion_length": 123.40625,
"epoch": 3.8285714285714287,
"grad_norm": 0.01626521907746792,
"kl": 0.05231146275764331,
"learning_rate": 2.665361456913797e-07,
"loss": 0.0001,
"reward": 6.836877226829529,
"reward_std": 0.002825208706781268,
"rewards/concensus_correctness_reward_func": 1.963874988257885,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9980022832751274,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 134
},
{
"completion_length": 118.125,
"epoch": 3.8857142857142857,
"grad_norm": 0.039282504469156265,
"kl": 0.046510634711012244,
"learning_rate": 2.523410011550064e-07,
"loss": 0.0,
"reward": 6.843999952077866,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 1.9689999967813492,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 136
},
{
"completion_length": 120.96875,
"epoch": 3.942857142857143,
"grad_norm": 252.50936889648438,
"kl": 87.47745934262639,
"learning_rate": 2.3840561670893495e-07,
"loss": 0.0875,
"reward": 6.647455930709839,
"reward_std": 0.030291149392724037,
"rewards/concensus_correctness_reward_func": 1.918874979019165,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.994205929338932,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 138
},
{
"completion_length": 130.15625,
"epoch": 4.0,
"grad_norm": 19.294851303100586,
"kl": 0.41531820816453546,
"learning_rate": 2.247446086470982e-07,
"loss": 0.0004,
"reward": 6.778987288475037,
"reward_std": 0.02229178324341774,
"rewards/concensus_correctness_reward_func": 2.0447499975562096,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9998623356223106,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 140
},
{
"completion_length": 120.90625,
"epoch": 4.057142857142857,
"grad_norm": 0.03657761588692665,
"kl": 0.06334134587086737,
"learning_rate": 2.113723054804904e-07,
"loss": 0.0001,
"reward": 6.919625014066696,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.044624984264374,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 142
},
{
"completion_length": 118.46875,
"epoch": 4.114285714285714,
"grad_norm": 80.59430694580078,
"kl": 18.28185724944342,
"learning_rate": 1.9830273290853766e-07,
"loss": 0.0183,
"reward": 7.364194869995117,
"reward_std": 0.07015768438577652,
"rewards/concensus_correctness_reward_func": 2.2896249666810036,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.98472610861063,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 144
},
{
"completion_length": 120.25,
"epoch": 4.171428571428572,
"grad_norm": 0.026064695790410042,
"kl": 0.45868788298685104,
"learning_rate": 1.8554959910807772e-07,
"loss": 0.0005,
"reward": 6.647747337818146,
"reward_std": 0.02520996890962124,
"rewards/concensus_correctness_reward_func": 1.9192499816417694,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9784973822534084,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 146
},
{
"completion_length": 125.9375,
"epoch": 4.228571428571429,
"grad_norm": 3.1762712001800537,
"kl": 6.2984363965224475,
"learning_rate": 1.7312628035537386e-07,
"loss": 0.0063,
"reward": 7.706711947917938,
"reward_std": 0.0663449972635135,
"rewards/concensus_correctness_reward_func": 2.3786249980330467,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.968712005764246,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 148
},
{
"completion_length": 128.21875,
"epoch": 4.285714285714286,
"grad_norm": 0.71825110912323,
"kl": 0.12271159124793485,
"learning_rate": 1.6104580699624837e-07,
"loss": 0.0001,
"reward": 6.778389781713486,
"reward_std": 0.022097086533904076,
"rewards/concensus_correctness_reward_func": 2.0452499836683273,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9987648203969002,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 150
},
{
"completion_length": 115.53125,
"epoch": 4.3428571428571425,
"grad_norm": 434.4410705566406,
"kl": 51.27761673851637,
"learning_rate": 1.493208497790504e-07,
"loss": 0.0513,
"reward": 6.786718785762787,
"reward_std": 0.027621358633041382,
"rewards/concensus_correctness_reward_func": 1.9312499752268195,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 152
},
{
"completion_length": 127.96875,
"epoch": 4.4,
"grad_norm": 1.3492827415466309,
"kl": 6.197932193404995,
"learning_rate": 1.3796370656478934e-07,
"loss": 0.0062,
"reward": 6.995270878076553,
"reward_std": 0.18507131934165955,
"rewards/concensus_correctness_reward_func": 2.110624987632036,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9627709090709686,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 154
},
{
"completion_length": 120.875,
"epoch": 4.457142857142857,
"grad_norm": 8.5289888381958,
"kl": 120.79435319965705,
"learning_rate": 1.2698628942837697e-07,
"loss": 0.1208,
"reward": 7.774048000574112,
"reward_std": 0.0011493656784296036,
"rewards/concensus_correctness_reward_func": 2.5381249859929085,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.985922958701849,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 156
},
{
"completion_length": 130.1875,
"epoch": 4.514285714285714,
"grad_norm": 33.96376037597656,
"kl": 2.418273651972413,
"learning_rate": 1.1640011216450691e-07,
"loss": 0.0024,
"reward": 6.574308484792709,
"reward_std": 0.1323229782283306,
"rewards/concensus_correctness_reward_func": 1.9178749769926071,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9611210152506828,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 158
},
{
"completion_length": 125.4375,
"epoch": 4.571428571428571,
"grad_norm": 2.4072132110595703,
"kl": 0.4212528702628333,
"learning_rate": 1.0621627821127288e-07,
"loss": 0.0004,
"reward": 7.118884056806564,
"reward_std": 0.07193531304073986,
"rewards/concensus_correctness_reward_func": 2.169749990105629,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9999153092503548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 160
},
{
"completion_length": 126.46875,
"epoch": 4.628571428571428,
"grad_norm": 0.01093831192702055,
"kl": 1.1463339874171652,
"learning_rate": 9.644546900419531e-08,
"loss": 0.0011,
"reward": 6.793500006198883,
"reward_std": 0.17306438088417053,
"rewards/concensus_correctness_reward_func": 1.9809999950230122,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 162
},
{
"completion_length": 129.53125,
"epoch": 4.685714285714286,
"grad_norm": 2244.8427734375,
"kl": 333.1012824997306,
"learning_rate": 8.70979327728718e-08,
"loss": 0.3331,
"reward": 6.814886599779129,
"reward_std": 0.025744116224814206,
"rewards/concensus_correctness_reward_func": 2.0433749854564667,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9511991124600172,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 164
},
{
"completion_length": 129.15625,
"epoch": 4.742857142857143,
"grad_norm": 17.648225784301758,
"kl": 2.1229274289216846,
"learning_rate": 7.81834737919978e-08,
"loss": 0.0021,
"reward": 7.373527824878693,
"reward_std": 0.051579396531451494,
"rewards/concensus_correctness_reward_func": 2.2849999964237213,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9791528545320034,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 166
},
{
"completion_length": 121.84375,
"epoch": 4.8,
"grad_norm": 17.635286331176758,
"kl": 0.524610364343971,
"learning_rate": 6.971144209803736e-08,
"loss": 0.0005,
"reward": 6.5660789012908936,
"reward_std": 0.13991801149677485,
"rewards/concensus_correctness_reward_func": 1.9189999923110008,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9634852148592472,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 168
},
{
"completion_length": 132.78125,
"epoch": 4.857142857142857,
"grad_norm": 0.019757816568017006,
"kl": 0.8419713787152432,
"learning_rate": 6.16907236823262e-08,
"loss": 0.0008,
"reward": 6.322874933481216,
"reward_std": 0.19533824920654297,
"rewards/concensus_correctness_reward_func": 1.7759999670088291,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.25,
"step": 170
},
{
"completion_length": 134.03125,
"epoch": 4.914285714285715,
"grad_norm": 241.08810424804688,
"kl": 24.628016122151166,
"learning_rate": 5.412973117089287e-08,
"loss": 0.0246,
"reward": 6.505185455083847,
"reward_std": 0.031045368872582912,
"rewards/concensus_correctness_reward_func": 1.7921249866485596,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9083729842677712,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2109375,
"step": 172
},
{
"completion_length": 127.25,
"epoch": 4.9714285714285715,
"grad_norm": 21.49008560180664,
"kl": 9.178197413566522,
"learning_rate": 4.703639500077655e-08,
"loss": 0.0092,
"reward": 7.088774770498276,
"reward_std": 0.07385746456566267,
"rewards/concensus_correctness_reward_func": 2.1410000026226044,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9985561221837997,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 174
},
{
"completion_length": 128.28125,
"epoch": 5.0285714285714285,
"grad_norm": 4.017098426818848,
"kl": 0.2039044737466611,
"learning_rate": 4.041815510209395e-08,
"loss": 0.0002,
"reward": 7.131375730037689,
"reward_std": 0.28885310888290405,
"rewards/concensus_correctness_reward_func": 2.13349998742342,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9978756718337536,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 176
},
{
"completion_length": 127.75,
"epoch": 5.085714285714285,
"grad_norm": 4.335840702056885,
"kl": 0.6839758672285825,
"learning_rate": 3.4281953094578875e-08,
"loss": 0.0007,
"reward": 7.12287500500679,
"reward_std": 0.06629125960171223,
"rewards/concensus_correctness_reward_func": 2.1697499826550484,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.234375,
"step": 178
},
{
"completion_length": 160.46875,
"epoch": 5.142857142857143,
"grad_norm": 720.6825561523438,
"kl": 144.5732550881803,
"learning_rate": 2.8634225006782864e-08,
"loss": 0.1446,
"reward": 6.793639570474625,
"reward_std": 0.17524122472968884,
"rewards/concensus_correctness_reward_func": 2.043374978005886,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9455770738422871,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2109375,
"step": 180
},
{
"completion_length": 129.84375,
"epoch": 5.2,
"grad_norm": 5.253899097442627,
"kl": 1.940689492301317,
"learning_rate": 2.348089452556956e-08,
"loss": 0.0019,
"reward": 6.638930112123489,
"reward_std": 0.035586774349212646,
"rewards/concensus_correctness_reward_func": 2.042999990284443,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9396800906397402,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.203125,
"step": 182
},
{
"completion_length": 124.84375,
"epoch": 5.257142857142857,
"grad_norm": 0.04068833962082863,
"kl": 0.04214784048963338,
"learning_rate": 1.882736678298491e-08,
"loss": 0.0,
"reward": 7.167124956846237,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.1671249717473984,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 184
},
{
"completion_length": 131.25,
"epoch": 5.314285714285714,
"grad_norm": 6659.9755859375,
"kl": 727.840228227491,
"learning_rate": 1.4678522687020412e-08,
"loss": 0.7278,
"reward": 6.89025542140007,
"reward_std": 0.024933231994509697,
"rewards/concensus_correctness_reward_func": 2.044499985873699,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9941929578781128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 186
},
{
"completion_length": 121.40625,
"epoch": 5.371428571428572,
"grad_norm": 26.308658599853516,
"kl": 0.40540319198044017,
"learning_rate": 1.1038713802214717e-08,
"loss": 0.0004,
"reward": 7.417273789644241,
"reward_std": 0.0028156833723187447,
"rewards/concensus_correctness_reward_func": 2.2954999804496765,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.996773824095726,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 188
},
{
"completion_length": 120.0625,
"epoch": 5.428571428571429,
"grad_norm": 0.20397737622261047,
"kl": 0.1569939429173246,
"learning_rate": 7.91175778546288e-09,
"loss": 0.0002,
"reward": 7.27496874332428,
"reward_std": 0.027621358633041382,
"rewards/concensus_correctness_reward_func": 2.294499985873699,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 190
},
{
"completion_length": 126.28125,
"epoch": 5.485714285714286,
"grad_norm": 0.02739715203642845,
"kl": 0.04604100895812735,
"learning_rate": 5.3009343818219975e-09,
"loss": 0.0,
"reward": 6.741377204656601,
"reward_std": 0.002825208706781268,
"rewards/concensus_correctness_reward_func": 1.8683749809861183,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9980022832751274,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 192
},
{
"completion_length": 125.25,
"epoch": 5.542857142857143,
"grad_norm": 0.047832246869802475,
"kl": 0.05312615679576993,
"learning_rate": 3.2089819845111944e-09,
"loss": 0.0001,
"reward": 6.788500040769577,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.0384999737143517,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 194
},
{
"completion_length": 130.9375,
"epoch": 5.6,
"grad_norm": 14.318893432617188,
"kl": 1.7824806793942116,
"learning_rate": 1.638094762715314e-09,
"loss": 0.0018,
"reward": 7.678025424480438,
"reward_std": 0.0315791592001915,
"rewards/concensus_correctness_reward_func": 2.328750006854534,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.974275503307581,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 196
},
{
"completion_length": 122.1875,
"epoch": 5.6571428571428575,
"grad_norm": 24.025115966796875,
"kl": 0.2889690902666189,
"learning_rate": 5.899203602046654e-10,
"loss": 0.0003,
"reward": 6.572003036737442,
"reward_std": 0.03199221845716238,
"rewards/concensus_correctness_reward_func": 1.8446249887347221,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9969093427062035,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 198
},
{
"completion_length": 145.59375,
"epoch": 5.714285714285714,
"grad_norm": 21.43385124206543,
"kl": 9.787296281545423,
"learning_rate": 6.555816718389895e-11,
"loss": 0.0098,
"reward": 7.064323574304581,
"reward_std": 0.1496257558465004,
"rewards/concensus_correctness_reward_func": 2.1701249852776527,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9606048539280891,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 200
},
{
"epoch": 5.714285714285714,
"step": 200,
"total_flos": 0.0,
"train_loss": 28842.747129663287,
"train_runtime": 1262.9855,
"train_samples_per_second": 2.534,
"train_steps_per_second": 0.158
}
],
"logging_steps": 2,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}