Files
Qwen2.5-0.5B-Instruct-Gensy…/trainer_state.json

824 lines
33 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5633802816901409,
"eval_steps": 500,
"global_step": 20,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 14.0,
"completions/mean_terminated_length": 14.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.028169014084507043,
"grad_norm": 32.06034851074219,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0176,
"num_tokens": 568.0,
"reward": 0.020783159881830215,
"reward_std": 0.012284406460821629,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.020783159881830215,
"rewards/question_recreation_reward_func/std": 0.027877027168869972,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 63.0,
"completions/max_terminated_length": 63.0,
"completions/mean_length": 29.5,
"completions/mean_terminated_length": 29.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.056338028169014086,
"grad_norm": 6.299616813659668,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.0791,
"num_tokens": 1198.0,
"reward": 0.02762589044868946,
"reward_std": 0.0009784403955563903,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.02762589231133461,
"rewards/question_recreation_reward_func/std": 0.0051710638217628,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 448.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 118.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.08450704225352113,
"grad_norm": 3.1016180515289307,
"kl": 8.319292101077735e-05,
"learning_rate": 9.931806517013612e-07,
"loss": 0.3379,
"num_tokens": 2182.0,
"reward": 0.03331246227025986,
"reward_std": 0.0052673472091555595,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.03331245854496956,
"rewards/question_recreation_reward_func/std": 0.009135501459240913,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 27.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 13.0,
"completions/mean_terminated_length": 13.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.11267605633802817,
"grad_norm": 29.29948616027832,
"kl": 0.05142414569854736,
"learning_rate": 9.729086208503173e-07,
"loss": -0.1736,
"num_tokens": 2746.0,
"reward": 0.14608332514762878,
"reward_std": 0.007730965036898851,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.14608332514762878,
"rewards/question_recreation_reward_func/std": 0.009648437611758709,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 13.5,
"completions/mean_terminated_length": 13.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.14084507042253522,
"grad_norm": 40.53683853149414,
"kl": 0.011120198392518432,
"learning_rate": 9.397368756032444e-07,
"loss": 0.0004,
"num_tokens": 3312.0,
"reward": 0.08978645503520966,
"reward_std": 0.013956054113805294,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.08978645503520966,
"rewards/question_recreation_reward_func/std": 0.08247601985931396,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.16901408450704225,
"grad_norm": 0.06918946653604507,
"kl": 0.0004225224256515503,
"learning_rate": 8.945702546981968e-07,
"loss": 0.0,
"num_tokens": 3856.0,
"reward": 0.19858156144618988,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.19858156144618988,
"rewards/question_recreation_reward_func/std": 0.0,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 8.25,
"completions/mean_terminated_length": 8.25,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.19718309859154928,
"grad_norm": 41.10600280761719,
"kl": 0.06056380271911621,
"learning_rate": 8.386407858128706e-07,
"loss": -0.0184,
"num_tokens": 4401.0,
"reward": 0.03791220486164093,
"reward_std": 0.047372184693813324,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.006662202067673206,
"rewards/question_recreation_reward_func/std": 0.004442098084837198,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.03125,
"rewards/xmlcount_reward_func/std": 0.0625,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 21.0,
"completions/max_terminated_length": 21.0,
"completions/mean_length": 11.25,
"completions/mean_terminated_length": 11.25,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.22535211267605634,
"grad_norm": 84.13429260253906,
"kl": 0.9762128964066505,
"learning_rate": 7.734740790612136e-07,
"loss": 0.1866,
"num_tokens": 4958.0,
"reward": 0.019636042416095734,
"reward_std": 0.0071492972783744335,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.019636042416095734,
"rewards/question_recreation_reward_func/std": 0.00888961460441351,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 110.0,
"completions/max_terminated_length": 110.0,
"completions/mean_length": 37.75,
"completions/mean_terminated_length": 37.75,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.2535211267605634,
"grad_norm": 4.597474098205566,
"kl": 0.008542275987565517,
"learning_rate": 7.008477123264847e-07,
"loss": -0.2221,
"num_tokens": 5621.0,
"reward": 0.08780995011329651,
"reward_std": 0.08147402852773666,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.025309953838586807,
"rewards/question_recreation_reward_func/std": 0.00799198541790247,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0625,
"rewards/xmlcount_reward_func/std": 0.125,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.28169014084507044,
"grad_norm": 0.32836952805519104,
"kl": 0.010951906442642212,
"learning_rate": 6.227427435703995e-07,
"loss": 0.0004,
"num_tokens": 6165.0,
"reward": 0.02834007889032364,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.02834007889032364,
"rewards/question_recreation_reward_func/std": 0.014024702832102776,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 26.0,
"completions/max_terminated_length": 26.0,
"completions/mean_length": 12.5,
"completions/mean_terminated_length": 12.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.30985915492957744,
"grad_norm": 14.266270637512207,
"kl": 0.016900939866900444,
"learning_rate": 5.412896727361662e-07,
"loss": -0.1861,
"num_tokens": 6727.0,
"reward": 0.14206652343273163,
"reward_std": 0.02492138370871544,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.14206652343273163,
"rewards/question_recreation_reward_func/std": 0.04201439768075943,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 448.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 123.5,
"completions/mean_terminated_length": 15.333333015441895,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.3380281690140845,
"grad_norm": 1.9125043153762817,
"kl": 0.014537290277075954,
"learning_rate": 4.5871032726383385e-07,
"loss": -0.3405,
"num_tokens": 7733.0,
"reward": 0.5619847178459167,
"reward_std": 0.6531730890274048,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 1.0,
"rewards/question_recreation_reward_func/mean": 0.030734695494174957,
"rewards/question_recreation_reward_func/std": 0.019535422325134277,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.03125,
"rewards/xmlcount_reward_func/std": 0.0625,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 27.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 17.5,
"completions/mean_terminated_length": 17.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.36619718309859156,
"grad_norm": 0.005306577309966087,
"kl": 5.5371059715980664e-05,
"learning_rate": 3.772572564296004e-07,
"loss": 0.0,
"num_tokens": 8315.0,
"reward": 0.01907399296760559,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.01907399296760559,
"rewards/question_recreation_reward_func/std": 0.003249133238568902,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.39436619718309857,
"grad_norm": 0.021956074982881546,
"kl": 6.914883852005005e-05,
"learning_rate": 2.9915228767351535e-07,
"loss": 0.0,
"num_tokens": 8859.0,
"reward": 0.014652014710009098,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.014652014710009098,
"rewards/question_recreation_reward_func/std": 0.0,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4225352112676056,
"grad_norm": 0.48415833711624146,
"kl": 0.0022591277956962585,
"learning_rate": 2.2652592093878665e-07,
"loss": 0.0001,
"num_tokens": 9403.0,
"reward": 0.1094527393579483,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.1094527393579483,
"rewards/question_recreation_reward_func/std": 0.0,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 19.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 13.5,
"completions/mean_terminated_length": 13.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4507042253521127,
"grad_norm": 0.04596748575568199,
"kl": 0.0005076723755337298,
"learning_rate": 1.6135921418712955e-07,
"loss": 0.0,
"num_tokens": 9969.0,
"reward": 0.018024075776338577,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.018024075776338577,
"rewards/question_recreation_reward_func/std": 0.009574447758495808,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 448.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 122.75,
"completions/mean_terminated_length": 14.333333015441895,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.4788732394366197,
"grad_norm": 14.307421684265137,
"kl": 0.3401991333812475,
"learning_rate": 1.0542974530180327e-07,
"loss": 0.5429,
"num_tokens": 10972.0,
"reward": 0.0627230554819107,
"reward_std": 0.0639922246336937,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.031473059207201004,
"rewards/question_recreation_reward_func/std": 0.02106659673154354,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.03125,
"rewards/xmlcount_reward_func/std": 0.0625,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 43.0,
"completions/max_terminated_length": 43.0,
"completions/mean_length": 25.0,
"completions/mean_terminated_length": 25.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5070422535211268,
"grad_norm": 8.994085311889648,
"kl": 0.03856681424076669,
"learning_rate": 6.026312439675551e-08,
"loss": -0.1128,
"num_tokens": 11584.0,
"reward": 0.10017985105514526,
"reward_std": 0.0002543535956647247,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.10017985105514526,
"rewards/question_recreation_reward_func/std": 0.00035971030592918396,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 448.0,
"completions/max_terminated_length": 52.0,
"completions/mean_length": 137.5,
"completions/mean_terminated_length": 34.0,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.5352112676056338,
"grad_norm": 7.788077354431152,
"kl": 0.012560278875753284,
"learning_rate": 2.7091379149682682e-08,
"loss": 0.4072,
"num_tokens": 12646.0,
"reward": 0.012618010863661766,
"reward_std": 0.005928314756602049,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.012618010863661766,
"rewards/question_recreation_reward_func/std": 0.006028006784617901,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 448.0,
"completions/max_terminated_length": 157.0,
"completions/mean_length": 270.5,
"completions/mean_terminated_length": 93.0,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.5633802816901409,
"grad_norm": 7.968515872955322,
"kl": 0.023016713559627533,
"learning_rate": 6.819348298638839e-09,
"loss": 0.244,
"num_tokens": 14240.0,
"reward": 0.11969352513551712,
"reward_std": 0.05395982041954994,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.11969352513551712,
"rewards/question_recreation_reward_func/std": 0.1334262639284134,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 20
},
{
"epoch": 0.5633802816901409,
"step": 20,
"total_flos": 0.0,
"train_loss": 0.030225585971913917,
"train_runtime": 15476.4298,
"train_samples_per_second": 0.005,
"train_steps_per_second": 0.001
}
],
"logging_steps": 1,
"max_steps": 20,
"num_input_tokens_seen": 14240,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}