Model: eurb1/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-camouflaged_gliding_salamander Source: Original Platform
824 lines
33 KiB
JSON
824 lines
33 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.5633802816901409,
|
|
"eval_steps": 500,
|
|
"global_step": 20,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 14.0,
|
|
"completions/mean_terminated_length": 14.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.028169014084507043,
|
|
"grad_norm": 32.06034851074219,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0176,
|
|
"num_tokens": 568.0,
|
|
"reward": 0.020783159881830215,
|
|
"reward_std": 0.012284406460821629,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.020783159881830215,
|
|
"rewards/question_recreation_reward_func/std": 0.027877027168869972,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 63.0,
|
|
"completions/max_terminated_length": 63.0,
|
|
"completions/mean_length": 29.5,
|
|
"completions/mean_terminated_length": 29.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.056338028169014086,
|
|
"grad_norm": 6.299616813659668,
|
|
"kl": 0.0,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0791,
|
|
"num_tokens": 1198.0,
|
|
"reward": 0.02762589044868946,
|
|
"reward_std": 0.0009784403955563903,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.02762589231133461,
|
|
"rewards/question_recreation_reward_func/std": 0.0051710638217628,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 118.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.08450704225352113,
|
|
"grad_norm": 3.1016180515289307,
|
|
"kl": 8.319292101077735e-05,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": 0.3379,
|
|
"num_tokens": 2182.0,
|
|
"reward": 0.03331246227025986,
|
|
"reward_std": 0.0052673472091555595,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.03331245854496956,
|
|
"rewards/question_recreation_reward_func/std": 0.009135501459240913,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 13.0,
|
|
"completions/mean_terminated_length": 13.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.11267605633802817,
|
|
"grad_norm": 29.29948616027832,
|
|
"kl": 0.05142414569854736,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": -0.1736,
|
|
"num_tokens": 2746.0,
|
|
"reward": 0.14608332514762878,
|
|
"reward_std": 0.007730965036898851,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.14608332514762878,
|
|
"rewards/question_recreation_reward_func/std": 0.009648437611758709,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.14084507042253522,
|
|
"grad_norm": 40.53683853149414,
|
|
"kl": 0.011120198392518432,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3312.0,
|
|
"reward": 0.08978645503520966,
|
|
"reward_std": 0.013956054113805294,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.08978645503520966,
|
|
"rewards/question_recreation_reward_func/std": 0.08247601985931396,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.16901408450704225,
|
|
"grad_norm": 0.06918946653604507,
|
|
"kl": 0.0004225224256515503,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3856.0,
|
|
"reward": 0.19858156144618988,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.19858156144618988,
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 9.0,
|
|
"completions/max_terminated_length": 9.0,
|
|
"completions/mean_length": 8.25,
|
|
"completions/mean_terminated_length": 8.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.19718309859154928,
|
|
"grad_norm": 41.10600280761719,
|
|
"kl": 0.06056380271911621,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": -0.0184,
|
|
"num_tokens": 4401.0,
|
|
"reward": 0.03791220486164093,
|
|
"reward_std": 0.047372184693813324,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.006662202067673206,
|
|
"rewards/question_recreation_reward_func/std": 0.004442098084837198,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.03125,
|
|
"rewards/xmlcount_reward_func/std": 0.0625,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 21.0,
|
|
"completions/max_terminated_length": 21.0,
|
|
"completions/mean_length": 11.25,
|
|
"completions/mean_terminated_length": 11.25,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.22535211267605634,
|
|
"grad_norm": 84.13429260253906,
|
|
"kl": 0.9762128964066505,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": 0.1866,
|
|
"num_tokens": 4958.0,
|
|
"reward": 0.019636042416095734,
|
|
"reward_std": 0.0071492972783744335,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.019636042416095734,
|
|
"rewards/question_recreation_reward_func/std": 0.00888961460441351,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 110.0,
|
|
"completions/max_terminated_length": 110.0,
|
|
"completions/mean_length": 37.75,
|
|
"completions/mean_terminated_length": 37.75,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.2535211267605634,
|
|
"grad_norm": 4.597474098205566,
|
|
"kl": 0.008542275987565517,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": -0.2221,
|
|
"num_tokens": 5621.0,
|
|
"reward": 0.08780995011329651,
|
|
"reward_std": 0.08147402852773666,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.025309953838586807,
|
|
"rewards/question_recreation_reward_func/std": 0.00799198541790247,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0625,
|
|
"rewards/xmlcount_reward_func/std": 0.125,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.28169014084507044,
|
|
"grad_norm": 0.32836952805519104,
|
|
"kl": 0.010951906442642212,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": 0.0004,
|
|
"num_tokens": 6165.0,
|
|
"reward": 0.02834007889032364,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.02834007889032364,
|
|
"rewards/question_recreation_reward_func/std": 0.014024702832102776,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 26.0,
|
|
"completions/max_terminated_length": 26.0,
|
|
"completions/mean_length": 12.5,
|
|
"completions/mean_terminated_length": 12.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.30985915492957744,
|
|
"grad_norm": 14.266270637512207,
|
|
"kl": 0.016900939866900444,
|
|
"learning_rate": 5.412896727361662e-07,
|
|
"loss": -0.1861,
|
|
"num_tokens": 6727.0,
|
|
"reward": 0.14206652343273163,
|
|
"reward_std": 0.02492138370871544,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.14206652343273163,
|
|
"rewards/question_recreation_reward_func/std": 0.04201439768075943,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 123.5,
|
|
"completions/mean_terminated_length": 15.333333015441895,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.3380281690140845,
|
|
"grad_norm": 1.9125043153762817,
|
|
"kl": 0.014537290277075954,
|
|
"learning_rate": 4.5871032726383385e-07,
|
|
"loss": -0.3405,
|
|
"num_tokens": 7733.0,
|
|
"reward": 0.5619847178459167,
|
|
"reward_std": 0.6531730890274048,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 1.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.030734695494174957,
|
|
"rewards/question_recreation_reward_func/std": 0.019535422325134277,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.03125,
|
|
"rewards/xmlcount_reward_func/std": 0.0625,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 17.5,
|
|
"completions/mean_terminated_length": 17.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.36619718309859156,
|
|
"grad_norm": 0.005306577309966087,
|
|
"kl": 5.5371059715980664e-05,
|
|
"learning_rate": 3.772572564296004e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8315.0,
|
|
"reward": 0.01907399296760559,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.01907399296760559,
|
|
"rewards/question_recreation_reward_func/std": 0.003249133238568902,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.39436619718309857,
|
|
"grad_norm": 0.021956074982881546,
|
|
"kl": 6.914883852005005e-05,
|
|
"learning_rate": 2.9915228767351535e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 8859.0,
|
|
"reward": 0.014652014710009098,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.014652014710009098,
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 8.0,
|
|
"completions/max_terminated_length": 8.0,
|
|
"completions/mean_length": 8.0,
|
|
"completions/mean_terminated_length": 8.0,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.4225352112676056,
|
|
"grad_norm": 0.48415833711624146,
|
|
"kl": 0.0022591277956962585,
|
|
"learning_rate": 2.2652592093878665e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 9403.0,
|
|
"reward": 0.1094527393579483,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.1094527393579483,
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 19.0,
|
|
"completions/max_terminated_length": 19.0,
|
|
"completions/mean_length": 13.5,
|
|
"completions/mean_terminated_length": 13.5,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.4507042253521127,
|
|
"grad_norm": 0.04596748575568199,
|
|
"kl": 0.0005076723755337298,
|
|
"learning_rate": 1.6135921418712955e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9969.0,
|
|
"reward": 0.018024075776338577,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.018024075776338577,
|
|
"rewards/question_recreation_reward_func/std": 0.009574447758495808,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 122.75,
|
|
"completions/mean_terminated_length": 14.333333015441895,
|
|
"completions/min_length": 8.0,
|
|
"completions/min_terminated_length": 8.0,
|
|
"epoch": 0.4788732394366197,
|
|
"grad_norm": 14.307421684265137,
|
|
"kl": 0.3401991333812475,
|
|
"learning_rate": 1.0542974530180327e-07,
|
|
"loss": 0.5429,
|
|
"num_tokens": 10972.0,
|
|
"reward": 0.0627230554819107,
|
|
"reward_std": 0.0639922246336937,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.031473059207201004,
|
|
"rewards/question_recreation_reward_func/std": 0.02106659673154354,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.03125,
|
|
"rewards/xmlcount_reward_func/std": 0.0625,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 43.0,
|
|
"completions/max_terminated_length": 43.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.5070422535211268,
|
|
"grad_norm": 8.994085311889648,
|
|
"kl": 0.03856681424076669,
|
|
"learning_rate": 6.026312439675551e-08,
|
|
"loss": -0.1128,
|
|
"num_tokens": 11584.0,
|
|
"reward": 0.10017985105514526,
|
|
"reward_std": 0.0002543535956647247,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.10017985105514526,
|
|
"rewards/question_recreation_reward_func/std": 0.00035971030592918396,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 52.0,
|
|
"completions/mean_length": 137.5,
|
|
"completions/mean_terminated_length": 34.0,
|
|
"completions/min_length": 19.0,
|
|
"completions/min_terminated_length": 19.0,
|
|
"epoch": 0.5352112676056338,
|
|
"grad_norm": 7.788077354431152,
|
|
"kl": 0.012560278875753284,
|
|
"learning_rate": 2.7091379149682682e-08,
|
|
"loss": 0.4072,
|
|
"num_tokens": 12646.0,
|
|
"reward": 0.012618010863661766,
|
|
"reward_std": 0.005928314756602049,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.012618010863661766,
|
|
"rewards/question_recreation_reward_func/std": 0.006028006784617901,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 157.0,
|
|
"completions/mean_length": 270.5,
|
|
"completions/mean_terminated_length": 93.0,
|
|
"completions/min_length": 29.0,
|
|
"completions/min_terminated_length": 29.0,
|
|
"epoch": 0.5633802816901409,
|
|
"grad_norm": 7.968515872955322,
|
|
"kl": 0.023016713559627533,
|
|
"learning_rate": 6.819348298638839e-09,
|
|
"loss": 0.244,
|
|
"num_tokens": 14240.0,
|
|
"reward": 0.11969352513551712,
|
|
"reward_std": 0.05395982041954994,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.11969352513551712,
|
|
"rewards/question_recreation_reward_func/std": 0.1334262639284134,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.5633802816901409,
|
|
"step": 20,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.030225585971913917,
|
|
"train_runtime": 15476.4298,
|
|
"train_samples_per_second": 0.005,
|
|
"train_steps_per_second": 0.001
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 20,
|
|
"num_input_tokens_seen": 14240,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 25,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|