1019 lines
40 KiB
JSON
1019 lines
40 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 16.571428571428573,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 50,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 0.5714285714285714,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 2.666666666666666e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 2080.0,
|
||
|
|
"reward": 3.9373488426208496,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.013348821084946394,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 1.2857142857142856,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 8e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 4160.0,
|
||
|
|
"reward": 3.9331103563308716,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.009110347600653768,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 1.8571428571428572,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.96430995261912e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 6240.0,
|
||
|
|
"reward": 3.979282855987549,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.05528285843320191,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 2.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.857876700217506e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 8320.0,
|
||
|
|
"reward": 3.9332165718078613,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.009216589853167534,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 3.2857142857142856,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.682599546705714e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 10400.0,
|
||
|
|
"reward": 3.979282855987549,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.05528285843320191,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 3.857142857142857,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.441606317040557e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 12480.0,
|
||
|
|
"reward": 3.9281322956085205,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.00413223123177886,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 4.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.139197541114644e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 14560.0,
|
||
|
|
"reward": 3.936775326728821,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.01277530868537724,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 5.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 6.780769710698569e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 16640.0,
|
||
|
|
"reward": 3.97958505153656,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0555851433891803,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 5.857142857142857,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 6.37271897891742e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 18720.0,
|
||
|
|
"reward": 3.9275587797164917,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0035587188322097063,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 6.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 5.922327020746735e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 20800.0,
|
||
|
|
"reward": 3.9328081607818604,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.008808062644675374,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 7.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 5.437631091350051e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 22880.0,
|
||
|
|
"reward": 3.927860975265503,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0038610037881881,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 7.857142857142857,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 4.927280601070113e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 24960.0,
|
||
|
|
"reward": 3.9373488426208496,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.013348821084946394,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 8.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 4.400382766496394e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 27040.0,
|
||
|
|
"reward": 3.97958505153656,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0555851433891803,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 9.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 3.866340091969303e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 29120.0,
|
||
|
|
"reward": 3.93169105052948,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.007690950063988566,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 9.857142857142858,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 3.33468258166748e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 31200.0,
|
||
|
|
"reward": 3.9275587797164917,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0035587188322097063,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 10.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 2.8148976764574644e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 33280.0,
|
||
|
|
"reward": 3.97958505153656,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0555851433891803,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 11.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 2.3162609502866607e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 35360.0,
|
||
|
|
"reward": 3.9344987869262695,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.010498687624931335,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 11.857142857142858,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 1.8476705873465095e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 37440.0,
|
||
|
|
"reward": 3.9332165718078613,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.009216589853167534,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 12.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 1.41748859376479e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 39520.0,
|
||
|
|
"reward": 3.937077522277832,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.013077593641355634,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 13.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 1.0333915774081696e-07,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 41600.0,
|
||
|
|
"reward": 3.9275587797164917,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0035587188322097063,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 13.857142857142858,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 7.022337586329597e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 43680.0,
|
||
|
|
"reward": 3.9373488426208496,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.013348821084946394,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 14.571428571428571,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 4.299246565604755e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 45760.0,
|
||
|
|
"reward": 3.929249405860901,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.005249343812465668,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 15.285714285714286,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 2.213236335683253e-08,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 47840.0,
|
||
|
|
"reward": 3.97958505153656,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.0555851433891803,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 15.857142857142858,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 8.0153179853653e-09,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 49920.0,
|
||
|
|
"reward": 3.979282855987549,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.05528285843320191,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"completions/clipped_ratio": 0.0,
|
||
|
|
"completions/max_length": 4.0,
|
||
|
|
"completions/max_terminated_length": 4.0,
|
||
|
|
"completions/mean_length": 4.0,
|
||
|
|
"completions/mean_terminated_length": 4.0,
|
||
|
|
"completions/min_length": 4.0,
|
||
|
|
"completions/min_terminated_length": 4.0,
|
||
|
|
"epoch": 16.571428571428573,
|
||
|
|
"grad_norm": 0.0,
|
||
|
|
"kl": 0.0,
|
||
|
|
"learning_rate": 8.932485507387344e-10,
|
||
|
|
"loss": 0.0,
|
||
|
|
"num_tokens": 52000.0,
|
||
|
|
"reward": 3.93846595287323,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
||
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
||
|
|
"rewards/consensus_reward_func/std": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
||
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
||
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
||
|
|
"rewards/question_recreation_reward_func/mean": 0.014465933665633202,
|
||
|
|
"rewards/question_recreation_reward_func/std": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
||
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
||
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 16.571428571428573,
|
||
|
|
"step": 50,
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_loss": 0.0,
|
||
|
|
"train_runtime": 792.9111,
|
||
|
|
"train_samples_per_second": 0.252,
|
||
|
|
"train_steps_per_second": 0.063
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 2,
|
||
|
|
"max_steps": 50,
|
||
|
|
"num_input_tokens_seen": 52000,
|
||
|
|
"num_train_epochs": 17,
|
||
|
|
"save_steps": 25,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|