Model: alsandeer33/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-flightless_arctic_kangaroo Source: Original Platform
1019 lines
42 KiB
JSON
1019 lines
42 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 16.571428571428573,
|
|
"eval_steps": 500,
|
|
"global_step": 50,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.5,
|
|
"completions/max_terminated_length": 147.5,
|
|
"completions/mean_length": 135.125,
|
|
"completions/mean_terminated_length": 135.125,
|
|
"completions/min_length": 120.5,
|
|
"completions/min_terminated_length": 120.5,
|
|
"epoch": 0.5714285714285714,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 2.666666666666666e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 3129.0,
|
|
"reward": 6.070932626724243,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5089328289031982,
|
|
"rewards/question_recreation_reward_func/std": 0.5670355260372162,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.5,
|
|
"completions/max_terminated_length": 148.5,
|
|
"completions/mean_length": 144.45833587646484,
|
|
"completions/mean_terminated_length": 144.45833587646484,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 1.2857142857142856,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 6316.0,
|
|
"reward": 5.576179265975952,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5141791701316833,
|
|
"rewards/question_recreation_reward_func/std": 0.5554790496826172,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 161.0,
|
|
"completions/max_terminated_length": 161.0,
|
|
"completions/mean_length": 141.16666412353516,
|
|
"completions/mean_terminated_length": 141.16666412353516,
|
|
"completions/min_length": 120.5,
|
|
"completions/min_terminated_length": 120.5,
|
|
"epoch": 1.8571428571428572,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.96430995261912e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 9464.0,
|
|
"reward": 5.596338510513306,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.534338653087616,
|
|
"rewards/question_recreation_reward_func/std": 0.5376994013786316,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 134.5,
|
|
"completions/max_terminated_length": 134.5,
|
|
"completions/mean_length": 121.625,
|
|
"completions/mean_terminated_length": 121.625,
|
|
"completions/min_length": 115.5,
|
|
"completions/min_terminated_length": 115.5,
|
|
"epoch": 2.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.857876700217506e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 12485.0,
|
|
"reward": 7.093775987625122,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5317758321762085,
|
|
"rewards/question_recreation_reward_func/std": 0.5406587421894073,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.0,
|
|
"completions/max_terminated_length": 148.0,
|
|
"completions/mean_length": 133.0,
|
|
"completions/mean_terminated_length": 133.0,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 3.2857142857142856,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.682599546705714e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 15605.0,
|
|
"reward": 5.623394966125488,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5613951683044434,
|
|
"rewards/question_recreation_reward_func/std": 0.5064572691917419,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 139.0,
|
|
"completions/max_terminated_length": 139.0,
|
|
"completions/mean_length": 129.04166412353516,
|
|
"completions/mean_terminated_length": 129.04166412353516,
|
|
"completions/min_length": 122.5,
|
|
"completions/min_terminated_length": 122.5,
|
|
"epoch": 3.857142857142857,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.441606317040557e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 18689.0,
|
|
"reward": 7.075747728347778,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5137476325035095,
|
|
"rewards/question_recreation_reward_func/std": 0.5614758431911469,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.0,
|
|
"completions/max_terminated_length": 144.0,
|
|
"completions/mean_length": 131.75,
|
|
"completions/mean_terminated_length": 131.75,
|
|
"completions/min_length": 121.5,
|
|
"completions/min_terminated_length": 121.5,
|
|
"epoch": 4.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.139197541114644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 21791.0,
|
|
"reward": 5.585848331451416,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5238484740257263,
|
|
"rewards/question_recreation_reward_func/std": 0.5498124361038208,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 133.5,
|
|
"completions/max_terminated_length": 133.5,
|
|
"completions/mean_length": 125.125,
|
|
"completions/mean_terminated_length": 125.125,
|
|
"completions/min_length": 115.5,
|
|
"completions/min_terminated_length": 115.5,
|
|
"epoch": 5.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 6.780769710698569e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 24827.0,
|
|
"reward": 6.1232874393463135,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5612876415252686,
|
|
"rewards/question_recreation_reward_func/std": 0.5065814107656479,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.0,
|
|
"completions/max_terminated_length": 148.0,
|
|
"completions/mean_length": 137.875,
|
|
"completions/mean_terminated_length": 137.875,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 5.857142857142857,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 6.37271897891742e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 27995.0,
|
|
"reward": 7.081830263137817,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5198305249214172,
|
|
"rewards/question_recreation_reward_func/std": 0.5544519424438477,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 155.5,
|
|
"completions/max_terminated_length": 155.5,
|
|
"completions/mean_length": 142.25,
|
|
"completions/mean_terminated_length": 142.25,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 6.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 5.922327020746735e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 31181.0,
|
|
"reward": 5.591153383255005,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5291535258293152,
|
|
"rewards/question_recreation_reward_func/std": 0.5381881296634674,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 152.0,
|
|
"completions/max_terminated_length": 152.0,
|
|
"completions/mean_length": 142.29166412353516,
|
|
"completions/mean_terminated_length": 142.29166412353516,
|
|
"completions/min_length": 128.5,
|
|
"completions/min_terminated_length": 128.5,
|
|
"epoch": 7.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 5.437631091350051e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 34351.0,
|
|
"reward": 7.0908520221710205,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5288519263267517,
|
|
"rewards/question_recreation_reward_func/std": 0.5440349578857422,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.5,
|
|
"completions/max_terminated_length": 146.5,
|
|
"completions/mean_length": 127.70833206176758,
|
|
"completions/mean_terminated_length": 127.70833206176758,
|
|
"completions/min_length": 120.5,
|
|
"completions/min_terminated_length": 120.5,
|
|
"epoch": 7.857142857142857,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 4.927280601070113e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 37429.0,
|
|
"reward": 6.077057361602783,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5150575637817383,
|
|
"rewards/question_recreation_reward_func/std": 0.5599632859230042,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 154.0,
|
|
"completions/max_terminated_length": 154.0,
|
|
"completions/mean_length": 134.0,
|
|
"completions/mean_terminated_length": 134.0,
|
|
"completions/min_length": 108.5,
|
|
"completions/min_terminated_length": 108.5,
|
|
"epoch": 8.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 4.400382766496394e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 40549.0,
|
|
"reward": 6.108230352401733,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5462303161621094,
|
|
"rewards/question_recreation_reward_func/std": 0.5239680707454681,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.5,
|
|
"completions/max_terminated_length": 145.5,
|
|
"completions/mean_length": 128.25,
|
|
"completions/mean_terminated_length": 128.25,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 9.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 3.866340091969303e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 43619.0,
|
|
"reward": 5.5803306102752686,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5183308720588684,
|
|
"rewards/question_recreation_reward_func/std": 0.5561836063861847,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 138.0,
|
|
"completions/max_terminated_length": 138.0,
|
|
"completions/mean_length": 132.70833587646484,
|
|
"completions/mean_terminated_length": 132.70833587646484,
|
|
"completions/min_length": 126.5,
|
|
"completions/min_terminated_length": 126.5,
|
|
"epoch": 9.857142857142858,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 3.33468258166748e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 46738.0,
|
|
"reward": 7.087554454803467,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5255548357963562,
|
|
"rewards/question_recreation_reward_func/std": 0.5478420853614807,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 139.5,
|
|
"completions/max_terminated_length": 139.5,
|
|
"completions/mean_length": 127.25,
|
|
"completions/mean_terminated_length": 127.25,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 10.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 2.8148976764574644e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 49804.0,
|
|
"reward": 5.6411614418029785,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.579161524772644,
|
|
"rewards/question_recreation_reward_func/std": 0.48594246804714203,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 161.5,
|
|
"completions/max_terminated_length": 161.5,
|
|
"completions/mean_length": 146.04166412353516,
|
|
"completions/mean_terminated_length": 146.04166412353516,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 11.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 2.3162609502866607e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 53020.0,
|
|
"reward": 6.080661296844482,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.518661230802536,
|
|
"rewards/question_recreation_reward_func/std": 0.5448050200939178,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 134.0,
|
|
"completions/max_terminated_length": 134.0,
|
|
"completions/mean_length": 125.20833206176758,
|
|
"completions/mean_terminated_length": 125.20833206176758,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 11.857142857142858,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 1.8476705873465095e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 56199.0,
|
|
"reward": 7.085609674453735,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5236097574234009,
|
|
"rewards/question_recreation_reward_func/std": 0.5500880181789398,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.5,
|
|
"completions/max_terminated_length": 142.5,
|
|
"completions/mean_length": 132.75,
|
|
"completions/mean_terminated_length": 132.75,
|
|
"completions/min_length": 124.5,
|
|
"completions/min_terminated_length": 124.5,
|
|
"epoch": 12.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 1.41748859376479e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 59309.0,
|
|
"reward": 5.591024875640869,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5290250182151794,
|
|
"rewards/question_recreation_reward_func/std": 0.5438350737094879,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 141.0,
|
|
"completions/max_terminated_length": 141.0,
|
|
"completions/mean_length": 133.58333587646484,
|
|
"completions/mean_terminated_length": 133.58333587646484,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 13.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 1.0333915774081696e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 62434.0,
|
|
"reward": 7.574448108673096,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 1.0,
|
|
"rewards/final_correctness_reward_func/std": 1.154700517654419,
|
|
"rewards/question_recreation_reward_func/mean": 0.5124481320381165,
|
|
"rewards/question_recreation_reward_func/std": 0.5629763901233673,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.5,
|
|
"completions/max_terminated_length": 140.5,
|
|
"completions/mean_length": 125.29166412353516,
|
|
"completions/mean_terminated_length": 125.29166412353516,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 13.857142857142858,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 7.022337586329597e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 65500.0,
|
|
"reward": 5.575525760650635,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5135260820388794,
|
|
"rewards/question_recreation_reward_func/std": 0.5617317259311676,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.5,
|
|
"completions/max_terminated_length": 140.5,
|
|
"completions/mean_length": 133.625,
|
|
"completions/mean_terminated_length": 133.625,
|
|
"completions/min_length": 126.5,
|
|
"completions/min_terminated_length": 126.5,
|
|
"epoch": 14.571428571428571,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 4.299246565604755e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 68617.0,
|
|
"reward": 7.085801124572754,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.5238010287284851,
|
|
"rewards/question_recreation_reward_func/std": 0.5443686544895172,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 136.5,
|
|
"completions/max_terminated_length": 136.5,
|
|
"completions/mean_length": 124.83333206176758,
|
|
"completions/mean_terminated_length": 124.83333206176758,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 15.285714285714286,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 2.213236335683253e-08,
|
|
"loss": 0.0,
|
|
"num_tokens": 71685.0,
|
|
"reward": 6.153791427612305,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.5,
|
|
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
|
|
"rewards/question_recreation_reward_func/mean": 0.591791570186615,
|
|
"rewards/question_recreation_reward_func/std": 0.47135844826698303,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 156.0,
|
|
"completions/max_terminated_length": 156.0,
|
|
"completions/mean_length": 137.25,
|
|
"completions/mean_terminated_length": 137.25,
|
|
"completions/min_length": 115.5,
|
|
"completions/min_terminated_length": 115.5,
|
|
"epoch": 15.857142857142858,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 8.0153179853653e-09,
|
|
"loss": 0.0,
|
|
"num_tokens": 74824.0,
|
|
"reward": 5.596338510513306,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.534338653087616,
|
|
"rewards/question_recreation_reward_func/std": 0.5376994013786316,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.0,
|
|
"completions/max_terminated_length": 144.0,
|
|
"completions/mean_length": 126.625,
|
|
"completions/mean_terminated_length": 126.625,
|
|
"completions/min_length": 108.5,
|
|
"completions/min_terminated_length": 108.5,
|
|
"epoch": 16.571428571428573,
|
|
"grad_norm": 0.0,
|
|
"kl": 0.0,
|
|
"learning_rate": 8.932485507387344e-10,
|
|
"loss": 0.0,
|
|
"num_tokens": 77885.0,
|
|
"reward": 5.580442428588867,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 2.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.5184425115585327,
|
|
"rewards/question_recreation_reward_func/std": 0.550556093454361,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 16.571428571428573,
|
|
"step": 50,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.0,
|
|
"train_runtime": 2413.8883,
|
|
"train_samples_per_second": 0.083,
|
|
"train_steps_per_second": 0.021
|
|
}
|
|
],
|
|
"logging_steps": 2,
|
|
"max_steps": 50,
|
|
"num_input_tokens_seen": 77885,
|
|
"num_train_epochs": 17,
|
|
"save_steps": 25,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|