Files
Qwen2.5-0.5B-Instruct-Gensy…/trainer_state.json
ModelHub XC ee2cfebfb7 初始化项目,由ModelHub XC社区提供模型
Model: alsandeer33/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-flightless_arctic_kangaroo
Source: Original Platform
2026-05-13 11:08:25 +08:00

1019 lines
42 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 16.571428571428573,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 147.5,
"completions/max_terminated_length": 147.5,
"completions/mean_length": 135.125,
"completions/mean_terminated_length": 135.125,
"completions/min_length": 120.5,
"completions/min_terminated_length": 120.5,
"epoch": 0.5714285714285714,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.666666666666666e-07,
"loss": 0.0,
"num_tokens": 3129.0,
"reward": 6.070932626724243,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5089328289031982,
"rewards/question_recreation_reward_func/std": 0.5670355260372162,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.5,
"completions/max_terminated_length": 148.5,
"completions/mean_length": 144.45833587646484,
"completions/mean_terminated_length": 144.45833587646484,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 1.2857142857142856,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 6316.0,
"reward": 5.576179265975952,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5141791701316833,
"rewards/question_recreation_reward_func/std": 0.5554790496826172,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 141.16666412353516,
"completions/mean_terminated_length": 141.16666412353516,
"completions/min_length": 120.5,
"completions/min_terminated_length": 120.5,
"epoch": 1.8571428571428572,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.96430995261912e-07,
"loss": 0.0,
"num_tokens": 9464.0,
"reward": 5.596338510513306,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.534338653087616,
"rewards/question_recreation_reward_func/std": 0.5376994013786316,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 134.5,
"completions/max_terminated_length": 134.5,
"completions/mean_length": 121.625,
"completions/mean_terminated_length": 121.625,
"completions/min_length": 115.5,
"completions/min_terminated_length": 115.5,
"epoch": 2.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.857876700217506e-07,
"loss": 0.0,
"num_tokens": 12485.0,
"reward": 7.093775987625122,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5317758321762085,
"rewards/question_recreation_reward_func/std": 0.5406587421894073,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 133.0,
"completions/mean_terminated_length": 133.0,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 3.2857142857142856,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.682599546705714e-07,
"loss": 0.0,
"num_tokens": 15605.0,
"reward": 5.623394966125488,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5613951683044434,
"rewards/question_recreation_reward_func/std": 0.5064572691917419,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 139.0,
"completions/max_terminated_length": 139.0,
"completions/mean_length": 129.04166412353516,
"completions/mean_terminated_length": 129.04166412353516,
"completions/min_length": 122.5,
"completions/min_terminated_length": 122.5,
"epoch": 3.857142857142857,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.441606317040557e-07,
"loss": 0.0,
"num_tokens": 18689.0,
"reward": 7.075747728347778,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5137476325035095,
"rewards/question_recreation_reward_func/std": 0.5614758431911469,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 131.75,
"completions/mean_terminated_length": 131.75,
"completions/min_length": 121.5,
"completions/min_terminated_length": 121.5,
"epoch": 4.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.139197541114644e-07,
"loss": 0.0,
"num_tokens": 21791.0,
"reward": 5.585848331451416,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5238484740257263,
"rewards/question_recreation_reward_func/std": 0.5498124361038208,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 133.5,
"completions/max_terminated_length": 133.5,
"completions/mean_length": 125.125,
"completions/mean_terminated_length": 125.125,
"completions/min_length": 115.5,
"completions/min_terminated_length": 115.5,
"epoch": 5.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 6.780769710698569e-07,
"loss": 0.0,
"num_tokens": 24827.0,
"reward": 6.1232874393463135,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5612876415252686,
"rewards/question_recreation_reward_func/std": 0.5065814107656479,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 137.875,
"completions/mean_terminated_length": 137.875,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 5.857142857142857,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 6.37271897891742e-07,
"loss": 0.0,
"num_tokens": 27995.0,
"reward": 7.081830263137817,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5198305249214172,
"rewards/question_recreation_reward_func/std": 0.5544519424438477,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 155.5,
"completions/max_terminated_length": 155.5,
"completions/mean_length": 142.25,
"completions/mean_terminated_length": 142.25,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 6.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 5.922327020746735e-07,
"loss": 0.0,
"num_tokens": 31181.0,
"reward": 5.591153383255005,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5291535258293152,
"rewards/question_recreation_reward_func/std": 0.5381881296634674,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 152.0,
"completions/max_terminated_length": 152.0,
"completions/mean_length": 142.29166412353516,
"completions/mean_terminated_length": 142.29166412353516,
"completions/min_length": 128.5,
"completions/min_terminated_length": 128.5,
"epoch": 7.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 5.437631091350051e-07,
"loss": 0.0,
"num_tokens": 34351.0,
"reward": 7.0908520221710205,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5288519263267517,
"rewards/question_recreation_reward_func/std": 0.5440349578857422,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 146.5,
"completions/max_terminated_length": 146.5,
"completions/mean_length": 127.70833206176758,
"completions/mean_terminated_length": 127.70833206176758,
"completions/min_length": 120.5,
"completions/min_terminated_length": 120.5,
"epoch": 7.857142857142857,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 4.927280601070113e-07,
"loss": 0.0,
"num_tokens": 37429.0,
"reward": 6.077057361602783,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5150575637817383,
"rewards/question_recreation_reward_func/std": 0.5599632859230042,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 154.0,
"completions/max_terminated_length": 154.0,
"completions/mean_length": 134.0,
"completions/mean_terminated_length": 134.0,
"completions/min_length": 108.5,
"completions/min_terminated_length": 108.5,
"epoch": 8.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 4.400382766496394e-07,
"loss": 0.0,
"num_tokens": 40549.0,
"reward": 6.108230352401733,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5462303161621094,
"rewards/question_recreation_reward_func/std": 0.5239680707454681,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 145.5,
"completions/max_terminated_length": 145.5,
"completions/mean_length": 128.25,
"completions/mean_terminated_length": 128.25,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 9.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.866340091969303e-07,
"loss": 0.0,
"num_tokens": 43619.0,
"reward": 5.5803306102752686,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5183308720588684,
"rewards/question_recreation_reward_func/std": 0.5561836063861847,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 138.0,
"completions/max_terminated_length": 138.0,
"completions/mean_length": 132.70833587646484,
"completions/mean_terminated_length": 132.70833587646484,
"completions/min_length": 126.5,
"completions/min_terminated_length": 126.5,
"epoch": 9.857142857142858,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.33468258166748e-07,
"loss": 0.0,
"num_tokens": 46738.0,
"reward": 7.087554454803467,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5255548357963562,
"rewards/question_recreation_reward_func/std": 0.5478420853614807,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 139.5,
"completions/max_terminated_length": 139.5,
"completions/mean_length": 127.25,
"completions/mean_terminated_length": 127.25,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 10.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.8148976764574644e-07,
"loss": 0.0,
"num_tokens": 49804.0,
"reward": 5.6411614418029785,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.579161524772644,
"rewards/question_recreation_reward_func/std": 0.48594246804714203,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.5,
"completions/max_terminated_length": 161.5,
"completions/mean_length": 146.04166412353516,
"completions/mean_terminated_length": 146.04166412353516,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 11.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.3162609502866607e-07,
"loss": 0.0,
"num_tokens": 53020.0,
"reward": 6.080661296844482,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.518661230802536,
"rewards/question_recreation_reward_func/std": 0.5448050200939178,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 134.0,
"completions/max_terminated_length": 134.0,
"completions/mean_length": 125.20833206176758,
"completions/mean_terminated_length": 125.20833206176758,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 11.857142857142858,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.8476705873465095e-07,
"loss": 0.0,
"num_tokens": 56199.0,
"reward": 7.085609674453735,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5236097574234009,
"rewards/question_recreation_reward_func/std": 0.5500880181789398,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 142.5,
"completions/max_terminated_length": 142.5,
"completions/mean_length": 132.75,
"completions/mean_terminated_length": 132.75,
"completions/min_length": 124.5,
"completions/min_terminated_length": 124.5,
"epoch": 12.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.41748859376479e-07,
"loss": 0.0,
"num_tokens": 59309.0,
"reward": 5.591024875640869,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5290250182151794,
"rewards/question_recreation_reward_func/std": 0.5438350737094879,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 133.58333587646484,
"completions/mean_terminated_length": 133.58333587646484,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 13.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1.0333915774081696e-07,
"loss": 0.0,
"num_tokens": 62434.0,
"reward": 7.574448108673096,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 1.0,
"rewards/final_correctness_reward_func/std": 1.154700517654419,
"rewards/question_recreation_reward_func/mean": 0.5124481320381165,
"rewards/question_recreation_reward_func/std": 0.5629763901233673,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.5,
"completions/max_terminated_length": 140.5,
"completions/mean_length": 125.29166412353516,
"completions/mean_terminated_length": 125.29166412353516,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 13.857142857142858,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 7.022337586329597e-08,
"loss": 0.0,
"num_tokens": 65500.0,
"reward": 5.575525760650635,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5135260820388794,
"rewards/question_recreation_reward_func/std": 0.5617317259311676,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.5,
"completions/max_terminated_length": 140.5,
"completions/mean_length": 133.625,
"completions/mean_terminated_length": 133.625,
"completions/min_length": 126.5,
"completions/min_terminated_length": 126.5,
"epoch": 14.571428571428571,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 4.299246565604755e-08,
"loss": 0.0,
"num_tokens": 68617.0,
"reward": 7.085801124572754,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 2.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.5238010287284851,
"rewards/question_recreation_reward_func/std": 0.5443686544895172,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 136.5,
"completions/max_terminated_length": 136.5,
"completions/mean_length": 124.83333206176758,
"completions/mean_terminated_length": 124.83333206176758,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 15.285714285714286,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2.213236335683253e-08,
"loss": 0.0,
"num_tokens": 71685.0,
"reward": 6.153791427612305,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.5,
"rewards/final_correctness_reward_func/std": 0.5773502588272095,
"rewards/question_recreation_reward_func/mean": 0.591791570186615,
"rewards/question_recreation_reward_func/std": 0.47135844826698303,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 156.0,
"completions/max_terminated_length": 156.0,
"completions/mean_length": 137.25,
"completions/mean_terminated_length": 137.25,
"completions/min_length": 115.5,
"completions/min_terminated_length": 115.5,
"epoch": 15.857142857142858,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 8.0153179853653e-09,
"loss": 0.0,
"num_tokens": 74824.0,
"reward": 5.596338510513306,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.534338653087616,
"rewards/question_recreation_reward_func/std": 0.5376994013786316,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 126.625,
"completions/mean_terminated_length": 126.625,
"completions/min_length": 108.5,
"completions/min_terminated_length": 108.5,
"epoch": 16.571428571428573,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 8.932485507387344e-10,
"loss": 0.0,
"num_tokens": 77885.0,
"reward": 5.580442428588867,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 1.9240000247955322,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 2.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.5184425115585327,
"rewards/question_recreation_reward_func/std": 0.550556093454361,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 1.1380000114440918,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 50
},
{
"epoch": 16.571428571428573,
"step": 50,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 2413.8883,
"train_samples_per_second": 0.083,
"train_steps_per_second": 0.021
}
],
"logging_steps": 2,
"max_steps": 50,
"num_input_tokens_seen": 77885,
"num_train_epochs": 17,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}