Model: carestudd/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-screeching_endangered_chinchilla Source: Original Platform
604 lines
24 KiB
JSON
604 lines
24 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 6.8,
|
|
"eval_steps": 500,
|
|
"global_step": 14,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 23.5,
|
|
"completions/mean_terminated_length": 23.5,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 0.4,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 45.845420837402344,
|
|
"kl": 0.0,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0,
|
|
"num_tokens": 606.0,
|
|
"reward": 0.27725791931152344,
|
|
"reward_std": 0.0050897058099508286,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.184007927775383,
|
|
"rewards/question_recreation_reward_func/std": 0.007569987326860428,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.09325000643730164,
|
|
"rewards/xmlcount_reward_func/std": 0.10767660290002823,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 24.0,
|
|
"completions/mean_terminated_length": 24.0,
|
|
"completions/min_length": 21.0,
|
|
"completions/min_terminated_length": 21.0,
|
|
"epoch": 0.8,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 9.568375587463379,
|
|
"kl": 0.0,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 1214.0,
|
|
"reward": 0.05984270200133324,
|
|
"reward_std": 5.343298471416347e-05,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.05984270200133324,
|
|
"rewards/question_recreation_reward_func/std": 0.0004468102415557951,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 21.5,
|
|
"completions/mean_terminated_length": 21.5,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 1.4,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 39.694496154785156,
|
|
"kl": 0.1073869839310646,
|
|
"learning_rate": 9.931806517013612e-07,
|
|
"loss": 0.0043,
|
|
"num_tokens": 1812.0,
|
|
"reward": 0.20595212280750275,
|
|
"reward_std": 0.0009102191543206573,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.020702123641967773,
|
|
"rewards/question_recreation_reward_func/std": 0.003408734453842044,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.18525001406669617,
|
|
"rewards/xmlcount_reward_func/std": 0.002061557024717331,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 221.0,
|
|
"completions/max_terminated_length": 221.0,
|
|
"completions/mean_length": 82.25,
|
|
"completions/mean_terminated_length": 82.25,
|
|
"completions/min_length": 27.0,
|
|
"completions/min_terminated_length": 27.0,
|
|
"epoch": 1.8,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 16.176429748535156,
|
|
"kl": 0.020354424137622118,
|
|
"learning_rate": 9.729086208503173e-07,
|
|
"loss": 0.0798,
|
|
"num_tokens": 2653.0,
|
|
"reward": 0.04737311601638794,
|
|
"reward_std": 0.010779645293951035,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.04737311601638794,
|
|
"rewards/question_recreation_reward_func/std": 0.019436875358223915,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 45.0,
|
|
"completions/max_terminated_length": 45.0,
|
|
"completions/mean_length": 28.25,
|
|
"completions/mean_terminated_length": 28.25,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 2.4,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 44.07194900512695,
|
|
"kl": 0.08810030668973923,
|
|
"learning_rate": 9.397368756032444e-07,
|
|
"loss": 0.0939,
|
|
"num_tokens": 3278.0,
|
|
"reward": 0.050381630659103394,
|
|
"reward_std": 0.006566238589584827,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.0503816232085228,
|
|
"rewards/question_recreation_reward_func/std": 0.007008333690464497,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 27.0,
|
|
"completions/max_terminated_length": 27.0,
|
|
"completions/mean_length": 25.0,
|
|
"completions/mean_terminated_length": 25.0,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 2.8,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 1.3027467727661133,
|
|
"kl": 0.08069050312042236,
|
|
"learning_rate": 8.945702546981968e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 3890.0,
|
|
"reward": 0.1409657597541809,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.04946577176451683,
|
|
"rewards/question_recreation_reward_func/std": 0.012337522581219673,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
|
|
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 157.0,
|
|
"completions/max_terminated_length": 157.0,
|
|
"completions/mean_length": 67.0,
|
|
"completions/mean_terminated_length": 67.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 3.4,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 18.420305252075195,
|
|
"kl": 0.03622829727828503,
|
|
"learning_rate": 8.386407858128706e-07,
|
|
"loss": -0.0354,
|
|
"num_tokens": 4670.0,
|
|
"reward": 0.044995490461587906,
|
|
"reward_std": 0.0019922610372304916,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.044995490461587906,
|
|
"rewards/question_recreation_reward_func/std": 0.014919609762728214,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.0,
|
|
"completions/max_terminated_length": 146.0,
|
|
"completions/mean_length": 54.75,
|
|
"completions/mean_terminated_length": 54.75,
|
|
"completions/min_length": 23.0,
|
|
"completions/min_terminated_length": 23.0,
|
|
"epoch": 3.8,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 11.040939331054688,
|
|
"kl": 0.025315589271485806,
|
|
"learning_rate": 7.734740790612136e-07,
|
|
"loss": 0.243,
|
|
"num_tokens": 5401.0,
|
|
"reward": 0.27007412910461426,
|
|
"reward_std": 0.010380705818533897,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.1785741150379181,
|
|
"rewards/question_recreation_reward_func/std": 0.012178990058600903,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
|
|
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 20.0,
|
|
"completions/max_terminated_length": 20.0,
|
|
"completions/mean_length": 17.0,
|
|
"completions/mean_terminated_length": 17.0,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 4.4,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.005074405111372471,
|
|
"kl": 0.15366993844509125,
|
|
"learning_rate": 7.008477123264847e-07,
|
|
"loss": 0.0061,
|
|
"num_tokens": 5981.0,
|
|
"reward": 0.05733615159988403,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.05733615159988403,
|
|
"rewards/question_recreation_reward_func/std": 0.0004914185265079141,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 389.0,
|
|
"completions/max_terminated_length": 389.0,
|
|
"completions/mean_length": 259.5,
|
|
"completions/mean_terminated_length": 259.5,
|
|
"completions/min_length": 166.0,
|
|
"completions/min_terminated_length": 166.0,
|
|
"epoch": 4.8,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 12.459712028503418,
|
|
"kl": 0.029772183392196894,
|
|
"learning_rate": 6.227427435703995e-07,
|
|
"loss": 0.2349,
|
|
"num_tokens": 7531.0,
|
|
"reward": 0.11735042929649353,
|
|
"reward_std": 0.018421867862343788,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.11735042929649353,
|
|
"rewards/question_recreation_reward_func/std": 0.01842377707362175,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 18.5,
|
|
"completions/mean_terminated_length": 18.5,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 5.4,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.039047498255968094,
|
|
"kl": 0.11775290966033936,
|
|
"learning_rate": 5.412896727361662e-07,
|
|
"loss": 0.0047,
|
|
"num_tokens": 8117.0,
|
|
"reward": 0.40121549367904663,
|
|
"reward_std": 0.0,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.30971550941467285,
|
|
"rewards/question_recreation_reward_func/std": 0.044006314128637314,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
|
|
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.0,
|
|
"completions/max_terminated_length": 143.0,
|
|
"completions/mean_length": 61.75,
|
|
"completions/mean_terminated_length": 61.75,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 5.8,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 24.296092987060547,
|
|
"kl": 0.0357817467302084,
|
|
"learning_rate": 4.5871032726383385e-07,
|
|
"loss": 0.0419,
|
|
"num_tokens": 8876.0,
|
|
"reward": 0.048355475068092346,
|
|
"reward_std": 0.007202191278338432,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.048355475068092346,
|
|
"rewards/question_recreation_reward_func/std": 0.020798446610569954,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 149.0,
|
|
"completions/max_terminated_length": 149.0,
|
|
"completions/mean_length": 64.25,
|
|
"completions/mean_terminated_length": 64.25,
|
|
"completions/min_length": 14.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 6.4,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 20.938844680786133,
|
|
"kl": 0.061262642964720726,
|
|
"learning_rate": 3.772572564296004e-07,
|
|
"loss": 0.0246,
|
|
"num_tokens": 9645.0,
|
|
"reward": 0.19921739399433136,
|
|
"reward_std": 0.01111482735723257,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.19921737909317017,
|
|
"rewards/question_recreation_reward_func/std": 0.08456551283597946,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.0,
|
|
"rewards/xmlcount_reward_func/std": 0.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 23.0,
|
|
"completions/max_terminated_length": 23.0,
|
|
"completions/mean_length": 21.0,
|
|
"completions/mean_terminated_length": 21.0,
|
|
"completions/min_length": 20.0,
|
|
"completions/min_terminated_length": 20.0,
|
|
"epoch": 6.8,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 12.05763053894043,
|
|
"kl": 0.054638177156448364,
|
|
"learning_rate": 2.9915228767351535e-07,
|
|
"loss": 0.0173,
|
|
"num_tokens": 10241.0,
|
|
"reward": 0.14335308969020844,
|
|
"reward_std": 0.0008261414477601647,
|
|
"rewards/concensus_correctness_reward_func/mean": 0.0,
|
|
"rewards/concensus_correctness_reward_func/std": 0.0,
|
|
"rewards/consensus_reward_func/mean": 0.0,
|
|
"rewards/consensus_reward_func/std": 0.0,
|
|
"rewards/cumulative_reward_2/mean": 0.0,
|
|
"rewards/cumulative_reward_2/std": 0.0,
|
|
"rewards/final_correctness_reward_func/mean": 0.0,
|
|
"rewards/final_correctness_reward_func/std": 0.0,
|
|
"rewards/question_recreation_reward_func/mean": 0.0513530895113945,
|
|
"rewards/question_recreation_reward_func/std": 0.007401337847113609,
|
|
"rewards/soft_format_reward_func/mean": 0.0,
|
|
"rewards/soft_format_reward_func/std": 0.0,
|
|
"rewards/strict_format_reward_func/mean": 0.0,
|
|
"rewards/strict_format_reward_func/std": 0.0,
|
|
"rewards/xmlcount_reward_func/mean": 0.09200000017881393,
|
|
"rewards/xmlcount_reward_func/std": 0.10623559355735779,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 6.8,
|
|
"step": 14,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.05132548208348453,
|
|
"train_runtime": 571.6229,
|
|
"train_samples_per_second": 0.14,
|
|
"train_steps_per_second": 0.035
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 20,
|
|
"num_input_tokens_seen": 10241,
|
|
"num_train_epochs": 7,
|
|
"save_steps": 25,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|