Files
Qwen2.5-0.5B-Instruct-Gensy…/trainer_state.json
ModelHub XC 3242972f48 初始化项目,由ModelHub XC社区提供模型
Model: carestudd/Qwen2.5-0.5B-Instruct-Gensyn-Swarm-screeching_endangered_chinchilla
Source: Original Platform
2026-05-30 03:15:40 +08:00

604 lines
24 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.8,
"eval_steps": 500,
"global_step": 14,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 27.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 23.5,
"completions/mean_terminated_length": 23.5,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 45.845420837402344,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"num_tokens": 606.0,
"reward": 0.27725791931152344,
"reward_std": 0.0050897058099508286,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.184007927775383,
"rewards/question_recreation_reward_func/std": 0.007569987326860428,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.09325000643730164,
"rewards/xmlcount_reward_func/std": 0.10767660290002823,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 27.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 24.0,
"completions/mean_terminated_length": 24.0,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 9.568375587463379,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 1214.0,
"reward": 0.05984270200133324,
"reward_std": 5.343298471416347e-05,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.05984270200133324,
"rewards/question_recreation_reward_func/std": 0.0004468102415557951,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 23.0,
"completions/max_terminated_length": 23.0,
"completions/mean_length": 21.5,
"completions/mean_terminated_length": 21.5,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 1.4,
"frac_reward_zero_std": 0.5,
"grad_norm": 39.694496154785156,
"kl": 0.1073869839310646,
"learning_rate": 9.931806517013612e-07,
"loss": 0.0043,
"num_tokens": 1812.0,
"reward": 0.20595212280750275,
"reward_std": 0.0009102191543206573,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.020702123641967773,
"rewards/question_recreation_reward_func/std": 0.003408734453842044,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.18525001406669617,
"rewards/xmlcount_reward_func/std": 0.002061557024717331,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 221.0,
"completions/max_terminated_length": 221.0,
"completions/mean_length": 82.25,
"completions/mean_terminated_length": 82.25,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 1.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 16.176429748535156,
"kl": 0.020354424137622118,
"learning_rate": 9.729086208503173e-07,
"loss": 0.0798,
"num_tokens": 2653.0,
"reward": 0.04737311601638794,
"reward_std": 0.010779645293951035,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.04737311601638794,
"rewards/question_recreation_reward_func/std": 0.019436875358223915,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 45.0,
"completions/max_terminated_length": 45.0,
"completions/mean_length": 28.25,
"completions/mean_terminated_length": 28.25,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 2.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 44.07194900512695,
"kl": 0.08810030668973923,
"learning_rate": 9.397368756032444e-07,
"loss": 0.0939,
"num_tokens": 3278.0,
"reward": 0.050381630659103394,
"reward_std": 0.006566238589584827,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.0503816232085228,
"rewards/question_recreation_reward_func/std": 0.007008333690464497,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 27.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 25.0,
"completions/mean_terminated_length": 25.0,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 2.8,
"frac_reward_zero_std": 1.0,
"grad_norm": 1.3027467727661133,
"kl": 0.08069050312042236,
"learning_rate": 8.945702546981968e-07,
"loss": 0.0032,
"num_tokens": 3890.0,
"reward": 0.1409657597541809,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.04946577176451683,
"rewards/question_recreation_reward_func/std": 0.012337522581219673,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 157.0,
"completions/max_terminated_length": 157.0,
"completions/mean_length": 67.0,
"completions/mean_terminated_length": 67.0,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 3.4,
"frac_reward_zero_std": 0.5,
"grad_norm": 18.420305252075195,
"kl": 0.03622829727828503,
"learning_rate": 8.386407858128706e-07,
"loss": -0.0354,
"num_tokens": 4670.0,
"reward": 0.044995490461587906,
"reward_std": 0.0019922610372304916,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.044995490461587906,
"rewards/question_recreation_reward_func/std": 0.014919609762728214,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 146.0,
"completions/max_terminated_length": 146.0,
"completions/mean_length": 54.75,
"completions/mean_terminated_length": 54.75,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 3.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 11.040939331054688,
"kl": 0.025315589271485806,
"learning_rate": 7.734740790612136e-07,
"loss": 0.243,
"num_tokens": 5401.0,
"reward": 0.27007412910461426,
"reward_std": 0.010380705818533897,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.1785741150379181,
"rewards/question_recreation_reward_func/std": 0.012178990058600903,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 20.0,
"completions/max_terminated_length": 20.0,
"completions/mean_length": 17.0,
"completions/mean_terminated_length": 17.0,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 4.4,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005074405111372471,
"kl": 0.15366993844509125,
"learning_rate": 7.008477123264847e-07,
"loss": 0.0061,
"num_tokens": 5981.0,
"reward": 0.05733615159988403,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.05733615159988403,
"rewards/question_recreation_reward_func/std": 0.0004914185265079141,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 259.5,
"completions/mean_terminated_length": 259.5,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 4.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 12.459712028503418,
"kl": 0.029772183392196894,
"learning_rate": 6.227427435703995e-07,
"loss": 0.2349,
"num_tokens": 7531.0,
"reward": 0.11735042929649353,
"reward_std": 0.018421867862343788,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.11735042929649353,
"rewards/question_recreation_reward_func/std": 0.01842377707362175,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 23.0,
"completions/max_terminated_length": 23.0,
"completions/mean_length": 18.5,
"completions/mean_terminated_length": 18.5,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 5.4,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.039047498255968094,
"kl": 0.11775290966033936,
"learning_rate": 5.412896727361662e-07,
"loss": 0.0047,
"num_tokens": 8117.0,
"reward": 0.40121549367904663,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.30971550941467285,
"rewards/question_recreation_reward_func/std": 0.044006314128637314,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.09149999916553497,
"rewards/xmlcount_reward_func/std": 0.10565510392189026,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 61.75,
"completions/mean_terminated_length": 61.75,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 5.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 24.296092987060547,
"kl": 0.0357817467302084,
"learning_rate": 4.5871032726383385e-07,
"loss": 0.0419,
"num_tokens": 8876.0,
"reward": 0.048355475068092346,
"reward_std": 0.007202191278338432,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.048355475068092346,
"rewards/question_recreation_reward_func/std": 0.020798446610569954,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 149.0,
"completions/max_terminated_length": 149.0,
"completions/mean_length": 64.25,
"completions/mean_terminated_length": 64.25,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 6.4,
"frac_reward_zero_std": 0.5,
"grad_norm": 20.938844680786133,
"kl": 0.061262642964720726,
"learning_rate": 3.772572564296004e-07,
"loss": 0.0246,
"num_tokens": 9645.0,
"reward": 0.19921739399433136,
"reward_std": 0.01111482735723257,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.19921737909317017,
"rewards/question_recreation_reward_func/std": 0.08456551283597946,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.0,
"rewards/xmlcount_reward_func/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 23.0,
"completions/max_terminated_length": 23.0,
"completions/mean_length": 21.0,
"completions/mean_terminated_length": 21.0,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 6.8,
"frac_reward_zero_std": 0.5,
"grad_norm": 12.05763053894043,
"kl": 0.054638177156448364,
"learning_rate": 2.9915228767351535e-07,
"loss": 0.0173,
"num_tokens": 10241.0,
"reward": 0.14335308969020844,
"reward_std": 0.0008261414477601647,
"rewards/concensus_correctness_reward_func/mean": 0.0,
"rewards/concensus_correctness_reward_func/std": 0.0,
"rewards/consensus_reward_func/mean": 0.0,
"rewards/consensus_reward_func/std": 0.0,
"rewards/cumulative_reward_2/mean": 0.0,
"rewards/cumulative_reward_2/std": 0.0,
"rewards/final_correctness_reward_func/mean": 0.0,
"rewards/final_correctness_reward_func/std": 0.0,
"rewards/question_recreation_reward_func/mean": 0.0513530895113945,
"rewards/question_recreation_reward_func/std": 0.007401337847113609,
"rewards/soft_format_reward_func/mean": 0.0,
"rewards/soft_format_reward_func/std": 0.0,
"rewards/strict_format_reward_func/mean": 0.0,
"rewards/strict_format_reward_func/std": 0.0,
"rewards/xmlcount_reward_func/mean": 0.09200000017881393,
"rewards/xmlcount_reward_func/std": 0.10623559355735779,
"step": 14
},
{
"epoch": 6.8,
"step": 14,
"total_flos": 0.0,
"train_loss": 0.05132548208348453,
"train_runtime": 571.6229,
"train_samples_per_second": 0.14,
"train_steps_per_second": 0.035
}
],
"logging_steps": 1,
"max_steps": 20,
"num_input_tokens_seen": 10241,
"num_train_epochs": 7,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}