483 lines
17 KiB
JSON
483 lines
17 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0666666666666667,
|
|
"eval_steps": 500,
|
|
"global_step": 160,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.2125,
|
|
"completions/max_length": 490.8,
|
|
"completions/max_terminated_length": 470.6,
|
|
"completions/mean_length": 414.4625,
|
|
"completions/mean_terminated_length": 395.1430999755859,
|
|
"completions/min_length": 310.5,
|
|
"completions/min_terminated_length": 310.5,
|
|
"entropy": 0.5249067967757582,
|
|
"epoch": 0.06666666666666667,
|
|
"frac_reward_zero_std": 0.05,
|
|
"grad_norm": 2.65625,
|
|
"kl": 0.03049815017875517,
|
|
"learning_rate": 9.4375e-06,
|
|
"loss": -0.010575222969055175,
|
|
"num_tokens": 46025.0,
|
|
"reward": 0.73009033203125,
|
|
"reward_std": 0.4704558838158846,
|
|
"rewards/JointRewardFunction/mean": 0.73009033203125,
|
|
"rewards/JointRewardFunction/std": 0.47045588716864584,
|
|
"step": 10,
|
|
"step_time": 21.721466124300058
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15,
|
|
"completions/max_length": 474.6,
|
|
"completions/max_terminated_length": 432.8,
|
|
"completions/mean_length": 372.65,
|
|
"completions/mean_terminated_length": 348.14678955078125,
|
|
"completions/min_length": 280.5,
|
|
"completions/min_terminated_length": 280.5,
|
|
"entropy": 0.4361519979313016,
|
|
"epoch": 0.13333333333333333,
|
|
"frac_reward_zero_std": 0.05,
|
|
"grad_norm": 4.15625,
|
|
"kl": 0.0652532160282135,
|
|
"learning_rate": 8.8125e-06,
|
|
"loss": 0.016564452648162843,
|
|
"num_tokens": 89597.0,
|
|
"reward": 0.95604248046875,
|
|
"reward_std": 0.5059975624084473,
|
|
"rewards/JointRewardFunction/mean": 0.95604248046875,
|
|
"rewards/JointRewardFunction/std": 0.5059975773096085,
|
|
"step": 20,
|
|
"step_time": 22.023339059400495
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0375,
|
|
"completions/max_length": 425.6,
|
|
"completions/max_terminated_length": 406.9,
|
|
"completions/mean_length": 317.9,
|
|
"completions/mean_terminated_length": 309.6845275878906,
|
|
"completions/min_length": 231.4,
|
|
"completions/min_terminated_length": 231.4,
|
|
"entropy": 0.45581948235630987,
|
|
"epoch": 0.2,
|
|
"frac_reward_zero_std": 0.1,
|
|
"grad_norm": 2.859375,
|
|
"kl": 0.1008026220370084,
|
|
"learning_rate": 8.1875e-06,
|
|
"loss": 0.01793680489063263,
|
|
"num_tokens": 126445.0,
|
|
"reward": 1.2108154296875,
|
|
"reward_std": 0.40027157836593685,
|
|
"rewards/JointRewardFunction/mean": 1.2108154296875,
|
|
"rewards/JointRewardFunction/std": 0.40027157838921995,
|
|
"step": 30,
|
|
"step_time": 19.79602696299935
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1125,
|
|
"completions/max_length": 493.9,
|
|
"completions/max_terminated_length": 468.6,
|
|
"completions/mean_length": 352.1875,
|
|
"completions/mean_terminated_length": 335.66607666015625,
|
|
"completions/min_length": 238.5,
|
|
"completions/min_terminated_length": 238.5,
|
|
"entropy": 0.4116522930562496,
|
|
"epoch": 0.26666666666666666,
|
|
"frac_reward_zero_std": 0.05,
|
|
"grad_norm": 2.5625,
|
|
"kl": 0.12701121605932714,
|
|
"learning_rate": 7.5625e-06,
|
|
"loss": 0.05010480284690857,
|
|
"num_tokens": 167932.0,
|
|
"reward": 1.2074462890625,
|
|
"reward_std": 0.42208707332611084,
|
|
"rewards/JointRewardFunction/mean": 1.2074462890625,
|
|
"rewards/JointRewardFunction/std": 0.4220870822668076,
|
|
"step": 40,
|
|
"step_time": 22.582551179301117
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 393.4,
|
|
"completions/max_terminated_length": 380.0,
|
|
"completions/mean_length": 300.8625,
|
|
"completions/mean_terminated_length": 296.27321472167966,
|
|
"completions/min_length": 226.3,
|
|
"completions/min_terminated_length": 226.3,
|
|
"entropy": 0.4190680437721312,
|
|
"epoch": 0.3333333333333333,
|
|
"frac_reward_zero_std": 0.35,
|
|
"grad_norm": 3.25,
|
|
"kl": 0.13846059744246303,
|
|
"learning_rate": 6.9375e-06,
|
|
"loss": 0.03549057841300964,
|
|
"num_tokens": 204717.0,
|
|
"reward": 1.26171875,
|
|
"reward_std": 0.38662562653189525,
|
|
"rewards/JointRewardFunction/mean": 1.26171875,
|
|
"rewards/JointRewardFunction/std": 0.3866256324923597,
|
|
"step": 50,
|
|
"step_time": 18.59559666490022
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 426.2,
|
|
"completions/max_terminated_length": 414.2,
|
|
"completions/mean_length": 285.9125,
|
|
"completions/mean_terminated_length": 280.5291687011719,
|
|
"completions/min_length": 190.9,
|
|
"completions/min_terminated_length": 190.9,
|
|
"entropy": 0.3493430153466761,
|
|
"epoch": 0.4,
|
|
"frac_reward_zero_std": 0.25,
|
|
"grad_norm": 2.375,
|
|
"kl": 0.15198964411392807,
|
|
"learning_rate": 6.3125e-06,
|
|
"loss": 0.01220681592822075,
|
|
"num_tokens": 240990.0,
|
|
"reward": 1.27255859375,
|
|
"reward_std": 0.34817005618242547,
|
|
"rewards/JointRewardFunction/mean": 1.27255859375,
|
|
"rewards/JointRewardFunction/std": 0.34817005618242547,
|
|
"step": 60,
|
|
"step_time": 19.868489251599385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.025,
|
|
"completions/max_length": 370.4,
|
|
"completions/max_terminated_length": 365.0,
|
|
"completions/mean_length": 277.2875,
|
|
"completions/mean_terminated_length": 272.9107177734375,
|
|
"completions/min_length": 179.9,
|
|
"completions/min_terminated_length": 179.9,
|
|
"entropy": 0.34267437979578974,
|
|
"epoch": 0.4666666666666667,
|
|
"frac_reward_zero_std": 0.55,
|
|
"grad_norm": 0.017333984375,
|
|
"kl": 0.18903981931507588,
|
|
"learning_rate": 5.6875e-06,
|
|
"loss": 0.019876784086227416,
|
|
"num_tokens": 276969.0,
|
|
"reward": 1.381591796875,
|
|
"reward_std": 0.2380124439485371,
|
|
"rewards/JointRewardFunction/mean": 1.381591796875,
|
|
"rewards/JointRewardFunction/std": 0.23801244990900158,
|
|
"step": 70,
|
|
"step_time": 17.71040062670145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 378.1,
|
|
"completions/max_terminated_length": 378.1,
|
|
"completions/mean_length": 271.3625,
|
|
"completions/mean_terminated_length": 271.3625,
|
|
"completions/min_length": 171.4,
|
|
"completions/min_terminated_length": 171.4,
|
|
"entropy": 0.3542415237054229,
|
|
"epoch": 0.5333333333333333,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 0.0751953125,
|
|
"kl": 0.19187260391190647,
|
|
"learning_rate": 5.0625e-06,
|
|
"loss": 0.0034067176282405855,
|
|
"num_tokens": 312350.0,
|
|
"reward": 1.40601806640625,
|
|
"reward_std": 0.2126459252787754,
|
|
"rewards/JointRewardFunction/mean": 1.40601806640625,
|
|
"rewards/JointRewardFunction/std": 0.21264592825900763,
|
|
"step": 80,
|
|
"step_time": 18.04391895070148
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05,
|
|
"completions/max_length": 357.6,
|
|
"completions/max_terminated_length": 341.1,
|
|
"completions/mean_length": 272.275,
|
|
"completions/mean_terminated_length": 260.9375,
|
|
"completions/min_length": 171.3,
|
|
"completions/min_terminated_length": 171.3,
|
|
"entropy": 0.31064137276262044,
|
|
"epoch": 0.6,
|
|
"frac_reward_zero_std": 0.6,
|
|
"grad_norm": 1.875,
|
|
"kl": 0.20775549318641423,
|
|
"learning_rate": 4.4375e-06,
|
|
"loss": 0.0008514203131198883,
|
|
"num_tokens": 348280.0,
|
|
"reward": 1.353466796875,
|
|
"reward_std": 0.21437984704971313,
|
|
"rewards/JointRewardFunction/mean": 1.353466796875,
|
|
"rewards/JointRewardFunction/std": 0.21437986195087433,
|
|
"step": 90,
|
|
"step_time": 17.224812426199787
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0625,
|
|
"completions/max_length": 386.0,
|
|
"completions/max_terminated_length": 357.8,
|
|
"completions/mean_length": 267.65,
|
|
"completions/mean_terminated_length": 250.7500030517578,
|
|
"completions/min_length": 169.7,
|
|
"completions/min_terminated_length": 169.7,
|
|
"entropy": 0.3662784457206726,
|
|
"epoch": 0.6666666666666666,
|
|
"frac_reward_zero_std": 0.7,
|
|
"grad_norm": 0.01239013671875,
|
|
"kl": 0.21083315466530622,
|
|
"learning_rate": 3.8125e-06,
|
|
"loss": 0.011665140837430954,
|
|
"num_tokens": 384576.0,
|
|
"reward": 1.34949951171875,
|
|
"reward_std": 0.23986690491437912,
|
|
"rewards/JointRewardFunction/mean": 1.34949951171875,
|
|
"rewards/JointRewardFunction/std": 0.2398669108748436,
|
|
"step": 100,
|
|
"step_time": 18.349693166400904
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 382.2,
|
|
"completions/max_terminated_length": 364.2,
|
|
"completions/mean_length": 275.575,
|
|
"completions/mean_terminated_length": 272.3714294433594,
|
|
"completions/min_length": 188.2,
|
|
"completions/min_terminated_length": 188.2,
|
|
"entropy": 0.37121466230601075,
|
|
"epoch": 0.7333333333333333,
|
|
"frac_reward_zero_std": 0.6,
|
|
"grad_norm": 0.0146484375,
|
|
"kl": 0.21329910093918442,
|
|
"learning_rate": 3.1875e-06,
|
|
"loss": 0.007188273221254348,
|
|
"num_tokens": 419570.0,
|
|
"reward": 1.350439453125,
|
|
"reward_std": 0.23384397297631948,
|
|
"rewards/JointRewardFunction/mean": 1.350439453125,
|
|
"rewards/JointRewardFunction/std": 0.23384397297631948,
|
|
"step": 110,
|
|
"step_time": 18.14425329649821
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 346.6,
|
|
"completions/max_terminated_length": 345.3,
|
|
"completions/mean_length": 252.15,
|
|
"completions/mean_terminated_length": 250.38750305175782,
|
|
"completions/min_length": 166.8,
|
|
"completions/min_terminated_length": 166.8,
|
|
"entropy": 0.4173679456114769,
|
|
"epoch": 0.8,
|
|
"frac_reward_zero_std": 0.45,
|
|
"grad_norm": 2.09375,
|
|
"kl": 0.21794578088447453,
|
|
"learning_rate": 2.5625e-06,
|
|
"loss": -0.0005294814705848694,
|
|
"num_tokens": 452006.0,
|
|
"reward": 1.35863037109375,
|
|
"reward_std": 0.24987269788980485,
|
|
"rewards/JointRewardFunction/mean": 1.35863037109375,
|
|
"rewards/JointRewardFunction/std": 0.24987269788980485,
|
|
"step": 120,
|
|
"step_time": 16.82072365879685
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0375,
|
|
"completions/max_length": 441.8,
|
|
"completions/max_terminated_length": 440.3,
|
|
"completions/mean_length": 301.7375,
|
|
"completions/mean_terminated_length": 296.75750122070315,
|
|
"completions/min_length": 191.0,
|
|
"completions/min_terminated_length": 191.0,
|
|
"entropy": 0.33190380278974774,
|
|
"epoch": 0.8666666666666667,
|
|
"frac_reward_zero_std": 0.5,
|
|
"grad_norm": 2.09375,
|
|
"kl": 0.18061227248981596,
|
|
"learning_rate": 1.9375e-06,
|
|
"loss": 0.008612716197967529,
|
|
"num_tokens": 490397.0,
|
|
"reward": 1.374853515625,
|
|
"reward_std": 0.2561936320271343,
|
|
"rewards/JointRewardFunction/mean": 1.374853515625,
|
|
"rewards/JointRewardFunction/std": 0.2561936320271343,
|
|
"step": 130,
|
|
"step_time": 20.63984096989916
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 340.1,
|
|
"completions/max_terminated_length": 340.1,
|
|
"completions/mean_length": 256.5,
|
|
"completions/mean_terminated_length": 256.5,
|
|
"completions/min_length": 179.7,
|
|
"completions/min_terminated_length": 179.7,
|
|
"entropy": 0.38072127737104894,
|
|
"epoch": 0.9333333333333333,
|
|
"frac_reward_zero_std": 0.7,
|
|
"grad_norm": 2.96875,
|
|
"kl": 0.21334810927510262,
|
|
"learning_rate": 1.3125000000000001e-06,
|
|
"loss": 0.012666280567646026,
|
|
"num_tokens": 523549.0,
|
|
"reward": 1.436767578125,
|
|
"reward_std": 0.153020023368299,
|
|
"rewards/JointRewardFunction/mean": 1.436767578125,
|
|
"rewards/JointRewardFunction/std": 0.153020023368299,
|
|
"step": 140,
|
|
"step_time": 16.57510228729916
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0125,
|
|
"completions/max_length": 371.9,
|
|
"completions/max_terminated_length": 370.6,
|
|
"completions/mean_length": 257.4125,
|
|
"completions/mean_terminated_length": 254.5607147216797,
|
|
"completions/min_length": 169.0,
|
|
"completions/min_terminated_length": 169.0,
|
|
"entropy": 0.3929610840976238,
|
|
"epoch": 1.0,
|
|
"frac_reward_zero_std": 0.3,
|
|
"grad_norm": 0.01513671875,
|
|
"kl": 0.21818328225053846,
|
|
"learning_rate": 6.875000000000001e-07,
|
|
"loss": 0.011788636445999146,
|
|
"num_tokens": 556454.0,
|
|
"reward": 1.348779296875,
|
|
"reward_std": 0.3280519276857376,
|
|
"rewards/JointRewardFunction/mean": 1.348779296875,
|
|
"rewards/JointRewardFunction/std": 0.32805192805826666,
|
|
"step": 150,
|
|
"step_time": 17.876252979701167
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 343.0,
|
|
"completions/max_terminated_length": 343.0,
|
|
"completions/mean_length": 240.45,
|
|
"completions/mean_terminated_length": 240.45,
|
|
"completions/min_length": 176.0,
|
|
"completions/min_terminated_length": 176.0,
|
|
"entropy": 0.4010548871010542,
|
|
"epoch": 1.0666666666666667,
|
|
"frac_reward_zero_std": 0.65,
|
|
"grad_norm": 0.025146484375,
|
|
"kl": 0.22879955088719725,
|
|
"learning_rate": 6.250000000000001e-08,
|
|
"loss": 0.022160810232162476,
|
|
"num_tokens": 587606.0,
|
|
"reward": 1.3880859375,
|
|
"reward_std": 0.237497678399086,
|
|
"rewards/JointRewardFunction/mean": 1.3880859375,
|
|
"rewards/JointRewardFunction/std": 0.2374976843595505,
|
|
"step": 160,
|
|
"step_time": 16.50347660660045
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 160,
|
|
"num_input_tokens_seen": 587606,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 10,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|