2735 lines
100 KiB
JSON
2735 lines
100 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 7.575757575757576,
|
|
"eval_steps": 500,
|
|
"global_step": 500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 177.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 177.8,
|
|
"completions/max_terminated_length": 177.8,
|
|
"completions/mean_length": 157.85000610351562,
|
|
"completions/mean_terminated_length": 157.85000610351562,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.07575757575757576,
|
|
"frac_reward_zero_std": 0.4000000059604645,
|
|
"grad_norm": 1.5840047597885132,
|
|
"kl": 0.0010059793893522702,
|
|
"learning_rate": 1.6000000000000001e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 73447.0,
|
|
"reward": 0.5880883574485779,
|
|
"reward_std": 0.020529226586222648,
|
|
"rewards/reward_function/mean": 0.5880883395671844,
|
|
"rewards/reward_function/std": 0.06562883183360099,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 176.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 176.2,
|
|
"completions/max_terminated_length": 176.2,
|
|
"completions/mean_length": 156.65000915527344,
|
|
"completions/mean_terminated_length": 156.65000915527344,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.15151515151515152,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 1.6390373706817627,
|
|
"kl": 0.0017284046276472508,
|
|
"learning_rate": 3.6000000000000003e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 146334.0,
|
|
"reward": 0.605418348312378,
|
|
"reward_std": 0.02508251890540123,
|
|
"rewards/reward_function/mean": 0.60541832447052,
|
|
"rewards/reward_function/std": 0.06859094277024269,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 176.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 176.4,
|
|
"completions/max_terminated_length": 176.4,
|
|
"completions/mean_length": 157.0000030517578,
|
|
"completions/mean_terminated_length": 157.0000030517578,
|
|
"completions/min_length": 140.8,
|
|
"completions/min_terminated_length": 140.8,
|
|
"epoch": 0.22727272727272727,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.7626600861549377,
|
|
"kl": 0.003397522373901059,
|
|
"learning_rate": 5.600000000000001e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 219198.0,
|
|
"reward": 0.5862850427627564,
|
|
"reward_std": 0.036518129706382754,
|
|
"rewards/reward_function/mean": 0.5862850069999694,
|
|
"rewards/reward_function/std": 0.08488646671175956,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 183.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 183.0,
|
|
"completions/max_terminated_length": 183.0,
|
|
"completions/mean_length": 156.98334045410155,
|
|
"completions/mean_terminated_length": 156.98334045410155,
|
|
"completions/min_length": 139.8,
|
|
"completions/min_terminated_length": 139.8,
|
|
"epoch": 0.30303030303030304,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.9624250531196594,
|
|
"kl": 0.007015585945919156,
|
|
"learning_rate": 7.600000000000001e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 291737.0,
|
|
"reward": 0.6001700401306153,
|
|
"reward_std": 0.025772593356668948,
|
|
"rewards/reward_function/mean": 0.6001700043678284,
|
|
"rewards/reward_function/std": 0.07909451425075531,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 172.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 172.2,
|
|
"completions/max_terminated_length": 172.2,
|
|
"completions/mean_length": 155.23333740234375,
|
|
"completions/mean_terminated_length": 155.23333740234375,
|
|
"completions/min_length": 138.2,
|
|
"completions/min_terminated_length": 138.2,
|
|
"epoch": 0.3787878787878788,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.5355867743492126,
|
|
"kl": 0.009492208405087391,
|
|
"learning_rate": 9.600000000000001e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 364535.0,
|
|
"reward": 0.5766633510589599,
|
|
"reward_std": 0.04085115455091,
|
|
"rewards/reward_function/mean": 0.576663339138031,
|
|
"rewards/reward_function/std": 0.10587597712874412,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 198.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 198.4,
|
|
"completions/max_terminated_length": 198.4,
|
|
"completions/mean_length": 153.6166748046875,
|
|
"completions/mean_terminated_length": 153.6166748046875,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.45454545454545453,
|
|
"frac_reward_zero_std": 0.13333333730697633,
|
|
"grad_norm": 0.6549646854400635,
|
|
"kl": 0.061492755884925525,
|
|
"learning_rate": 1.16e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 436992.0,
|
|
"reward": 0.5978150248527527,
|
|
"reward_std": 0.04262940138578415,
|
|
"rewards/reward_function/mean": 0.5978150129318237,
|
|
"rewards/reward_function/std": 0.09431936666369438,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 176.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 176.0,
|
|
"completions/max_terminated_length": 176.0,
|
|
"completions/mean_length": 158.86667175292968,
|
|
"completions/mean_terminated_length": 158.86667175292968,
|
|
"completions/min_length": 145.0,
|
|
"completions/min_terminated_length": 145.0,
|
|
"epoch": 0.5303030303030303,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.4725801348686218,
|
|
"kl": 2.357983988771836,
|
|
"learning_rate": 1.3600000000000002e-05,
|
|
"loss": 0.0024,
|
|
"num_tokens": 509788.0,
|
|
"reward": 0.6049700140953064,
|
|
"reward_std": 0.012831439916044473,
|
|
"rewards/reward_function/mean": 0.6049699783325195,
|
|
"rewards/reward_function/std": 0.08928216472268105,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 172.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 172.4,
|
|
"completions/max_terminated_length": 172.4,
|
|
"completions/mean_length": 154.5500030517578,
|
|
"completions/mean_terminated_length": 154.5500030517578,
|
|
"completions/min_length": 138.8,
|
|
"completions/min_terminated_length": 138.8,
|
|
"epoch": 0.6060606060606061,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.41579461097717285,
|
|
"kl": 0.10282722649474939,
|
|
"learning_rate": 1.5600000000000003e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 582789.0,
|
|
"reward": 0.5595033764839172,
|
|
"reward_std": 0.014409982354845852,
|
|
"rewards/reward_function/mean": 0.5595033466815948,
|
|
"rewards/reward_function/std": 0.053104204079136255,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 202.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 202.6,
|
|
"completions/max_terminated_length": 202.6,
|
|
"completions/mean_length": 158.20000305175782,
|
|
"completions/mean_terminated_length": 158.20000305175782,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.6818181818181818,
|
|
"frac_reward_zero_std": 0.33333333730697634,
|
|
"grad_norm": 0.03286667913198471,
|
|
"kl": 3726.0936788400013,
|
|
"learning_rate": 1.76e-05,
|
|
"loss": 3.7261,
|
|
"num_tokens": 655565.0,
|
|
"reward": 0.5829650402069092,
|
|
"reward_std": 0.031194474175572397,
|
|
"rewards/reward_function/mean": 0.5829649925231933,
|
|
"rewards/reward_function/std": 0.09724260903894902,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 180.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 180.4,
|
|
"completions/max_terminated_length": 180.4,
|
|
"completions/mean_length": 157.5500030517578,
|
|
"completions/mean_terminated_length": 157.5500030517578,
|
|
"completions/min_length": 141.2,
|
|
"completions/min_terminated_length": 141.2,
|
|
"epoch": 0.7575757575757576,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.45231395959854126,
|
|
"kl": 0.24270717451969784,
|
|
"learning_rate": 1.9600000000000002e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 727754.0,
|
|
"reward": 0.6422233581542969,
|
|
"reward_std": 0.015453202556818724,
|
|
"rewards/reward_function/mean": 0.642223310470581,
|
|
"rewards/reward_function/std": 0.08873879238963127,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 174.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 174.8,
|
|
"completions/max_terminated_length": 174.8,
|
|
"completions/mean_length": 153.8166717529297,
|
|
"completions/mean_terminated_length": 153.8166717529297,
|
|
"completions/min_length": 138.2,
|
|
"completions/min_terminated_length": 138.2,
|
|
"epoch": 0.8333333333333334,
|
|
"frac_reward_zero_std": 0.20000000596046447,
|
|
"grad_norm": 0.3569779694080353,
|
|
"kl": 0.25451052089532217,
|
|
"learning_rate": 1.9822222222222226e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 800059.0,
|
|
"reward": 0.6014716982841491,
|
|
"reward_std": 0.015854166075587272,
|
|
"rewards/reward_function/mean": 0.6014716625213623,
|
|
"rewards/reward_function/std": 0.10040064603090286,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 173.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 173.2,
|
|
"completions/max_terminated_length": 173.2,
|
|
"completions/mean_length": 153.4666717529297,
|
|
"completions/mean_terminated_length": 153.4666717529297,
|
|
"completions/min_length": 135.6,
|
|
"completions/min_terminated_length": 135.6,
|
|
"epoch": 0.9090909090909091,
|
|
"frac_reward_zero_std": 0.20000000596046447,
|
|
"grad_norm": 0.4230985939502716,
|
|
"kl": 0.10768474241097768,
|
|
"learning_rate": 1.9600000000000002e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 872639.0,
|
|
"reward": 0.6015583634376526,
|
|
"reward_std": 0.020151399821043015,
|
|
"rewards/reward_function/mean": 0.6015583276748657,
|
|
"rewards/reward_function/std": 0.1292761668562889,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 164.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 164.0,
|
|
"completions/max_terminated_length": 164.0,
|
|
"completions/mean_length": 148.58334045410157,
|
|
"completions/mean_terminated_length": 148.58334045410157,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.9848484848484849,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.25373604893684387,
|
|
"kl": 0.1025156612197558,
|
|
"learning_rate": 1.9377777777777778e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 944522.0,
|
|
"reward": 0.6054783701896668,
|
|
"reward_std": 0.014289665129035711,
|
|
"rewards/reward_function/mean": 0.6054783225059509,
|
|
"rewards/reward_function/std": 0.1043807715177536,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 166.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 166.8,
|
|
"completions/max_terminated_length": 166.8,
|
|
"completions/mean_length": 148.18333740234374,
|
|
"completions/mean_terminated_length": 148.18333740234374,
|
|
"completions/min_length": 129.8,
|
|
"completions/min_terminated_length": 129.8,
|
|
"epoch": 1.0606060606060606,
|
|
"frac_reward_zero_std": 0.20000000596046447,
|
|
"grad_norm": 0.40739330649375916,
|
|
"kl": 0.13809017241001129,
|
|
"learning_rate": 1.9155555555555558e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 1016593.0,
|
|
"reward": 0.625361704826355,
|
|
"reward_std": 0.020681749982759356,
|
|
"rewards/reward_function/mean": 0.6253616333007812,
|
|
"rewards/reward_function/std": 0.09655277617275715,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 161.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 161.6,
|
|
"completions/max_terminated_length": 161.6,
|
|
"completions/mean_length": 143.83333740234374,
|
|
"completions/mean_terminated_length": 143.83333740234374,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 1.1363636363636362,
|
|
"frac_reward_zero_std": 0.33333333730697634,
|
|
"grad_norm": 0.012142821215093136,
|
|
"kl": 0.1849093531568845,
|
|
"learning_rate": 1.8933333333333334e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1087987.0,
|
|
"reward": 0.6634533524513244,
|
|
"reward_std": 0.01583307459950447,
|
|
"rewards/reward_function/mean": 0.6634533286094666,
|
|
"rewards/reward_function/std": 0.12549073845148087,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 167.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 167.4,
|
|
"completions/max_terminated_length": 167.4,
|
|
"completions/mean_length": 149.76667175292968,
|
|
"completions/mean_terminated_length": 149.76667175292968,
|
|
"completions/min_length": 135.4,
|
|
"completions/min_terminated_length": 135.4,
|
|
"epoch": 1.2121212121212122,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.2965545952320099,
|
|
"kl": 0.14318528175354003,
|
|
"learning_rate": 1.8711111111111113e-05,
|
|
"loss": 0.0001,
|
|
"num_tokens": 1160193.0,
|
|
"reward": 0.5806800246238708,
|
|
"reward_std": 0.04468099344521761,
|
|
"rewards/reward_function/mean": 0.5806800127029419,
|
|
"rewards/reward_function/std": 0.11235176101326942,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 168.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 168.0,
|
|
"completions/max_terminated_length": 168.0,
|
|
"completions/mean_length": 148.08334045410157,
|
|
"completions/mean_terminated_length": 148.08334045410157,
|
|
"completions/min_length": 134.8,
|
|
"completions/min_terminated_length": 134.8,
|
|
"epoch": 1.2878787878787878,
|
|
"frac_reward_zero_std": 0.13333333730697633,
|
|
"grad_norm": 0.3983455300331116,
|
|
"kl": 0.16668486495812734,
|
|
"learning_rate": 1.848888888888889e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1232170.0,
|
|
"reward": 0.5667450308799744,
|
|
"reward_std": 0.02706171413883567,
|
|
"rewards/reward_function/mean": 0.5667450308799744,
|
|
"rewards/reward_function/std": 0.1369288980960846,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 159.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 159.6,
|
|
"completions/max_terminated_length": 159.6,
|
|
"completions/mean_length": 143.23333740234375,
|
|
"completions/mean_terminated_length": 143.23333740234375,
|
|
"completions/min_length": 127.2,
|
|
"completions/min_terminated_length": 127.2,
|
|
"epoch": 1.3636363636363638,
|
|
"frac_reward_zero_std": 0.20000000596046447,
|
|
"grad_norm": 0.3573758006095886,
|
|
"kl": 0.16684276660283406,
|
|
"learning_rate": 1.826666666666667e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1303880.0,
|
|
"reward": 0.6362017035484314,
|
|
"reward_std": 0.007156953122466803,
|
|
"rewards/reward_function/mean": 0.6362016916275024,
|
|
"rewards/reward_function/std": 0.07174314968287945,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 156.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 156.2,
|
|
"completions/max_terminated_length": 156.2,
|
|
"completions/mean_length": 142.61667175292968,
|
|
"completions/mean_terminated_length": 142.61667175292968,
|
|
"completions/min_length": 132.4,
|
|
"completions/min_terminated_length": 132.4,
|
|
"epoch": 1.4393939393939394,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.28054413199424744,
|
|
"kl": 0.16249675651391346,
|
|
"learning_rate": 1.8044444444444445e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1375937.0,
|
|
"reward": 0.5929333567619324,
|
|
"reward_std": 0.010427127918228507,
|
|
"rewards/reward_function/mean": 0.5929333448410035,
|
|
"rewards/reward_function/std": 0.0421123169362545,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 153.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 153.2,
|
|
"completions/max_terminated_length": 153.2,
|
|
"completions/mean_length": 141.6166748046875,
|
|
"completions/mean_terminated_length": 141.6166748046875,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 1.5151515151515151,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.35829174518585205,
|
|
"kl": 0.1883176525433858,
|
|
"learning_rate": 1.782222222222222e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1448046.0,
|
|
"reward": 0.5822266817092896,
|
|
"reward_std": 0.0035873036831617355,
|
|
"rewards/reward_function/mean": 0.5822266697883606,
|
|
"rewards/reward_function/std": 0.045498589798808095,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 150.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 150.8,
|
|
"completions/max_terminated_length": 150.8,
|
|
"completions/mean_length": 138.93334045410157,
|
|
"completions/mean_terminated_length": 138.93334045410157,
|
|
"completions/min_length": 124.2,
|
|
"completions/min_terminated_length": 124.2,
|
|
"epoch": 1.5909090909090908,
|
|
"frac_reward_zero_std": 0.6000000178813935,
|
|
"grad_norm": 0.24431772530078888,
|
|
"kl": 0.19910954435666403,
|
|
"learning_rate": 1.76e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1520398.0,
|
|
"reward": 0.6048667073249817,
|
|
"reward_std": 0.0037458556122146546,
|
|
"rewards/reward_function/mean": 0.6048666715621949,
|
|
"rewards/reward_function/std": 0.05969845354557037,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 150.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 150.0,
|
|
"completions/max_terminated_length": 150.0,
|
|
"completions/mean_length": 138.0500061035156,
|
|
"completions/mean_terminated_length": 138.0500061035156,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 1.6666666666666665,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.3577372431755066,
|
|
"kl": 0.20313620964686077,
|
|
"learning_rate": 1.737777777777778e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1591865.0,
|
|
"reward": 0.6223516702651978,
|
|
"reward_std": 0.005239984532818198,
|
|
"rewards/reward_function/mean": 0.6223516583442688,
|
|
"rewards/reward_function/std": 0.08074029944837094,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 147.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.6,
|
|
"completions/max_terminated_length": 147.6,
|
|
"completions/mean_length": 135.33334045410157,
|
|
"completions/mean_terminated_length": 135.33334045410157,
|
|
"completions/min_length": 126.8,
|
|
"completions/min_terminated_length": 126.8,
|
|
"epoch": 1.7424242424242424,
|
|
"frac_reward_zero_std": 0.3333333432674408,
|
|
"grad_norm": 0.4057120680809021,
|
|
"kl": 0.23306088149547577,
|
|
"learning_rate": 1.7155555555555557e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1663321.0,
|
|
"reward": 0.6518083691596985,
|
|
"reward_std": 0.004030292294919491,
|
|
"rewards/reward_function/mean": 0.6518083572387695,
|
|
"rewards/reward_function/std": 0.07627851068973542,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 148.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 148.2,
|
|
"completions/max_terminated_length": 148.2,
|
|
"completions/mean_length": 137.23334045410155,
|
|
"completions/mean_terminated_length": 137.23334045410155,
|
|
"completions/min_length": 126.8,
|
|
"completions/min_terminated_length": 126.8,
|
|
"epoch": 1.8181818181818183,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.33752211928367615,
|
|
"kl": 0.22376729945341747,
|
|
"learning_rate": 1.6933333333333336e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1735547.0,
|
|
"reward": 0.5559616804122924,
|
|
"reward_std": 0.004689847212284803,
|
|
"rewards/reward_function/mean": 0.55596165060997,
|
|
"rewards/reward_function/std": 0.07087234668433666,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.6,
|
|
"completions/max_terminated_length": 144.6,
|
|
"completions/mean_length": 134.8166717529297,
|
|
"completions/mean_terminated_length": 134.8166717529297,
|
|
"completions/min_length": 124.6,
|
|
"completions/min_terminated_length": 124.6,
|
|
"epoch": 1.893939393939394,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.383306622505188,
|
|
"kl": 0.21860195795694987,
|
|
"learning_rate": 1.6711111111111112e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1806636.0,
|
|
"reward": 0.619973337650299,
|
|
"reward_std": 0.007127840328030289,
|
|
"rewards/reward_function/mean": 0.6199733138084411,
|
|
"rewards/reward_function/std": 0.1044730719178915,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 149.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 149.0,
|
|
"completions/max_terminated_length": 149.0,
|
|
"completions/mean_length": 138.85000305175782,
|
|
"completions/mean_terminated_length": 138.85000305175782,
|
|
"completions/min_length": 129.8,
|
|
"completions/min_terminated_length": 129.8,
|
|
"epoch": 1.9696969696969697,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.3041795790195465,
|
|
"kl": 0.22460319399833678,
|
|
"learning_rate": 1.648888888888889e-05,
|
|
"loss": 0.0002,
|
|
"num_tokens": 1878715.0,
|
|
"reward": 0.600421690940857,
|
|
"reward_std": 0.003138695494271815,
|
|
"rewards/reward_function/mean": 0.6004216790199279,
|
|
"rewards/reward_function/std": 0.07153937965631485,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 149.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 149.4,
|
|
"completions/max_terminated_length": 149.4,
|
|
"completions/mean_length": 138.73333740234375,
|
|
"completions/mean_terminated_length": 138.73333740234375,
|
|
"completions/min_length": 129.6,
|
|
"completions/min_terminated_length": 129.6,
|
|
"epoch": 2.0454545454545454,
|
|
"frac_reward_zero_std": 0.06666666865348816,
|
|
"grad_norm": 0.4632037281990051,
|
|
"kl": 0.2490247219800949,
|
|
"learning_rate": 1.6266666666666668e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 1950283.0,
|
|
"reward": 0.6769016981124878,
|
|
"reward_std": 0.004545123921707273,
|
|
"rewards/reward_function/mean": 0.6769016504287719,
|
|
"rewards/reward_function/std": 0.10631415694952011,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 150.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 150.4,
|
|
"completions/max_terminated_length": 150.4,
|
|
"completions/mean_length": 141.00000610351563,
|
|
"completions/mean_terminated_length": 141.00000610351563,
|
|
"completions/min_length": 133.4,
|
|
"completions/min_terminated_length": 133.4,
|
|
"epoch": 2.121212121212121,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.3432351350784302,
|
|
"kl": 0.2777308980623881,
|
|
"learning_rate": 1.6044444444444444e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2022539.0,
|
|
"reward": 0.5752000212669373,
|
|
"reward_std": 0.003263407130725682,
|
|
"rewards/reward_function/mean": 0.5751999974250793,
|
|
"rewards/reward_function/std": 0.04959992915391922,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 159.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 159.6,
|
|
"completions/max_terminated_length": 159.6,
|
|
"completions/mean_length": 142.4166748046875,
|
|
"completions/mean_terminated_length": 142.4166748046875,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 2.196969696969697,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.2584984600543976,
|
|
"kl": 0.28019193609555565,
|
|
"learning_rate": 1.5822222222222224e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2094328.0,
|
|
"reward": 0.6064266800880432,
|
|
"reward_std": 0.005170531757175923,
|
|
"rewards/reward_function/mean": 0.6064266920089721,
|
|
"rewards/reward_function/std": 0.06116051897406578,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 151.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 151.8,
|
|
"completions/max_terminated_length": 151.8,
|
|
"completions/mean_length": 140.68333740234374,
|
|
"completions/mean_terminated_length": 140.68333740234374,
|
|
"completions/min_length": 130.4,
|
|
"completions/min_terminated_length": 130.4,
|
|
"epoch": 2.2727272727272725,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.5067008137702942,
|
|
"kl": 0.2740016082922618,
|
|
"learning_rate": 1.5600000000000003e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2166033.0,
|
|
"reward": 0.609345018863678,
|
|
"reward_std": 0.002939810324460268,
|
|
"rewards/reward_function/mean": 0.6093450069427491,
|
|
"rewards/reward_function/std": 0.07519036456942559,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 156.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 156.0,
|
|
"completions/max_terminated_length": 156.0,
|
|
"completions/mean_length": 141.85000305175782,
|
|
"completions/mean_terminated_length": 141.85000305175782,
|
|
"completions/min_length": 131.4,
|
|
"completions/min_terminated_length": 131.4,
|
|
"epoch": 2.3484848484848486,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.0041471216827631,
|
|
"kl": 0.30289856195449827,
|
|
"learning_rate": 1.537777777777778e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2238128.0,
|
|
"reward": 0.625088381767273,
|
|
"reward_std": 0.003446168079972267,
|
|
"rewards/reward_function/mean": 0.625088346004486,
|
|
"rewards/reward_function/std": 0.0934045672416687,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 154.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 154.0,
|
|
"completions/max_terminated_length": 154.0,
|
|
"completions/mean_length": 143.03333740234376,
|
|
"completions/mean_terminated_length": 143.03333740234376,
|
|
"completions/min_length": 132.6,
|
|
"completions/min_terminated_length": 132.6,
|
|
"epoch": 2.4242424242424243,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.4292708933353424,
|
|
"kl": 0.32506192127863565,
|
|
"learning_rate": 1.5155555555555557e-05,
|
|
"loss": 0.0003,
|
|
"num_tokens": 2310254.0,
|
|
"reward": 0.6129150271415711,
|
|
"reward_std": 0.001613644661847502,
|
|
"rewards/reward_function/mean": 0.6129150032997132,
|
|
"rewards/reward_function/std": 0.04124578349292278,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 150.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 150.6,
|
|
"completions/max_terminated_length": 150.6,
|
|
"completions/mean_length": 140.08333740234374,
|
|
"completions/mean_terminated_length": 140.08333740234374,
|
|
"completions/min_length": 130.2,
|
|
"completions/min_terminated_length": 130.2,
|
|
"epoch": 2.5,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.22632652521133423,
|
|
"kl": 0.3719263752301534,
|
|
"learning_rate": 1.4933333333333335e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2382007.0,
|
|
"reward": 0.6153733611106873,
|
|
"reward_std": 0.0011415929766371846,
|
|
"rewards/reward_function/mean": 0.6153733372688294,
|
|
"rewards/reward_function/std": 0.06577699668705464,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 153.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 153.6,
|
|
"completions/max_terminated_length": 153.6,
|
|
"completions/mean_length": 142.00000610351563,
|
|
"completions/mean_terminated_length": 142.00000610351563,
|
|
"completions/min_length": 132.2,
|
|
"completions/min_terminated_length": 132.2,
|
|
"epoch": 2.5757575757575757,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.24591365456581116,
|
|
"kl": 0.3840523103872935,
|
|
"learning_rate": 1.4711111111111111e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2453727.0,
|
|
"reward": 0.6065983414649964,
|
|
"reward_std": 0.0017149411884020082,
|
|
"rewards/reward_function/mean": 0.6065983414649964,
|
|
"rewards/reward_function/std": 0.07528561279177666,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 153.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 153.2,
|
|
"completions/max_terminated_length": 153.2,
|
|
"completions/mean_length": 138.4166717529297,
|
|
"completions/mean_terminated_length": 138.4166717529297,
|
|
"completions/min_length": 131.2,
|
|
"completions/min_terminated_length": 131.2,
|
|
"epoch": 2.6515151515151514,
|
|
"frac_reward_zero_std": 0.6666666746139527,
|
|
"grad_norm": 0.2127596139907837,
|
|
"kl": 0.39285261034965513,
|
|
"learning_rate": 1.448888888888889e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2525720.0,
|
|
"reward": 0.5957500219345093,
|
|
"reward_std": 0.000984994637838099,
|
|
"rewards/reward_function/mean": 0.5957500100135803,
|
|
"rewards/reward_function/std": 0.05670791454613209,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 150.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 150.4,
|
|
"completions/max_terminated_length": 150.4,
|
|
"completions/mean_length": 138.28334045410156,
|
|
"completions/mean_terminated_length": 138.28334045410156,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 2.7272727272727275,
|
|
"frac_reward_zero_std": 0.3333333432674408,
|
|
"grad_norm": 0.5221720933914185,
|
|
"kl": 0.42230349183082583,
|
|
"learning_rate": 1.4266666666666668e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2597141.0,
|
|
"reward": 0.6146900177001953,
|
|
"reward_std": 0.002093914127908647,
|
|
"rewards/reward_function/mean": 0.6146899938583374,
|
|
"rewards/reward_function/std": 0.09454492926597595,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.6,
|
|
"completions/max_terminated_length": 144.6,
|
|
"completions/mean_length": 136.4,
|
|
"completions/mean_terminated_length": 136.4,
|
|
"completions/min_length": 128.8,
|
|
"completions/min_terminated_length": 128.8,
|
|
"epoch": 2.8030303030303028,
|
|
"frac_reward_zero_std": 0.6666666746139527,
|
|
"grad_norm": 0.0012158072786405683,
|
|
"kl": 0.4219982922077179,
|
|
"learning_rate": 1.4044444444444445e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2668885.0,
|
|
"reward": 0.6260683536529541,
|
|
"reward_std": 0.001636023900937289,
|
|
"rewards/reward_function/mean": 0.6260683178901673,
|
|
"rewards/reward_function/std": 0.0843635703320615,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.0,
|
|
"completions/max_terminated_length": 145.0,
|
|
"completions/mean_length": 134.48334045410155,
|
|
"completions/mean_terminated_length": 134.48334045410155,
|
|
"completions/min_length": 128.8,
|
|
"completions/min_terminated_length": 128.8,
|
|
"epoch": 2.878787878787879,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.5890435576438904,
|
|
"kl": 0.4845229466756185,
|
|
"learning_rate": 1.3822222222222224e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 2739846.0,
|
|
"reward": 0.647468364238739,
|
|
"reward_std": 0.004450538125820458,
|
|
"rewards/reward_function/mean": 0.6474683403968811,
|
|
"rewards/reward_function/std": 0.0956076867878437,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.8,
|
|
"completions/max_terminated_length": 142.8,
|
|
"completions/mean_length": 136.56666870117186,
|
|
"completions/mean_terminated_length": 136.56666870117186,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 2.9545454545454546,
|
|
"frac_reward_zero_std": 0.5333333492279053,
|
|
"grad_norm": 0.5040601491928101,
|
|
"kl": 0.4712466796239217,
|
|
"learning_rate": 1.3600000000000002e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 2811368.0,
|
|
"reward": 0.5991516828536987,
|
|
"reward_std": 0.0033168070833198724,
|
|
"rewards/reward_function/mean": 0.5991516828536987,
|
|
"rewards/reward_function/std": 0.11686233524233103,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 149.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 149.6,
|
|
"completions/max_terminated_length": 149.6,
|
|
"completions/mean_length": 136.61666870117188,
|
|
"completions/mean_terminated_length": 136.61666870117188,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 3.0303030303030303,
|
|
"frac_reward_zero_std": 0.06666666865348816,
|
|
"grad_norm": 0.4343958795070648,
|
|
"kl": 4.539325646559397,
|
|
"learning_rate": 1.3377777777777778e-05,
|
|
"loss": 0.0046,
|
|
"num_tokens": 2882637.0,
|
|
"reward": 0.6381200432777405,
|
|
"reward_std": 0.002671318035572767,
|
|
"rewards/reward_function/mean": 0.6381200075149536,
|
|
"rewards/reward_function/std": 0.06200197748839855,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.0,
|
|
"completions/max_terminated_length": 145.0,
|
|
"completions/mean_length": 136.4666717529297,
|
|
"completions/mean_terminated_length": 136.4666717529297,
|
|
"completions/min_length": 129.6,
|
|
"completions/min_terminated_length": 129.6,
|
|
"epoch": 3.106060606060606,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.001530068926513195,
|
|
"kl": 0.42953559557596843,
|
|
"learning_rate": 1.3155555555555558e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 2954481.0,
|
|
"reward": 0.6389833569526673,
|
|
"reward_std": 0.0028340428718365727,
|
|
"rewards/reward_function/mean": 0.6389833331108093,
|
|
"rewards/reward_function/std": 0.07430336326360702,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 152.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 152.8,
|
|
"completions/max_terminated_length": 152.8,
|
|
"completions/mean_length": 137.0500030517578,
|
|
"completions/mean_terminated_length": 137.0500030517578,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 3.1818181818181817,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.2242085039615631,
|
|
"kl": 0.4064524511496226,
|
|
"learning_rate": 1.2933333333333334e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3025984.0,
|
|
"reward": 0.592305040359497,
|
|
"reward_std": 0.023412239202298225,
|
|
"rewards/reward_function/mean": 0.5923050284385681,
|
|
"rewards/reward_function/std": 0.06483886577188969,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.8,
|
|
"completions/max_terminated_length": 142.8,
|
|
"completions/mean_length": 136.6666687011719,
|
|
"completions/mean_terminated_length": 136.6666687011719,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 3.257575757575758,
|
|
"frac_reward_zero_std": 0.800000011920929,
|
|
"grad_norm": 0.5229790806770325,
|
|
"kl": 0.4055224259694417,
|
|
"learning_rate": 1.2711111111111112e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3097968.0,
|
|
"reward": 0.5731833577156067,
|
|
"reward_std": 0.01963214036077261,
|
|
"rewards/reward_function/mean": 0.5731833457946778,
|
|
"rewards/reward_function/std": 0.07965116798877717,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.8,
|
|
"completions/max_terminated_length": 143.8,
|
|
"completions/mean_length": 135.50000610351563,
|
|
"completions/mean_terminated_length": 135.50000610351563,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 3.3333333333333335,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.39335760474205017,
|
|
"kl": 0.43457045356432594,
|
|
"learning_rate": 1.2488888888888891e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3169202.0,
|
|
"reward": 0.6536967039108277,
|
|
"reward_std": 0.001967143564252183,
|
|
"rewards/reward_function/mean": 0.6536966800689697,
|
|
"rewards/reward_function/std": 0.0629568338394165,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.4,
|
|
"completions/max_terminated_length": 146.4,
|
|
"completions/mean_length": 137.25000610351563,
|
|
"completions/mean_terminated_length": 137.25000610351563,
|
|
"completions/min_length": 129.2,
|
|
"completions/min_terminated_length": 129.2,
|
|
"epoch": 3.409090909090909,
|
|
"frac_reward_zero_std": 0.5333333432674408,
|
|
"grad_norm": 0.4324550926685333,
|
|
"kl": 0.4297958453496297,
|
|
"learning_rate": 1.2266666666666667e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3240145.0,
|
|
"reward": 0.6059366822242737,
|
|
"reward_std": 0.0020971522550098597,
|
|
"rewards/reward_function/mean": 0.6059366822242737,
|
|
"rewards/reward_function/std": 0.12276976853609085,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.4,
|
|
"completions/max_terminated_length": 144.4,
|
|
"completions/mean_length": 136.2500030517578,
|
|
"completions/mean_terminated_length": 136.2500030517578,
|
|
"completions/min_length": 130.6,
|
|
"completions/min_terminated_length": 130.6,
|
|
"epoch": 3.484848484848485,
|
|
"frac_reward_zero_std": 0.5333333432674408,
|
|
"grad_norm": 0.29709550738334656,
|
|
"kl": 0.4481562276681264,
|
|
"learning_rate": 1.2044444444444445e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3311672.0,
|
|
"reward": 0.6084200322628022,
|
|
"reward_std": 0.002014898555353284,
|
|
"rewards/reward_function/mean": 0.6084200084209442,
|
|
"rewards/reward_function/std": 0.08303090147674083,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 141.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 141.8,
|
|
"completions/max_terminated_length": 141.8,
|
|
"completions/mean_length": 136.60000305175782,
|
|
"completions/mean_terminated_length": 136.60000305175782,
|
|
"completions/min_length": 130.8,
|
|
"completions/min_terminated_length": 130.8,
|
|
"epoch": 3.5606060606060606,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.4094958007335663,
|
|
"kl": 0.4330082913239797,
|
|
"learning_rate": 1.1822222222222225e-05,
|
|
"loss": 0.0004,
|
|
"num_tokens": 3383508.0,
|
|
"reward": 0.6278083682060241,
|
|
"reward_std": 0.0020803724364668597,
|
|
"rewards/reward_function/mean": 0.6278083443641662,
|
|
"rewards/reward_function/std": 0.05985546782612801,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.0,
|
|
"completions/max_terminated_length": 143.0,
|
|
"completions/mean_length": 136.03334045410156,
|
|
"completions/mean_terminated_length": 136.03334045410156,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 3.6363636363636362,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.38449370861053467,
|
|
"kl": 0.47435951630274453,
|
|
"learning_rate": 1.16e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3454898.0,
|
|
"reward": 0.5935583353042603,
|
|
"reward_std": 0.0033034421736374497,
|
|
"rewards/reward_function/mean": 0.5935583353042603,
|
|
"rewards/reward_function/std": 0.1149674504995346,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.0,
|
|
"completions/max_terminated_length": 143.0,
|
|
"completions/mean_length": 136.90000610351564,
|
|
"completions/mean_terminated_length": 136.90000610351564,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 3.712121212121212,
|
|
"frac_reward_zero_std": 0.7333333432674408,
|
|
"grad_norm": 0.20474444329738617,
|
|
"kl": 0.4580156147480011,
|
|
"learning_rate": 1.1377777777777779e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3527168.0,
|
|
"reward": 0.5878700256347656,
|
|
"reward_std": 0.0007033476096694358,
|
|
"rewards/reward_function/mean": 0.5878700017929077,
|
|
"rewards/reward_function/std": 0.03646513521671295,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.2,
|
|
"completions/max_terminated_length": 146.2,
|
|
"completions/mean_length": 136.83333740234374,
|
|
"completions/mean_terminated_length": 136.83333740234374,
|
|
"completions/min_length": 131.6,
|
|
"completions/min_terminated_length": 131.6,
|
|
"epoch": 3.787878787878788,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.3273075222969055,
|
|
"kl": 0.4593264718850454,
|
|
"learning_rate": 1.1155555555555556e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3598574.0,
|
|
"reward": 0.6440800189971924,
|
|
"reward_std": 0.002970032987650484,
|
|
"rewards/reward_function/mean": 0.6440800070762634,
|
|
"rewards/reward_function/std": 0.05392170324921608,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.0,
|
|
"completions/max_terminated_length": 146.0,
|
|
"completions/mean_length": 137.45000610351562,
|
|
"completions/mean_terminated_length": 137.45000610351562,
|
|
"completions/min_length": 131.6,
|
|
"completions/min_terminated_length": 131.6,
|
|
"epoch": 3.8636363636363638,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.2910541892051697,
|
|
"kl": 0.4974993328253428,
|
|
"learning_rate": 1.0933333333333334e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3670177.0,
|
|
"reward": 0.6269100069999695,
|
|
"reward_std": 0.0007708149147219956,
|
|
"rewards/reward_function/mean": 0.6269099950790405,
|
|
"rewards/reward_function/std": 0.06847313707694411,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.2,
|
|
"completions/max_terminated_length": 144.2,
|
|
"completions/mean_length": 136.33334045410157,
|
|
"completions/mean_terminated_length": 136.33334045410157,
|
|
"completions/min_length": 130.8,
|
|
"completions/min_terminated_length": 130.8,
|
|
"epoch": 3.9393939393939394,
|
|
"frac_reward_zero_std": 0.13333333730697633,
|
|
"grad_norm": 0.29939982295036316,
|
|
"kl": 0.5057354072729746,
|
|
"learning_rate": 1.0711111111111112e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3741225.0,
|
|
"reward": 0.6429433226585388,
|
|
"reward_std": 0.0031549638602882623,
|
|
"rewards/reward_function/mean": 0.6429433345794677,
|
|
"rewards/reward_function/std": 0.07378025688230991,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 147.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.4,
|
|
"completions/max_terminated_length": 147.4,
|
|
"completions/mean_length": 138.36667175292968,
|
|
"completions/mean_terminated_length": 138.36667175292968,
|
|
"completions/min_length": 130.8,
|
|
"completions/min_terminated_length": 130.8,
|
|
"epoch": 4.015151515151516,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.0023478628136217594,
|
|
"kl": 0.48426355322202047,
|
|
"learning_rate": 1.048888888888889e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3813143.0,
|
|
"reward": 0.6231333494186402,
|
|
"reward_std": 0.0028609800268895925,
|
|
"rewards/reward_function/mean": 0.6231333374977112,
|
|
"rewards/reward_function/std": 0.0803637184202671,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 147.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.8,
|
|
"completions/max_terminated_length": 147.8,
|
|
"completions/mean_length": 139.08333435058594,
|
|
"completions/mean_terminated_length": 139.08333435058594,
|
|
"completions/min_length": 132.6,
|
|
"completions/min_terminated_length": 132.6,
|
|
"epoch": 4.090909090909091,
|
|
"frac_reward_zero_std": 0.26666667461395266,
|
|
"grad_norm": 0.299233615398407,
|
|
"kl": 0.5079402208328248,
|
|
"learning_rate": 1.0266666666666668e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3884252.0,
|
|
"reward": 0.6081600427627564,
|
|
"reward_std": 0.0036574415396898987,
|
|
"rewards/reward_function/mean": 0.6081599950790405,
|
|
"rewards/reward_function/std": 0.08519913330674171,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.8,
|
|
"completions/max_terminated_length": 145.8,
|
|
"completions/mean_length": 137.45000305175782,
|
|
"completions/mean_terminated_length": 137.45000305175782,
|
|
"completions/min_length": 131.6,
|
|
"completions/min_terminated_length": 131.6,
|
|
"epoch": 4.166666666666667,
|
|
"frac_reward_zero_std": 0.3333333432674408,
|
|
"grad_norm": 0.25058528780937195,
|
|
"kl": 0.4812973976135254,
|
|
"learning_rate": 1.0044444444444446e-05,
|
|
"loss": 0.0005,
|
|
"num_tokens": 3956003.0,
|
|
"reward": 0.5873700261116028,
|
|
"reward_std": 0.0026831103255972265,
|
|
"rewards/reward_function/mean": 0.5873700022697449,
|
|
"rewards/reward_function/std": 0.0719571478664875,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.2,
|
|
"completions/max_terminated_length": 144.2,
|
|
"completions/mean_length": 136.8666748046875,
|
|
"completions/mean_terminated_length": 136.8666748046875,
|
|
"completions/min_length": 130.2,
|
|
"completions/min_terminated_length": 130.2,
|
|
"epoch": 4.242424242424242,
|
|
"frac_reward_zero_std": 0.6666666865348816,
|
|
"grad_norm": 0.22324170172214508,
|
|
"kl": 0.4731239755948385,
|
|
"learning_rate": 9.822222222222223e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4027967.0,
|
|
"reward": 0.5852800250053406,
|
|
"reward_std": 0.0007758166582789272,
|
|
"rewards/reward_function/mean": 0.5852800071239471,
|
|
"rewards/reward_function/std": 0.07820635661482811,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 168.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 168.4,
|
|
"completions/max_terminated_length": 168.4,
|
|
"completions/mean_length": 138.6166748046875,
|
|
"completions/mean_terminated_length": 138.6166748046875,
|
|
"completions/min_length": 128.8,
|
|
"completions/min_terminated_length": 128.8,
|
|
"epoch": 4.318181818181818,
|
|
"frac_reward_zero_std": 0.20000000596046447,
|
|
"grad_norm": 0.3128497898578644,
|
|
"kl": 0.5036711434523264,
|
|
"learning_rate": 9.600000000000001e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4099096.0,
|
|
"reward": 0.6271817207336425,
|
|
"reward_std": 0.0018659118562936784,
|
|
"rewards/reward_function/mean": 0.6271816611289978,
|
|
"rewards/reward_function/std": 0.11487905830144882,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.6,
|
|
"completions/max_terminated_length": 140.6,
|
|
"completions/mean_length": 134.8000030517578,
|
|
"completions/mean_terminated_length": 134.8000030517578,
|
|
"completions/min_length": 128.2,
|
|
"completions/min_terminated_length": 128.2,
|
|
"epoch": 4.393939393939394,
|
|
"frac_reward_zero_std": 0.7333333492279053,
|
|
"grad_norm": 0.22989916801452637,
|
|
"kl": 0.4845824003219604,
|
|
"learning_rate": 9.377777777777779e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4171172.0,
|
|
"reward": 0.6156550168991088,
|
|
"reward_std": 0.0006054541794583201,
|
|
"rewards/reward_function/mean": 0.615654981136322,
|
|
"rewards/reward_function/std": 0.06183821316808462,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 139.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 139.2,
|
|
"completions/max_terminated_length": 139.2,
|
|
"completions/mean_length": 135.33333740234374,
|
|
"completions/mean_terminated_length": 135.33333740234374,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 4.46969696969697,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.5052292943000793,
|
|
"kl": 0.4509253283341726,
|
|
"learning_rate": 9.155555555555557e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4242420.0,
|
|
"reward": 0.6587733507156373,
|
|
"reward_std": 0.0039598645540536385,
|
|
"rewards/reward_function/mean": 0.6587733268737793,
|
|
"rewards/reward_function/std": 0.10939907655119896,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.6,
|
|
"completions/max_terminated_length": 140.6,
|
|
"completions/mean_length": 135.4666717529297,
|
|
"completions/mean_terminated_length": 135.4666717529297,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 4.545454545454545,
|
|
"frac_reward_zero_std": 0.7333333492279053,
|
|
"grad_norm": 0.34855085611343384,
|
|
"kl": 0.6146595040957133,
|
|
"learning_rate": 8.933333333333333e-06,
|
|
"loss": 0.0006,
|
|
"num_tokens": 4314044.0,
|
|
"reward": 0.5829833626747132,
|
|
"reward_std": 0.02590584859426599,
|
|
"rewards/reward_function/mean": 0.5829833388328552,
|
|
"rewards/reward_function/std": 0.090936179459095,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.6,
|
|
"completions/max_terminated_length": 142.6,
|
|
"completions/mean_length": 136.7166748046875,
|
|
"completions/mean_terminated_length": 136.7166748046875,
|
|
"completions/min_length": 130.6,
|
|
"completions/min_terminated_length": 130.6,
|
|
"epoch": 4.621212121212121,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.0010158641962334514,
|
|
"kl": 0.4747675855954488,
|
|
"learning_rate": 8.711111111111111e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4385951.0,
|
|
"reward": 0.5888633489608764,
|
|
"reward_std": 0.000871180125977844,
|
|
"rewards/reward_function/mean": 0.5888633370399475,
|
|
"rewards/reward_function/std": 0.059521042928099635,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.2,
|
|
"completions/max_terminated_length": 143.2,
|
|
"completions/mean_length": 135.63333740234376,
|
|
"completions/mean_terminated_length": 135.63333740234376,
|
|
"completions/min_length": 129.8,
|
|
"completions/min_terminated_length": 129.8,
|
|
"epoch": 4.696969696969697,
|
|
"frac_reward_zero_std": 0.6666666746139527,
|
|
"grad_norm": 0.3419860005378723,
|
|
"kl": 0.4693106154600779,
|
|
"learning_rate": 8.48888888888889e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4457693.0,
|
|
"reward": 0.6256850123405456,
|
|
"reward_std": 0.00048030615434981885,
|
|
"rewards/reward_function/mean": 0.6256850123405456,
|
|
"rewards/reward_function/std": 0.07439599148929119,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.0,
|
|
"completions/max_terminated_length": 145.0,
|
|
"completions/mean_length": 135.26666870117188,
|
|
"completions/mean_terminated_length": 135.26666870117188,
|
|
"completions/min_length": 125.6,
|
|
"completions/min_terminated_length": 125.6,
|
|
"epoch": 4.7727272727272725,
|
|
"frac_reward_zero_std": 0.7333333432674408,
|
|
"grad_norm": 0.0016331294318661094,
|
|
"kl": 0.4828857759634654,
|
|
"learning_rate": 8.266666666666667e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4529181.0,
|
|
"reward": 0.6507667064666748,
|
|
"reward_std": 0.0004982158861821517,
|
|
"rewards/reward_function/mean": 0.6507666826248169,
|
|
"rewards/reward_function/std": 0.06604736782610417,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.4,
|
|
"completions/max_terminated_length": 144.4,
|
|
"completions/mean_length": 134.51666870117188,
|
|
"completions/mean_terminated_length": 134.51666870117188,
|
|
"completions/min_length": 126.6,
|
|
"completions/min_terminated_length": 126.6,
|
|
"epoch": 4.848484848484849,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.2499098777770996,
|
|
"kl": 0.494149374961853,
|
|
"learning_rate": 8.044444444444444e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4600312.0,
|
|
"reward": 0.6521166801452637,
|
|
"reward_std": 0.0006720453522575554,
|
|
"rewards/reward_function/mean": 0.6521166682243347,
|
|
"rewards/reward_function/std": 0.08324470967054368,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.6,
|
|
"completions/max_terminated_length": 145.6,
|
|
"completions/mean_length": 136.28333740234376,
|
|
"completions/mean_terminated_length": 136.28333740234376,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 4.924242424242424,
|
|
"frac_reward_zero_std": 0.5333333492279053,
|
|
"grad_norm": 0.23840579390525818,
|
|
"kl": 0.4895890514055888,
|
|
"learning_rate": 7.822222222222224e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4671817.0,
|
|
"reward": 0.578350019454956,
|
|
"reward_std": 0.0009398535461514257,
|
|
"rewards/reward_function/mean": 0.5783499956130982,
|
|
"rewards/reward_function/std": 0.08822323828935623,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.6,
|
|
"completions/max_terminated_length": 140.6,
|
|
"completions/mean_length": 134.86667175292968,
|
|
"completions/mean_terminated_length": 134.86667175292968,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 5.0,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.0008585389005020261,
|
|
"kl": 0.49821096857388814,
|
|
"learning_rate": 7.600000000000001e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4742753.0,
|
|
"reward": 0.6717733860015869,
|
|
"reward_std": 0.0035625058459118008,
|
|
"rewards/reward_function/mean": 0.6717733263969421,
|
|
"rewards/reward_function/std": 0.1110783264040947,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.4,
|
|
"completions/max_terminated_length": 142.4,
|
|
"completions/mean_length": 132.9666748046875,
|
|
"completions/mean_terminated_length": 132.9666748046875,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 5.075757575757576,
|
|
"frac_reward_zero_std": 0.5333333492279053,
|
|
"grad_norm": 0.23162811994552612,
|
|
"kl": 0.5019173324108124,
|
|
"learning_rate": 7.377777777777778e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4814159.0,
|
|
"reward": 0.6638583660125732,
|
|
"reward_std": 0.0006335714278975502,
|
|
"rewards/reward_function/mean": 0.6638583660125732,
|
|
"rewards/reward_function/std": 0.08800669051706791,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.2,
|
|
"completions/max_terminated_length": 140.2,
|
|
"completions/mean_length": 133.15000610351564,
|
|
"completions/mean_terminated_length": 133.15000610351564,
|
|
"completions/min_length": 124.8,
|
|
"completions/min_terminated_length": 124.8,
|
|
"epoch": 5.151515151515151,
|
|
"frac_reward_zero_std": 0.5333333432674408,
|
|
"grad_norm": 0.001869841362349689,
|
|
"kl": 0.502401218811671,
|
|
"learning_rate": 7.155555555555556e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4885236.0,
|
|
"reward": 0.6237933874130249,
|
|
"reward_std": 0.0005346706879208796,
|
|
"rewards/reward_function/mean": 0.6237933516502381,
|
|
"rewards/reward_function/std": 0.11264116615056992,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 141.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 141.4,
|
|
"completions/max_terminated_length": 141.4,
|
|
"completions/mean_length": 134.51666870117188,
|
|
"completions/mean_terminated_length": 134.51666870117188,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 5.2272727272727275,
|
|
"frac_reward_zero_std": 0.6666666746139527,
|
|
"grad_norm": 0.0013165439013391733,
|
|
"kl": 0.5192258059978485,
|
|
"learning_rate": 6.9333333333333344e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 4956535.0,
|
|
"reward": 0.627209997177124,
|
|
"reward_std": 0.00032304815831594167,
|
|
"rewards/reward_function/mean": 0.627209997177124,
|
|
"rewards/reward_function/std": 0.10075384378433228,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.4,
|
|
"completions/max_terminated_length": 140.4,
|
|
"completions/mean_length": 133.4666748046875,
|
|
"completions/mean_terminated_length": 133.4666748046875,
|
|
"completions/min_length": 126.8,
|
|
"completions/min_terminated_length": 126.8,
|
|
"epoch": 5.303030303030303,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.0013948202831670642,
|
|
"kl": 0.5109677731990814,
|
|
"learning_rate": 6.711111111111111e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5027323.0,
|
|
"reward": 0.6721283674240113,
|
|
"reward_std": 0.002642212545470102,
|
|
"rewards/reward_function/mean": 0.6721283793449402,
|
|
"rewards/reward_function/std": 0.1264987990260124,
|
|
"step": 350
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 170.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 170.4,
|
|
"completions/max_terminated_length": 170.4,
|
|
"completions/mean_length": 137.61667175292968,
|
|
"completions/mean_terminated_length": 137.61667175292968,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 5.378787878787879,
|
|
"frac_reward_zero_std": 0.8666666686534882,
|
|
"grad_norm": 0.001804351806640625,
|
|
"kl": 0.5009710172812144,
|
|
"learning_rate": 6.488888888888889e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5099384.0,
|
|
"reward": 0.5923767209053039,
|
|
"reward_std": 0.001375216245651245,
|
|
"rewards/reward_function/mean": 0.5923766851425171,
|
|
"rewards/reward_function/std": 0.04210694804787636,
|
|
"step": 355
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.2,
|
|
"completions/max_terminated_length": 143.2,
|
|
"completions/mean_length": 135.01667175292968,
|
|
"completions/mean_terminated_length": 135.01667175292968,
|
|
"completions/min_length": 127.2,
|
|
"completions/min_terminated_length": 127.2,
|
|
"epoch": 5.454545454545454,
|
|
"frac_reward_zero_std": 0.7333333492279053,
|
|
"grad_norm": 0.38603079319000244,
|
|
"kl": 0.4913855314254761,
|
|
"learning_rate": 6.266666666666668e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5170701.0,
|
|
"reward": 0.5950633525848389,
|
|
"reward_std": 0.004046973369258922,
|
|
"rewards/reward_function/mean": 0.5950633406639099,
|
|
"rewards/reward_function/std": 0.10473623871803284,
|
|
"step": 360
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.2,
|
|
"completions/max_terminated_length": 142.2,
|
|
"completions/mean_length": 134.9166717529297,
|
|
"completions/mean_terminated_length": 134.9166717529297,
|
|
"completions/min_length": 128.8,
|
|
"completions/min_terminated_length": 128.8,
|
|
"epoch": 5.53030303030303,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.0022605860140174627,
|
|
"kl": 0.4842663645744324,
|
|
"learning_rate": 6.044444444444445e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5242424.0,
|
|
"reward": 0.622570025920868,
|
|
"reward_std": 5.1546646864153446e-05,
|
|
"rewards/reward_function/mean": 0.6225700139999389,
|
|
"rewards/reward_function/std": 0.033699407684616746,
|
|
"step": 365
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.6,
|
|
"completions/max_terminated_length": 142.6,
|
|
"completions/mean_length": 135.10000610351562,
|
|
"completions/mean_terminated_length": 135.10000610351562,
|
|
"completions/min_length": 128.2,
|
|
"completions/min_terminated_length": 128.2,
|
|
"epoch": 5.606060606060606,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.28471168875694275,
|
|
"kl": 0.49788983861605324,
|
|
"learning_rate": 5.822222222222223e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5314082.0,
|
|
"reward": 0.5884933590888977,
|
|
"reward_std": 0.0018088188953697681,
|
|
"rewards/reward_function/mean": 0.5884933233261108,
|
|
"rewards/reward_function/std": 0.05092477286234498,
|
|
"step": 370
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 141.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 141.6,
|
|
"completions/max_terminated_length": 141.6,
|
|
"completions/mean_length": 134.83334045410157,
|
|
"completions/mean_terminated_length": 134.83334045410157,
|
|
"completions/min_length": 129.4,
|
|
"completions/min_terminated_length": 129.4,
|
|
"epoch": 5.681818181818182,
|
|
"frac_reward_zero_std": 0.800000011920929,
|
|
"grad_norm": 0.2497999668121338,
|
|
"kl": 0.5025030295054118,
|
|
"learning_rate": 5.600000000000001e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5385792.0,
|
|
"reward": 0.6160383701324463,
|
|
"reward_std": 0.000584107032045722,
|
|
"rewards/reward_function/mean": 0.6160383224487305,
|
|
"rewards/reward_function/std": 0.061161456257104875,
|
|
"step": 375
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 138.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 138.8,
|
|
"completions/max_terminated_length": 138.8,
|
|
"completions/mean_length": 133.60000305175782,
|
|
"completions/mean_terminated_length": 133.60000305175782,
|
|
"completions/min_length": 127.6,
|
|
"completions/min_terminated_length": 127.6,
|
|
"epoch": 5.757575757575758,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.5607179999351501,
|
|
"kl": 0.5027451872825622,
|
|
"learning_rate": 5.3777777777777784e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5457272.0,
|
|
"reward": 0.6048883557319641,
|
|
"reward_std": 0.001853011967614293,
|
|
"rewards/reward_function/mean": 0.6048883438110352,
|
|
"rewards/reward_function/std": 0.08679699413478374,
|
|
"step": 380
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.6,
|
|
"completions/max_terminated_length": 140.6,
|
|
"completions/mean_length": 134.86666870117188,
|
|
"completions/mean_terminated_length": 134.86666870117188,
|
|
"completions/min_length": 129.6,
|
|
"completions/min_terminated_length": 129.6,
|
|
"epoch": 5.833333333333333,
|
|
"frac_reward_zero_std": 0.33333333730697634,
|
|
"grad_norm": 0.0009953180560842156,
|
|
"kl": 0.5802711407343547,
|
|
"learning_rate": 5.155555555555556e-06,
|
|
"loss": 0.0006,
|
|
"num_tokens": 5528632.0,
|
|
"reward": 0.598478353023529,
|
|
"reward_std": 0.0020464868051931263,
|
|
"rewards/reward_function/mean": 0.598478353023529,
|
|
"rewards/reward_function/std": 0.06861904165707529,
|
|
"step": 385
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.6,
|
|
"completions/max_terminated_length": 142.6,
|
|
"completions/mean_length": 134.65000610351564,
|
|
"completions/mean_terminated_length": 134.65000610351564,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 5.909090909090909,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.2697948217391968,
|
|
"kl": 0.5500587046146392,
|
|
"learning_rate": 4.933333333333334e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5599959.0,
|
|
"reward": 0.6114783763885498,
|
|
"reward_std": 0.0005467232927912846,
|
|
"rewards/reward_function/mean": 0.6114783406257629,
|
|
"rewards/reward_function/std": 0.07442084513604641,
|
|
"step": 390
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.4,
|
|
"completions/max_terminated_length": 142.4,
|
|
"completions/mean_length": 135.63333740234376,
|
|
"completions/mean_terminated_length": 135.63333740234376,
|
|
"completions/min_length": 128.2,
|
|
"completions/min_terminated_length": 128.2,
|
|
"epoch": 5.984848484848484,
|
|
"frac_reward_zero_std": 0.800000011920929,
|
|
"grad_norm": 0.23243778944015503,
|
|
"kl": 0.47906127373377483,
|
|
"learning_rate": 4.711111111111111e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5671181.0,
|
|
"reward": 0.6451417088508606,
|
|
"reward_std": 0.0028338861418887975,
|
|
"rewards/reward_function/mean": 0.6451416611671448,
|
|
"rewards/reward_function/std": 0.10424772650003433,
|
|
"step": 395
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.8,
|
|
"completions/max_terminated_length": 140.8,
|
|
"completions/mean_length": 134.28334045410156,
|
|
"completions/mean_terminated_length": 134.28334045410156,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 6.0606060606060606,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.2668240964412689,
|
|
"kl": 0.49893170793851216,
|
|
"learning_rate": 4.488888888888889e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5742606.0,
|
|
"reward": 0.6422833681106568,
|
|
"reward_std": 0.001804861845448613,
|
|
"rewards/reward_function/mean": 0.6422833323478698,
|
|
"rewards/reward_function/std": 0.09786662720143795,
|
|
"step": 400
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 145.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 145.6,
|
|
"completions/max_terminated_length": 145.6,
|
|
"completions/mean_length": 136.43334045410157,
|
|
"completions/mean_terminated_length": 136.43334045410157,
|
|
"completions/min_length": 131.4,
|
|
"completions/min_terminated_length": 131.4,
|
|
"epoch": 6.136363636363637,
|
|
"frac_reward_zero_std": 0.8000000059604645,
|
|
"grad_norm": 0.3256749212741852,
|
|
"kl": 0.500635419289271,
|
|
"learning_rate": 4.266666666666668e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5814636.0,
|
|
"reward": 0.5908233761787415,
|
|
"reward_std": 0.00030224729562178254,
|
|
"rewards/reward_function/mean": 0.5908233284950256,
|
|
"rewards/reward_function/std": 0.06711971089243889,
|
|
"step": 405
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.6,
|
|
"completions/max_terminated_length": 142.6,
|
|
"completions/mean_length": 135.1666717529297,
|
|
"completions/mean_terminated_length": 135.1666717529297,
|
|
"completions/min_length": 129.2,
|
|
"completions/min_terminated_length": 129.2,
|
|
"epoch": 6.212121212121212,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.4264754354953766,
|
|
"kl": 0.5067307233810425,
|
|
"learning_rate": 4.044444444444445e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5886194.0,
|
|
"reward": 0.5957333445549011,
|
|
"reward_std": 0.002951131097506732,
|
|
"rewards/reward_function/mean": 0.5957333445549011,
|
|
"rewards/reward_function/std": 0.05839875200763345,
|
|
"step": 410
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.4,
|
|
"completions/max_terminated_length": 143.4,
|
|
"completions/mean_length": 135.5500030517578,
|
|
"completions/mean_terminated_length": 135.5500030517578,
|
|
"completions/min_length": 130.6,
|
|
"completions/min_terminated_length": 130.6,
|
|
"epoch": 6.287878787878788,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.7883802056312561,
|
|
"kl": 0.5081615746021271,
|
|
"learning_rate": 3.8222222222222224e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 5957895.0,
|
|
"reward": 0.579140043258667,
|
|
"reward_std": 0.0008852901773934718,
|
|
"rewards/reward_function/mean": 0.5791400074958801,
|
|
"rewards/reward_function/std": 0.06494581587612629,
|
|
"step": 415
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 144.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 144.8,
|
|
"completions/max_terminated_length": 144.8,
|
|
"completions/mean_length": 134.73333435058595,
|
|
"completions/mean_terminated_length": 134.73333435058595,
|
|
"completions/min_length": 128.2,
|
|
"completions/min_terminated_length": 128.2,
|
|
"epoch": 6.363636363636363,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.20063082873821259,
|
|
"kl": 0.49638415773709615,
|
|
"learning_rate": 3.6000000000000003e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6029167.0,
|
|
"reward": 0.6484000086784363,
|
|
"reward_std": 0.0009333359310403466,
|
|
"rewards/reward_function/mean": 0.6483999967575074,
|
|
"rewards/reward_function/std": 0.06488962545990944,
|
|
"step": 420
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.4,
|
|
"completions/max_terminated_length": 143.4,
|
|
"completions/mean_length": 135.68333740234374,
|
|
"completions/mean_terminated_length": 135.68333740234374,
|
|
"completions/min_length": 128.8,
|
|
"completions/min_terminated_length": 128.8,
|
|
"epoch": 6.4393939393939394,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.00192440883256495,
|
|
"kl": 0.506149830420812,
|
|
"learning_rate": 3.377777777777778e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6100760.0,
|
|
"reward": 0.656581723690033,
|
|
"reward_std": 0.00018239482160424813,
|
|
"rewards/reward_function/mean": 0.6565816640853882,
|
|
"rewards/reward_function/std": 0.09751839749515057,
|
|
"step": 425
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.8,
|
|
"completions/max_terminated_length": 142.8,
|
|
"completions/mean_length": 136.7166717529297,
|
|
"completions/mean_terminated_length": 136.7166717529297,
|
|
"completions/min_length": 131.4,
|
|
"completions/min_terminated_length": 131.4,
|
|
"epoch": 6.515151515151516,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.003226165659725666,
|
|
"kl": 0.481193604071935,
|
|
"learning_rate": 3.1555555555555555e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6172311.0,
|
|
"reward": 0.6063950300216675,
|
|
"reward_std": 0.0008966684341430664,
|
|
"rewards/reward_function/mean": 0.6063949942588807,
|
|
"rewards/reward_function/std": 0.07530169561505318,
|
|
"step": 430
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 147.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.4,
|
|
"completions/max_terminated_length": 147.4,
|
|
"completions/mean_length": 137.03333740234376,
|
|
"completions/mean_terminated_length": 137.03333740234376,
|
|
"completions/min_length": 127.8,
|
|
"completions/min_terminated_length": 127.8,
|
|
"epoch": 6.590909090909091,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.20924994349479675,
|
|
"kl": 0.5101174155871073,
|
|
"learning_rate": 2.9333333333333338e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6243641.0,
|
|
"reward": 0.6130650043487549,
|
|
"reward_std": 0.022456027381122113,
|
|
"rewards/reward_function/mean": 0.6130649983882904,
|
|
"rewards/reward_function/std": 0.09285207167267799,
|
|
"step": 435
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.2,
|
|
"completions/max_terminated_length": 143.2,
|
|
"completions/mean_length": 136.28333740234376,
|
|
"completions/mean_terminated_length": 136.28333740234376,
|
|
"completions/min_length": 129.4,
|
|
"completions/min_terminated_length": 129.4,
|
|
"epoch": 6.666666666666667,
|
|
"frac_reward_zero_std": 0.40000001192092893,
|
|
"grad_norm": 0.47669851779937744,
|
|
"kl": 0.49107330640157065,
|
|
"learning_rate": 2.7111111111111116e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6314638.0,
|
|
"reward": 0.6283350110054016,
|
|
"reward_std": 0.002342601466079941,
|
|
"rewards/reward_function/mean": 0.6283349990844727,
|
|
"rewards/reward_function/std": 0.11100482866168022,
|
|
"step": 440
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.4,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.4,
|
|
"completions/max_terminated_length": 143.4,
|
|
"completions/mean_length": 135.6166748046875,
|
|
"completions/mean_terminated_length": 135.6166748046875,
|
|
"completions/min_length": 130.6,
|
|
"completions/min_terminated_length": 130.6,
|
|
"epoch": 6.742424242424242,
|
|
"frac_reward_zero_std": 0.6000000178813935,
|
|
"grad_norm": 0.35882994532585144,
|
|
"kl": 0.48326287865638734,
|
|
"learning_rate": 2.488888888888889e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6386483.0,
|
|
"reward": 0.5984933614730835,
|
|
"reward_std": 0.00557331838645041,
|
|
"rewards/reward_function/mean": 0.5984933495521545,
|
|
"rewards/reward_function/std": 0.0850373286753893,
|
|
"step": 445
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 141.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 141.8,
|
|
"completions/max_terminated_length": 141.8,
|
|
"completions/mean_length": 135.9166748046875,
|
|
"completions/mean_terminated_length": 135.9166748046875,
|
|
"completions/min_length": 130.4,
|
|
"completions/min_terminated_length": 130.4,
|
|
"epoch": 6.818181818181818,
|
|
"frac_reward_zero_std": 0.7333333432674408,
|
|
"grad_norm": 0.0010997391073033214,
|
|
"kl": 0.5031669855117797,
|
|
"learning_rate": 2.266666666666667e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6458146.0,
|
|
"reward": 0.6327366828918457,
|
|
"reward_std": 0.0019782020128332077,
|
|
"rewards/reward_function/mean": 0.6327366590499878,
|
|
"rewards/reward_function/std": 0.04953039065003395,
|
|
"step": 450
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.2,
|
|
"completions/max_terminated_length": 146.2,
|
|
"completions/mean_length": 136.0166748046875,
|
|
"completions/mean_terminated_length": 136.0166748046875,
|
|
"completions/min_length": 127.2,
|
|
"completions/min_terminated_length": 127.2,
|
|
"epoch": 6.893939393939394,
|
|
"frac_reward_zero_std": 0.6666666746139527,
|
|
"grad_norm": 0.3914296329021454,
|
|
"kl": 0.5335827827453613,
|
|
"learning_rate": 2.0444444444444447e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6529779.0,
|
|
"reward": 0.596358346939087,
|
|
"reward_std": 0.020278603804763408,
|
|
"rewards/reward_function/mean": 0.5963583350181579,
|
|
"rewards/reward_function/std": 0.09048257786780596,
|
|
"step": 455
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.6,
|
|
"completions/max_terminated_length": 142.6,
|
|
"completions/mean_length": 135.78334045410156,
|
|
"completions/mean_terminated_length": 135.78334045410156,
|
|
"completions/min_length": 129.6,
|
|
"completions/min_terminated_length": 129.6,
|
|
"epoch": 6.96969696969697,
|
|
"frac_reward_zero_std": 0.5333333432674408,
|
|
"grad_norm": 0.0012995324796065688,
|
|
"kl": 0.48169001936912537,
|
|
"learning_rate": 1.8222222222222225e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6601226.0,
|
|
"reward": 0.6193716883659363,
|
|
"reward_std": 0.0011129752092529088,
|
|
"rewards/reward_function/mean": 0.6193716883659363,
|
|
"rewards/reward_function/std": 0.05945280194282532,
|
|
"step": 460
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 143.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 143.8,
|
|
"completions/max_terminated_length": 143.8,
|
|
"completions/mean_length": 134.60000610351562,
|
|
"completions/mean_terminated_length": 134.60000610351562,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 7.045454545454546,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.4167572855949402,
|
|
"kl": 0.48465609351793926,
|
|
"learning_rate": 1.6000000000000001e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6672610.0,
|
|
"reward": 0.6474766969680786,
|
|
"reward_std": 0.0035319807764608415,
|
|
"rewards/reward_function/mean": 0.6474766850471496,
|
|
"rewards/reward_function/std": 0.1136886328458786,
|
|
"step": 465
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.8,
|
|
"completions/max_terminated_length": 142.8,
|
|
"completions/mean_length": 136.10000610351562,
|
|
"completions/mean_terminated_length": 136.10000610351562,
|
|
"completions/min_length": 128.6,
|
|
"completions/min_terminated_length": 128.6,
|
|
"epoch": 7.121212121212121,
|
|
"frac_reward_zero_std": 0.4666666805744171,
|
|
"grad_norm": 0.25415971875190735,
|
|
"kl": 0.4857166389624278,
|
|
"learning_rate": 1.377777777777778e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6743756.0,
|
|
"reward": 0.6225350260734558,
|
|
"reward_std": 0.0013274424407427432,
|
|
"rewards/reward_function/mean": 0.6225350141525269,
|
|
"rewards/reward_function/std": 0.07253584116697312,
|
|
"step": 470
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.0,
|
|
"completions/max_terminated_length": 146.0,
|
|
"completions/mean_length": 135.9166748046875,
|
|
"completions/mean_terminated_length": 135.9166748046875,
|
|
"completions/min_length": 129.2,
|
|
"completions/min_terminated_length": 129.2,
|
|
"epoch": 7.196969696969697,
|
|
"frac_reward_zero_std": 0.4666666746139526,
|
|
"grad_norm": 0.0014292309060692787,
|
|
"kl": 0.4930973211924235,
|
|
"learning_rate": 1.1555555555555556e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6815079.0,
|
|
"reward": 0.5987317025661468,
|
|
"reward_std": 0.0013763949275016785,
|
|
"rewards/reward_function/mean": 0.59873166680336,
|
|
"rewards/reward_function/std": 0.11740763150155545,
|
|
"step": 475
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 142.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 142.8,
|
|
"completions/max_terminated_length": 142.8,
|
|
"completions/mean_length": 134.9166748046875,
|
|
"completions/mean_terminated_length": 134.9166748046875,
|
|
"completions/min_length": 127.6,
|
|
"completions/min_terminated_length": 127.6,
|
|
"epoch": 7.2727272727272725,
|
|
"frac_reward_zero_std": 0.6000000178813935,
|
|
"grad_norm": 0.26499706506729126,
|
|
"kl": 0.49674716194470725,
|
|
"learning_rate": 9.333333333333334e-07,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6886154.0,
|
|
"reward": 0.625700044631958,
|
|
"reward_std": 0.0038128262152895332,
|
|
"rewards/reward_function/mean": 0.6257000207901001,
|
|
"rewards/reward_function/std": 0.1047673098742962,
|
|
"step": 480
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.8,
|
|
"completions/max_terminated_length": 146.8,
|
|
"completions/mean_length": 136.3166717529297,
|
|
"completions/mean_terminated_length": 136.3166717529297,
|
|
"completions/min_length": 129.6,
|
|
"completions/min_terminated_length": 129.6,
|
|
"epoch": 7.348484848484849,
|
|
"frac_reward_zero_std": 0.600000011920929,
|
|
"grad_norm": 0.3776555359363556,
|
|
"kl": 0.5015994648138682,
|
|
"learning_rate": 7.111111111111112e-07,
|
|
"loss": 0.0005,
|
|
"num_tokens": 6958057.0,
|
|
"reward": 0.619901716709137,
|
|
"reward_std": 0.003150607304269215,
|
|
"rewards/reward_function/mean": 0.6199016690254211,
|
|
"rewards/reward_function/std": 0.07383745927363634,
|
|
"step": 485
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 146.6,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 146.6,
|
|
"completions/max_terminated_length": 146.6,
|
|
"completions/mean_length": 137.63334045410156,
|
|
"completions/mean_terminated_length": 137.63334045410156,
|
|
"completions/min_length": 131.2,
|
|
"completions/min_terminated_length": 131.2,
|
|
"epoch": 7.424242424242424,
|
|
"frac_reward_zero_std": 0.6666666805744171,
|
|
"grad_norm": 0.005205494351685047,
|
|
"kl": 0.5021458446979523,
|
|
"learning_rate": 4.88888888888889e-07,
|
|
"loss": 0.0005,
|
|
"num_tokens": 7029915.0,
|
|
"reward": 0.604265034198761,
|
|
"reward_std": 0.021144862449727953,
|
|
"rewards/reward_function/mean": 0.604265010356903,
|
|
"rewards/reward_function/std": 0.08077941089868546,
|
|
"step": 490
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 147.2,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 147.2,
|
|
"completions/max_terminated_length": 147.2,
|
|
"completions/mean_length": 137.2666748046875,
|
|
"completions/mean_terminated_length": 137.2666748046875,
|
|
"completions/min_length": 130.8,
|
|
"completions/min_terminated_length": 130.8,
|
|
"epoch": 7.5,
|
|
"frac_reward_zero_std": 0.8666666746139526,
|
|
"grad_norm": 0.26117172837257385,
|
|
"kl": 0.4589234252770742,
|
|
"learning_rate": 2.666666666666667e-07,
|
|
"loss": 0.0005,
|
|
"num_tokens": 7101851.0,
|
|
"reward": 0.5880266904830933,
|
|
"reward_std": 0.0015000006183981895,
|
|
"rewards/reward_function/mean": 0.5880266785621643,
|
|
"rewards/reward_function/std": 0.043198444740846755,
|
|
"step": 495
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completion_length": 140.8,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 140.8,
|
|
"completions/max_terminated_length": 140.8,
|
|
"completions/mean_length": 135.0000030517578,
|
|
"completions/mean_terminated_length": 135.0000030517578,
|
|
"completions/min_length": 127.2,
|
|
"completions/min_terminated_length": 127.2,
|
|
"epoch": 7.575757575757576,
|
|
"frac_reward_zero_std": 0.4000000059604645,
|
|
"grad_norm": 0.07097447663545609,
|
|
"kl": 0.4987126151720683,
|
|
"learning_rate": 4.444444444444445e-08,
|
|
"loss": 0.0005,
|
|
"num_tokens": 7172631.0,
|
|
"reward": 0.6335716843605042,
|
|
"reward_std": 0.003161613627889892,
|
|
"rewards/reward_function/mean": 0.6335716724395752,
|
|
"rewards/reward_function/std": 0.07902667857706547,
|
|
"step": 500
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 500,
|
|
"num_input_tokens_seen": 7172631,
|
|
"num_train_epochs": 8,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|