{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.818181818181818, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 177.8, "completions/clipped_ratio": 0.0, "completions/max_length": 177.8, "completions/max_terminated_length": 177.8, "completions/mean_length": 157.85000610351562, "completions/mean_terminated_length": 157.85000610351562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.07575757575757576, "frac_reward_zero_std": 0.4000000059604645, "grad_norm": 1.5840047597885132, "kl": 0.0010059793893522702, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "num_tokens": 73447.0, "reward": 0.5880883574485779, "reward_std": 0.020529226586222648, "rewards/reward_function/mean": 0.5880883395671844, "rewards/reward_function/std": 0.06562883183360099, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.2, "completions/clipped_ratio": 0.0, "completions/max_length": 176.2, "completions/max_terminated_length": 176.2, "completions/mean_length": 156.65000915527344, "completions/mean_terminated_length": 156.65000915527344, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.15151515151515152, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 1.6390373706817627, "kl": 0.0017284046276472508, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "num_tokens": 146334.0, "reward": 0.605418348312378, "reward_std": 0.02508251890540123, "rewards/reward_function/mean": 0.60541832447052, "rewards/reward_function/std": 0.06859094277024269, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.4, "completions/clipped_ratio": 0.0, "completions/max_length": 176.4, "completions/max_terminated_length": 176.4, "completions/mean_length": 157.0000030517578, "completions/mean_terminated_length": 157.0000030517578, "completions/min_length": 140.8, "completions/min_terminated_length": 140.8, "epoch": 0.22727272727272727, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.7626600861549377, "kl": 0.003397522373901059, "learning_rate": 5.600000000000001e-06, "loss": 0.0, "num_tokens": 219198.0, "reward": 0.5862850427627564, "reward_std": 0.036518129706382754, "rewards/reward_function/mean": 0.5862850069999694, "rewards/reward_function/std": 0.08488646671175956, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 183.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.98334045410155, "completions/mean_terminated_length": 156.98334045410155, "completions/min_length": 139.8, "completions/min_terminated_length": 139.8, "epoch": 0.30303030303030304, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.9624250531196594, "kl": 0.007015585945919156, "learning_rate": 7.600000000000001e-06, "loss": 0.0, "num_tokens": 291737.0, "reward": 0.6001700401306153, "reward_std": 0.025772593356668948, "rewards/reward_function/mean": 0.6001700043678284, "rewards/reward_function/std": 0.07909451425075531, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 172.2, "completions/clipped_ratio": 0.0, "completions/max_length": 172.2, "completions/max_terminated_length": 172.2, "completions/mean_length": 155.23333740234375, "completions/mean_terminated_length": 155.23333740234375, "completions/min_length": 138.2, "completions/min_terminated_length": 138.2, "epoch": 0.3787878787878788, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.5355867743492126, "kl": 0.009492208405087391, "learning_rate": 9.600000000000001e-06, "loss": 0.0, "num_tokens": 364535.0, "reward": 0.5766633510589599, "reward_std": 0.04085115455091, "rewards/reward_function/mean": 0.576663339138031, "rewards/reward_function/std": 0.10587597712874412, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 198.4, "completions/clipped_ratio": 0.0, "completions/max_length": 198.4, "completions/max_terminated_length": 198.4, "completions/mean_length": 153.6166748046875, "completions/mean_terminated_length": 153.6166748046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.45454545454545453, "frac_reward_zero_std": 0.13333333730697633, "grad_norm": 0.6549646854400635, "kl": 0.061492755884925525, "learning_rate": 1.16e-05, "loss": 0.0001, "num_tokens": 436992.0, "reward": 0.5978150248527527, "reward_std": 0.04262940138578415, "rewards/reward_function/mean": 0.5978150129318237, "rewards/reward_function/std": 0.09431936666369438, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.86667175292968, "completions/mean_terminated_length": 158.86667175292968, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5303030303030303, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.4725801348686218, "kl": 2.357983988771836, "learning_rate": 1.3600000000000002e-05, "loss": 0.0024, "num_tokens": 509788.0, "reward": 0.6049700140953064, "reward_std": 0.012831439916044473, "rewards/reward_function/mean": 0.6049699783325195, "rewards/reward_function/std": 0.08928216472268105, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 172.4, "completions/clipped_ratio": 0.0, "completions/max_length": 172.4, "completions/max_terminated_length": 172.4, "completions/mean_length": 154.5500030517578, "completions/mean_terminated_length": 154.5500030517578, "completions/min_length": 138.8, "completions/min_terminated_length": 138.8, "epoch": 0.6060606060606061, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.41579461097717285, "kl": 0.10282722649474939, "learning_rate": 1.5600000000000003e-05, "loss": 0.0001, "num_tokens": 582789.0, "reward": 0.5595033764839172, "reward_std": 0.014409982354845852, "rewards/reward_function/mean": 0.5595033466815948, "rewards/reward_function/std": 0.053104204079136255, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 202.6, "completions/clipped_ratio": 0.0, "completions/max_length": 202.6, "completions/max_terminated_length": 202.6, "completions/mean_length": 158.20000305175782, "completions/mean_terminated_length": 158.20000305175782, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6818181818181818, "frac_reward_zero_std": 0.33333333730697634, "grad_norm": 0.03286667913198471, "kl": 3726.0936788400013, "learning_rate": 1.76e-05, "loss": 3.7261, "num_tokens": 655565.0, "reward": 0.5829650402069092, "reward_std": 0.031194474175572397, "rewards/reward_function/mean": 0.5829649925231933, "rewards/reward_function/std": 0.09724260903894902, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 180.4, "completions/clipped_ratio": 0.0, "completions/max_length": 180.4, "completions/max_terminated_length": 180.4, "completions/mean_length": 157.5500030517578, "completions/mean_terminated_length": 157.5500030517578, "completions/min_length": 141.2, "completions/min_terminated_length": 141.2, "epoch": 0.7575757575757576, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.45231395959854126, "kl": 0.24270717451969784, "learning_rate": 1.9600000000000002e-05, "loss": 0.0002, "num_tokens": 727754.0, "reward": 0.6422233581542969, "reward_std": 0.015453202556818724, "rewards/reward_function/mean": 0.642223310470581, "rewards/reward_function/std": 0.08873879238963127, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 174.8, "completions/clipped_ratio": 0.0, "completions/max_length": 174.8, "completions/max_terminated_length": 174.8, "completions/mean_length": 153.8166717529297, "completions/mean_terminated_length": 153.8166717529297, "completions/min_length": 138.2, "completions/min_terminated_length": 138.2, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.20000000596046447, "grad_norm": 0.3569779694080353, "kl": 0.25451052089532217, "learning_rate": 1.9822222222222226e-05, "loss": 0.0003, "num_tokens": 800059.0, "reward": 0.6014716982841491, "reward_std": 0.015854166075587272, "rewards/reward_function/mean": 0.6014716625213623, "rewards/reward_function/std": 0.10040064603090286, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 173.2, "completions/clipped_ratio": 0.0, "completions/max_length": 173.2, "completions/max_terminated_length": 173.2, "completions/mean_length": 153.4666717529297, "completions/mean_terminated_length": 153.4666717529297, "completions/min_length": 135.6, "completions/min_terminated_length": 135.6, "epoch": 0.9090909090909091, "frac_reward_zero_std": 0.20000000596046447, "grad_norm": 0.4230985939502716, "kl": 0.10768474241097768, "learning_rate": 1.9600000000000002e-05, "loss": 0.0001, "num_tokens": 872639.0, "reward": 0.6015583634376526, "reward_std": 0.020151399821043015, "rewards/reward_function/mean": 0.6015583276748657, "rewards/reward_function/std": 0.1292761668562889, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 164.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 148.58334045410157, "completions/mean_terminated_length": 148.58334045410157, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.9848484848484849, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.25373604893684387, "kl": 0.1025156612197558, "learning_rate": 1.9377777777777778e-05, "loss": 0.0001, "num_tokens": 944522.0, "reward": 0.6054783701896668, "reward_std": 0.014289665129035711, "rewards/reward_function/mean": 0.6054783225059509, "rewards/reward_function/std": 0.1043807715177536, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 166.8, "completions/clipped_ratio": 0.0, "completions/max_length": 166.8, "completions/max_terminated_length": 166.8, "completions/mean_length": 148.18333740234374, "completions/mean_terminated_length": 148.18333740234374, "completions/min_length": 129.8, "completions/min_terminated_length": 129.8, "epoch": 1.0606060606060606, "frac_reward_zero_std": 0.20000000596046447, "grad_norm": 0.40739330649375916, "kl": 0.13809017241001129, "learning_rate": 1.9155555555555558e-05, "loss": 0.0001, "num_tokens": 1016593.0, "reward": 0.625361704826355, "reward_std": 0.020681749982759356, "rewards/reward_function/mean": 0.6253616333007812, "rewards/reward_function/std": 0.09655277617275715, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 161.6, "completions/clipped_ratio": 0.0, "completions/max_length": 161.6, "completions/max_terminated_length": 161.6, "completions/mean_length": 143.83333740234374, "completions/mean_terminated_length": 143.83333740234374, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 1.1363636363636362, "frac_reward_zero_std": 0.33333333730697634, "grad_norm": 0.012142821215093136, "kl": 0.1849093531568845, "learning_rate": 1.8933333333333334e-05, "loss": 0.0002, "num_tokens": 1087987.0, "reward": 0.6634533524513244, "reward_std": 0.01583307459950447, "rewards/reward_function/mean": 0.6634533286094666, "rewards/reward_function/std": 0.12549073845148087, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 167.4, "completions/clipped_ratio": 0.0, "completions/max_length": 167.4, "completions/max_terminated_length": 167.4, "completions/mean_length": 149.76667175292968, "completions/mean_terminated_length": 149.76667175292968, "completions/min_length": 135.4, "completions/min_terminated_length": 135.4, "epoch": 1.2121212121212122, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.2965545952320099, "kl": 0.14318528175354003, "learning_rate": 1.8711111111111113e-05, "loss": 0.0001, "num_tokens": 1160193.0, "reward": 0.5806800246238708, "reward_std": 0.04468099344521761, "rewards/reward_function/mean": 0.5806800127029419, "rewards/reward_function/std": 0.11235176101326942, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 168.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 148.08334045410157, "completions/mean_terminated_length": 148.08334045410157, "completions/min_length": 134.8, "completions/min_terminated_length": 134.8, "epoch": 1.2878787878787878, "frac_reward_zero_std": 0.13333333730697633, "grad_norm": 0.3983455300331116, "kl": 0.16668486495812734, "learning_rate": 1.848888888888889e-05, "loss": 0.0002, "num_tokens": 1232170.0, "reward": 0.5667450308799744, "reward_std": 0.02706171413883567, "rewards/reward_function/mean": 0.5667450308799744, "rewards/reward_function/std": 0.1369288980960846, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 159.6, "completions/clipped_ratio": 0.0, "completions/max_length": 159.6, "completions/max_terminated_length": 159.6, "completions/mean_length": 143.23333740234375, "completions/mean_terminated_length": 143.23333740234375, "completions/min_length": 127.2, "completions/min_terminated_length": 127.2, "epoch": 1.3636363636363638, "frac_reward_zero_std": 0.20000000596046447, "grad_norm": 0.3573758006095886, "kl": 0.16684276660283406, "learning_rate": 1.826666666666667e-05, "loss": 0.0002, "num_tokens": 1303880.0, "reward": 0.6362017035484314, "reward_std": 0.007156953122466803, "rewards/reward_function/mean": 0.6362016916275024, "rewards/reward_function/std": 0.07174314968287945, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 156.2, "completions/clipped_ratio": 0.0, "completions/max_length": 156.2, "completions/max_terminated_length": 156.2, "completions/mean_length": 142.61667175292968, "completions/mean_terminated_length": 142.61667175292968, "completions/min_length": 132.4, "completions/min_terminated_length": 132.4, "epoch": 1.4393939393939394, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.28054413199424744, "kl": 0.16249675651391346, "learning_rate": 1.8044444444444445e-05, "loss": 0.0002, "num_tokens": 1375937.0, "reward": 0.5929333567619324, "reward_std": 0.010427127918228507, "rewards/reward_function/mean": 0.5929333448410035, "rewards/reward_function/std": 0.0421123169362545, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 153.2, "completions/clipped_ratio": 0.0, "completions/max_length": 153.2, "completions/max_terminated_length": 153.2, "completions/mean_length": 141.6166748046875, "completions/mean_terminated_length": 141.6166748046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.5151515151515151, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.35829174518585205, "kl": 0.1883176525433858, "learning_rate": 1.782222222222222e-05, "loss": 0.0002, "num_tokens": 1448046.0, "reward": 0.5822266817092896, "reward_std": 0.0035873036831617355, "rewards/reward_function/mean": 0.5822266697883606, "rewards/reward_function/std": 0.045498589798808095, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.8, "completions/clipped_ratio": 0.0, "completions/max_length": 150.8, "completions/max_terminated_length": 150.8, "completions/mean_length": 138.93334045410157, "completions/mean_terminated_length": 138.93334045410157, "completions/min_length": 124.2, "completions/min_terminated_length": 124.2, "epoch": 1.5909090909090908, "frac_reward_zero_std": 0.6000000178813935, "grad_norm": 0.24431772530078888, "kl": 0.19910954435666403, "learning_rate": 1.76e-05, "loss": 0.0002, "num_tokens": 1520398.0, "reward": 0.6048667073249817, "reward_std": 0.0037458556122146546, "rewards/reward_function/mean": 0.6048666715621949, "rewards/reward_function/std": 0.05969845354557037, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.0500061035156, "completions/mean_terminated_length": 138.0500061035156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.3577372431755066, "kl": 0.20313620964686077, "learning_rate": 1.737777777777778e-05, "loss": 0.0002, "num_tokens": 1591865.0, "reward": 0.6223516702651978, "reward_std": 0.005239984532818198, "rewards/reward_function/mean": 0.6223516583442688, "rewards/reward_function/std": 0.08074029944837094, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 147.6, "completions/clipped_ratio": 0.0, "completions/max_length": 147.6, "completions/max_terminated_length": 147.6, "completions/mean_length": 135.33334045410157, "completions/mean_terminated_length": 135.33334045410157, "completions/min_length": 126.8, "completions/min_terminated_length": 126.8, "epoch": 1.7424242424242424, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.4057120680809021, "kl": 0.23306088149547577, "learning_rate": 1.7155555555555557e-05, "loss": 0.0002, "num_tokens": 1663321.0, "reward": 0.6518083691596985, "reward_std": 0.004030292294919491, "rewards/reward_function/mean": 0.6518083572387695, "rewards/reward_function/std": 0.07627851068973542, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 148.2, "completions/clipped_ratio": 0.0, "completions/max_length": 148.2, "completions/max_terminated_length": 148.2, "completions/mean_length": 137.23334045410155, "completions/mean_terminated_length": 137.23334045410155, "completions/min_length": 126.8, "completions/min_terminated_length": 126.8, "epoch": 1.8181818181818183, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.33752211928367615, "kl": 0.22376729945341747, "learning_rate": 1.6933333333333336e-05, "loss": 0.0002, "num_tokens": 1735547.0, "reward": 0.5559616804122924, "reward_std": 0.004689847212284803, "rewards/reward_function/mean": 0.55596165060997, "rewards/reward_function/std": 0.07087234668433666, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.6, "completions/clipped_ratio": 0.0, "completions/max_length": 144.6, "completions/max_terminated_length": 144.6, "completions/mean_length": 134.8166717529297, "completions/mean_terminated_length": 134.8166717529297, "completions/min_length": 124.6, "completions/min_terminated_length": 124.6, "epoch": 1.893939393939394, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.383306622505188, "kl": 0.21860195795694987, "learning_rate": 1.6711111111111112e-05, "loss": 0.0002, "num_tokens": 1806636.0, "reward": 0.619973337650299, "reward_std": 0.007127840328030289, "rewards/reward_function/mean": 0.6199733138084411, "rewards/reward_function/std": 0.1044730719178915, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 149.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 138.85000305175782, "completions/mean_terminated_length": 138.85000305175782, "completions/min_length": 129.8, "completions/min_terminated_length": 129.8, "epoch": 1.9696969696969697, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.3041795790195465, "kl": 0.22460319399833678, "learning_rate": 1.648888888888889e-05, "loss": 0.0002, "num_tokens": 1878715.0, "reward": 0.600421690940857, "reward_std": 0.003138695494271815, "rewards/reward_function/mean": 0.6004216790199279, "rewards/reward_function/std": 0.07153937965631485, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 149.4, "completions/clipped_ratio": 0.0, "completions/max_length": 149.4, "completions/max_terminated_length": 149.4, "completions/mean_length": 138.73333740234375, "completions/mean_terminated_length": 138.73333740234375, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "epoch": 2.0454545454545454, "frac_reward_zero_std": 0.06666666865348816, "grad_norm": 0.4632037281990051, "kl": 0.2490247219800949, "learning_rate": 1.6266666666666668e-05, "loss": 0.0003, "num_tokens": 1950283.0, "reward": 0.6769016981124878, "reward_std": 0.004545123921707273, "rewards/reward_function/mean": 0.6769016504287719, "rewards/reward_function/std": 0.10631415694952011, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.4, "completions/clipped_ratio": 0.0, "completions/max_length": 150.4, "completions/max_terminated_length": 150.4, "completions/mean_length": 141.00000610351563, "completions/mean_terminated_length": 141.00000610351563, "completions/min_length": 133.4, "completions/min_terminated_length": 133.4, "epoch": 2.121212121212121, "frac_reward_zero_std": 0.4666666805744171, "grad_norm": 0.3432351350784302, "kl": 0.2777308980623881, "learning_rate": 1.6044444444444444e-05, "loss": 0.0003, "num_tokens": 2022539.0, "reward": 0.5752000212669373, "reward_std": 0.003263407130725682, "rewards/reward_function/mean": 0.5751999974250793, "rewards/reward_function/std": 0.04959992915391922, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 159.6, "completions/clipped_ratio": 0.0, "completions/max_length": 159.6, "completions/max_terminated_length": 159.6, "completions/mean_length": 142.4166748046875, "completions/mean_terminated_length": 142.4166748046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 2.196969696969697, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.2584984600543976, "kl": 0.28019193609555565, "learning_rate": 1.5822222222222224e-05, "loss": 0.0003, "num_tokens": 2094328.0, "reward": 0.6064266800880432, "reward_std": 0.005170531757175923, "rewards/reward_function/mean": 0.6064266920089721, "rewards/reward_function/std": 0.06116051897406578, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 151.8, "completions/clipped_ratio": 0.0, "completions/max_length": 151.8, "completions/max_terminated_length": 151.8, "completions/mean_length": 140.68333740234374, "completions/mean_terminated_length": 140.68333740234374, "completions/min_length": 130.4, "completions/min_terminated_length": 130.4, "epoch": 2.2727272727272725, "frac_reward_zero_std": 0.4666666805744171, "grad_norm": 0.5067008137702942, "kl": 0.2740016082922618, "learning_rate": 1.5600000000000003e-05, "loss": 0.0003, "num_tokens": 2166033.0, "reward": 0.609345018863678, "reward_std": 0.002939810324460268, "rewards/reward_function/mean": 0.6093450069427491, "rewards/reward_function/std": 0.07519036456942559, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 156.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 141.85000305175782, "completions/mean_terminated_length": 141.85000305175782, "completions/min_length": 131.4, "completions/min_terminated_length": 131.4, "epoch": 2.3484848484848486, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.0041471216827631, "kl": 0.30289856195449827, "learning_rate": 1.537777777777778e-05, "loss": 0.0003, "num_tokens": 2238128.0, "reward": 0.625088381767273, "reward_std": 0.003446168079972267, "rewards/reward_function/mean": 0.625088346004486, "rewards/reward_function/std": 0.0934045672416687, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 154.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 143.03333740234376, "completions/mean_terminated_length": 143.03333740234376, "completions/min_length": 132.6, "completions/min_terminated_length": 132.6, "epoch": 2.4242424242424243, "frac_reward_zero_std": 0.4666666805744171, "grad_norm": 0.4292708933353424, "kl": 0.32506192127863565, "learning_rate": 1.5155555555555557e-05, "loss": 0.0003, "num_tokens": 2310254.0, "reward": 0.6129150271415711, "reward_std": 0.001613644661847502, "rewards/reward_function/mean": 0.6129150032997132, "rewards/reward_function/std": 0.04124578349292278, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.6, "completions/clipped_ratio": 0.0, "completions/max_length": 150.6, "completions/max_terminated_length": 150.6, "completions/mean_length": 140.08333740234374, "completions/mean_terminated_length": 140.08333740234374, "completions/min_length": 130.2, "completions/min_terminated_length": 130.2, "epoch": 2.5, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.22632652521133423, "kl": 0.3719263752301534, "learning_rate": 1.4933333333333335e-05, "loss": 0.0004, "num_tokens": 2382007.0, "reward": 0.6153733611106873, "reward_std": 0.0011415929766371846, "rewards/reward_function/mean": 0.6153733372688294, "rewards/reward_function/std": 0.06577699668705464, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 153.6, "completions/clipped_ratio": 0.0, "completions/max_length": 153.6, "completions/max_terminated_length": 153.6, "completions/mean_length": 142.00000610351563, "completions/mean_terminated_length": 142.00000610351563, "completions/min_length": 132.2, "completions/min_terminated_length": 132.2, "epoch": 2.5757575757575757, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.24591365456581116, "kl": 0.3840523103872935, "learning_rate": 1.4711111111111111e-05, "loss": 0.0004, "num_tokens": 2453727.0, "reward": 0.6065983414649964, "reward_std": 0.0017149411884020082, "rewards/reward_function/mean": 0.6065983414649964, "rewards/reward_function/std": 0.07528561279177666, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 153.2, "completions/clipped_ratio": 0.0, "completions/max_length": 153.2, "completions/max_terminated_length": 153.2, "completions/mean_length": 138.4166717529297, "completions/mean_terminated_length": 138.4166717529297, "completions/min_length": 131.2, "completions/min_terminated_length": 131.2, "epoch": 2.6515151515151514, "frac_reward_zero_std": 0.6666666746139527, "grad_norm": 0.2127596139907837, "kl": 0.39285261034965513, "learning_rate": 1.448888888888889e-05, "loss": 0.0004, "num_tokens": 2525720.0, "reward": 0.5957500219345093, "reward_std": 0.000984994637838099, "rewards/reward_function/mean": 0.5957500100135803, "rewards/reward_function/std": 0.05670791454613209, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 150.4, "completions/clipped_ratio": 0.0, "completions/max_length": 150.4, "completions/max_terminated_length": 150.4, "completions/mean_length": 138.28334045410156, "completions/mean_terminated_length": 138.28334045410156, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 2.7272727272727275, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.5221720933914185, "kl": 0.42230349183082583, "learning_rate": 1.4266666666666668e-05, "loss": 0.0004, "num_tokens": 2597141.0, "reward": 0.6146900177001953, "reward_std": 0.002093914127908647, "rewards/reward_function/mean": 0.6146899938583374, "rewards/reward_function/std": 0.09454492926597595, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.6, "completions/clipped_ratio": 0.0, "completions/max_length": 144.6, "completions/max_terminated_length": 144.6, "completions/mean_length": 136.4, "completions/mean_terminated_length": 136.4, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "epoch": 2.8030303030303028, "frac_reward_zero_std": 0.6666666746139527, "grad_norm": 0.0012158072786405683, "kl": 0.4219982922077179, "learning_rate": 1.4044444444444445e-05, "loss": 0.0004, "num_tokens": 2668885.0, "reward": 0.6260683536529541, "reward_std": 0.001636023900937289, "rewards/reward_function/mean": 0.6260683178901673, "rewards/reward_function/std": 0.0843635703320615, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 134.48334045410155, "completions/mean_terminated_length": 134.48334045410155, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "epoch": 2.878787878787879, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.5890435576438904, "kl": 0.4845229466756185, "learning_rate": 1.3822222222222224e-05, "loss": 0.0005, "num_tokens": 2739846.0, "reward": 0.647468364238739, "reward_std": 0.004450538125820458, "rewards/reward_function/mean": 0.6474683403968811, "rewards/reward_function/std": 0.0956076867878437, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.8, "completions/clipped_ratio": 0.0, "completions/max_length": 142.8, "completions/max_terminated_length": 142.8, "completions/mean_length": 136.56666870117186, "completions/mean_terminated_length": 136.56666870117186, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 2.9545454545454546, "frac_reward_zero_std": 0.5333333492279053, "grad_norm": 0.5040601491928101, "kl": 0.4712466796239217, "learning_rate": 1.3600000000000002e-05, "loss": 0.0005, "num_tokens": 2811368.0, "reward": 0.5991516828536987, "reward_std": 0.0033168070833198724, "rewards/reward_function/mean": 0.5991516828536987, "rewards/reward_function/std": 0.11686233524233103, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 149.6, "completions/clipped_ratio": 0.0, "completions/max_length": 149.6, "completions/max_terminated_length": 149.6, "completions/mean_length": 136.61666870117188, "completions/mean_terminated_length": 136.61666870117188, "completions/min_length": 128.6, "completions/min_terminated_length": 128.6, "epoch": 3.0303030303030303, "frac_reward_zero_std": 0.06666666865348816, "grad_norm": 0.4343958795070648, "kl": 4.539325646559397, "learning_rate": 1.3377777777777778e-05, "loss": 0.0046, "num_tokens": 2882637.0, "reward": 0.6381200432777405, "reward_std": 0.002671318035572767, "rewards/reward_function/mean": 0.6381200075149536, "rewards/reward_function/std": 0.06200197748839855, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 136.4666717529297, "completions/mean_terminated_length": 136.4666717529297, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "epoch": 3.106060606060606, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.001530068926513195, "kl": 0.42953559557596843, "learning_rate": 1.3155555555555558e-05, "loss": 0.0004, "num_tokens": 2954481.0, "reward": 0.6389833569526673, "reward_std": 0.0028340428718365727, "rewards/reward_function/mean": 0.6389833331108093, "rewards/reward_function/std": 0.07430336326360702, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 152.8, "completions/clipped_ratio": 0.0, "completions/max_length": 152.8, "completions/max_terminated_length": 152.8, "completions/mean_length": 137.0500030517578, "completions/mean_terminated_length": 137.0500030517578, "completions/min_length": 128.6, "completions/min_terminated_length": 128.6, "epoch": 3.1818181818181817, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.2242085039615631, "kl": 0.4064524511496226, "learning_rate": 1.2933333333333334e-05, "loss": 0.0004, "num_tokens": 3025984.0, "reward": 0.592305040359497, "reward_std": 0.023412239202298225, "rewards/reward_function/mean": 0.5923050284385681, "rewards/reward_function/std": 0.06483886577188969, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.8, "completions/clipped_ratio": 0.0, "completions/max_length": 142.8, "completions/max_terminated_length": 142.8, "completions/mean_length": 136.6666687011719, "completions/mean_terminated_length": 136.6666687011719, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 3.257575757575758, "frac_reward_zero_std": 0.800000011920929, "grad_norm": 0.5229790806770325, "kl": 0.4055224259694417, "learning_rate": 1.2711111111111112e-05, "loss": 0.0004, "num_tokens": 3097968.0, "reward": 0.5731833577156067, "reward_std": 0.01963214036077261, "rewards/reward_function/mean": 0.5731833457946778, "rewards/reward_function/std": 0.07965116798877717, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.8, "completions/clipped_ratio": 0.0, "completions/max_length": 143.8, "completions/max_terminated_length": 143.8, "completions/mean_length": 135.50000610351563, "completions/mean_terminated_length": 135.50000610351563, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 3.3333333333333335, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.39335760474205017, "kl": 0.43457045356432594, "learning_rate": 1.2488888888888891e-05, "loss": 0.0004, "num_tokens": 3169202.0, "reward": 0.6536967039108277, "reward_std": 0.001967143564252183, "rewards/reward_function/mean": 0.6536966800689697, "rewards/reward_function/std": 0.0629568338394165, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 146.4, "completions/clipped_ratio": 0.0, "completions/max_length": 146.4, "completions/max_terminated_length": 146.4, "completions/mean_length": 137.25000610351563, "completions/mean_terminated_length": 137.25000610351563, "completions/min_length": 129.2, "completions/min_terminated_length": 129.2, "epoch": 3.409090909090909, "frac_reward_zero_std": 0.5333333432674408, "grad_norm": 0.4324550926685333, "kl": 0.4297958453496297, "learning_rate": 1.2266666666666667e-05, "loss": 0.0004, "num_tokens": 3240145.0, "reward": 0.6059366822242737, "reward_std": 0.0020971522550098597, "rewards/reward_function/mean": 0.6059366822242737, "rewards/reward_function/std": 0.12276976853609085, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.4, "completions/clipped_ratio": 0.0, "completions/max_length": 144.4, "completions/max_terminated_length": 144.4, "completions/mean_length": 136.2500030517578, "completions/mean_terminated_length": 136.2500030517578, "completions/min_length": 130.6, "completions/min_terminated_length": 130.6, "epoch": 3.484848484848485, "frac_reward_zero_std": 0.5333333432674408, "grad_norm": 0.29709550738334656, "kl": 0.4481562276681264, "learning_rate": 1.2044444444444445e-05, "loss": 0.0005, "num_tokens": 3311672.0, "reward": 0.6084200322628022, "reward_std": 0.002014898555353284, "rewards/reward_function/mean": 0.6084200084209442, "rewards/reward_function/std": 0.08303090147674083, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 141.8, "completions/clipped_ratio": 0.0, "completions/max_length": 141.8, "completions/max_terminated_length": 141.8, "completions/mean_length": 136.60000305175782, "completions/mean_terminated_length": 136.60000305175782, "completions/min_length": 130.8, "completions/min_terminated_length": 130.8, "epoch": 3.5606060606060606, "frac_reward_zero_std": 0.4666666805744171, "grad_norm": 0.4094958007335663, "kl": 0.4330082913239797, "learning_rate": 1.1822222222222225e-05, "loss": 0.0004, "num_tokens": 3383508.0, "reward": 0.6278083682060241, "reward_std": 0.0020803724364668597, "rewards/reward_function/mean": 0.6278083443641662, "rewards/reward_function/std": 0.05985546782612801, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 136.03334045410156, "completions/mean_terminated_length": 136.03334045410156, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 3.6363636363636362, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.38449370861053467, "kl": 0.47435951630274453, "learning_rate": 1.16e-05, "loss": 0.0005, "num_tokens": 3454898.0, "reward": 0.5935583353042603, "reward_std": 0.0033034421736374497, "rewards/reward_function/mean": 0.5935583353042603, "rewards/reward_function/std": 0.1149674504995346, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 136.90000610351564, "completions/mean_terminated_length": 136.90000610351564, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 3.712121212121212, "frac_reward_zero_std": 0.7333333432674408, "grad_norm": 0.20474444329738617, "kl": 0.4580156147480011, "learning_rate": 1.1377777777777779e-05, "loss": 0.0005, "num_tokens": 3527168.0, "reward": 0.5878700256347656, "reward_std": 0.0007033476096694358, "rewards/reward_function/mean": 0.5878700017929077, "rewards/reward_function/std": 0.03646513521671295, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 146.2, "completions/clipped_ratio": 0.0, "completions/max_length": 146.2, "completions/max_terminated_length": 146.2, "completions/mean_length": 136.83333740234374, "completions/mean_terminated_length": 136.83333740234374, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "epoch": 3.787878787878788, "frac_reward_zero_std": 0.4666666805744171, "grad_norm": 0.3273075222969055, "kl": 0.4593264718850454, "learning_rate": 1.1155555555555556e-05, "loss": 0.0005, "num_tokens": 3598574.0, "reward": 0.6440800189971924, "reward_std": 0.002970032987650484, "rewards/reward_function/mean": 0.6440800070762634, "rewards/reward_function/std": 0.05392170324921608, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 146.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 137.45000610351562, "completions/mean_terminated_length": 137.45000610351562, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "epoch": 3.8636363636363638, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.2910541892051697, "kl": 0.4974993328253428, "learning_rate": 1.0933333333333334e-05, "loss": 0.0005, "num_tokens": 3670177.0, "reward": 0.6269100069999695, "reward_std": 0.0007708149147219956, "rewards/reward_function/mean": 0.6269099950790405, "rewards/reward_function/std": 0.06847313707694411, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.2, "completions/clipped_ratio": 0.0, "completions/max_length": 144.2, "completions/max_terminated_length": 144.2, "completions/mean_length": 136.33334045410157, "completions/mean_terminated_length": 136.33334045410157, "completions/min_length": 130.8, "completions/min_terminated_length": 130.8, "epoch": 3.9393939393939394, "frac_reward_zero_std": 0.13333333730697633, "grad_norm": 0.29939982295036316, "kl": 0.5057354072729746, "learning_rate": 1.0711111111111112e-05, "loss": 0.0005, "num_tokens": 3741225.0, "reward": 0.6429433226585388, "reward_std": 0.0031549638602882623, "rewards/reward_function/mean": 0.6429433345794677, "rewards/reward_function/std": 0.07378025688230991, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 147.4, "completions/clipped_ratio": 0.0, "completions/max_length": 147.4, "completions/max_terminated_length": 147.4, "completions/mean_length": 138.36667175292968, "completions/mean_terminated_length": 138.36667175292968, "completions/min_length": 130.8, "completions/min_terminated_length": 130.8, "epoch": 4.015151515151516, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.0023478628136217594, "kl": 0.48426355322202047, "learning_rate": 1.048888888888889e-05, "loss": 0.0005, "num_tokens": 3813143.0, "reward": 0.6231333494186402, "reward_std": 0.0028609800268895925, "rewards/reward_function/mean": 0.6231333374977112, "rewards/reward_function/std": 0.0803637184202671, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 147.8, "completions/clipped_ratio": 0.0, "completions/max_length": 147.8, "completions/max_terminated_length": 147.8, "completions/mean_length": 139.08333435058594, "completions/mean_terminated_length": 139.08333435058594, "completions/min_length": 132.6, "completions/min_terminated_length": 132.6, "epoch": 4.090909090909091, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.299233615398407, "kl": 0.5079402208328248, "learning_rate": 1.0266666666666668e-05, "loss": 0.0005, "num_tokens": 3884252.0, "reward": 0.6081600427627564, "reward_std": 0.0036574415396898987, "rewards/reward_function/mean": 0.6081599950790405, "rewards/reward_function/std": 0.08519913330674171, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.8, "completions/clipped_ratio": 0.0, "completions/max_length": 145.8, "completions/max_terminated_length": 145.8, "completions/mean_length": 137.45000305175782, "completions/mean_terminated_length": 137.45000305175782, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "epoch": 4.166666666666667, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.25058528780937195, "kl": 0.4812973976135254, "learning_rate": 1.0044444444444446e-05, "loss": 0.0005, "num_tokens": 3956003.0, "reward": 0.5873700261116028, "reward_std": 0.0026831103255972265, "rewards/reward_function/mean": 0.5873700022697449, "rewards/reward_function/std": 0.0719571478664875, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.2, "completions/clipped_ratio": 0.0, "completions/max_length": 144.2, "completions/max_terminated_length": 144.2, "completions/mean_length": 136.8666748046875, "completions/mean_terminated_length": 136.8666748046875, "completions/min_length": 130.2, "completions/min_terminated_length": 130.2, "epoch": 4.242424242424242, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.22324170172214508, "kl": 0.4731239755948385, "learning_rate": 9.822222222222223e-06, "loss": 0.0005, "num_tokens": 4027967.0, "reward": 0.5852800250053406, "reward_std": 0.0007758166582789272, "rewards/reward_function/mean": 0.5852800071239471, "rewards/reward_function/std": 0.07820635661482811, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 168.4, "completions/clipped_ratio": 0.0, "completions/max_length": 168.4, "completions/max_terminated_length": 168.4, "completions/mean_length": 138.6166748046875, "completions/mean_terminated_length": 138.6166748046875, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "epoch": 4.318181818181818, "frac_reward_zero_std": 0.20000000596046447, "grad_norm": 0.3128497898578644, "kl": 0.5036711434523264, "learning_rate": 9.600000000000001e-06, "loss": 0.0005, "num_tokens": 4099096.0, "reward": 0.6271817207336425, "reward_std": 0.0018659118562936784, "rewards/reward_function/mean": 0.6271816611289978, "rewards/reward_function/std": 0.11487905830144882, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.6, "completions/clipped_ratio": 0.0, "completions/max_length": 140.6, "completions/max_terminated_length": 140.6, "completions/mean_length": 134.8000030517578, "completions/mean_terminated_length": 134.8000030517578, "completions/min_length": 128.2, "completions/min_terminated_length": 128.2, "epoch": 4.393939393939394, "frac_reward_zero_std": 0.7333333492279053, "grad_norm": 0.22989916801452637, "kl": 0.4845824003219604, "learning_rate": 9.377777777777779e-06, "loss": 0.0005, "num_tokens": 4171172.0, "reward": 0.6156550168991088, "reward_std": 0.0006054541794583201, "rewards/reward_function/mean": 0.615654981136322, "rewards/reward_function/std": 0.06183821316808462, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 139.2, "completions/clipped_ratio": 0.0, "completions/max_length": 139.2, "completions/max_terminated_length": 139.2, "completions/mean_length": 135.33333740234374, "completions/mean_terminated_length": 135.33333740234374, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 4.46969696969697, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.5052292943000793, "kl": 0.4509253283341726, "learning_rate": 9.155555555555557e-06, "loss": 0.0005, "num_tokens": 4242420.0, "reward": 0.6587733507156373, "reward_std": 0.0039598645540536385, "rewards/reward_function/mean": 0.6587733268737793, "rewards/reward_function/std": 0.10939907655119896, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.6, "completions/clipped_ratio": 0.0, "completions/max_length": 140.6, "completions/max_terminated_length": 140.6, "completions/mean_length": 135.4666717529297, "completions/mean_terminated_length": 135.4666717529297, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 4.545454545454545, "frac_reward_zero_std": 0.7333333492279053, "grad_norm": 0.34855085611343384, "kl": 0.6146595040957133, "learning_rate": 8.933333333333333e-06, "loss": 0.0006, "num_tokens": 4314044.0, "reward": 0.5829833626747132, "reward_std": 0.02590584859426599, "rewards/reward_function/mean": 0.5829833388328552, "rewards/reward_function/std": 0.090936179459095, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.6, "completions/clipped_ratio": 0.0, "completions/max_length": 142.6, "completions/max_terminated_length": 142.6, "completions/mean_length": 136.7166748046875, "completions/mean_terminated_length": 136.7166748046875, "completions/min_length": 130.6, "completions/min_terminated_length": 130.6, "epoch": 4.621212121212121, "frac_reward_zero_std": 0.8666666746139526, "grad_norm": 0.0010158641962334514, "kl": 0.4747675855954488, "learning_rate": 8.711111111111111e-06, "loss": 0.0005, "num_tokens": 4385951.0, "reward": 0.5888633489608764, "reward_std": 0.000871180125977844, "rewards/reward_function/mean": 0.5888633370399475, "rewards/reward_function/std": 0.059521042928099635, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.2, "completions/clipped_ratio": 0.0, "completions/max_length": 143.2, "completions/max_terminated_length": 143.2, "completions/mean_length": 135.63333740234376, "completions/mean_terminated_length": 135.63333740234376, "completions/min_length": 129.8, "completions/min_terminated_length": 129.8, "epoch": 4.696969696969697, "frac_reward_zero_std": 0.6666666746139527, "grad_norm": 0.3419860005378723, "kl": 0.4693106154600779, "learning_rate": 8.48888888888889e-06, "loss": 0.0005, "num_tokens": 4457693.0, "reward": 0.6256850123405456, "reward_std": 0.00048030615434981885, "rewards/reward_function/mean": 0.6256850123405456, "rewards/reward_function/std": 0.07439599148929119, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 135.26666870117188, "completions/mean_terminated_length": 135.26666870117188, "completions/min_length": 125.6, "completions/min_terminated_length": 125.6, "epoch": 4.7727272727272725, "frac_reward_zero_std": 0.7333333432674408, "grad_norm": 0.0016331294318661094, "kl": 0.4828857759634654, "learning_rate": 8.266666666666667e-06, "loss": 0.0005, "num_tokens": 4529181.0, "reward": 0.6507667064666748, "reward_std": 0.0004982158861821517, "rewards/reward_function/mean": 0.6507666826248169, "rewards/reward_function/std": 0.06604736782610417, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.4, "completions/clipped_ratio": 0.0, "completions/max_length": 144.4, "completions/max_terminated_length": 144.4, "completions/mean_length": 134.51666870117188, "completions/mean_terminated_length": 134.51666870117188, "completions/min_length": 126.6, "completions/min_terminated_length": 126.6, "epoch": 4.848484848484849, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.2499098777770996, "kl": 0.494149374961853, "learning_rate": 8.044444444444444e-06, "loss": 0.0005, "num_tokens": 4600312.0, "reward": 0.6521166801452637, "reward_std": 0.0006720453522575554, "rewards/reward_function/mean": 0.6521166682243347, "rewards/reward_function/std": 0.08324470967054368, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.6, "completions/clipped_ratio": 0.0, "completions/max_length": 145.6, "completions/max_terminated_length": 145.6, "completions/mean_length": 136.28333740234376, "completions/mean_terminated_length": 136.28333740234376, "completions/min_length": 128.6, "completions/min_terminated_length": 128.6, "epoch": 4.924242424242424, "frac_reward_zero_std": 0.5333333492279053, "grad_norm": 0.23840579390525818, "kl": 0.4895890514055888, "learning_rate": 7.822222222222224e-06, "loss": 0.0005, "num_tokens": 4671817.0, "reward": 0.578350019454956, "reward_std": 0.0009398535461514257, "rewards/reward_function/mean": 0.5783499956130982, "rewards/reward_function/std": 0.08822323828935623, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.6, "completions/clipped_ratio": 0.0, "completions/max_length": 140.6, "completions/max_terminated_length": 140.6, "completions/mean_length": 134.86667175292968, "completions/mean_terminated_length": 134.86667175292968, "completions/min_length": 128.6, "completions/min_terminated_length": 128.6, "epoch": 5.0, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.0008585389005020261, "kl": 0.49821096857388814, "learning_rate": 7.600000000000001e-06, "loss": 0.0005, "num_tokens": 4742753.0, "reward": 0.6717733860015869, "reward_std": 0.0035625058459118008, "rewards/reward_function/mean": 0.6717733263969421, "rewards/reward_function/std": 0.1110783264040947, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.4, "completions/clipped_ratio": 0.0, "completions/max_length": 142.4, "completions/max_terminated_length": 142.4, "completions/mean_length": 132.9666748046875, "completions/mean_terminated_length": 132.9666748046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 5.075757575757576, "frac_reward_zero_std": 0.5333333492279053, "grad_norm": 0.23162811994552612, "kl": 0.5019173324108124, "learning_rate": 7.377777777777778e-06, "loss": 0.0005, "num_tokens": 4814159.0, "reward": 0.6638583660125732, "reward_std": 0.0006335714278975502, "rewards/reward_function/mean": 0.6638583660125732, "rewards/reward_function/std": 0.08800669051706791, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.2, "completions/clipped_ratio": 0.0, "completions/max_length": 140.2, "completions/max_terminated_length": 140.2, "completions/mean_length": 133.15000610351564, "completions/mean_terminated_length": 133.15000610351564, "completions/min_length": 124.8, "completions/min_terminated_length": 124.8, "epoch": 5.151515151515151, "frac_reward_zero_std": 0.5333333432674408, "grad_norm": 0.001869841362349689, "kl": 0.502401218811671, "learning_rate": 7.155555555555556e-06, "loss": 0.0005, "num_tokens": 4885236.0, "reward": 0.6237933874130249, "reward_std": 0.0005346706879208796, "rewards/reward_function/mean": 0.6237933516502381, "rewards/reward_function/std": 0.11264116615056992, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 141.4, "completions/clipped_ratio": 0.0, "completions/max_length": 141.4, "completions/max_terminated_length": 141.4, "completions/mean_length": 134.51666870117188, "completions/mean_terminated_length": 134.51666870117188, "completions/min_length": 128.6, "completions/min_terminated_length": 128.6, "epoch": 5.2272727272727275, "frac_reward_zero_std": 0.6666666746139527, "grad_norm": 0.0013165439013391733, "kl": 0.5192258059978485, "learning_rate": 6.9333333333333344e-06, "loss": 0.0005, "num_tokens": 4956535.0, "reward": 0.627209997177124, "reward_std": 0.00032304815831594167, "rewards/reward_function/mean": 0.627209997177124, "rewards/reward_function/std": 0.10075384378433228, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.4, "completions/clipped_ratio": 0.0, "completions/max_length": 140.4, "completions/max_terminated_length": 140.4, "completions/mean_length": 133.4666748046875, "completions/mean_terminated_length": 133.4666748046875, "completions/min_length": 126.8, "completions/min_terminated_length": 126.8, "epoch": 5.303030303030303, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.0013948202831670642, "kl": 0.5109677731990814, "learning_rate": 6.711111111111111e-06, "loss": 0.0005, "num_tokens": 5027323.0, "reward": 0.6721283674240113, "reward_std": 0.002642212545470102, "rewards/reward_function/mean": 0.6721283793449402, "rewards/reward_function/std": 0.1264987990260124, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 170.4, "completions/clipped_ratio": 0.0, "completions/max_length": 170.4, "completions/max_terminated_length": 170.4, "completions/mean_length": 137.61667175292968, "completions/mean_terminated_length": 137.61667175292968, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 5.378787878787879, "frac_reward_zero_std": 0.8666666686534882, "grad_norm": 0.001804351806640625, "kl": 0.5009710172812144, "learning_rate": 6.488888888888889e-06, "loss": 0.0005, "num_tokens": 5099384.0, "reward": 0.5923767209053039, "reward_std": 0.001375216245651245, "rewards/reward_function/mean": 0.5923766851425171, "rewards/reward_function/std": 0.04210694804787636, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.2, "completions/clipped_ratio": 0.0, "completions/max_length": 143.2, "completions/max_terminated_length": 143.2, "completions/mean_length": 135.01667175292968, "completions/mean_terminated_length": 135.01667175292968, "completions/min_length": 127.2, "completions/min_terminated_length": 127.2, "epoch": 5.454545454545454, "frac_reward_zero_std": 0.7333333492279053, "grad_norm": 0.38603079319000244, "kl": 0.4913855314254761, "learning_rate": 6.266666666666668e-06, "loss": 0.0005, "num_tokens": 5170701.0, "reward": 0.5950633525848389, "reward_std": 0.004046973369258922, "rewards/reward_function/mean": 0.5950633406639099, "rewards/reward_function/std": 0.10473623871803284, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.2, "completions/clipped_ratio": 0.0, "completions/max_length": 142.2, "completions/max_terminated_length": 142.2, "completions/mean_length": 134.9166717529297, "completions/mean_terminated_length": 134.9166717529297, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "epoch": 5.53030303030303, "frac_reward_zero_std": 0.8666666746139526, "grad_norm": 0.0022605860140174627, "kl": 0.4842663645744324, "learning_rate": 6.044444444444445e-06, "loss": 0.0005, "num_tokens": 5242424.0, "reward": 0.622570025920868, "reward_std": 5.1546646864153446e-05, "rewards/reward_function/mean": 0.6225700139999389, "rewards/reward_function/std": 0.033699407684616746, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.6, "completions/clipped_ratio": 0.0, "completions/max_length": 142.6, "completions/max_terminated_length": 142.6, "completions/mean_length": 135.10000610351562, "completions/mean_terminated_length": 135.10000610351562, "completions/min_length": 128.2, "completions/min_terminated_length": 128.2, "epoch": 5.606060606060606, "frac_reward_zero_std": 0.8666666746139526, "grad_norm": 0.28471168875694275, "kl": 0.49788983861605324, "learning_rate": 5.822222222222223e-06, "loss": 0.0005, "num_tokens": 5314082.0, "reward": 0.5884933590888977, "reward_std": 0.0018088188953697681, "rewards/reward_function/mean": 0.5884933233261108, "rewards/reward_function/std": 0.05092477286234498, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 141.6, "completions/clipped_ratio": 0.0, "completions/max_length": 141.6, "completions/max_terminated_length": 141.6, "completions/mean_length": 134.83334045410157, "completions/mean_terminated_length": 134.83334045410157, "completions/min_length": 129.4, "completions/min_terminated_length": 129.4, "epoch": 5.681818181818182, "frac_reward_zero_std": 0.800000011920929, "grad_norm": 0.2497999668121338, "kl": 0.5025030295054118, "learning_rate": 5.600000000000001e-06, "loss": 0.0005, "num_tokens": 5385792.0, "reward": 0.6160383701324463, "reward_std": 0.000584107032045722, "rewards/reward_function/mean": 0.6160383224487305, "rewards/reward_function/std": 0.061161456257104875, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 138.8, "completions/clipped_ratio": 0.0, "completions/max_length": 138.8, "completions/max_terminated_length": 138.8, "completions/mean_length": 133.60000305175782, "completions/mean_terminated_length": 133.60000305175782, "completions/min_length": 127.6, "completions/min_terminated_length": 127.6, "epoch": 5.757575757575758, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.5607179999351501, "kl": 0.5027451872825622, "learning_rate": 5.3777777777777784e-06, "loss": 0.0005, "num_tokens": 5457272.0, "reward": 0.6048883557319641, "reward_std": 0.001853011967614293, "rewards/reward_function/mean": 0.6048883438110352, "rewards/reward_function/std": 0.08679699413478374, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.6, "completions/clipped_ratio": 0.0, "completions/max_length": 140.6, "completions/max_terminated_length": 140.6, "completions/mean_length": 134.86666870117188, "completions/mean_terminated_length": 134.86666870117188, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "epoch": 5.833333333333333, "frac_reward_zero_std": 0.33333333730697634, "grad_norm": 0.0009953180560842156, "kl": 0.5802711407343547, "learning_rate": 5.155555555555556e-06, "loss": 0.0006, "num_tokens": 5528632.0, "reward": 0.598478353023529, "reward_std": 0.0020464868051931263, "rewards/reward_function/mean": 0.598478353023529, "rewards/reward_function/std": 0.06861904165707529, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.6, "completions/clipped_ratio": 0.0, "completions/max_length": 142.6, "completions/max_terminated_length": 142.6, "completions/mean_length": 134.65000610351564, "completions/mean_terminated_length": 134.65000610351564, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 5.909090909090909, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.2697948217391968, "kl": 0.5500587046146392, "learning_rate": 4.933333333333334e-06, "loss": 0.0005, "num_tokens": 5599959.0, "reward": 0.6114783763885498, "reward_std": 0.0005467232927912846, "rewards/reward_function/mean": 0.6114783406257629, "rewards/reward_function/std": 0.07442084513604641, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.4, "completions/clipped_ratio": 0.0, "completions/max_length": 142.4, "completions/max_terminated_length": 142.4, "completions/mean_length": 135.63333740234376, "completions/mean_terminated_length": 135.63333740234376, "completions/min_length": 128.2, "completions/min_terminated_length": 128.2, "epoch": 5.984848484848484, "frac_reward_zero_std": 0.800000011920929, "grad_norm": 0.23243778944015503, "kl": 0.47906127373377483, "learning_rate": 4.711111111111111e-06, "loss": 0.0005, "num_tokens": 5671181.0, "reward": 0.6451417088508606, "reward_std": 0.0028338861418887975, "rewards/reward_function/mean": 0.6451416611671448, "rewards/reward_function/std": 0.10424772650003433, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 140.8, "completions/clipped_ratio": 0.0, "completions/max_length": 140.8, "completions/max_terminated_length": 140.8, "completions/mean_length": 134.28334045410156, "completions/mean_terminated_length": 134.28334045410156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 6.0606060606060606, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.2668240964412689, "kl": 0.49893170793851216, "learning_rate": 4.488888888888889e-06, "loss": 0.0005, "num_tokens": 5742606.0, "reward": 0.6422833681106568, "reward_std": 0.001804861845448613, "rewards/reward_function/mean": 0.6422833323478698, "rewards/reward_function/std": 0.09786662720143795, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 145.6, "completions/clipped_ratio": 0.0, "completions/max_length": 145.6, "completions/max_terminated_length": 145.6, "completions/mean_length": 136.43334045410157, "completions/mean_terminated_length": 136.43334045410157, "completions/min_length": 131.4, "completions/min_terminated_length": 131.4, "epoch": 6.136363636363637, "frac_reward_zero_std": 0.8000000059604645, "grad_norm": 0.3256749212741852, "kl": 0.500635419289271, "learning_rate": 4.266666666666668e-06, "loss": 0.0005, "num_tokens": 5814636.0, "reward": 0.5908233761787415, "reward_std": 0.00030224729562178254, "rewards/reward_function/mean": 0.5908233284950256, "rewards/reward_function/std": 0.06711971089243889, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.6, "completions/clipped_ratio": 0.0, "completions/max_length": 142.6, "completions/max_terminated_length": 142.6, "completions/mean_length": 135.1666717529297, "completions/mean_terminated_length": 135.1666717529297, "completions/min_length": 129.2, "completions/min_terminated_length": 129.2, "epoch": 6.212121212121212, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.4264754354953766, "kl": 0.5067307233810425, "learning_rate": 4.044444444444445e-06, "loss": 0.0005, "num_tokens": 5886194.0, "reward": 0.5957333445549011, "reward_std": 0.002951131097506732, "rewards/reward_function/mean": 0.5957333445549011, "rewards/reward_function/std": 0.05839875200763345, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.4, "completions/clipped_ratio": 0.0, "completions/max_length": 143.4, "completions/max_terminated_length": 143.4, "completions/mean_length": 135.5500030517578, "completions/mean_terminated_length": 135.5500030517578, "completions/min_length": 130.6, "completions/min_terminated_length": 130.6, "epoch": 6.287878787878788, "frac_reward_zero_std": 0.600000011920929, "grad_norm": 0.7883802056312561, "kl": 0.5081615746021271, "learning_rate": 3.8222222222222224e-06, "loss": 0.0005, "num_tokens": 5957895.0, "reward": 0.579140043258667, "reward_std": 0.0008852901773934718, "rewards/reward_function/mean": 0.5791400074958801, "rewards/reward_function/std": 0.06494581587612629, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 144.8, "completions/clipped_ratio": 0.0, "completions/max_length": 144.8, "completions/max_terminated_length": 144.8, "completions/mean_length": 134.73333435058595, "completions/mean_terminated_length": 134.73333435058595, "completions/min_length": 128.2, "completions/min_terminated_length": 128.2, "epoch": 6.363636363636363, "frac_reward_zero_std": 0.6666666805744171, "grad_norm": 0.20063082873821259, "kl": 0.49638415773709615, "learning_rate": 3.6000000000000003e-06, "loss": 0.0005, "num_tokens": 6029167.0, "reward": 0.6484000086784363, "reward_std": 0.0009333359310403466, "rewards/reward_function/mean": 0.6483999967575074, "rewards/reward_function/std": 0.06488962545990944, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.4, "completions/clipped_ratio": 0.0, "completions/max_length": 143.4, "completions/max_terminated_length": 143.4, "completions/mean_length": 135.68333740234374, "completions/mean_terminated_length": 135.68333740234374, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "epoch": 6.4393939393939394, "frac_reward_zero_std": 0.8666666746139526, "grad_norm": 0.00192440883256495, "kl": 0.506149830420812, "learning_rate": 3.377777777777778e-06, "loss": 0.0005, "num_tokens": 6100760.0, "reward": 0.656581723690033, "reward_std": 0.00018239482160424813, "rewards/reward_function/mean": 0.6565816640853882, "rewards/reward_function/std": 0.09751839749515057, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 142.8, "completions/clipped_ratio": 0.0, "completions/max_length": 142.8, "completions/max_terminated_length": 142.8, "completions/mean_length": 136.7166717529297, "completions/mean_terminated_length": 136.7166717529297, "completions/min_length": 131.4, "completions/min_terminated_length": 131.4, "epoch": 6.515151515151516, "frac_reward_zero_std": 0.8666666746139526, "grad_norm": 0.003226165659725666, "kl": 0.481193604071935, "learning_rate": 3.1555555555555555e-06, "loss": 0.0005, "num_tokens": 6172311.0, "reward": 0.6063950300216675, "reward_std": 0.0008966684341430664, "rewards/reward_function/mean": 0.6063949942588807, "rewards/reward_function/std": 0.07530169561505318, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 147.4, "completions/clipped_ratio": 0.0, "completions/max_length": 147.4, "completions/max_terminated_length": 147.4, "completions/mean_length": 137.03333740234376, "completions/mean_terminated_length": 137.03333740234376, "completions/min_length": 127.8, "completions/min_terminated_length": 127.8, "epoch": 6.590909090909091, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.20924994349479675, "kl": 0.5101174155871073, "learning_rate": 2.9333333333333338e-06, "loss": 0.0005, "num_tokens": 6243641.0, "reward": 0.6130650043487549, "reward_std": 0.022456027381122113, "rewards/reward_function/mean": 0.6130649983882904, "rewards/reward_function/std": 0.09285207167267799, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.2, "completions/clipped_ratio": 0.0, "completions/max_length": 143.2, "completions/max_terminated_length": 143.2, "completions/mean_length": 136.28333740234376, "completions/mean_terminated_length": 136.28333740234376, "completions/min_length": 129.4, "completions/min_terminated_length": 129.4, "epoch": 6.666666666666667, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.47669851779937744, "kl": 0.49107330640157065, "learning_rate": 2.7111111111111116e-06, "loss": 0.0005, "num_tokens": 6314638.0, "reward": 0.6283350110054016, "reward_std": 0.002342601466079941, "rewards/reward_function/mean": 0.6283349990844727, "rewards/reward_function/std": 0.11100482866168022, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 143.4, "completions/clipped_ratio": 0.0, "completions/max_length": 143.4, "completions/max_terminated_length": 143.4, "completions/mean_length": 135.6166748046875, "completions/mean_terminated_length": 135.6166748046875, "completions/min_length": 130.6, "completions/min_terminated_length": 130.6, "epoch": 6.742424242424242, "frac_reward_zero_std": 0.6000000178813935, "grad_norm": 0.35882994532585144, "kl": 0.48326287865638734, "learning_rate": 2.488888888888889e-06, "loss": 0.0005, "num_tokens": 6386483.0, "reward": 0.5984933614730835, "reward_std": 0.00557331838645041, "rewards/reward_function/mean": 0.5984933495521545, "rewards/reward_function/std": 0.0850373286753893, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 141.8, "completions/clipped_ratio": 0.0, "completions/max_length": 141.8, "completions/max_terminated_length": 141.8, "completions/mean_length": 135.9166748046875, "completions/mean_terminated_length": 135.9166748046875, "completions/min_length": 130.4, "completions/min_terminated_length": 130.4, "epoch": 6.818181818181818, "frac_reward_zero_std": 0.7333333432674408, "grad_norm": 0.0010997391073033214, "kl": 0.5031669855117797, "learning_rate": 2.266666666666667e-06, "loss": 0.0005, "num_tokens": 6458146.0, "reward": 0.6327366828918457, "reward_std": 0.0019782020128332077, "rewards/reward_function/mean": 0.6327366590499878, "rewards/reward_function/std": 0.04953039065003395, "step": 450 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 6458146, "num_train_epochs": 8, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }