{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7575757575757576, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 177.8, "completions/clipped_ratio": 0.0, "completions/max_length": 177.8, "completions/max_terminated_length": 177.8, "completions/mean_length": 157.85000610351562, "completions/mean_terminated_length": 157.85000610351562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.07575757575757576, "frac_reward_zero_std": 0.4000000059604645, "grad_norm": 1.5840047597885132, "kl": 0.0010059793893522702, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "num_tokens": 73447.0, "reward": 0.5880883574485779, "reward_std": 0.020529226586222648, "rewards/reward_function/mean": 0.5880883395671844, "rewards/reward_function/std": 0.06562883183360099, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.2, "completions/clipped_ratio": 0.0, "completions/max_length": 176.2, "completions/max_terminated_length": 176.2, "completions/mean_length": 156.65000915527344, "completions/mean_terminated_length": 156.65000915527344, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.15151515151515152, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 1.6390373706817627, "kl": 0.0017284046276472508, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "num_tokens": 146334.0, "reward": 0.605418348312378, "reward_std": 0.02508251890540123, "rewards/reward_function/mean": 0.60541832447052, "rewards/reward_function/std": 0.06859094277024269, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.4, "completions/clipped_ratio": 0.0, "completions/max_length": 176.4, "completions/max_terminated_length": 176.4, "completions/mean_length": 157.0000030517578, "completions/mean_terminated_length": 157.0000030517578, "completions/min_length": 140.8, "completions/min_terminated_length": 140.8, "epoch": 0.22727272727272727, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.7626600861549377, "kl": 0.003397522373901059, "learning_rate": 5.600000000000001e-06, "loss": 0.0, "num_tokens": 219198.0, "reward": 0.5862850427627564, "reward_std": 0.036518129706382754, "rewards/reward_function/mean": 0.5862850069999694, "rewards/reward_function/std": 0.08488646671175956, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 183.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.98334045410155, "completions/mean_terminated_length": 156.98334045410155, "completions/min_length": 139.8, "completions/min_terminated_length": 139.8, "epoch": 0.30303030303030304, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.9624250531196594, "kl": 0.007015585945919156, "learning_rate": 7.600000000000001e-06, "loss": 0.0, "num_tokens": 291737.0, "reward": 0.6001700401306153, "reward_std": 0.025772593356668948, "rewards/reward_function/mean": 0.6001700043678284, "rewards/reward_function/std": 0.07909451425075531, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 172.2, "completions/clipped_ratio": 0.0, "completions/max_length": 172.2, "completions/max_terminated_length": 172.2, "completions/mean_length": 155.23333740234375, "completions/mean_terminated_length": 155.23333740234375, "completions/min_length": 138.2, "completions/min_terminated_length": 138.2, "epoch": 0.3787878787878788, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.5355867743492126, "kl": 0.009492208405087391, "learning_rate": 9.600000000000001e-06, "loss": 0.0, "num_tokens": 364535.0, "reward": 0.5766633510589599, "reward_std": 0.04085115455091, "rewards/reward_function/mean": 0.576663339138031, "rewards/reward_function/std": 0.10587597712874412, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 198.4, "completions/clipped_ratio": 0.0, "completions/max_length": 198.4, "completions/max_terminated_length": 198.4, "completions/mean_length": 153.6166748046875, "completions/mean_terminated_length": 153.6166748046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.45454545454545453, "frac_reward_zero_std": 0.13333333730697633, "grad_norm": 0.6549646854400635, "kl": 0.061492755884925525, "learning_rate": 1.16e-05, "loss": 0.0001, "num_tokens": 436992.0, "reward": 0.5978150248527527, "reward_std": 0.04262940138578415, "rewards/reward_function/mean": 0.5978150129318237, "rewards/reward_function/std": 0.09431936666369438, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 176.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.86667175292968, "completions/mean_terminated_length": 158.86667175292968, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5303030303030303, "frac_reward_zero_std": 0.40000001192092893, "grad_norm": 0.4725801348686218, "kl": 2.357983988771836, "learning_rate": 1.3600000000000002e-05, "loss": 0.0024, "num_tokens": 509788.0, "reward": 0.6049700140953064, "reward_std": 0.012831439916044473, "rewards/reward_function/mean": 0.6049699783325195, "rewards/reward_function/std": 0.08928216472268105, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 172.4, "completions/clipped_ratio": 0.0, "completions/max_length": 172.4, "completions/max_terminated_length": 172.4, "completions/mean_length": 154.5500030517578, "completions/mean_terminated_length": 154.5500030517578, "completions/min_length": 138.8, "completions/min_terminated_length": 138.8, "epoch": 0.6060606060606061, "frac_reward_zero_std": 0.4666666746139526, "grad_norm": 0.41579461097717285, "kl": 0.10282722649474939, "learning_rate": 1.5600000000000003e-05, "loss": 0.0001, "num_tokens": 582789.0, "reward": 0.5595033764839172, "reward_std": 0.014409982354845852, "rewards/reward_function/mean": 0.5595033466815948, "rewards/reward_function/std": 0.053104204079136255, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 202.6, "completions/clipped_ratio": 0.0, "completions/max_length": 202.6, "completions/max_terminated_length": 202.6, "completions/mean_length": 158.20000305175782, "completions/mean_terminated_length": 158.20000305175782, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6818181818181818, "frac_reward_zero_std": 0.33333333730697634, "grad_norm": 0.03286667913198471, "kl": 3726.0936788400013, "learning_rate": 1.76e-05, "loss": 3.7261, "num_tokens": 655565.0, "reward": 0.5829650402069092, "reward_std": 0.031194474175572397, "rewards/reward_function/mean": 0.5829649925231933, "rewards/reward_function/std": 0.09724260903894902, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 180.4, "completions/clipped_ratio": 0.0, "completions/max_length": 180.4, "completions/max_terminated_length": 180.4, "completions/mean_length": 157.5500030517578, "completions/mean_terminated_length": 157.5500030517578, "completions/min_length": 141.2, "completions/min_terminated_length": 141.2, "epoch": 0.7575757575757576, "frac_reward_zero_std": 0.26666667461395266, "grad_norm": 0.45231395959854126, "kl": 0.24270717451969784, "learning_rate": 1.9600000000000002e-05, "loss": 0.0002, "num_tokens": 727754.0, "reward": 0.6422233581542969, "reward_std": 0.015453202556818724, "rewards/reward_function/mean": 0.642223310470581, "rewards/reward_function/std": 0.08873879238963127, "step": 50 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 727754, "num_train_epochs": 8, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }