{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.28, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 892.4, "completions/max_terminated_length": 819.9, "completions/mean_length": 573.1125, "completions/mean_terminated_length": 551.0821472167969, "completions/min_length": 313.1, "completions/min_terminated_length": 313.1, "entropy": 0.7912669345736504, "epoch": 0.08, "frac_reward_zero_std": 0.35, "grad_norm": 2.359375, "kl": 0.04604918217446539, "learning_rate": 9.4375e-06, "loss": 0.021108362078666686, "num_tokens": 66493.0, "reward": 0.6003784224390983, "reward_std": 0.5601114392280578, "rewards/JointRewardFunction/mean": 0.6003784224390983, "rewards/JointRewardFunction/std": 0.5601114451885223, "step": 10, "step_time": 37.44416529300106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 800.3, "completions/max_terminated_length": 673.7, "completions/mean_length": 473.2875, "completions/mean_terminated_length": 445.1142974853516, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 1.0140388168394565, "epoch": 0.16, "frac_reward_zero_std": 0.4, "grad_norm": 3.75, "kl": 0.25841885171830653, "learning_rate": 8.8125e-06, "loss": 0.03933271169662476, "num_tokens": 125000.0, "reward": 0.9494775831699371, "reward_std": 0.4877780556678772, "rewards/JointRewardFunction/mean": 0.9494775831699371, "rewards/JointRewardFunction/std": 0.4877780944108963, "step": 20, "step_time": 35.173660528497564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 951.5, "completions/max_terminated_length": 813.4, "completions/mean_length": 634.375, "completions/mean_terminated_length": 576.2744140625, "completions/min_length": 331.8, "completions/min_terminated_length": 331.8, "entropy": 1.309154535830021, "epoch": 0.24, "frac_reward_zero_std": 0.375, "grad_norm": 1.859375, "kl": 0.2194912993349135, "learning_rate": 8.1875e-06, "loss": 0.026669433712959288, "num_tokens": 196194.0, "reward": 1.0899658679962159, "reward_std": 0.5111429423093796, "rewards/JointRewardFunction/mean": 1.0899658679962159, "rewards/JointRewardFunction/std": 0.5111429691314697, "step": 30, "step_time": 41.286807323301765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 848.0, "completions/max_terminated_length": 783.4, "completions/mean_length": 569.7125, "completions/mean_terminated_length": 549.1333435058593, "completions/min_length": 369.3, "completions/min_terminated_length": 369.3, "entropy": 1.1440452575683593, "epoch": 0.32, "frac_reward_zero_std": 0.25, "grad_norm": 2.671875, "kl": 0.13953614169731737, "learning_rate": 7.5625e-06, "loss": 0.013277828693389893, "num_tokens": 262345.0, "reward": 0.8887524664402008, "reward_std": 0.5226060330867768, "rewards/JointRewardFunction/mean": 0.8887524664402008, "rewards/JointRewardFunction/std": 0.5226060688495636, "step": 40, "step_time": 37.13389427080001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.8, "completions/max_terminated_length": 875.8, "completions/mean_length": 560.675, "completions/mean_terminated_length": 560.675, "completions/min_length": 320.4, "completions/min_terminated_length": 320.4, "entropy": 1.1498947571963072, "epoch": 0.4, "frac_reward_zero_std": 0.15, "grad_norm": 2.875, "kl": 0.09845088529400528, "learning_rate": 6.9375e-06, "loss": 0.03778347373008728, "num_tokens": 328029.0, "reward": 0.8338769674301147, "reward_std": 0.5376368969678879, "rewards/JointRewardFunction/mean": 0.8338769674301147, "rewards/JointRewardFunction/std": 0.5376369208097458, "step": 50, "step_time": 38.24463103190137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.7, "completions/max_terminated_length": 829.7, "completions/mean_length": 531.85, "completions/mean_terminated_length": 531.85, "completions/min_length": 287.5, "completions/min_terminated_length": 287.5, "entropy": 1.1666041024029254, "epoch": 0.48, "frac_reward_zero_std": 0.075, "grad_norm": 2.015625, "kl": 0.0893499652389437, "learning_rate": 6.3125e-06, "loss": 0.008259650319814682, "num_tokens": 390917.0, "reward": 0.6566748261451721, "reward_std": 0.5261385977268219, "rewards/JointRewardFunction/mean": 0.6566748261451721, "rewards/JointRewardFunction/std": 0.5261386096477508, "step": 60, "step_time": 36.22042608499832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.1, "completions/max_terminated_length": 760.1, "completions/mean_length": 505.25, "completions/mean_terminated_length": 505.25, "completions/min_length": 287.4, "completions/min_terminated_length": 287.4, "entropy": 1.1663248613476753, "epoch": 0.56, "frac_reward_zero_std": 0.2, "grad_norm": 1.9296875, "kl": 0.09666758836247027, "learning_rate": 5.6875e-06, "loss": 0.04623619616031647, "num_tokens": 451793.0, "reward": 0.8685888767242431, "reward_std": 0.5478669673204422, "rewards/JointRewardFunction/mean": 0.8685888767242431, "rewards/JointRewardFunction/std": 0.5478669852018356, "step": 70, "step_time": 33.734915479502526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.8, "completions/max_terminated_length": 692.8, "completions/mean_length": 482.775, "completions/mean_terminated_length": 482.775, "completions/min_length": 305.5, "completions/min_terminated_length": 305.5, "entropy": 1.1494709253311157, "epoch": 0.64, "frac_reward_zero_std": 0.125, "grad_norm": 2.765625, "kl": 0.11551248212344944, "learning_rate": 5.0625e-06, "loss": -0.016333292424678802, "num_tokens": 511029.0, "reward": 1.0234729290008544, "reward_std": 0.49485546052455903, "rewards/JointRewardFunction/mean": 1.0234729290008544, "rewards/JointRewardFunction/std": 0.4948554873466492, "step": 80, "step_time": 31.04121985129932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.5, "completions/max_terminated_length": 633.5, "completions/mean_length": 459.7625, "completions/mean_terminated_length": 459.7625, "completions/min_length": 280.1, "completions/min_terminated_length": 280.1, "entropy": 1.1554153360426427, "epoch": 0.72, "frac_reward_zero_std": 0.325, "grad_norm": 1.78125, "kl": 0.1084105208516121, "learning_rate": 4.4375e-06, "loss": -0.037684041261672976, "num_tokens": 568430.0, "reward": 1.0063842952251434, "reward_std": 0.45139331221580503, "rewards/JointRewardFunction/mean": 1.0063842952251434, "rewards/JointRewardFunction/std": 0.45139334285631777, "step": 90, "step_time": 28.41928261620196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.2, "completions/max_terminated_length": 694.2, "completions/mean_length": 483.575, "completions/mean_terminated_length": 483.575, "completions/min_length": 270.1, "completions/min_terminated_length": 270.1, "entropy": 1.1369776532053948, "epoch": 0.8, "frac_reward_zero_std": 0.4, "grad_norm": 2.25, "kl": 0.12164497645571828, "learning_rate": 3.8125e-06, "loss": -0.011769261211156845, "num_tokens": 627782.0, "reward": 1.045616489648819, "reward_std": 0.48985774293541906, "rewards/JointRewardFunction/mean": 1.045616489648819, "rewards/JointRewardFunction/std": 0.48985776007175447, "step": 100, "step_time": 30.974922098598473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.8, "completions/max_terminated_length": 711.8, "completions/mean_length": 478.5875, "completions/mean_terminated_length": 478.5875, "completions/min_length": 269.1, "completions/min_terminated_length": 269.1, "entropy": 1.1295112110674381, "epoch": 0.88, "frac_reward_zero_std": 0.275, "grad_norm": 1.9765625, "kl": 0.11388859800063074, "learning_rate": 3.1875e-06, "loss": 0.00085725337266922, "num_tokens": 686613.0, "reward": 0.9904785633087159, "reward_std": 0.5183025985956192, "rewards/JointRewardFunction/mean": 0.9904785633087159, "rewards/JointRewardFunction/std": 0.5183026134967804, "step": 110, "step_time": 31.47189465950178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.1, "completions/max_terminated_length": 681.1, "completions/mean_length": 474.575, "completions/mean_terminated_length": 474.575, "completions/min_length": 306.6, "completions/min_terminated_length": 306.6, "entropy": 1.0753874629735947, "epoch": 0.96, "frac_reward_zero_std": 0.225, "grad_norm": 2.3125, "kl": 0.11223098039627075, "learning_rate": 2.5625e-06, "loss": 0.02290368378162384, "num_tokens": 744951.0, "reward": 1.1014428853988647, "reward_std": 0.43211724162101744, "rewards/JointRewardFunction/mean": 1.1014428853988647, "rewards/JointRewardFunction/std": 0.4321172535419464, "step": 120, "step_time": 30.187947676197656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.8, "completions/max_terminated_length": 700.8, "completions/mean_length": 482.3, "completions/mean_terminated_length": 482.3, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 1.100510736554861, "epoch": 1.04, "frac_reward_zero_std": 0.175, "grad_norm": 1.890625, "kl": 0.11114813778549433, "learning_rate": 1.9375e-06, "loss": -0.00960662066936493, "num_tokens": 804153.0, "reward": 1.0441894710063935, "reward_std": 0.4884881317615509, "rewards/JointRewardFunction/mean": 1.0441894710063935, "rewards/JointRewardFunction/std": 0.48848815858364103, "step": 130, "step_time": 31.431871901799603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.8, "completions/max_terminated_length": 654.8, "completions/mean_length": 455.1875, "completions/mean_terminated_length": 455.1875, "completions/min_length": 274.1, "completions/min_terminated_length": 274.1, "entropy": 1.1068335216492415, "epoch": 1.12, "frac_reward_zero_std": 0.35, "grad_norm": 2.296875, "kl": 0.11968475547619165, "learning_rate": 1.3125000000000001e-06, "loss": -0.020959584414958952, "num_tokens": 861216.0, "reward": 1.0450488328933716, "reward_std": 0.4309545159339905, "rewards/JointRewardFunction/mean": 1.0450488328933716, "rewards/JointRewardFunction/std": 0.43095452189445493, "step": 140, "step_time": 29.483576541099684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.2, "completions/max_terminated_length": 677.2, "completions/mean_length": 466.3125, "completions/mean_terminated_length": 466.3125, "completions/min_length": 296.8, "completions/min_terminated_length": 296.8, "entropy": 1.0722076326608658, "epoch": 1.2, "frac_reward_zero_std": 0.45, "grad_norm": 0.85546875, "kl": 0.1184447065461427, "learning_rate": 6.875000000000001e-07, "loss": 0.02653493583202362, "num_tokens": 919097.0, "reward": 1.2055200517177582, "reward_std": 0.3585283608641475, "rewards/JointRewardFunction/mean": 1.2055200517177582, "rewards/JointRewardFunction/std": 0.35852835562545804, "step": 150, "step_time": 30.24508252329979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 470.9375, "completions/mean_terminated_length": 470.9375, "completions/min_length": 331.2, "completions/min_terminated_length": 331.2, "entropy": 1.0674828842282296, "epoch": 1.28, "frac_reward_zero_std": 0.425, "grad_norm": 1.875, "kl": 0.12561513194814325, "learning_rate": 6.250000000000001e-08, "loss": -0.007510318607091904, "num_tokens": 977674.0, "reward": 1.1753845453262328, "reward_std": 0.39318075180053713, "rewards/JointRewardFunction/mean": 1.1753845453262328, "rewards/JointRewardFunction/std": 0.3931807607412338, "step": 160, "step_time": 29.98714039499864 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 977674, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }