{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0666666666666667, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2125, "completions/max_length": 490.8, "completions/max_terminated_length": 470.6, "completions/mean_length": 414.4625, "completions/mean_terminated_length": 395.1430999755859, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "entropy": 0.5249067967757582, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.05, "grad_norm": 2.65625, "kl": 0.03049815017875517, "learning_rate": 9.4375e-06, "loss": -0.010575222969055175, "num_tokens": 46025.0, "reward": 0.73009033203125, "reward_std": 0.4704558838158846, "rewards/JointRewardFunction/mean": 0.73009033203125, "rewards/JointRewardFunction/std": 0.47045588716864584, "step": 10, "step_time": 21.721466124300058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15, "completions/max_length": 474.6, "completions/max_terminated_length": 432.8, "completions/mean_length": 372.65, "completions/mean_terminated_length": 348.14678955078125, "completions/min_length": 280.5, "completions/min_terminated_length": 280.5, "entropy": 0.4361519979313016, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.05, "grad_norm": 4.15625, "kl": 0.0652532160282135, "learning_rate": 8.8125e-06, "loss": 0.016564452648162843, "num_tokens": 89597.0, "reward": 0.95604248046875, "reward_std": 0.5059975624084473, "rewards/JointRewardFunction/mean": 0.95604248046875, "rewards/JointRewardFunction/std": 0.5059975773096085, "step": 20, "step_time": 22.023339059400495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 425.6, "completions/max_terminated_length": 406.9, "completions/mean_length": 317.9, "completions/mean_terminated_length": 309.6845275878906, "completions/min_length": 231.4, "completions/min_terminated_length": 231.4, "entropy": 0.45581948235630987, "epoch": 0.2, "frac_reward_zero_std": 0.1, "grad_norm": 2.859375, "kl": 0.1008026220370084, "learning_rate": 8.1875e-06, "loss": 0.01793680489063263, "num_tokens": 126445.0, "reward": 1.2108154296875, "reward_std": 0.40027157836593685, "rewards/JointRewardFunction/mean": 1.2108154296875, "rewards/JointRewardFunction/std": 0.40027157838921995, "step": 30, "step_time": 19.79602696299935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1125, "completions/max_length": 493.9, "completions/max_terminated_length": 468.6, "completions/mean_length": 352.1875, "completions/mean_terminated_length": 335.66607666015625, "completions/min_length": 238.5, "completions/min_terminated_length": 238.5, "entropy": 0.4116522930562496, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.05, "grad_norm": 2.5625, "kl": 0.12701121605932714, "learning_rate": 7.5625e-06, "loss": 0.05010480284690857, "num_tokens": 167932.0, "reward": 1.2074462890625, "reward_std": 0.42208707332611084, "rewards/JointRewardFunction/mean": 1.2074462890625, "rewards/JointRewardFunction/std": 0.4220870822668076, "step": 40, "step_time": 22.582551179301117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 393.4, "completions/max_terminated_length": 380.0, "completions/mean_length": 300.8625, "completions/mean_terminated_length": 296.27321472167966, "completions/min_length": 226.3, "completions/min_terminated_length": 226.3, "entropy": 0.4190680437721312, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.35, "grad_norm": 3.25, "kl": 0.13846059744246303, "learning_rate": 6.9375e-06, "loss": 0.03549057841300964, "num_tokens": 204717.0, "reward": 1.26171875, "reward_std": 0.38662562653189525, "rewards/JointRewardFunction/mean": 1.26171875, "rewards/JointRewardFunction/std": 0.3866256324923597, "step": 50, "step_time": 18.59559666490022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 426.2, "completions/max_terminated_length": 414.2, "completions/mean_length": 285.9125, "completions/mean_terminated_length": 280.5291687011719, "completions/min_length": 190.9, "completions/min_terminated_length": 190.9, "entropy": 0.3493430153466761, "epoch": 0.4, "frac_reward_zero_std": 0.25, "grad_norm": 2.375, "kl": 0.15198964411392807, "learning_rate": 6.3125e-06, "loss": 0.01220681592822075, "num_tokens": 240990.0, "reward": 1.27255859375, "reward_std": 0.34817005618242547, "rewards/JointRewardFunction/mean": 1.27255859375, "rewards/JointRewardFunction/std": 0.34817005618242547, "step": 60, "step_time": 19.868489251599385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 370.4, "completions/max_terminated_length": 365.0, "completions/mean_length": 277.2875, "completions/mean_terminated_length": 272.9107177734375, "completions/min_length": 179.9, "completions/min_terminated_length": 179.9, "entropy": 0.34267437979578974, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.55, "grad_norm": 0.017333984375, "kl": 0.18903981931507588, "learning_rate": 5.6875e-06, "loss": 0.019876784086227416, "num_tokens": 276969.0, "reward": 1.381591796875, "reward_std": 0.2380124439485371, "rewards/JointRewardFunction/mean": 1.381591796875, "rewards/JointRewardFunction/std": 0.23801244990900158, "step": 70, "step_time": 17.71040062670145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.1, "completions/max_terminated_length": 378.1, "completions/mean_length": 271.3625, "completions/mean_terminated_length": 271.3625, "completions/min_length": 171.4, "completions/min_terminated_length": 171.4, "entropy": 0.3542415237054229, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 0.0751953125, "kl": 0.19187260391190647, "learning_rate": 5.0625e-06, "loss": 0.0034067176282405855, "num_tokens": 312350.0, "reward": 1.40601806640625, "reward_std": 0.2126459252787754, "rewards/JointRewardFunction/mean": 1.40601806640625, "rewards/JointRewardFunction/std": 0.21264592825900763, "step": 80, "step_time": 18.04391895070148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 357.6, "completions/max_terminated_length": 341.1, "completions/mean_length": 272.275, "completions/mean_terminated_length": 260.9375, "completions/min_length": 171.3, "completions/min_terminated_length": 171.3, "entropy": 0.31064137276262044, "epoch": 0.6, "frac_reward_zero_std": 0.6, "grad_norm": 1.875, "kl": 0.20775549318641423, "learning_rate": 4.4375e-06, "loss": 0.0008514203131198883, "num_tokens": 348280.0, "reward": 1.353466796875, "reward_std": 0.21437984704971313, "rewards/JointRewardFunction/mean": 1.353466796875, "rewards/JointRewardFunction/std": 0.21437986195087433, "step": 90, "step_time": 17.224812426199787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 386.0, "completions/max_terminated_length": 357.8, "completions/mean_length": 267.65, "completions/mean_terminated_length": 250.7500030517578, "completions/min_length": 169.7, "completions/min_terminated_length": 169.7, "entropy": 0.3662784457206726, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.7, "grad_norm": 0.01239013671875, "kl": 0.21083315466530622, "learning_rate": 3.8125e-06, "loss": 0.011665140837430954, "num_tokens": 384576.0, "reward": 1.34949951171875, "reward_std": 0.23986690491437912, "rewards/JointRewardFunction/mean": 1.34949951171875, "rewards/JointRewardFunction/std": 0.2398669108748436, "step": 100, "step_time": 18.349693166400904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 382.2, "completions/max_terminated_length": 364.2, "completions/mean_length": 275.575, "completions/mean_terminated_length": 272.3714294433594, "completions/min_length": 188.2, "completions/min_terminated_length": 188.2, "entropy": 0.37121466230601075, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.6, "grad_norm": 0.0146484375, "kl": 0.21329910093918442, "learning_rate": 3.1875e-06, "loss": 0.007188273221254348, "num_tokens": 419570.0, "reward": 1.350439453125, "reward_std": 0.23384397297631948, "rewards/JointRewardFunction/mean": 1.350439453125, "rewards/JointRewardFunction/std": 0.23384397297631948, "step": 110, "step_time": 18.14425329649821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 346.6, "completions/max_terminated_length": 345.3, "completions/mean_length": 252.15, "completions/mean_terminated_length": 250.38750305175782, "completions/min_length": 166.8, "completions/min_terminated_length": 166.8, "entropy": 0.4173679456114769, "epoch": 0.8, "frac_reward_zero_std": 0.45, "grad_norm": 2.09375, "kl": 0.21794578088447453, "learning_rate": 2.5625e-06, "loss": -0.0005294814705848694, "num_tokens": 452006.0, "reward": 1.35863037109375, "reward_std": 0.24987269788980485, "rewards/JointRewardFunction/mean": 1.35863037109375, "rewards/JointRewardFunction/std": 0.24987269788980485, "step": 120, "step_time": 16.82072365879685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 441.8, "completions/max_terminated_length": 440.3, "completions/mean_length": 301.7375, "completions/mean_terminated_length": 296.75750122070315, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.33190380278974774, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 2.09375, "kl": 0.18061227248981596, "learning_rate": 1.9375e-06, "loss": 0.008612716197967529, "num_tokens": 490397.0, "reward": 1.374853515625, "reward_std": 0.2561936320271343, "rewards/JointRewardFunction/mean": 1.374853515625, "rewards/JointRewardFunction/std": 0.2561936320271343, "step": 130, "step_time": 20.63984096989916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.1, "completions/max_terminated_length": 340.1, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 179.7, "completions/min_terminated_length": 179.7, "entropy": 0.38072127737104894, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.7, "grad_norm": 2.96875, "kl": 0.21334810927510262, "learning_rate": 1.3125000000000001e-06, "loss": 0.012666280567646026, "num_tokens": 523549.0, "reward": 1.436767578125, "reward_std": 0.153020023368299, "rewards/JointRewardFunction/mean": 1.436767578125, "rewards/JointRewardFunction/std": 0.153020023368299, "step": 140, "step_time": 16.57510228729916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 371.9, "completions/max_terminated_length": 370.6, "completions/mean_length": 257.4125, "completions/mean_terminated_length": 254.5607147216797, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3929610840976238, "epoch": 1.0, "frac_reward_zero_std": 0.3, "grad_norm": 0.01513671875, "kl": 0.21818328225053846, "learning_rate": 6.875000000000001e-07, "loss": 0.011788636445999146, "num_tokens": 556454.0, "reward": 1.348779296875, "reward_std": 0.3280519276857376, "rewards/JointRewardFunction/mean": 1.348779296875, "rewards/JointRewardFunction/std": 0.32805192805826666, "step": 150, "step_time": 17.876252979701167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 240.45, "completions/mean_terminated_length": 240.45, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4010548871010542, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.65, "grad_norm": 0.025146484375, "kl": 0.22879955088719725, "learning_rate": 6.250000000000001e-08, "loss": 0.022160810232162476, "num_tokens": 587606.0, "reward": 1.3880859375, "reward_std": 0.237497678399086, "rewards/JointRewardFunction/mean": 1.3880859375, "rewards/JointRewardFunction/std": 0.2374976843595505, "step": 160, "step_time": 16.50347660660045 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 587606, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }