{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0666666666666667, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7375, "completions/max_length": 512.0, "completions/max_terminated_length": 424.3, "completions/mean_length": 490.4875, "completions/mean_terminated_length": 405.65333557128906, "completions/min_length": 428.2, "completions/min_terminated_length": 377.0, "entropy": 0.25217165537178515, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.4, "grad_norm": 1.5703125, "kl": 0.0473901923673111, "learning_rate": 9.4375e-06, "loss": -0.0006930597126483917, "num_tokens": 52107.0, "reward": 0.675, "reward_std": 0.399344927072525, "rewards/JointRewardFunction/mean": 0.675, "rewards/JointRewardFunction/std": 0.39934495091438293, "step": 10, "step_time": 22.542225440999847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8625, "completions/max_length": 512.0, "completions/max_terminated_length": 284.7, "completions/mean_length": 504.1875, "completions/mean_terminated_length": 272.675, "completions/min_length": 467.3, "completions/min_terminated_length": 262.5, "entropy": 0.202899154368788, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.55, "grad_norm": 1.546875, "kl": 0.09602957724127918, "learning_rate": 8.8125e-06, "loss": -1.602950505912304e-05, "num_tokens": 106202.0, "reward": 0.7, "reward_std": 0.3110164314508438, "rewards/JointRewardFunction/mean": 0.7, "rewards/JointRewardFunction/std": 0.3110164374113083, "step": 20, "step_time": 23.115930349700285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8625, "completions/max_length": 512.0, "completions/max_terminated_length": 228.9, "completions/mean_length": 495.1875, "completions/mean_terminated_length": 212.45, "completions/min_length": 461.5, "completions/min_terminated_length": 205.5, "entropy": 0.21361660687252879, "epoch": 0.2, "frac_reward_zero_std": 0.65, "grad_norm": 1.6640625, "kl": 0.09101211386732758, "learning_rate": 8.1875e-06, "loss": 0.0017554668709635734, "num_tokens": 157233.0, "reward": 0.8625, "reward_std": 0.2857582807540894, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.28575828671455383, "step": 30, "step_time": 22.91968024349999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 512.0, "completions/max_terminated_length": 141.1, "completions/mean_length": 506.8625, "completions/mean_terminated_length": 132.6, "completions/min_length": 482.5, "completions/min_terminated_length": 124.1, "entropy": 0.1852631143294275, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.55, "grad_norm": 1.90625, "kl": 0.08399276400450617, "learning_rate": 7.5625e-06, "loss": 0.000752098485827446, "num_tokens": 211094.0, "reward": 0.7875, "reward_std": 0.35561010539531707, "rewards/JointRewardFunction/mean": 0.7875, "rewards/JointRewardFunction/std": 0.35561011731624603, "step": 40, "step_time": 23.1765345180007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7375, "completions/max_length": 512.0, "completions/max_terminated_length": 379.3, "completions/mean_length": 490.375, "completions/mean_terminated_length": 355.89500122070314, "completions/min_length": 437.5, "completions/min_terminated_length": 335.1, "entropy": 0.20920381098985671, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.65, "grad_norm": 1.671875, "kl": 0.08546415464952588, "learning_rate": 6.9375e-06, "loss": 0.002159162983298302, "num_tokens": 263040.0, "reward": 0.8625, "reward_std": 0.2857582807540894, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.28575828671455383, "step": 50, "step_time": 23.374498161900373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.525, "completions/max_length": 512.0, "completions/max_terminated_length": 472.5, "completions/mean_length": 468.2875, "completions/mean_terminated_length": 420.48833923339845, "completions/min_length": 354.2, "completions/min_terminated_length": 354.2, "entropy": 0.2077056860551238, "epoch": 0.4, "frac_reward_zero_std": 0.75, "grad_norm": 1.4609375, "kl": 0.09320587411057205, "learning_rate": 6.3125e-06, "loss": -0.0018056023865938187, "num_tokens": 313903.0, "reward": 0.8625, "reward_std": 0.21504760384559632, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.2150476098060608, "step": 60, "step_time": 23.37301885729976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6625, "completions/max_length": 512.0, "completions/max_terminated_length": 405.9, "completions/mean_length": 482.5875, "completions/mean_terminated_length": 375.9566711425781, "completions/min_length": 381.3, "completions/min_terminated_length": 330.1, "entropy": 0.2098280948586762, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.2734375, "kl": 0.07904624061193317, "learning_rate": 5.6875e-06, "loss": 0.002197714149951935, "num_tokens": 366306.0, "reward": 0.7875, "reward_std": 0.35561010539531707, "rewards/JointRewardFunction/mean": 0.7875, "rewards/JointRewardFunction/std": 0.35561011731624603, "step": 70, "step_time": 23.226104836099694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.475, "completions/max_length": 512.0, "completions/max_terminated_length": 476.6, "completions/mean_length": 462.95, "completions/mean_terminated_length": 421.48095703125, "completions/min_length": 355.3, "completions/min_terminated_length": 355.3, "entropy": 0.23389623733237386, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.55, "grad_norm": 1.421875, "kl": 1.7520212520845235, "learning_rate": 5.0625e-06, "loss": 0.014839766919612885, "num_tokens": 417014.0, "reward": 0.825, "reward_std": 0.3429849535226822, "rewards/JointRewardFunction/mean": 0.825, "rewards/JointRewardFunction/std": 0.34298495948314667, "step": 80, "step_time": 23.28830157520024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6125, "completions/max_length": 512.0, "completions/max_terminated_length": 400.4, "completions/mean_length": 469.4375, "completions/mean_terminated_length": 354.97833557128905, "completions/min_length": 361.2, "completions/min_terminated_length": 310.0, "entropy": 0.23284959373995662, "epoch": 0.6, "frac_reward_zero_std": 0.8, "grad_norm": 0.00860595703125, "kl": 0.09338290304876865, "learning_rate": 4.4375e-06, "loss": -0.00816301703453064, "num_tokens": 468717.0, "reward": 0.775, "reward_std": 0.2112731784582138, "rewards/JointRewardFunction/mean": 0.775, "rewards/JointRewardFunction/std": 0.2112731844186783, "step": 90, "step_time": 23.190502532400387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7125, "completions/max_length": 512.0, "completions/max_terminated_length": 375.5, "completions/mean_length": 486.9625, "completions/mean_terminated_length": 343.10833435058595, "completions/min_length": 421.6, "completions/min_terminated_length": 319.2, "entropy": 0.23229737337678671, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.45, "grad_norm": 2.265625, "kl": 0.10509764784947037, "learning_rate": 3.8125e-06, "loss": 0.012743420898914337, "num_tokens": 522558.0, "reward": 0.7, "reward_std": 0.3737070143222809, "rewards/JointRewardFunction/mean": 0.7, "rewards/JointRewardFunction/std": 0.37370702624320984, "step": 100, "step_time": 23.302893278999363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 470.8, "completions/mean_length": 468.9, "completions/mean_terminated_length": 418.7083374023438, "completions/min_length": 370.4, "completions/min_terminated_length": 370.4, "entropy": 0.21385292476043105, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.8, "grad_norm": 1.65625, "kl": 0.10132698961533607, "learning_rate": 3.1875e-06, "loss": 0.004469546675682068, "num_tokens": 573018.0, "reward": 0.8625, "reward_std": 0.22768060266971588, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.22768060266971588, "step": 110, "step_time": 23.21463985899991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.475, "completions/max_length": 512.0, "completions/max_terminated_length": 498.2, "completions/mean_length": 464.075, "completions/mean_terminated_length": 425.3933380126953, "completions/min_length": 350.8, "completions/min_terminated_length": 350.8, "entropy": 0.24488159762695433, "epoch": 0.8, "frac_reward_zero_std": 0.6, "grad_norm": 1.1953125, "kl": 0.11085290028713643, "learning_rate": 2.5625e-06, "loss": 0.007788118720054626, "num_tokens": 622408.0, "reward": 0.85, "reward_std": 0.3265853762626648, "rewards/JointRewardFunction/mean": 0.85, "rewards/JointRewardFunction/std": 0.3265853762626648, "step": 120, "step_time": 23.28342122000013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.575, "completions/max_length": 512.0, "completions/max_terminated_length": 386.7, "completions/mean_length": 473.75, "completions/mean_terminated_length": 338.4150054931641, "completions/min_length": 396.3, "completions/min_terminated_length": 293.9, "entropy": 0.20693162837997078, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.7, "grad_norm": 0.0274658203125, "kl": 0.1018260810058564, "learning_rate": 1.9375e-06, "loss": 0.004177199304103851, "num_tokens": 674560.0, "reward": 0.7125, "reward_std": 0.28489942848682404, "rewards/JointRewardFunction/mean": 0.7125, "rewards/JointRewardFunction/std": 0.284899440407753, "step": 130, "step_time": 23.369472190498527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 474.9, "completions/mean_length": 473.4625, "completions/mean_terminated_length": 436.53857421875, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "entropy": 0.21715571610257028, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 2.65625, "kl": 0.09930992983281613, "learning_rate": 1.3125000000000001e-06, "loss": -0.003046867996454239, "num_tokens": 725069.0, "reward": 0.8625, "reward_std": 0.22220884561538695, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.22220885157585143, "step": 140, "step_time": 23.435090316800597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.525, "completions/max_length": 512.0, "completions/max_terminated_length": 465.8, "completions/mean_length": 467.0125, "completions/mean_terminated_length": 418.62000427246096, "completions/min_length": 369.5, "completions/min_terminated_length": 369.5, "entropy": 0.23989124922081828, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.031982421875, "kl": 0.11200284436345101, "learning_rate": 6.875000000000001e-07, "loss": 0.0012525198981165886, "num_tokens": 774742.0, "reward": 0.9125, "reward_std": 0.16875659823417663, "rewards/JointRewardFunction/mean": 0.9125, "rewards/JointRewardFunction/std": 0.1687566041946411, "step": 150, "step_time": 23.458841795299485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.475, "completions/max_length": 512.0, "completions/max_terminated_length": 451.2, "completions/mean_length": 453.7375, "completions/mean_terminated_length": 395.75953063964846, "completions/min_length": 338.2, "completions/min_terminated_length": 338.2, "entropy": 0.24999441923573612, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.85, "grad_norm": 0.0250244140625, "kl": 0.10539310625754297, "learning_rate": 6.250000000000001e-08, "loss": 0.012749123573303222, "num_tokens": 822957.0, "reward": 0.8625, "reward_std": 0.1759178400039673, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.17591784596443177, "step": 160, "step_time": 23.16400302819966 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 822957, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }