{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7375, "completions/max_length": 512.0, "completions/max_terminated_length": 284.9, "completions/mean_length": 491.525, "completions/mean_terminated_length": 261.77666931152345, "completions/min_length": 438.2, "completions/min_terminated_length": 233.4, "entropy": 0.23946007741615177, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.45, "grad_norm": 1.453125, "kl": 0.05540826761134667, "learning_rate": 9.100000000000001e-06, "loss": 0.0011166661977767944, "num_tokens": 52190.0, "reward": 0.7, "reward_std": 0.35391277372837066, "rewards/JointRewardFunction/mean": 0.7, "rewards/JointRewardFunction/std": 0.3539127916097641, "step": 10, "step_time": 22.393758795900045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7625, "completions/max_length": 512.0, "completions/max_terminated_length": 380.1, "completions/mean_length": 495.3625, "completions/mean_terminated_length": 357.5166717529297, "completions/min_length": 439.9, "completions/min_terminated_length": 337.5, "entropy": 0.15637785932049156, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 0.024169921875, "kl": 0.07800487959757448, "learning_rate": 8.1e-06, "loss": -0.0069251880049705505, "num_tokens": 105579.0, "reward": 0.775, "reward_std": 0.3446744382381439, "rewards/JointRewardFunction/mean": 0.775, "rewards/JointRewardFunction/std": 0.3446744501590729, "step": 20, "step_time": 22.42510473759985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 512.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 500.8125, "completions/mean_terminated_length": 368.27000122070314, "completions/min_length": 451.0, "completions/min_terminated_length": 348.6, "entropy": 0.16634994922205806, "epoch": 0.2, "frac_reward_zero_std": 0.65, "grad_norm": 1.5390625, "kl": 0.10815342329442501, "learning_rate": 7.100000000000001e-06, "loss": 0.005232430994510651, "num_tokens": 157060.0, "reward": 0.8875, "reward_std": 0.2394672751426697, "rewards/JointRewardFunction/mean": 0.8875, "rewards/JointRewardFunction/std": 0.23946728110313414, "step": 30, "step_time": 22.14527411999982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8875, "completions/max_length": 512.0, "completions/max_terminated_length": 262.4, "completions/mean_length": 502.0125, "completions/mean_terminated_length": 256.325, "completions/min_length": 451.3, "completions/min_terminated_length": 246.5, "entropy": 0.15448929518461227, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.45, "grad_norm": 0.02001953125, "kl": 0.08631132342852652, "learning_rate": 6.1e-06, "loss": 0.0021702755242586137, "num_tokens": 210533.0, "reward": 0.7625, "reward_std": 0.3475900113582611, "rewards/JointRewardFunction/mean": 0.7625, "rewards/JointRewardFunction/std": 0.34759002923965454, "step": 40, "step_time": 22.401593531500158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.925, "completions/max_length": 512.0, "completions/max_terminated_length": 235.5, "completions/mean_length": 509.125, "completions/mean_terminated_length": 234.8, "completions/min_length": 490.1, "completions/min_terminated_length": 234.1, "entropy": 0.15993599854409696, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.6, "grad_norm": 1.3125, "kl": 0.10001510812435299, "learning_rate": 5.1e-06, "loss": -0.001868080161511898, "num_tokens": 263979.0, "reward": 0.8375, "reward_std": 0.29291952252388, "rewards/JointRewardFunction/mean": 0.8375, "rewards/JointRewardFunction/std": 0.2929195284843445, "step": 50, "step_time": 22.454355105500007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9125, "completions/max_length": 512.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 505.65, "completions/mean_terminated_length": 219.93333435058594, "completions/min_length": 474.0, "completions/min_terminated_length": 218.0, "entropy": 0.15155332698486745, "epoch": 0.4, "frac_reward_zero_std": 0.65, "grad_norm": 1.265625, "kl": 0.0995997670572251, "learning_rate": 4.1e-06, "loss": 0.0030021272599697114, "num_tokens": 317831.0, "reward": 0.8125, "reward_std": 0.32402919232845306, "rewards/JointRewardFunction/mean": 0.8125, "rewards/JointRewardFunction/std": 0.32402920424938203, "step": 60, "step_time": 22.315176106599846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9125, "completions/max_length": 512.0, "completions/max_terminated_length": 182.4, "completions/mean_length": 505.3875, "completions/mean_terminated_length": 174.23333435058595, "completions/min_length": 475.2, "completions/min_terminated_length": 168.0, "entropy": 0.13972499519586562, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.65, "grad_norm": 0.0235595703125, "kl": 0.11641144650056959, "learning_rate": 3.1000000000000004e-06, "loss": 0.002840310521423817, "num_tokens": 372058.0, "reward": 0.8, "reward_std": 0.3038551896810532, "rewards/JointRewardFunction/mean": 0.8, "rewards/JointRewardFunction/std": 0.30385519564151764, "step": 70, "step_time": 22.520917878499766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8875, "completions/max_length": 512.0, "completions/max_terminated_length": 238.4, "completions/mean_length": 504.55, "completions/mean_terminated_length": 225.6, "completions/min_length": 466.8, "completions/min_terminated_length": 210.8, "entropy": 0.14867580346763135, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.7, "grad_norm": 1.109375, "kl": 0.09885827025864273, "learning_rate": 2.1000000000000002e-06, "loss": 9.710401645861566e-05, "num_tokens": 426094.0, "reward": 0.8625, "reward_std": 0.2558668524026871, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.25586686432361605, "step": 80, "step_time": 22.384258029100057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 512.0, "completions/max_terminated_length": 144.1, "completions/mean_length": 508.4875, "completions/mean_terminated_length": 139.3, "completions/min_length": 492.9, "completions/min_terminated_length": 134.5, "entropy": 0.1391168820671737, "epoch": 0.6, "frac_reward_zero_std": 0.65, "grad_norm": 1.1953125, "kl": 0.11537287631072105, "learning_rate": 1.1e-06, "loss": -0.004437017068266868, "num_tokens": 480921.0, "reward": 0.7875, "reward_std": 0.29291952252388, "rewards/JointRewardFunction/mean": 0.7875, "rewards/JointRewardFunction/std": 0.2929195284843445, "step": 90, "step_time": 22.70512801439986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9125, "completions/max_length": 512.0, "completions/max_terminated_length": 230.5, "completions/mean_length": 504.3625, "completions/mean_terminated_length": 220.3, "completions/min_length": 466.1, "completions/min_terminated_length": 210.1, "entropy": 0.1445184069685638, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.0107421875, "kl": 0.09200373792555183, "learning_rate": 1.0000000000000001e-07, "loss": 0.002389365620911121, "num_tokens": 536154.0, "reward": 0.8625, "reward_std": 0.22220884561538695, "rewards/JointRewardFunction/mean": 0.8625, "rewards/JointRewardFunction/std": 0.22220885157585143, "step": 100, "step_time": 22.48313269170003 } ], "logging_steps": 10, "max_steps": 100, "num_input_tokens_seen": 536154, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }