{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26666666666666666, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7875, "completions/max_length": 512.0, "completions/max_terminated_length": 240.3, "completions/mean_length": 483.275, "completions/mean_terminated_length": 200.9800018310547, "completions/min_length": 428.0, "completions/min_terminated_length": 172.0, "entropy": 0.44431554432958364, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.35, "grad_norm": 1.53125, "kl": 0.038827025642240186, "learning_rate": 9.4375e-06, "loss": 0.0020309146493673325, "num_tokens": 51530.0, "reward": 0.45, "reward_std": 0.3909654438495636, "rewards/JointRewardFunction/mean": 0.45, "rewards/JointRewardFunction/std": 0.3909654557704926, "step": 10, "step_time": 21.882423204801306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8, "completions/max_length": 512.0, "completions/max_terminated_length": 386.5, "completions/mean_length": 502.575, "completions/mean_terminated_length": 372.61000061035156, "completions/min_length": 457.8, "completions/min_terminated_length": 355.4, "entropy": 0.259922884311527, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.35, "grad_norm": 2.890625, "kl": 0.0678297683596611, "learning_rate": 8.8125e-06, "loss": 0.0040422692894935604, "num_tokens": 105496.0, "reward": 0.575, "reward_std": 0.4570506900548935, "rewards/JointRewardFunction/mean": 0.575, "rewards/JointRewardFunction/std": 0.45705069601535797, "step": 20, "step_time": 22.13749420610038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5875, "completions/max_length": 512.0, "completions/max_terminated_length": 422.9, "completions/mean_length": 473.825, "completions/mean_terminated_length": 379.08262634277344, "completions/min_length": 386.8, "completions/min_terminated_length": 335.6, "entropy": 0.25187050821259616, "epoch": 0.2, "frac_reward_zero_std": 0.35, "grad_norm": 0.09423828125, "kl": 0.11119869479443878, "learning_rate": 8.1875e-06, "loss": 0.0015535765327513219, "num_tokens": 154818.0, "reward": 0.7875, "reward_std": 0.3947398692369461, "rewards/JointRewardFunction/mean": 0.7875, "rewards/JointRewardFunction/std": 0.39473988115787506, "step": 30, "step_time": 21.91056177379942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.825, "completions/max_length": 512.0, "completions/max_terminated_length": 429.5, "completions/mean_length": 496.7, "completions/mean_terminated_length": 396.95, "completions/min_length": 420.2, "completions/min_terminated_length": 369.0, "entropy": 0.24834579518064856, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.45, "grad_norm": 2.140625, "kl": 0.08241275593172759, "learning_rate": 7.5625e-06, "loss": 0.007686867564916611, "num_tokens": 207866.0, "reward": 0.6625, "reward_std": 0.41830068826675415, "rewards/JointRewardFunction/mean": 0.6625, "rewards/JointRewardFunction/std": 0.41830070614814757, "step": 40, "step_time": 21.8902738727018 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 207866, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }