{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.28, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.1, "completions/max_terminated_length": 697.1, "completions/mean_length": 444.4875, "completions/mean_terminated_length": 444.4875, "completions/min_length": 229.5, "completions/min_terminated_length": 229.5, "entropy": 0.7007812947034836, "epoch": 0.08, "frac_reward_zero_std": 0.275, "grad_norm": 2.53125, "kl": 0.07940242243275861, "learning_rate": 9.4375e-06, "loss": 0.008688435703516007, "num_tokens": 56203.0, "reward": 0.6825366348028183, "reward_std": 0.42148933857679366, "rewards/JointRewardFunction/mean": 0.6825366348028183, "rewards/JointRewardFunction/std": 0.4214893504977226, "step": 10, "step_time": 29.87573713920101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.1, "completions/max_terminated_length": 655.1, "completions/mean_length": 424.4125, "completions/mean_terminated_length": 424.4125, "completions/min_length": 240.3, "completions/min_terminated_length": 240.3, "entropy": 0.7277415141463279, "epoch": 0.16, "frac_reward_zero_std": 0.225, "grad_norm": 1.765625, "kl": 0.09351013670675457, "learning_rate": 8.8125e-06, "loss": 0.0028222408145666122, "num_tokens": 110800.0, "reward": 0.7511120676994324, "reward_std": 0.4809179216623306, "rewards/JointRewardFunction/mean": 0.7511120676994324, "rewards/JointRewardFunction/std": 0.4809179395437241, "step": 20, "step_time": 29.239146613800404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.9, "completions/max_terminated_length": 681.9, "completions/mean_length": 482.1, "completions/mean_terminated_length": 482.1, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.8137854047119617, "epoch": 0.24, "frac_reward_zero_std": 0.275, "grad_norm": 2.34375, "kl": 0.08033584761433303, "learning_rate": 8.1875e-06, "loss": 0.019247914850711822, "num_tokens": 169812.0, "reward": 0.8227136462926865, "reward_std": 0.3767334818840027, "rewards/JointRewardFunction/mean": 0.8227136462926865, "rewards/JointRewardFunction/std": 0.37673348784446714, "step": 30, "step_time": 30.690014854901893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.3, "completions/max_terminated_length": 699.3, "completions/mean_length": 469.3375, "completions/mean_terminated_length": 469.3375, "completions/min_length": 286.5, "completions/min_terminated_length": 286.5, "entropy": 0.8683047238737345, "epoch": 0.32, "frac_reward_zero_std": 0.2, "grad_norm": 2.296875, "kl": 0.08547290097922086, "learning_rate": 7.5625e-06, "loss": 0.005851123481988907, "num_tokens": 227933.0, "reward": 0.8848474144935607, "reward_std": 0.4506288319826126, "rewards/JointRewardFunction/mean": 0.8848474144935607, "rewards/JointRewardFunction/std": 0.4506288468837738, "step": 40, "step_time": 31.253191193498786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.1, "completions/max_terminated_length": 713.1, "completions/mean_length": 510.825, "completions/mean_terminated_length": 510.825, "completions/min_length": 317.9, "completions/min_terminated_length": 317.9, "entropy": 0.8964162725955248, "epoch": 0.4, "frac_reward_zero_std": 0.2, "grad_norm": 1.71875, "kl": 0.07821776089258492, "learning_rate": 6.9375e-06, "loss": 0.013301727175712586, "num_tokens": 289629.0, "reward": 1.038847678899765, "reward_std": 0.3691652432084084, "rewards/JointRewardFunction/mean": 1.038847678899765, "rewards/JointRewardFunction/std": 0.36916525810956957, "step": 50, "step_time": 31.799948546098314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 523.425, "completions/mean_terminated_length": 523.425, "completions/min_length": 309.1, "completions/min_terminated_length": 309.1, "entropy": 0.8802642215043306, "epoch": 0.48, "frac_reward_zero_std": 0.175, "grad_norm": 2.0, "kl": 0.0804219183512032, "learning_rate": 6.3125e-06, "loss": 0.03207117319107056, "num_tokens": 351843.0, "reward": 0.9487841844558715, "reward_std": 0.42972691655158995, "rewards/JointRewardFunction/mean": 0.9487841844558715, "rewards/JointRewardFunction/std": 0.4297269403934479, "step": 60, "step_time": 32.83071275450202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 715.3, "completions/max_terminated_length": 695.1, "completions/mean_length": 510.3, "completions/mean_terminated_length": 503.83750305175784, "completions/min_length": 295.9, "completions/min_terminated_length": 295.9, "entropy": 0.9274463646113873, "epoch": 0.56, "frac_reward_zero_std": 0.3, "grad_norm": 2.265625, "kl": 0.08897386915050447, "learning_rate": 5.6875e-06, "loss": 0.07122264504432678, "num_tokens": 413123.0, "reward": 1.0958777070045471, "reward_std": 0.3831350475549698, "rewards/JointRewardFunction/mean": 1.0958777070045471, "rewards/JointRewardFunction/std": 0.3831350475549698, "step": 70, "step_time": 31.748691623102424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.4, "completions/max_terminated_length": 608.4, "completions/mean_length": 440.2875, "completions/mean_terminated_length": 440.2875, "completions/min_length": 326.3, "completions/min_terminated_length": 326.3, "entropy": 0.8611678160727024, "epoch": 0.64, "frac_reward_zero_std": 0.275, "grad_norm": 2.8125, "kl": 0.0876699925865978, "learning_rate": 5.0625e-06, "loss": 0.019712889194488527, "num_tokens": 468960.0, "reward": 1.0908459782600404, "reward_std": 0.35941318422555923, "rewards/JointRewardFunction/mean": 1.0908459782600404, "rewards/JointRewardFunction/std": 0.3594131987541914, "step": 80, "step_time": 27.2742907967011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.7, "completions/max_terminated_length": 626.7, "completions/mean_length": 434.05, "completions/mean_terminated_length": 434.05, "completions/min_length": 263.7, "completions/min_terminated_length": 263.7, "entropy": 0.8135976739227772, "epoch": 0.72, "frac_reward_zero_std": 0.375, "grad_norm": 1.6953125, "kl": 0.09378819759003817, "learning_rate": 4.4375e-06, "loss": 0.06376264691352844, "num_tokens": 524304.0, "reward": 1.109246850013733, "reward_std": 0.35286828482057897, "rewards/JointRewardFunction/mean": 1.109246850013733, "rewards/JointRewardFunction/std": 0.3528682907810435, "step": 90, "step_time": 28.181775099898367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 444.325, "completions/mean_terminated_length": 444.325, "completions/min_length": 307.4, "completions/min_terminated_length": 307.4, "entropy": 0.9360679395496845, "epoch": 0.8, "frac_reward_zero_std": 0.275, "grad_norm": 2.3125, "kl": 0.08875131588429212, "learning_rate": 3.8125e-06, "loss": 0.03808712363243103, "num_tokens": 580516.0, "reward": 1.0944311678409577, "reward_std": 0.3305914536118507, "rewards/JointRewardFunction/mean": 1.0944311678409577, "rewards/JointRewardFunction/std": 0.3305914685130119, "step": 100, "step_time": 28.45329053500027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.9, "completions/max_terminated_length": 650.9, "completions/mean_length": 442.6875, "completions/mean_terminated_length": 442.6875, "completions/min_length": 262.7, "completions/min_terminated_length": 262.7, "entropy": 0.9691729046404362, "epoch": 0.88, "frac_reward_zero_std": 0.275, "grad_norm": 2.21875, "kl": 0.08224498964846134, "learning_rate": 3.1875e-06, "loss": 0.027349627017974852, "num_tokens": 636475.0, "reward": 1.031976318359375, "reward_std": 0.43738164007663727, "rewards/JointRewardFunction/mean": 1.031976318359375, "rewards/JointRewardFunction/std": 0.4373816430568695, "step": 110, "step_time": 29.40713436929982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.8, "completions/max_terminated_length": 587.8, "completions/mean_length": 423.65, "completions/mean_terminated_length": 423.65, "completions/min_length": 273.3, "completions/min_terminated_length": 273.3, "entropy": 0.886437576636672, "epoch": 0.96, "frac_reward_zero_std": 0.425, "grad_norm": 1.96875, "kl": 0.08013711464591325, "learning_rate": 2.5625e-06, "loss": 0.029777994751930235, "num_tokens": 690739.0, "reward": 1.1266614079475403, "reward_std": 0.2993911794852465, "rewards/JointRewardFunction/mean": 1.1266614079475403, "rewards/JointRewardFunction/std": 0.29939117534086107, "step": 120, "step_time": 26.532396802499353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.8, "completions/max_terminated_length": 548.8, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 296.6, "completions/min_terminated_length": 296.6, "entropy": 0.8794000033289194, "epoch": 1.04, "frac_reward_zero_std": 0.475, "grad_norm": 1.1875, "kl": 0.0789637949783355, "learning_rate": 1.9375e-06, "loss": -0.010777493566274643, "num_tokens": 744437.0, "reward": 1.1596606612205504, "reward_std": 0.2929948531091213, "rewards/JointRewardFunction/mean": 1.1596606612205504, "rewards/JointRewardFunction/std": 0.29299486204981806, "step": 130, "step_time": 25.31505165609851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.5, "completions/max_terminated_length": 600.5, "completions/mean_length": 419.575, "completions/mean_terminated_length": 419.575, "completions/min_length": 260.8, "completions/min_terminated_length": 260.8, "entropy": 0.9347391467541456, "epoch": 1.12, "frac_reward_zero_std": 0.275, "grad_norm": 2.765625, "kl": 0.08032974656671285, "learning_rate": 1.3125000000000001e-06, "loss": 0.02551303505897522, "num_tokens": 798651.0, "reward": 1.0246130645275116, "reward_std": 0.39613366425037383, "rewards/JointRewardFunction/mean": 1.0246130645275116, "rewards/JointRewardFunction/std": 0.3961336761713028, "step": 140, "step_time": 27.637355703599678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.5, "completions/max_terminated_length": 558.5, "completions/mean_length": 394.375, "completions/mean_terminated_length": 394.375, "completions/min_length": 255.9, "completions/min_terminated_length": 255.9, "entropy": 0.841333020478487, "epoch": 1.2, "frac_reward_zero_std": 0.5, "grad_norm": 0.11767578125, "kl": 0.08763318308629095, "learning_rate": 6.875000000000001e-07, "loss": 0.025534918904304503, "num_tokens": 850777.0, "reward": 1.1621960639953612, "reward_std": 0.28392985463142395, "rewards/JointRewardFunction/mean": 1.1621960639953612, "rewards/JointRewardFunction/std": 0.2839298591017723, "step": 150, "step_time": 25.526650765698285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.3, "completions/max_terminated_length": 569.3, "completions/mean_length": 417.65, "completions/mean_terminated_length": 417.65, "completions/min_length": 253.4, "completions/min_terminated_length": 253.4, "entropy": 0.9602293692529201, "epoch": 1.28, "frac_reward_zero_std": 0.35, "grad_norm": 1.8671875, "kl": 0.08848655968904495, "learning_rate": 6.250000000000001e-08, "loss": 0.002139208652079105, "num_tokens": 905091.0, "reward": 1.0879150748252868, "reward_std": 0.3240631450258661, "rewards/JointRewardFunction/mean": 1.0879150748252868, "rewards/JointRewardFunction/std": 0.32406314978725276, "step": 160, "step_time": 26.06666781829881 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 905091, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }