{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5875, "completions/max_length": 512.0, "completions/max_terminated_length": 440.4, "completions/mean_length": 468.6125, "completions/mean_terminated_length": 379.02000122070314, "completions/min_length": 369.0, "completions/min_terminated_length": 317.8, "entropy": 0.2740801006555557, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.15, "grad_norm": 1.3671875, "kl": 0.04017456619621953, "learning_rate": 9.7e-06, "loss": -0.004630821943283081, "num_tokens": 50357.0, "reward": 0.803729248046875, "reward_std": 0.41876387000083926, "rewards/JointRewardFunction/mean": 0.803729248046875, "rewards/JointRewardFunction/std": 0.41876387894153594, "step": 10, "step_time": 23.422285906800244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 328.2, "completions/mean_length": 493.75, "completions/mean_terminated_length": 310.2800018310547, "completions/min_length": 446.2, "completions/min_terminated_length": 292.6, "entropy": 0.1757997965440154, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.65, "grad_norm": 0.1865234375, "kl": 0.064147645724006, "learning_rate": 9.366666666666668e-06, "loss": -0.00039904499426484107, "num_tokens": 103617.0, "reward": 0.85, "reward_std": 0.3472102493047714, "rewards/JointRewardFunction/mean": 0.85, "rewards/JointRewardFunction/std": 0.3472102552652359, "step": 20, "step_time": 23.712279441399733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6625, "completions/max_length": 512.0, "completions/max_terminated_length": 376.5, "completions/mean_length": 476.5875, "completions/mean_terminated_length": 333.94667358398436, "completions/min_length": 374.9, "completions/min_terminated_length": 272.5, "entropy": 0.21253416435793043, "epoch": 0.2, "frac_reward_zero_std": 0.3, "grad_norm": 2.28125, "kl": 0.07378085452364758, "learning_rate": 9.033333333333334e-06, "loss": -0.012095230817794799, "num_tokens": 153160.0, "reward": 0.95, "reward_std": 0.43348987102508546, "rewards/JointRewardFunction/mean": 0.95, "rewards/JointRewardFunction/std": 0.43348987102508546, "step": 30, "step_time": 23.327648415401928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.85, "completions/max_length": 512.0, "completions/max_terminated_length": 284.1, "completions/mean_length": 503.7625, "completions/mean_terminated_length": 278.8300018310547, "completions/min_length": 477.8, "completions/min_terminated_length": 273.0, "entropy": 0.21520083369687198, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.35, "grad_norm": 0.255859375, "kl": 0.06080276914872229, "learning_rate": 8.700000000000001e-06, "loss": -0.0005816968623548746, "num_tokens": 206773.0, "reward": 0.7875, "reward_std": 0.4616557478904724, "rewards/JointRewardFunction/mean": 0.7875, "rewards/JointRewardFunction/std": 0.46165576577186584, "step": 40, "step_time": 23.59478929130273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7625, "completions/max_length": 512.0, "completions/max_terminated_length": 325.7, "completions/mean_length": 490.0375, "completions/mean_terminated_length": 299.44667053222656, "completions/min_length": 433.4, "completions/min_terminated_length": 279.8, "entropy": 0.25695387944579123, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.45, "grad_norm": 1.6953125, "kl": 0.08556670525576919, "learning_rate": 8.366666666666667e-06, "loss": -0.004827765375375747, "num_tokens": 258692.0, "reward": 0.9375, "reward_std": 0.3981345325708389, "rewards/JointRewardFunction/mean": 0.9375, "rewards/JointRewardFunction/std": 0.3981345325708389, "step": 50, "step_time": 23.633761547702306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 332.6, "completions/mean_length": 492.95, "completions/mean_terminated_length": 307.1166687011719, "completions/min_length": 437.8, "completions/min_terminated_length": 284.2, "entropy": 0.2603289651684463, "epoch": 0.4, "frac_reward_zero_std": 0.55, "grad_norm": 1.671875, "kl": 0.060612542228773235, "learning_rate": 8.033333333333335e-06, "loss": 0.0008684337139129638, "num_tokens": 311528.0, "reward": 0.925, "reward_std": 0.2958350956439972, "rewards/JointRewardFunction/mean": 0.925, "rewards/JointRewardFunction/std": 0.29583510756492615, "step": 60, "step_time": 23.58587597120204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.725, "completions/max_length": 512.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 492.8125, "completions/mean_terminated_length": 313.85, "completions/min_length": 433.5, "completions/min_terminated_length": 279.9, "entropy": 0.27547385785728695, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.6, "grad_norm": 0.99609375, "kl": 0.0518098235828802, "learning_rate": 7.7e-06, "loss": -0.0017207100987434386, "num_tokens": 364749.0, "reward": 0.9125, "reward_std": 0.3383516758680344, "rewards/JointRewardFunction/mean": 0.9125, "rewards/JointRewardFunction/std": 0.3383516877889633, "step": 70, "step_time": 23.567738218297016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7375, "completions/max_length": 512.0, "completions/max_terminated_length": 449.5, "completions/mean_length": 487.675, "completions/mean_terminated_length": 422.1333343505859, "completions/min_length": 393.1, "completions/min_terminated_length": 393.1, "entropy": 0.2849867718294263, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.4, "grad_norm": 0.042724609375, "kl": 0.054288532945793125, "learning_rate": 7.3666666666666676e-06, "loss": 0.01214314103126526, "num_tokens": 417435.0, "reward": 0.9625, "reward_std": 0.44988160133361815, "rewards/JointRewardFunction/mean": 0.9625, "rewards/JointRewardFunction/std": 0.4498816192150116, "step": 80, "step_time": 23.797617995494512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7125, "completions/max_length": 512.0, "completions/max_terminated_length": 330.2, "completions/mean_length": 487.5125, "completions/mean_terminated_length": 302.90834045410156, "completions/min_length": 421.4, "completions/min_terminated_length": 267.8, "entropy": 0.2951860463246703, "epoch": 0.6, "frac_reward_zero_std": 0.45, "grad_norm": 1.265625, "kl": 0.04278905827086419, "learning_rate": 7.033333333333334e-06, "loss": -0.0006712859496474266, "num_tokens": 470584.0, "reward": 0.9375, "reward_std": 0.31479085683822633, "rewards/JointRewardFunction/mean": 0.9375, "rewards/JointRewardFunction/std": 0.3147908627986908, "step": 90, "step_time": 23.514957720592793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 459.9, "completions/mean_length": 472.3125, "completions/mean_terminated_length": 418.8316680908203, "completions/min_length": 365.9, "completions/min_terminated_length": 365.9, "entropy": 0.37076061628758905, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.4, "grad_norm": 2.15625, "kl": 0.05282424986362457, "learning_rate": 6.700000000000001e-06, "loss": -0.009071560204029083, "num_tokens": 523253.0, "reward": 0.9815673828125, "reward_std": 0.4160596996545792, "rewards/JointRewardFunction/mean": 0.9815673828125, "rewards/JointRewardFunction/std": 0.41605971157550814, "step": 100, "step_time": 23.664498128004197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6125, "completions/max_length": 509.7, "completions/max_terminated_length": 484.8, "completions/mean_length": 475.7, "completions/mean_terminated_length": 437.1983367919922, "completions/min_length": 383.8, "completions/min_terminated_length": 383.8, "entropy": 0.4262677112594247, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.4, "grad_norm": 2.0625, "kl": 0.03714689936023206, "learning_rate": 6.366666666666668e-06, "loss": 0.008247312903404237, "num_tokens": 574257.0, "reward": 1.037060546875, "reward_std": 0.3255626171827316, "rewards/JointRewardFunction/mean": 1.037060546875, "rewards/JointRewardFunction/std": 0.3255626350641251, "step": 110, "step_time": 23.18678354880103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.575, "completions/max_length": 512.0, "completions/max_terminated_length": 388.5, "completions/mean_length": 472.7625, "completions/mean_terminated_length": 342.3550079345703, "completions/min_length": 396.3, "completions/min_terminated_length": 293.9, "entropy": 0.4861995566636324, "epoch": 0.8, "frac_reward_zero_std": 0.35, "grad_norm": 1.40625, "kl": 0.039038634288590404, "learning_rate": 6.033333333333335e-06, "loss": 0.012050890922546386, "num_tokens": 624342.0, "reward": 0.925, "reward_std": 0.4410230278968811, "rewards/JointRewardFunction/mean": 0.925, "rewards/JointRewardFunction/std": 0.441023051738739, "step": 120, "step_time": 23.72601753709896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5375, "completions/max_length": 511.6, "completions/max_terminated_length": 476.8, "completions/mean_length": 470.0375, "completions/mean_terminated_length": 427.0658416748047, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.41965660247951747, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.55, "grad_norm": 1.4453125, "kl": 0.03558391091646627, "learning_rate": 5.7e-06, "loss": 0.0031338028609752657, "num_tokens": 676197.0, "reward": 1.029736328125, "reward_std": 0.3752464294433594, "rewards/JointRewardFunction/mean": 1.029736328125, "rewards/JointRewardFunction/std": 0.37524643540382385, "step": 130, "step_time": 23.592583017596915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 470.4, "completions/mean_length": 468.3375, "completions/mean_terminated_length": 422.97833862304685, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.4238532094284892, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.4, "grad_norm": 0.048095703125, "kl": 0.047195866727270185, "learning_rate": 5.366666666666666e-06, "loss": 0.011741240322589875, "num_tokens": 726296.0, "reward": 1.0271484375, "reward_std": 0.39577038288116456, "rewards/JointRewardFunction/mean": 1.0271484375, "rewards/JointRewardFunction/std": 0.39577039480209353, "step": 140, "step_time": 23.827525900093313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.475, "completions/max_length": 507.6, "completions/max_terminated_length": 417.3, "completions/mean_length": 452.0375, "completions/mean_terminated_length": 360.2283416748047, "completions/min_length": 351.9, "completions/min_terminated_length": 300.7, "entropy": 0.405802302993834, "epoch": 1.0, "frac_reward_zero_std": 0.5, "grad_norm": 1.3125, "kl": 0.05754059529863298, "learning_rate": 5.033333333333333e-06, "loss": 0.020150861144065856, "num_tokens": 774771.0, "reward": 1.0125, "reward_std": 0.37200968265533446, "rewards/JointRewardFunction/mean": 1.0125, "rewards/JointRewardFunction/std": 0.3720097005367279, "step": 150, "step_time": 23.477148605001275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.35, "completions/max_length": 504.7, "completions/max_terminated_length": 460.6, "completions/mean_length": 418.725, "completions/mean_terminated_length": 376.1097686767578, "completions/min_length": 290.2, "completions/min_terminated_length": 290.2, "entropy": 0.4137630261480808, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.55, "grad_norm": 1.5234375, "kl": 0.04866745978360996, "learning_rate": 4.7e-06, "loss": -0.012348555028438568, "num_tokens": 820185.0, "reward": 1.1374755859375, "reward_std": 0.2693489044904709, "rewards/JointRewardFunction/mean": 1.1374755859375, "rewards/JointRewardFunction/std": 0.2693489044904709, "step": 160, "step_time": 23.104779590805993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 512.0, "completions/max_terminated_length": 459.3, "completions/mean_length": 452.575, "completions/mean_terminated_length": 405.1588165283203, "completions/min_length": 342.3, "completions/min_terminated_length": 342.3, "entropy": 0.3708974776789546, "epoch": 1.1333333333333333, "frac_reward_zero_std": 0.8, "grad_norm": 0.032470703125, "kl": 0.05273645754205063, "learning_rate": 4.366666666666667e-06, "loss": 0.0020671430975198746, "num_tokens": 868979.0, "reward": 1.2064208984375, "reward_std": 0.1077505886554718, "rewards/JointRewardFunction/mean": 1.2064208984375, "rewards/JointRewardFunction/std": 0.10775059163570404, "step": 170, "step_time": 23.706487477812335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3875, "completions/max_length": 512.0, "completions/max_terminated_length": 481.1, "completions/mean_length": 445.025, "completions/mean_terminated_length": 411.8959655761719, "completions/min_length": 341.8, "completions/min_terminated_length": 341.8, "entropy": 0.3909618055447936, "epoch": 1.2, "frac_reward_zero_std": 0.55, "grad_norm": 1.7109375, "kl": 0.045815252687316385, "learning_rate": 4.033333333333333e-06, "loss": 0.004306042194366455, "num_tokens": 917385.0, "reward": 1.075, "reward_std": 0.3337466180324554, "rewards/JointRewardFunction/mean": 1.075, "rewards/JointRewardFunction/std": 0.3337466180324554, "step": 180, "step_time": 23.554991897323635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6125, "completions/max_length": 512.0, "completions/max_terminated_length": 471.2, "completions/mean_length": 468.075, "completions/mean_terminated_length": 422.6735778808594, "completions/min_length": 385.9, "completions/min_terminated_length": 385.9, "entropy": 0.36821637134999036, "epoch": 1.2666666666666666, "frac_reward_zero_std": 0.55, "grad_norm": 1.8671875, "kl": 0.04221574537805282, "learning_rate": 3.7e-06, "loss": 0.005308620631694794, "num_tokens": 968751.0, "reward": 0.95, "reward_std": 0.3438155859708786, "rewards/JointRewardFunction/mean": 0.95, "rewards/JointRewardFunction/std": 0.34381560385227206, "step": 190, "step_time": 23.511522994117694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 497.9, "completions/max_terminated_length": 470.5, "completions/mean_length": 431.05, "completions/mean_terminated_length": 400.3317901611328, "completions/min_length": 318.5, "completions/min_terminated_length": 318.5, "entropy": 0.41096227150410414, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.65, "grad_norm": 0.06640625, "kl": 0.04462224093731493, "learning_rate": 3.366666666666667e-06, "loss": 0.0076727248728275296, "num_tokens": 1016343.0, "reward": 1.125, "reward_std": 0.21674493551254273, "rewards/JointRewardFunction/mean": 1.125, "rewards/JointRewardFunction/std": 0.21674493551254273, "step": 200, "step_time": 23.060737183006133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3875, "completions/max_length": 512.0, "completions/max_terminated_length": 482.7, "completions/mean_length": 458.65, "completions/mean_terminated_length": 428.9585815429688, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "entropy": 0.3947587950155139, "epoch": 1.4, "frac_reward_zero_std": 0.65, "grad_norm": 9.3125, "kl": 0.10990666588768364, "learning_rate": 3.0333333333333337e-06, "loss": 0.010126692801713943, "num_tokens": 1066311.0, "reward": 1.111083984375, "reward_std": 0.28541127145290374, "rewards/JointRewardFunction/mean": 1.111083984375, "rewards/JointRewardFunction/std": 0.2854112803936005, "step": 210, "step_time": 23.734341114398557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 511.2, "completions/max_terminated_length": 499.2, "completions/mean_length": 448.1, "completions/mean_terminated_length": 415.37428894042966, "completions/min_length": 327.8, "completions/min_terminated_length": 327.8, "entropy": 0.4135129824280739, "epoch": 1.4666666666666668, "frac_reward_zero_std": 0.4, "grad_norm": 2.0, "kl": 0.04890955399023369, "learning_rate": 2.7000000000000004e-06, "loss": 0.005412362515926361, "num_tokens": 1115131.0, "reward": 1.0, "reward_std": 0.407365021109581, "rewards/JointRewardFunction/mean": 1.0, "rewards/JointRewardFunction/std": 0.4073650389909744, "step": 220, "step_time": 23.674485975824062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3, "completions/max_length": 493.0, "completions/max_terminated_length": 468.3, "completions/mean_length": 425.175, "completions/mean_terminated_length": 400.7392883300781, "completions/min_length": 313.4, "completions/min_terminated_length": 313.4, "entropy": 0.40025499686598776, "epoch": 1.5333333333333332, "frac_reward_zero_std": 0.6, "grad_norm": 0.045166015625, "kl": 0.044459241011645646, "learning_rate": 2.3666666666666667e-06, "loss": 0.0074857622385025024, "num_tokens": 1162029.0, "reward": 1.0625, "reward_std": 0.332049286365509, "rewards/JointRewardFunction/mean": 1.0625, "rewards/JointRewardFunction/std": 0.3320492923259735, "step": 230, "step_time": 22.975970137718832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5125, "completions/max_length": 507.5, "completions/max_terminated_length": 417.0, "completions/mean_length": 443.7875, "completions/mean_terminated_length": 338.87250366210935, "completions/min_length": 315.1, "completions/min_terminated_length": 263.9, "entropy": 0.3852341592311859, "epoch": 1.6, "frac_reward_zero_std": 0.5, "grad_norm": 2.109375, "kl": 0.039597276959102604, "learning_rate": 2.0333333333333335e-06, "loss": 0.015155516564846039, "num_tokens": 1211952.0, "reward": 1.03134765625, "reward_std": 0.30796128809452056, "rewards/JointRewardFunction/mean": 1.03134765625, "rewards/JointRewardFunction/std": 0.30796128809452056, "step": 240, "step_time": 23.477432061507717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4125, "completions/max_length": 499.5, "completions/max_terminated_length": 454.8, "completions/mean_length": 440.175, "completions/mean_terminated_length": 402.6975067138672, "completions/min_length": 342.6, "completions/min_terminated_length": 342.6, "entropy": 0.4049293929710984, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.55, "grad_norm": 0.06689453125, "kl": 0.057483326562214644, "learning_rate": 1.7000000000000002e-06, "loss": 0.005497528612613678, "num_tokens": 1260194.0, "reward": 1.100634765625, "reward_std": 0.30466278195381163, "rewards/JointRewardFunction/mean": 1.100634765625, "rewards/JointRewardFunction/std": 0.30466278940439223, "step": 250, "step_time": 23.31596095281129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 455.5875, "completions/mean_terminated_length": 423.4404815673828, "completions/min_length": 368.3, "completions/min_terminated_length": 368.3, "entropy": 0.37657185792922976, "epoch": 1.7333333333333334, "frac_reward_zero_std": 0.55, "grad_norm": 1.78125, "kl": 0.044227164250332865, "learning_rate": 1.3666666666666668e-06, "loss": 0.01007101833820343, "num_tokens": 1312481.0, "reward": 1.0563232421875, "reward_std": 0.3230230301618576, "rewards/JointRewardFunction/mean": 1.0563232421875, "rewards/JointRewardFunction/std": 0.3230230316519737, "step": 260, "step_time": 23.83616074830352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 511.2, "completions/max_terminated_length": 478.9, "completions/mean_length": 419.7, "completions/mean_terminated_length": 390.71238708496094, "completions/min_length": 292.4, "completions/min_terminated_length": 292.4, "entropy": 0.3890227179042995, "epoch": 1.8, "frac_reward_zero_std": 0.85, "grad_norm": 0.0517578125, "kl": 0.04973521351348609, "learning_rate": 1.0333333333333333e-06, "loss": 0.00801372081041336, "num_tokens": 1358297.0, "reward": 1.1375, "reward_std": 0.12793734967708587, "rewards/JointRewardFunction/mean": 1.1375, "rewards/JointRewardFunction/std": 0.12793734967708587, "step": 270, "step_time": 23.87891690217657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45, "completions/max_length": 512.0, "completions/max_terminated_length": 464.2, "completions/mean_length": 440.9125, "completions/mean_terminated_length": 394.0228607177734, "completions/min_length": 329.2, "completions/min_terminated_length": 329.2, "entropy": 0.3685274325311184, "epoch": 1.8666666666666667, "frac_reward_zero_std": 0.6, "grad_norm": 0.134765625, "kl": 0.05201944473665208, "learning_rate": 7.000000000000001e-07, "loss": 0.013003082573413849, "num_tokens": 1406762.0, "reward": 1.123974609375, "reward_std": 0.25532945692539216, "rewards/JointRewardFunction/mean": 1.123974609375, "rewards/JointRewardFunction/std": 0.25532945692539216, "step": 280, "step_time": 23.774091701293948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4125, "completions/max_length": 508.9, "completions/max_terminated_length": 447.5, "completions/mean_length": 443.3125, "completions/mean_terminated_length": 385.1429840087891, "completions/min_length": 311.9, "completions/min_terminated_length": 311.9, "entropy": 0.3619745412841439, "epoch": 1.9333333333333333, "frac_reward_zero_std": 0.55, "grad_norm": 1.296875, "kl": 0.04537691879086196, "learning_rate": 3.666666666666667e-07, "loss": 0.015167883038520813, "num_tokens": 1456063.0, "reward": 1.1, "reward_std": 0.2966939479112625, "rewards/JointRewardFunction/mean": 1.1, "rewards/JointRewardFunction/std": 0.296693953871727, "step": 290, "step_time": 23.711561490091846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3625, "completions/max_length": 511.4, "completions/max_terminated_length": 471.0, "completions/mean_length": 445.95, "completions/mean_terminated_length": 418.78345947265626, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "entropy": 0.3853150447830558, "epoch": 2.0, "frac_reward_zero_std": 0.6, "grad_norm": 1.8515625, "kl": 0.04785980319138616, "learning_rate": 3.333333333333334e-08, "loss": -0.0016140155494213104, "num_tokens": 1504095.0, "reward": 1.1375, "reward_std": 0.2394672751426697, "rewards/JointRewardFunction/mean": 1.1375, "rewards/JointRewardFunction/std": 0.23946728110313414, "step": 300, "step_time": 23.609792864409975 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 1504095, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }