{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4125, "completions/max_length": 499.4, "completions/max_terminated_length": 416.1, "completions/mean_length": 440.2, "completions/mean_terminated_length": 360.0207153320313, "completions/min_length": 351.3, "completions/min_terminated_length": 300.1, "entropy": 0.35336310751736166, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.15, "grad_norm": 2.46875, "kl": 0.04995681893542496, "learning_rate": 9.7e-06, "loss": 0.007881630957126618, "num_tokens": 48280.0, "reward": 0.6638354301452637, "reward_std": 0.4613180309534073, "rewards/JointRewardFunction/mean": 0.6638354301452637, "rewards/JointRewardFunction/std": 0.46131803542375566, "step": 10, "step_time": 21.880370603101618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4125, "completions/max_length": 505.1, "completions/max_terminated_length": 485.6, "completions/mean_length": 454.75, "completions/mean_terminated_length": 429.1646453857422, "completions/min_length": 359.6, "completions/min_terminated_length": 359.6, "entropy": 0.40911334455013276, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.1, "grad_norm": 1.859375, "kl": 0.06663629063405097, "learning_rate": 9.366666666666668e-06, "loss": -0.007935921847820281, "num_tokens": 97272.0, "reward": 0.742634254693985, "reward_std": 0.5053252905607224, "rewards/JointRewardFunction/mean": 0.742634254693985, "rewards/JointRewardFunction/std": 0.505325311422348, "step": 20, "step_time": 22.897377014198717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3625, "completions/max_length": 496.7, "completions/max_terminated_length": 476.1, "completions/mean_length": 449.0625, "completions/mean_terminated_length": 423.77500610351564, "completions/min_length": 347.1, "completions/min_terminated_length": 347.1, "entropy": 0.5090887371450663, "epoch": 0.2, "frac_reward_zero_std": 0.25, "grad_norm": 1.40625, "kl": 0.05061149680987, "learning_rate": 9.033333333333334e-06, "loss": -0.0031354159116744997, "num_tokens": 145589.0, "reward": 0.9517577826976776, "reward_std": 0.356577847735025, "rewards/JointRewardFunction/mean": 0.9517577826976776, "rewards/JointRewardFunction/std": 0.35657785963267086, "step": 30, "step_time": 22.42113570249967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 508.9, "completions/max_terminated_length": 420.2, "completions/mean_length": 447.0, "completions/mean_terminated_length": 361.18226623535156, "completions/min_length": 350.8, "completions/min_terminated_length": 299.6, "entropy": 0.5721639156341553, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.1, "grad_norm": 2.765625, "kl": 0.05449348199181259, "learning_rate": 8.700000000000001e-06, "loss": -0.0012190598994493485, "num_tokens": 195445.0, "reward": 0.9749389350414276, "reward_std": 0.41350028812885287, "rewards/JointRewardFunction/mean": 0.9749389350414276, "rewards/JointRewardFunction/std": 0.4135002911090851, "step": 40, "step_time": 22.95661881720298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1625, "completions/max_length": 467.3, "completions/max_terminated_length": 454.5, "completions/mean_length": 390.775, "completions/mean_terminated_length": 377.6511932373047, "completions/min_length": 296.3, "completions/min_terminated_length": 296.3, "entropy": 0.6591073881834746, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.45, "grad_norm": 3.03125, "kl": 0.08812928411643953, "learning_rate": 8.366666666666667e-06, "loss": -0.008682972937822341, "num_tokens": 240011.0, "reward": 1.1314965546131135, "reward_std": 0.2849295660853386, "rewards/JointRewardFunction/mean": 1.1314965546131135, "rewards/JointRewardFunction/std": 0.2849295552819967, "step": 50, "step_time": 21.223023884699796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.175, "completions/max_length": 473.6, "completions/max_terminated_length": 447.8, "completions/mean_length": 379.525, "completions/mean_terminated_length": 354.0325042724609, "completions/min_length": 264.9, "completions/min_terminated_length": 264.9, "entropy": 0.6001430394127965, "epoch": 0.4, "frac_reward_zero_std": 0.35, "grad_norm": 2.71875, "kl": 0.07532973024062813, "learning_rate": 8.033333333333335e-06, "loss": 0.010028349608182907, "num_tokens": 283417.0, "reward": 1.055295366048813, "reward_std": 0.3583150297403336, "rewards/JointRewardFunction/mean": 1.055295366048813, "rewards/JointRewardFunction/std": 0.358315047621727, "step": 60, "step_time": 21.51059049019932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 491.9, "completions/max_terminated_length": 471.2, "completions/mean_length": 376.875, "completions/mean_terminated_length": 364.50536193847654, "completions/min_length": 270.7, "completions/min_terminated_length": 270.7, "entropy": 0.5387065753340721, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.6, "grad_norm": 1.9375, "kl": 0.07996222919318825, "learning_rate": 7.7e-06, "loss": 0.01158405989408493, "num_tokens": 326415.0, "reward": 1.21389399766922, "reward_std": 0.20652158036828042, "rewards/JointRewardFunction/mean": 1.21389399766922, "rewards/JointRewardFunction/std": 0.20652157738804816, "step": 70, "step_time": 22.325782465600422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 452.9, "completions/max_terminated_length": 424.3, "completions/mean_length": 341.4625, "completions/mean_terminated_length": 331.04286193847656, "completions/min_length": 254.4, "completions/min_terminated_length": 254.4, "entropy": 0.6442078746855259, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.6, "grad_norm": 2.8125, "kl": 0.12920588869601488, "learning_rate": 7.3666666666666676e-06, "loss": 0.026441246271133423, "num_tokens": 366144.0, "reward": 1.1669726014137267, "reward_std": 0.2815501570701599, "rewards/JointRewardFunction/mean": 1.1669726014137267, "rewards/JointRewardFunction/std": 0.28155014626681807, "step": 80, "step_time": 20.707106610499977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 458.6, "completions/max_terminated_length": 436.6, "completions/mean_length": 344.7875, "completions/mean_terminated_length": 332.6904815673828, "completions/min_length": 257.1, "completions/min_terminated_length": 257.1, "entropy": 0.6171283535659313, "epoch": 0.6, "frac_reward_zero_std": 0.45, "grad_norm": 0.1728515625, "kl": 0.11114234835840761, "learning_rate": 7.033333333333334e-06, "loss": -0.0013597654178738594, "num_tokens": 406607.0, "reward": 1.0987597286701203, "reward_std": 0.3017842784523964, "rewards/JointRewardFunction/mean": 1.0987597286701203, "rewards/JointRewardFunction/std": 0.3017842710018158, "step": 90, "step_time": 21.128214740598196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 445.1, "completions/max_terminated_length": 418.1, "completions/mean_length": 350.3625, "completions/mean_terminated_length": 341.49822387695315, "completions/min_length": 258.7, "completions/min_terminated_length": 258.7, "entropy": 0.7072938833385706, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 2.6875, "kl": 0.10777388750575483, "learning_rate": 6.700000000000001e-06, "loss": 0.009119665622711182, "num_tokens": 446820.0, "reward": 1.167529249191284, "reward_std": 0.27103030947037043, "rewards/JointRewardFunction/mean": 1.167529249191284, "rewards/JointRewardFunction/std": 0.2710303008556366, "step": 100, "step_time": 20.569578059400374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 497.2, "completions/max_terminated_length": 483.4, "completions/mean_length": 386.8, "completions/mean_terminated_length": 376.6464324951172, "completions/min_length": 274.3, "completions/min_terminated_length": 274.3, "entropy": 0.5864197112619877, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.55, "grad_norm": 1.953125, "kl": 0.10330515620298683, "learning_rate": 6.366666666666668e-06, "loss": 0.004881048575043678, "num_tokens": 491392.0, "reward": 1.1497362732887269, "reward_std": 0.2508866846153978, "rewards/JointRewardFunction/mean": 1.1497362732887269, "rewards/JointRewardFunction/std": 0.25088667746749704, "step": 110, "step_time": 22.38237403490093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 489.8, "completions/max_terminated_length": 455.9, "completions/mean_length": 402.7, "completions/mean_terminated_length": 384.0738189697266, "completions/min_length": 297.5, "completions/min_terminated_length": 297.5, "entropy": 0.5570502711459995, "epoch": 0.8, "frac_reward_zero_std": 0.45, "grad_norm": 2.265625, "kl": 0.08270290573127567, "learning_rate": 6.033333333333335e-06, "loss": 0.013686606287956237, "num_tokens": 537628.0, "reward": 1.0768773972988128, "reward_std": 0.32933869063854215, "rewards/JointRewardFunction/mean": 1.0768773972988128, "rewards/JointRewardFunction/std": 0.32933869063854215, "step": 120, "step_time": 22.374344250299693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1625, "completions/max_length": 474.2, "completions/max_terminated_length": 460.2, "completions/mean_length": 382.75, "completions/mean_terminated_length": 366.1206024169922, "completions/min_length": 286.6, "completions/min_terminated_length": 286.6, "entropy": 0.5301733467727899, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.6, "grad_norm": 2.484375, "kl": 0.09477438307367266, "learning_rate": 5.7e-06, "loss": -0.0015326094813644885, "num_tokens": 583628.0, "reward": 1.1464037537574767, "reward_std": 0.26481162309646605, "rewards/JointRewardFunction/mean": 1.1464037537574767, "rewards/JointRewardFunction/std": 0.264811622351408, "step": 130, "step_time": 21.819646276899583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.175, "completions/max_length": 492.1, "completions/max_terminated_length": 456.2, "completions/mean_length": 383.6375, "completions/mean_terminated_length": 364.0131011962891, "completions/min_length": 279.7, "completions/min_terminated_length": 279.7, "entropy": 0.45302344355732205, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.25, "grad_norm": 3.4375, "kl": 0.07856867800001055, "learning_rate": 5.366666666666666e-06, "loss": 0.027430105209350585, "num_tokens": 628107.0, "reward": 1.1271068811416627, "reward_std": 0.3177687225921545, "rewards/JointRewardFunction/mean": 1.1271068811416627, "rewards/JointRewardFunction/std": 0.31776872408227064, "step": 140, "step_time": 22.27468511669831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.175, "completions/max_length": 488.7, "completions/max_terminated_length": 460.8, "completions/mean_length": 384.025, "completions/mean_terminated_length": 360.3232177734375, "completions/min_length": 279.9, "completions/min_terminated_length": 279.9, "entropy": 0.4796965181827545, "epoch": 1.0, "frac_reward_zero_std": 0.4, "grad_norm": 0.08740234375, "kl": 0.09200209667906165, "learning_rate": 5.033333333333333e-06, "loss": -0.009858855605125427, "num_tokens": 671557.0, "reward": 1.1115014195442199, "reward_std": 0.3146645646542311, "rewards/JointRewardFunction/mean": 1.1115014195442199, "rewards/JointRewardFunction/std": 0.3146645750850439, "step": 150, "step_time": 22.151629410100213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.225, "completions/max_length": 501.1, "completions/max_terminated_length": 447.8, "completions/mean_length": 400.075, "completions/mean_terminated_length": 361.0351318359375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.4785544477403164, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 2.0, "kl": 0.0879205574747175, "learning_rate": 4.7e-06, "loss": 0.017533975839614867, "num_tokens": 716935.0, "reward": 1.1597753405570983, "reward_std": 0.29188001044094564, "rewards/JointRewardFunction/mean": 1.1597753405570983, "rewards/JointRewardFunction/std": 0.29188000336289405, "step": 160, "step_time": 22.98355916679975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 467.4, "completions/max_terminated_length": 460.7, "completions/mean_length": 363.275, "completions/mean_terminated_length": 354.8253631591797, "completions/min_length": 251.6, "completions/min_terminated_length": 251.6, "entropy": 0.5262451708316803, "epoch": 1.1333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 0.060546875, "kl": 0.09955117981880904, "learning_rate": 4.366666666666667e-06, "loss": -0.007418201863765716, "num_tokens": 757865.0, "reward": 1.1451659560203553, "reward_std": 0.3273365020751953, "rewards/JointRewardFunction/mean": 1.1451659560203553, "rewards/JointRewardFunction/std": 0.3273364961147308, "step": 170, "step_time": 21.108805562603084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 452.5, "completions/max_terminated_length": 448.2, "completions/mean_length": 337.25, "completions/mean_terminated_length": 335.1607147216797, "completions/min_length": 241.7, "completions/min_terminated_length": 241.7, "entropy": 0.5567022401839494, "epoch": 1.2, "frac_reward_zero_std": 0.7, "grad_norm": 0.068359375, "kl": 0.09737768466584384, "learning_rate": 4.033333333333333e-06, "loss": 0.018832828104496, "num_tokens": 796505.0, "reward": 1.2489745497703553, "reward_std": 0.11990191522636451, "rewards/JointRewardFunction/mean": 1.2489745497703553, "rewards/JointRewardFunction/std": 0.11990190770593472, "step": 180, "step_time": 20.626747212697957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 473.7, "completions/max_terminated_length": 441.8, "completions/mean_length": 373.1875, "completions/mean_terminated_length": 352.4538177490234, "completions/min_length": 259.1, "completions/min_terminated_length": 259.1, "entropy": 0.5100005997344852, "epoch": 1.2666666666666666, "frac_reward_zero_std": 0.7, "grad_norm": 0.039794921875, "kl": 0.09228390976786613, "learning_rate": 3.7e-06, "loss": -0.000696965865790844, "num_tokens": 839088.0, "reward": 1.23026362657547, "reward_std": 0.14840476661920549, "rewards/JointRewardFunction/mean": 1.23026362657547, "rewards/JointRewardFunction/std": 0.14840476512908934, "step": 190, "step_time": 21.574640466592246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 480.2, "completions/max_terminated_length": 459.8, "completions/mean_length": 389.75, "completions/mean_terminated_length": 372.77679443359375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.5184403497725725, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.55, "grad_norm": 2.1875, "kl": 0.08728140082675964, "learning_rate": 3.366666666666667e-06, "loss": 0.021845726668834685, "num_tokens": 883220.0, "reward": 1.181435489654541, "reward_std": 0.24119414222077468, "rewards/JointRewardFunction/mean": 1.181435489654541, "rewards/JointRewardFunction/std": 0.24119413328007794, "step": 200, "step_time": 21.928263508094822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.075, "completions/max_length": 460.3, "completions/max_terminated_length": 452.6, "completions/mean_length": 375.4625, "completions/mean_terminated_length": 366.30833740234374, "completions/min_length": 278.2, "completions/min_terminated_length": 278.2, "entropy": 0.5214369297027588, "epoch": 1.4, "frac_reward_zero_std": 0.55, "grad_norm": 1.4375, "kl": 0.0887975221965462, "learning_rate": 3.0333333333333337e-06, "loss": 0.01691504716873169, "num_tokens": 926517.0, "reward": 1.2304613709449768, "reward_std": 0.189005006296793, "rewards/JointRewardFunction/mean": 1.2304613709449768, "rewards/JointRewardFunction/std": 0.18900499549345112, "step": 210, "step_time": 21.30131801480311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 470.3, "completions/max_terminated_length": 448.9, "completions/mean_length": 365.95, "completions/mean_terminated_length": 351.73179016113284, "completions/min_length": 272.3, "completions/min_terminated_length": 272.3, "entropy": 0.5099331840872765, "epoch": 1.4666666666666668, "frac_reward_zero_std": 0.6, "grad_norm": 2.0, "kl": 0.09267634809948504, "learning_rate": 2.7000000000000004e-06, "loss": -0.004276449233293534, "num_tokens": 969621.0, "reward": 1.0979979991912843, "reward_std": 0.326889356970787, "rewards/JointRewardFunction/mean": 1.0979979991912843, "rewards/JointRewardFunction/std": 0.32688935101032257, "step": 220, "step_time": 21.54177276209375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 437.0, "completions/max_terminated_length": 435.8, "completions/mean_length": 325.4375, "completions/mean_terminated_length": 323.2982147216797, "completions/min_length": 243.3, "completions/min_terminated_length": 243.3, "entropy": 0.527520533464849, "epoch": 1.5333333333333332, "frac_reward_zero_std": 0.5, "grad_norm": 3.609375, "kl": 0.09654896147549152, "learning_rate": 2.3666666666666667e-06, "loss": 0.004230192676186561, "num_tokens": 1007468.0, "reward": 1.1674120664596557, "reward_std": 0.3256095230579376, "rewards/JointRewardFunction/mean": 1.1674120664596557, "rewards/JointRewardFunction/std": 0.32560951411724093, "step": 230, "step_time": 20.321823318899987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 471.3, "completions/max_terminated_length": 445.0, "completions/mean_length": 387.2, "completions/mean_terminated_length": 362.8750030517578, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.5140835266560316, "epoch": 1.6, "frac_reward_zero_std": 0.35, "grad_norm": 1.65625, "kl": 0.08105785646475852, "learning_rate": 2.0333333333333335e-06, "loss": 0.0024391064420342446, "num_tokens": 1053144.0, "reward": 1.1341503262519836, "reward_std": 0.3189578216522932, "rewards/JointRewardFunction/mean": 1.1341503262519836, "rewards/JointRewardFunction/std": 0.3189578127115965, "step": 240, "step_time": 21.733667162401254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 476.7, "completions/max_terminated_length": 449.8, "completions/mean_length": 376.975, "completions/mean_terminated_length": 357.80071716308595, "completions/min_length": 280.1, "completions/min_terminated_length": 280.1, "entropy": 0.5188658468425273, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.65, "grad_norm": 1.40625, "kl": 0.09223291147500276, "learning_rate": 1.7000000000000002e-06, "loss": 0.003794506937265396, "num_tokens": 1096622.0, "reward": 1.2100683093070983, "reward_std": 0.1948750299634412, "rewards/JointRewardFunction/mean": 1.2100683093070983, "rewards/JointRewardFunction/std": 0.19487502239644527, "step": 250, "step_time": 21.73429901890122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 478.8, "completions/max_terminated_length": 468.3, "completions/mean_length": 366.125, "completions/mean_terminated_length": 358.8738159179687, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.547977139428258, "epoch": 1.7333333333333334, "frac_reward_zero_std": 0.45, "grad_norm": 1.734375, "kl": 0.09155708220787347, "learning_rate": 1.3666666666666668e-06, "loss": -0.014781329035758971, "num_tokens": 1138920.0, "reward": 1.15995112657547, "reward_std": 0.32106488235294817, "rewards/JointRewardFunction/mean": 1.15995112657547, "rewards/JointRewardFunction/std": 0.32106486298143866, "step": 260, "step_time": 21.99600164630174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1375, "completions/max_length": 463.6, "completions/max_terminated_length": 438.9, "completions/mean_length": 375.1, "completions/mean_terminated_length": 358.83917541503905, "completions/min_length": 273.2, "completions/min_terminated_length": 273.2, "entropy": 0.5359004020690918, "epoch": 1.8, "frac_reward_zero_std": 0.6, "grad_norm": 1.9140625, "kl": 0.087837297283113, "learning_rate": 1.0333333333333333e-06, "loss": 0.012757700681686402, "num_tokens": 1182736.0, "reward": 1.213478970527649, "reward_std": 0.17389502958394587, "rewards/JointRewardFunction/mean": 1.213478970527649, "rewards/JointRewardFunction/std": 0.17389501919969916, "step": 270, "step_time": 21.312464608701703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0875, "completions/max_length": 480.7, "completions/max_terminated_length": 456.9, "completions/mean_length": 372.65, "completions/mean_terminated_length": 360.74524841308596, "completions/min_length": 277.9, "completions/min_terminated_length": 277.9, "entropy": 0.5030612323433161, "epoch": 1.8666666666666667, "frac_reward_zero_std": 0.6, "grad_norm": 1.4140625, "kl": 0.08702772008255125, "learning_rate": 7.000000000000001e-07, "loss": 0.019132000207901, "num_tokens": 1225656.0, "reward": 1.2122631311416625, "reward_std": 0.19066368174389936, "rewards/JointRewardFunction/mean": 1.2122631311416625, "rewards/JointRewardFunction/std": 0.19066367280320265, "step": 280, "step_time": 21.90045202969777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2125, "completions/max_length": 477.3, "completions/max_terminated_length": 440.9, "completions/mean_length": 382.5, "completions/mean_terminated_length": 354.6571441650391, "completions/min_length": 282.6, "completions/min_terminated_length": 282.6, "entropy": 0.5357337659224868, "epoch": 1.9333333333333333, "frac_reward_zero_std": 0.5, "grad_norm": 1.765625, "kl": 0.0854645582381636, "learning_rate": 3.666666666666667e-07, "loss": -0.011803697794675827, "num_tokens": 1272012.0, "reward": 1.1109179258346558, "reward_std": 0.31746507063508034, "rewards/JointRewardFunction/mean": 1.1109179258346558, "rewards/JointRewardFunction/std": 0.31746507063508034, "step": 290, "step_time": 21.865317538700765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.175, "completions/max_length": 476.9, "completions/max_terminated_length": 435.9, "completions/mean_length": 365.8, "completions/mean_terminated_length": 339.0707153320312, "completions/min_length": 260.3, "completions/min_terminated_length": 260.3, "entropy": 0.5455268980935216, "epoch": 2.0, "frac_reward_zero_std": 0.45, "grad_norm": 1.546875, "kl": 0.08734047506004572, "learning_rate": 3.333333333333334e-08, "loss": -0.005074360966682434, "num_tokens": 1314476.0, "reward": 1.0770751595497132, "reward_std": 0.38892474174499514, "rewards/JointRewardFunction/mean": 1.0770751595497132, "rewards/JointRewardFunction/std": 0.3889247328042984, "step": 300, "step_time": 21.719864126896574 } ], "logging_steps": 10, "max_steps": 300, "num_input_tokens_seen": 1314476, "num_train_epochs": 2, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }