{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00075, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10096.0, "completions/max_terminated_length": 10096.0, "completions/mean_length": 8672.71875, "completions/mean_terminated_length": 8672.71875, "completions/min_length": 3020.0, "completions/min_terminated_length": 3020.0, "entropy": 0.49113161116838455, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.241949200630188, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0633, "num_tokens": 306152.0, "reward": -0.4408680200576782, "reward_std": 0.3989785313606262, "rewards/rollout_eval_reward_func/mean": 0.11064532399177551, "rewards/rollout_eval_reward_func/std": 0.21571724116802216, "rewards/rollout_reward_func/mean": -0.4408680200576782, "rewards/rollout_reward_func/std": 0.44763946533203125, "sampling/importance_sampling_ratio/max": 1.2819759845733643, "sampling/importance_sampling_ratio/mean": 0.9992397427558899, "sampling/importance_sampling_ratio/min": 0.7715137004852295, "sampling/sampling_logp_difference/max": 0.2594008445739746, "sampling/sampling_logp_difference/mean": 0.01546277105808258, "step": 1, "step_time": 73.26994180099973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49113161116838455, "epoch": 2e-05, "grad_norm": 1.2400784492492676, "kl": 0.0, "learning_rate": 2.8571428571428573e-06, "loss": -0.0633, "step": 2, "step_time": 30.109230951999052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 10009.0, "completions/max_terminated_length": 10009.0, "completions/mean_length": 7330.1875, "completions/mean_terminated_length": 7330.1875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.5131296459585428, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.102152943611145, "kl": 0.0009028113518070313, "learning_rate": 5.7142857142857145e-06, "loss": -0.2347, "num_tokens": 569740.0, "reward": -0.48799318075180054, "reward_std": 0.5598607063293457, "rewards/rollout_eval_reward_func/mean": 0.22929370403289795, "rewards/rollout_eval_reward_func/std": 0.26715749502182007, "rewards/rollout_reward_func/mean": -0.48799318075180054, "rewards/rollout_reward_func/std": 0.5559459924697876, "sampling/importance_sampling_ratio/max": 1.2627520561218262, "sampling/importance_sampling_ratio/mean": 1.0006182193756104, "sampling/importance_sampling_ratio/min": 0.7627776861190796, "sampling/sampling_logp_difference/max": 0.27078866958618164, "sampling/sampling_logp_difference/mean": 0.014230873435735703, "step": 3, "step_time": 68.85090976999709 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "entropy": 0.5151741988956928, "epoch": 4e-05, "grad_norm": 1.0848904848098755, "kl": 0.0004950130587531021, "learning_rate": 8.571428571428573e-06, "loss": -0.2336, "step": 4, "step_time": 28.428488818004553 }, { "clip_ratio/high_max": 0.0010416667209938169, "clip_ratio/high_mean": 0.0005208333604969084, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010416667209938169, "completions/clipped_ratio": 0.0, "completions/max_length": 10323.0, "completions/max_terminated_length": 10323.0, "completions/mean_length": 8267.125, "completions/mean_terminated_length": 8267.125, "completions/min_length": 1640.0, "completions/min_terminated_length": 1640.0, "entropy": 0.5123504158109426, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.1476984024047852, "kl": 0.0007431179510604125, "learning_rate": 1.1428571428571429e-05, "loss": -0.0418, "num_tokens": 862728.0, "reward": -0.46075016260147095, "reward_std": 0.5065791606903076, "rewards/rollout_eval_reward_func/mean": 0.128683939576149, "rewards/rollout_eval_reward_func/std": 0.2396152913570404, "rewards/rollout_reward_func/mean": -0.46075016260147095, "rewards/rollout_reward_func/std": 0.5104123950004578, "sampling/importance_sampling_ratio/max": 1.3248213529586792, "sampling/importance_sampling_ratio/mean": 1.0001360177993774, "sampling/importance_sampling_ratio/min": 0.6914317011833191, "sampling/sampling_logp_difference/max": 0.3689908981323242, "sampling/sampling_logp_difference/mean": 0.016226449981331825, "step": 5, "step_time": 75.37122915000327 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0032900729565881193, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00459215632872656, "entropy": 0.5106779877096415, "epoch": 6e-05, "grad_norm": 1.0145094394683838, "kl": 0.0013804795053147245, "learning_rate": 1.4285714285714285e-05, "loss": -0.045, "step": 6, "step_time": 29.551835642994774 }, { "clip_ratio/high_max": 0.0024003623984754086, "clip_ratio/high_mean": 0.0012001811992377043, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017210145597346127, "completions/clipped_ratio": 0.0, "completions/max_length": 10088.0, "completions/max_terminated_length": 10088.0, "completions/mean_length": 8518.21875, "completions/mean_terminated_length": 8518.21875, "completions/min_length": 4084.0, "completions/min_terminated_length": 4084.0, "entropy": 0.5038529355078936, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.5022886991500854, "kl": 0.002840353590727318, "learning_rate": 1.7142857142857145e-05, "loss": -0.0036, "num_tokens": 1164601.0, "reward": -0.41255950927734375, "reward_std": 0.46968239545822144, "rewards/rollout_eval_reward_func/mean": 0.11216971278190613, "rewards/rollout_eval_reward_func/std": 0.2204883098602295, "rewards/rollout_reward_func/mean": -0.41255950927734375, "rewards/rollout_reward_func/std": 0.5122336149215698, "sampling/importance_sampling_ratio/max": 1.4158059358596802, "sampling/importance_sampling_ratio/mean": 1.0018370151519775, "sampling/importance_sampling_ratio/min": 0.7707551121711731, "sampling/sampling_logp_difference/max": 0.3476989269256592, "sampling/sampling_logp_difference/mean": 0.017664402723312378, "step": 7, "step_time": 77.99332059699736 }, { "clip_ratio/high_max": 0.005842391517944634, "clip_ratio/high_mean": 0.0034420291776768863, "clip_ratio/low_mean": 0.0051097974355798215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008551826613256708, "entropy": 0.5001224614679813, "epoch": 8e-05, "grad_norm": 1.3377231359481812, "kl": 0.006958273006603122, "learning_rate": 2e-05, "loss": -0.0079, "step": 8, "step_time": 30.119341139003154 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.00046641789958812296, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015080846205819398, "completions/clipped_ratio": 0.0, "completions/max_length": 9987.0, "completions/max_terminated_length": 9987.0, "completions/mean_length": 8235.9375, "completions/mean_terminated_length": 8235.9375, "completions/min_length": 2028.0, "completions/min_terminated_length": 2028.0, "entropy": 0.5665333420038223, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.413293719291687, "kl": 0.012357485480606556, "learning_rate": 2.2857142857142858e-05, "loss": -0.0089, "num_tokens": 1456974.0, "reward": -0.2786320447921753, "reward_std": 0.4699662923812866, "rewards/rollout_eval_reward_func/mean": 0.12322154641151428, "rewards/rollout_eval_reward_func/std": 0.23254993557929993, "rewards/rollout_reward_func/mean": -0.2786320447921753, "rewards/rollout_reward_func/std": 0.510530948638916, "sampling/importance_sampling_ratio/max": 1.6322839260101318, "sampling/importance_sampling_ratio/mean": 0.9981738328933716, "sampling/importance_sampling_ratio/min": 0.6440463662147522, "sampling/sampling_logp_difference/max": 0.48998022079467773, "sampling/sampling_logp_difference/mean": 0.02640429511666298, "step": 9, "step_time": 80.34681812299641 }, { "clip_ratio/high_max": 0.028179825632832944, "clip_ratio/high_mean": 0.01559113833354786, "clip_ratio/low_mean": 0.01464278216008097, "clip_ratio/low_min": 0.006223290809430182, "clip_ratio/region_mean": 0.03023392061004415, "entropy": 0.5607042815536261, "epoch": 0.0001, "grad_norm": 1.2342119216918945, "kl": 0.03045007959008217, "learning_rate": 2.5714285714285714e-05, "loss": -0.0159, "step": 10, "step_time": 28.650263912999435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 10076.0, "completions/max_terminated_length": 10076.0, "completions/mean_length": 8311.21875, "completions/mean_terminated_length": 8311.21875, "completions/min_length": 1530.0, "completions/min_terminated_length": 1530.0, "entropy": 0.4887528121471405, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 1.4407812356948853, "kl": 0.04280303395353258, "learning_rate": 2.857142857142857e-05, "loss": -0.0508, "num_tokens": 1751757.0, "reward": -0.26280224323272705, "reward_std": 0.4824950098991394, "rewards/rollout_eval_reward_func/mean": 0.1091209352016449, "rewards/rollout_eval_reward_func/std": 0.22141531109809875, "rewards/rollout_reward_func/mean": -0.26280224323272705, "rewards/rollout_reward_func/std": 0.4825066328048706, "sampling/importance_sampling_ratio/max": 2.2060391902923584, "sampling/importance_sampling_ratio/mean": 1.003042221069336, "sampling/importance_sampling_ratio/min": 0.505047619342804, "sampling/sampling_logp_difference/max": 0.79119873046875, "sampling/sampling_logp_difference/mean": 0.03998423367738724, "step": 11, "step_time": 81.20211481799561 }, { "clip_ratio/high_max": 0.031166458851657808, "clip_ratio/high_mean": 0.01714572956552729, "clip_ratio/low_mean": 0.018567851395346224, "clip_ratio/low_min": 0.005885701393708587, "clip_ratio/region_mean": 0.0357135811354965, "entropy": 0.47410433553159237, "epoch": 0.00012, "grad_norm": 1.048365831375122, "kl": 0.08051084214821458, "learning_rate": 3.142857142857143e-05, "loss": -0.0558, "step": 12, "step_time": 29.28374841400546 }, { "clip_ratio/high_max": 0.001953125, "clip_ratio/high_mean": 0.0009765625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 10209.0, "completions/max_terminated_length": 10209.0, "completions/mean_length": 8161.71875, "completions/mean_terminated_length": 8161.71875, "completions/min_length": 1827.0, "completions/min_terminated_length": 1827.0, "entropy": 0.43679925985634327, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 1.0560696125030518, "kl": 0.09263498219661415, "learning_rate": 3.428571428571429e-05, "loss": 0.1042, "num_tokens": 2042327.0, "reward": -0.02590048871934414, "reward_std": 0.6161512732505798, "rewards/rollout_eval_reward_func/mean": 0.16006097197532654, "rewards/rollout_eval_reward_func/std": 0.2864827811717987, "rewards/rollout_reward_func/mean": -0.02590048871934414, "rewards/rollout_reward_func/std": 0.6041470170021057, "sampling/importance_sampling_ratio/max": 2.7582640647888184, "sampling/importance_sampling_ratio/mean": 0.9981331825256348, "sampling/importance_sampling_ratio/min": 0.361401230096817, "sampling/sampling_logp_difference/max": 1.0177664756774902, "sampling/sampling_logp_difference/mean": 0.06089622899889946, "step": 13, "step_time": 85.01218143400365 }, { "clip_ratio/high_max": 0.012486383900977671, "clip_ratio/high_mean": 0.007805692031979561, "clip_ratio/low_mean": 0.030729168094694614, "clip_ratio/low_min": 0.015625000465661287, "clip_ratio/region_mean": 0.038534860184881836, "entropy": 0.41658624820411205, "epoch": 0.00014, "grad_norm": 1.044942855834961, "kl": 0.16313170175999403, "learning_rate": 3.7142857142857143e-05, "loss": 0.1002, "step": 14, "step_time": 28.990433916003894 }, { "clip_ratio/high_max": 0.00596590933855623, "clip_ratio/high_mean": 0.0035037880297750235, "clip_ratio/low_mean": 0.0005122950533404946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004016083083115518, "completions/clipped_ratio": 0.0, "completions/max_length": 10134.0, "completions/max_terminated_length": 10134.0, "completions/mean_length": 8323.34375, "completions/mean_terminated_length": 8323.34375, "completions/min_length": 1934.0, "completions/min_terminated_length": 1934.0, "entropy": 0.44160761684179306, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 1.2544862031936646, "kl": 0.21658248733729124, "learning_rate": 4e-05, "loss": 0.1199, "num_tokens": 2337711.0, "reward": -0.0776321142911911, "reward_std": 0.5812347531318665, "rewards/rollout_eval_reward_func/mean": 0.14151422679424286, "rewards/rollout_eval_reward_func/std": 0.2538794279098511, "rewards/rollout_reward_func/mean": -0.0776321142911911, "rewards/rollout_reward_func/std": 0.5845968723297119, "sampling/importance_sampling_ratio/max": 1.8725090026855469, "sampling/importance_sampling_ratio/mean": 0.9912927150726318, "sampling/importance_sampling_ratio/min": 0.1565917581319809, "sampling/sampling_logp_difference/max": 1.8541131019592285, "sampling/sampling_logp_difference/mean": 0.06762713938951492, "step": 15, "step_time": 87.63701662399762 }, { "clip_ratio/high_max": 0.033285985700786114, "clip_ratio/high_mean": 0.02006899402476847, "clip_ratio/low_mean": 0.017902423918712884, "clip_ratio/low_min": 0.008303140406496823, "clip_ratio/region_mean": 0.03797141805989668, "entropy": 0.43832515366375446, "epoch": 0.00016, "grad_norm": 1.1862040758132935, "kl": 0.2433762801811099, "learning_rate": 4.2857142857142856e-05, "loss": 0.1137, "step": 16, "step_time": 30.26940473900322 }, { "clip_ratio/high_max": 0.005208333604969084, "clip_ratio/high_mean": 0.002604166802484542, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250001629814506, "completions/clipped_ratio": 0.0, "completions/max_length": 10099.0, "completions/max_terminated_length": 10099.0, "completions/mean_length": 8931.625, "completions/mean_terminated_length": 8931.625, "completions/min_length": 2013.0, "completions/min_terminated_length": 2013.0, "entropy": 0.4058182891458273, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 1.206827998161316, "kl": 0.1886005294509232, "learning_rate": 4.5714285714285716e-05, "loss": -0.0944, "num_tokens": 2652765.0, "reward": -0.0752231553196907, "reward_std": 0.48041343688964844, "rewards/rollout_eval_reward_func/mean": 0.10861280560493469, "rewards/rollout_eval_reward_func/std": 0.2368263602256775, "rewards/rollout_reward_func/mean": -0.0752231553196907, "rewards/rollout_reward_func/std": 0.5091694593429565, "sampling/importance_sampling_ratio/max": 2.2689177989959717, "sampling/importance_sampling_ratio/mean": 1.0046234130859375, "sampling/importance_sampling_ratio/min": 0.1846628040075302, "sampling/sampling_logp_difference/max": 1.6892237663269043, "sampling/sampling_logp_difference/mean": 0.06120520830154419, "step": 17, "step_time": 96.5394253049999 }, { "clip_ratio/high_max": 0.0221070961561054, "clip_ratio/high_mean": 0.013136881520040333, "clip_ratio/low_mean": 0.005389189289417118, "clip_ratio/low_min": 0.002066256827674806, "clip_ratio/region_mean": 0.01852607080945745, "entropy": 0.40752917528152466, "epoch": 0.00018, "grad_norm": 1.039859652519226, "kl": 0.20007089478895068, "learning_rate": 4.8571428571428576e-05, "loss": -0.1064, "step": 18, "step_time": 29.607819763001316 }, { "clip_ratio/high_max": 0.00424107164144516, "clip_ratio/high_mean": 0.00212053582072258, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00212053582072258, "completions/clipped_ratio": 0.0, "completions/max_length": 9868.0, "completions/max_terminated_length": 9868.0, "completions/mean_length": 7739.625, "completions/mean_terminated_length": 7739.625, "completions/min_length": 1494.0, "completions/min_terminated_length": 1494.0, "entropy": 0.3824189379811287, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 1.1822205781936646, "kl": 0.1706448094919324, "learning_rate": 5.142857142857143e-05, "loss": -0.1187, "num_tokens": 2929452.0, "reward": 0.1796756088733673, "reward_std": 0.6716787815093994, "rewards/rollout_eval_reward_func/mean": 0.25978150963783264, "rewards/rollout_eval_reward_func/std": 0.31619328260421753, "rewards/rollout_reward_func/mean": 0.1796756088733673, "rewards/rollout_reward_func/std": 0.6625394821166992, "sampling/importance_sampling_ratio/max": 1.8655627965927124, "sampling/importance_sampling_ratio/mean": 1.0000479221343994, "sampling/importance_sampling_ratio/min": 0.33482789993286133, "sampling/sampling_logp_difference/max": 1.0941386222839355, "sampling/sampling_logp_difference/mean": 0.04819408059120178, "step": 19, "step_time": 92.65558583299753 }, { "clip_ratio/high_max": 0.030015080701559782, "clip_ratio/high_mean": 0.018132540630176663, "clip_ratio/low_mean": 0.03180725604761392, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.049939796910621226, "entropy": 0.3580914381891489, "epoch": 0.0002, "grad_norm": 1.152976155281067, "kl": 0.2634436935186386, "learning_rate": 5.428571428571428e-05, "loss": -0.1272, "step": 20, "step_time": 28.27301450500272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020833335001952946, "clip_ratio/low_min": 0.0010416667209938169, "clip_ratio/region_mean": 0.0020833335001952946, "completions/clipped_ratio": 0.0, "completions/max_length": 10426.0, "completions/max_terminated_length": 10426.0, "completions/mean_length": 7911.40625, "completions/mean_terminated_length": 7911.40625, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "entropy": 0.3455618601292372, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.9142677187919617, "kl": 0.2354841867927462, "learning_rate": 5.714285714285714e-05, "loss": -0.0904, "num_tokens": 3211621.0, "reward": 0.09562171995639801, "reward_std": 0.6017146706581116, "rewards/rollout_eval_reward_func/mean": 0.1835619956254959, "rewards/rollout_eval_reward_func/std": 0.2800058424472809, "rewards/rollout_reward_func/mean": 0.09562171995639801, "rewards/rollout_reward_func/std": 0.5979344248771667, "sampling/importance_sampling_ratio/max": 1.7227435111999512, "sampling/importance_sampling_ratio/mean": 0.9981924295425415, "sampling/importance_sampling_ratio/min": 0.38243889808654785, "sampling/sampling_logp_difference/max": 0.961186408996582, "sampling/sampling_logp_difference/mean": 0.04361895099282265, "step": 21, "step_time": 94.40408171299714 }, { "clip_ratio/high_max": 0.03222161578014493, "clip_ratio/high_mean": 0.0181941413320601, "clip_ratio/low_mean": 0.02708333428017795, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.04527747584506869, "entropy": 0.3229655371978879, "epoch": 0.00022, "grad_norm": 0.8647798895835876, "kl": 0.21354854525998235, "learning_rate": 6e-05, "loss": -0.1008, "step": 22, "step_time": 30.11174104199381 }, { "clip_ratio/high_max": 0.0011160714784637094, "clip_ratio/high_mean": 0.0005580357392318547, "clip_ratio/low_mean": 0.0010995370685122907, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016575728077441454, "completions/clipped_ratio": 0.0, "completions/max_length": 10470.0, "completions/max_terminated_length": 10470.0, "completions/mean_length": 7568.375, "completions/mean_terminated_length": 7568.375, "completions/min_length": 2202.0, "completions/min_terminated_length": 2202.0, "entropy": 0.28525836300104856, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 1.0814907550811768, "kl": 0.35280791157856584, "learning_rate": 6.285714285714286e-05, "loss": 0.016, "num_tokens": 3482436.0, "reward": 0.2800288200378418, "reward_std": 0.7106037139892578, "rewards/rollout_eval_reward_func/mean": 0.33079269528388977, "rewards/rollout_eval_reward_func/std": 0.3271085023880005, "rewards/rollout_reward_func/mean": 0.2800288200378418, "rewards/rollout_reward_func/std": 0.6996307373046875, "sampling/importance_sampling_ratio/max": 1.6482936143875122, "sampling/importance_sampling_ratio/mean": 1.0002542734146118, "sampling/importance_sampling_ratio/min": 0.2758394777774811, "sampling/sampling_logp_difference/max": 1.2879362106323242, "sampling/sampling_logp_difference/mean": 0.0332026481628418, "step": 23, "step_time": 93.99063302500326 }, { "clip_ratio/high_max": 0.01396139187272638, "clip_ratio/high_mean": 0.007690923230256885, "clip_ratio/low_mean": 0.01880787085974589, "clip_ratio/low_min": 0.0031250001629814506, "clip_ratio/region_mean": 0.02649879432283342, "entropy": 0.2676102966070175, "epoch": 0.00024, "grad_norm": 0.8727543354034424, "kl": 0.3772396189160645, "learning_rate": 6.571428571428571e-05, "loss": 0.0057, "step": 24, "step_time": 29.4178187339985 }, { "clip_ratio/high_max": 0.004613095428794622, "clip_ratio/high_mean": 0.002985895553138107, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035067289136350155, "completions/clipped_ratio": 0.0, "completions/max_length": 9569.0, "completions/max_terminated_length": 9569.0, "completions/mean_length": 7533.28125, "completions/mean_terminated_length": 7533.28125, "completions/min_length": 2449.0, "completions/min_terminated_length": 2449.0, "entropy": 0.2505391649901867, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 1.017386555671692, "kl": 0.21523633878678083, "learning_rate": 6.857142857142858e-05, "loss": 0.0242, "num_tokens": 3751699.0, "reward": 0.3911706805229187, "reward_std": 0.638326108455658, "rewards/rollout_eval_reward_func/mean": 0.36318597197532654, "rewards/rollout_eval_reward_func/std": 0.3184514343738556, "rewards/rollout_reward_func/mean": 0.3911706805229187, "rewards/rollout_reward_func/std": 0.6562069654464722, "sampling/importance_sampling_ratio/max": 1.5404945611953735, "sampling/importance_sampling_ratio/mean": 0.9984301328659058, "sampling/importance_sampling_ratio/min": 0.4790920615196228, "sampling/sampling_logp_difference/max": 0.7358624935150146, "sampling/sampling_logp_difference/mean": 0.025531694293022156, "step": 25, "step_time": 92.37763964700025 }, { "clip_ratio/high_max": 0.02074831852223724, "clip_ratio/high_mean": 0.014075796061661094, "clip_ratio/low_mean": 0.024038826406467706, "clip_ratio/low_min": 0.004687500186264515, "clip_ratio/region_mean": 0.0381146224681288, "entropy": 0.24146342556923628, "epoch": 0.00026, "grad_norm": 1.08539617061615, "kl": 0.242179695982486, "learning_rate": 7.142857142857143e-05, "loss": 0.0152, "step": 26, "step_time": 27.09601488199405 }, { "clip_ratio/high_max": 0.004924242617562413, "clip_ratio/high_mean": 0.0024621213087812066, "clip_ratio/low_mean": 0.0015625000814907253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004024621390271932, "completions/clipped_ratio": 0.0, "completions/max_length": 9714.0, "completions/max_terminated_length": 9714.0, "completions/mean_length": 7341.125, "completions/mean_terminated_length": 7341.125, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.24662253353744745, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 1.0926475524902344, "kl": 0.201888975687325, "learning_rate": 7.428571428571429e-05, "loss": -0.0651, "num_tokens": 4014835.0, "reward": 0.26619410514831543, "reward_std": 0.6366387009620667, "rewards/rollout_eval_reward_func/mean": 0.31529471278190613, "rewards/rollout_eval_reward_func/std": 0.3177616000175476, "rewards/rollout_reward_func/mean": 0.26619410514831543, "rewards/rollout_reward_func/std": 0.6645346879959106, "sampling/importance_sampling_ratio/max": 1.7210402488708496, "sampling/importance_sampling_ratio/mean": 0.9990845918655396, "sampling/importance_sampling_ratio/min": 0.46208029985427856, "sampling/sampling_logp_difference/max": 0.7720166444778442, "sampling/sampling_logp_difference/mean": 0.024712545797228813, "step": 27, "step_time": 90.07543276499928 }, { "clip_ratio/high_max": 0.033208509092219174, "clip_ratio/high_mean": 0.018557379313278943, "clip_ratio/low_mean": 0.035281969350762665, "clip_ratio/low_min": 0.011458333698101342, "clip_ratio/region_mean": 0.05383934878045693, "entropy": 0.24193121027201414, "epoch": 0.00028, "grad_norm": 0.9876235127449036, "kl": 0.26401366433128715, "learning_rate": 7.714285714285715e-05, "loss": -0.073, "step": 28, "step_time": 27.219164144002207 }, { "clip_ratio/high_max": 0.0010775862028822303, "clip_ratio/high_mean": 0.0005387931014411151, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005387931014411151, "completions/clipped_ratio": 0.0, "completions/max_length": 9855.0, "completions/max_terminated_length": 9855.0, "completions/mean_length": 7361.875, "completions/mean_terminated_length": 7361.875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "entropy": 0.21860306337475777, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.9292377233505249, "kl": 0.20060417288914323, "learning_rate": 8e-05, "loss": -0.0977, "num_tokens": 4278506.0, "reward": 0.30670806765556335, "reward_std": 0.652392566204071, "rewards/rollout_eval_reward_func/mean": 0.3278709352016449, "rewards/rollout_eval_reward_func/std": 0.31351709365844727, "rewards/rollout_reward_func/mean": 0.30670806765556335, "rewards/rollout_reward_func/std": 0.6815608143806458, "sampling/importance_sampling_ratio/max": 1.4481010437011719, "sampling/importance_sampling_ratio/mean": 1.0026426315307617, "sampling/importance_sampling_ratio/min": 0.5693169832229614, "sampling/sampling_logp_difference/max": 0.5633178949356079, "sampling/sampling_logp_difference/mean": 0.01894025132060051, "step": 29, "step_time": 88.37378997200358 }, { "clip_ratio/high_max": 0.02580322092399001, "clip_ratio/high_mean": 0.015042814193293452, "clip_ratio/low_mean": 0.015608090267051011, "clip_ratio/low_min": 0.0020833334419876337, "clip_ratio/region_mean": 0.030650904460344464, "entropy": 0.2232473948970437, "epoch": 0.0003, "grad_norm": 0.6086679697036743, "kl": 0.19415233470499516, "learning_rate": 8.285714285714287e-05, "loss": -0.1081, "step": 30, "step_time": 28.619991764000588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005208333604969084, "completions/clipped_ratio": 0.0, "completions/max_length": 10726.0, "completions/max_terminated_length": 10726.0, "completions/mean_length": 7164.65625, "completions/mean_terminated_length": 7164.65625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.23761425912380219, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 1.050552487373352, "kl": 0.25638002483174205, "learning_rate": 8.571428571428571e-05, "loss": 0.012, "num_tokens": 4536097.0, "reward": 0.3345087766647339, "reward_std": 0.5485296249389648, "rewards/rollout_eval_reward_func/mean": 0.3090701103210449, "rewards/rollout_eval_reward_func/std": 0.32714226841926575, "rewards/rollout_reward_func/mean": 0.3345087766647339, "rewards/rollout_reward_func/std": 0.6012357473373413, "sampling/importance_sampling_ratio/max": 1.438549518585205, "sampling/importance_sampling_ratio/mean": 1.0011037588119507, "sampling/importance_sampling_ratio/min": 0.6349728107452393, "sampling/sampling_logp_difference/max": 0.45417308807373047, "sampling/sampling_logp_difference/mean": 0.015337169170379639, "step": 31, "step_time": 92.49027231299806 }, { "clip_ratio/high_max": 0.03391559107694775, "clip_ratio/high_mean": 0.018867517996113747, "clip_ratio/low_mean": 0.044338769221212715, "clip_ratio/low_min": 0.008333333535119891, "clip_ratio/region_mean": 0.06320628756657243, "entropy": 0.22916866652667522, "epoch": 0.00032, "grad_norm": 1.028586745262146, "kl": 0.3105860697105527, "learning_rate": 8.857142857142857e-05, "loss": 0.0055, "step": 32, "step_time": 29.399824877003994 }, { "clip_ratio/high_max": 0.0024519230937585235, "clip_ratio/high_mean": 0.0012259615468792617, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012259615468792617, "completions/clipped_ratio": 0.0, "completions/max_length": 10128.0, "completions/max_terminated_length": 10128.0, "completions/mean_length": 7357.46875, "completions/mean_terminated_length": 7357.46875, "completions/min_length": 1917.0, "completions/min_terminated_length": 1917.0, "entropy": 0.2557551637291908, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.9717881083488464, "kl": 0.2046954189427197, "learning_rate": 9.142857142857143e-05, "loss": 0.0245, "num_tokens": 4799621.0, "reward": 0.35216301679611206, "reward_std": 0.6164546608924866, "rewards/rollout_eval_reward_func/mean": 0.3365091383457184, "rewards/rollout_eval_reward_func/std": 0.3354848027229309, "rewards/rollout_reward_func/mean": 0.35216301679611206, "rewards/rollout_reward_func/std": 0.6309141516685486, "sampling/importance_sampling_ratio/max": 1.333243727684021, "sampling/importance_sampling_ratio/mean": 1.0005223751068115, "sampling/importance_sampling_ratio/min": 0.7339702248573303, "sampling/sampling_logp_difference/max": 0.30928683280944824, "sampling/sampling_logp_difference/mean": 0.014704002998769283, "step": 33, "step_time": 89.53553034700417 }, { "clip_ratio/high_max": 0.01991061063017696, "clip_ratio/high_mean": 0.011966300604399294, "clip_ratio/low_mean": 0.02272569522028789, "clip_ratio/low_min": 0.009722222457639873, "clip_ratio/region_mean": 0.03469199570827186, "entropy": 0.2428069869056344, "epoch": 0.00034, "grad_norm": 0.685612142086029, "kl": 0.2513351505622268, "learning_rate": 9.428571428571429e-05, "loss": 0.0129, "step": 34, "step_time": 28.25809028400181 }, { "clip_ratio/high_max": 0.0021990741370245814, "clip_ratio/high_mean": 0.0010995370685122907, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021412037895061076, "completions/clipped_ratio": 0.0, "completions/max_length": 9665.0, "completions/max_terminated_length": 9665.0, "completions/mean_length": 8000.09375, "completions/mean_terminated_length": 8000.09375, "completions/min_length": 4295.0, "completions/min_terminated_length": 4295.0, "entropy": 0.2354184165596962, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 1.040405035018921, "kl": 0.1770010399632156, "learning_rate": 9.714285714285715e-05, "loss": 0.1517, "num_tokens": 5084103.0, "reward": 0.3385156989097595, "reward_std": 0.5189785957336426, "rewards/rollout_eval_reward_func/mean": 0.23996442556381226, "rewards/rollout_eval_reward_func/std": 0.31991085410118103, "rewards/rollout_reward_func/mean": 0.3385156989097595, "rewards/rollout_reward_func/std": 0.5693588852882385, "sampling/importance_sampling_ratio/max": 1.4071576595306396, "sampling/importance_sampling_ratio/mean": 0.9996304512023926, "sampling/importance_sampling_ratio/min": 0.5387703776359558, "sampling/sampling_logp_difference/max": 0.6184659004211426, "sampling/sampling_logp_difference/mean": 0.015029089525341988, "step": 35, "step_time": 95.05921310200392 }, { "clip_ratio/high_max": 0.026263557723723352, "clip_ratio/high_mean": 0.014173445466440171, "clip_ratio/low_mean": 0.02787990286014974, "clip_ratio/low_min": 0.007291667046956718, "clip_ratio/region_mean": 0.04205334832658991, "entropy": 0.21858789399266243, "epoch": 0.00036, "grad_norm": 1.0455042123794556, "kl": 0.2051441869698465, "learning_rate": 0.0001, "loss": 0.1403, "step": 36, "step_time": 27.85193802100366 }, { "clip_ratio/high_max": 0.007164939888752997, "clip_ratio/high_mean": 0.0035824699443764985, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005144969967659563, "completions/clipped_ratio": 0.0, "completions/max_length": 9514.0, "completions/max_terminated_length": 9514.0, "completions/mean_length": 6029.25, "completions/mean_terminated_length": 6029.25, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "entropy": 0.21716525312513113, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.8604521751403809, "kl": 0.23097522975876927, "learning_rate": 9.999736485702831e-05, "loss": -0.0709, "num_tokens": 5305345.0, "reward": 0.41453179717063904, "reward_std": 0.7797224521636963, "rewards/rollout_eval_reward_func/mean": 0.4568089246749878, "rewards/rollout_eval_reward_func/std": 0.28734299540519714, "rewards/rollout_reward_func/mean": 0.41453179717063904, "rewards/rollout_reward_func/std": 0.755694568157196, "sampling/importance_sampling_ratio/max": 1.4738141298294067, "sampling/importance_sampling_ratio/mean": 1.000828742980957, "sampling/importance_sampling_ratio/min": 0.7324953079223633, "sampling/sampling_logp_difference/max": 0.3878536820411682, "sampling/sampling_logp_difference/mean": 0.013184964656829834, "step": 37, "step_time": 76.87407001600332 }, { "clip_ratio/high_max": 0.04774210066534579, "clip_ratio/high_mean": 0.02752261853311211, "clip_ratio/low_mean": 0.03158482233993709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.059107440523803234, "entropy": 0.21499714627861977, "epoch": 0.00038, "grad_norm": 1.026845932006836, "kl": 0.3676267918199301, "learning_rate": 9.998945979845876e-05, "loss": -0.0694, "step": 38, "step_time": 27.58343887600313 }, { "clip_ratio/high_max": 0.006285919691435993, "clip_ratio/high_mean": 0.0031429598457179964, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004184626566711813, "completions/clipped_ratio": 0.0, "completions/max_length": 9353.0, "completions/max_terminated_length": 9353.0, "completions/mean_length": 6221.78125, "completions/mean_terminated_length": 6221.78125, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "entropy": 0.21314978785812855, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 1.1063776016235352, "kl": 0.28443425707519054, "learning_rate": 9.997628593527586e-05, "loss": 0.1657, "num_tokens": 5533203.0, "reward": 0.5931290984153748, "reward_std": 0.5068180561065674, "rewards/rollout_eval_reward_func/mean": 0.4369918704032898, "rewards/rollout_eval_reward_func/std": 0.2919425666332245, "rewards/rollout_reward_func/mean": 0.5931290984153748, "rewards/rollout_reward_func/std": 0.6152276396751404, "sampling/importance_sampling_ratio/max": 1.4768017530441284, "sampling/importance_sampling_ratio/mean": 0.9989122152328491, "sampling/importance_sampling_ratio/min": 0.7442160248756409, "sampling/sampling_logp_difference/max": 0.3898787498474121, "sampling/sampling_logp_difference/mean": 0.011076296679675579, "step": 39, "step_time": 80.26773473300273 }, { "clip_ratio/high_max": 0.03581550612580031, "clip_ratio/high_mean": 0.021467003040015697, "clip_ratio/low_mean": 0.019476010755170137, "clip_ratio/low_min": 0.0031250000465661287, "clip_ratio/region_mean": 0.04094301396980882, "entropy": 0.2001811731606722, "epoch": 0.0004, "grad_norm": 0.8571550250053406, "kl": 0.39517259504646063, "learning_rate": 9.995784511894694e-05, "loss": 0.1561, "step": 40, "step_time": 26.113719172002675 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0013586956774815917, "clip_ratio/low_mean": 0.003238224715460092, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0045969203929416835, "completions/clipped_ratio": 0.0, "completions/max_length": 9876.0, "completions/max_terminated_length": 9876.0, "completions/mean_length": 7216.5625, "completions/mean_terminated_length": 7216.5625, "completions/min_length": 1879.0, "completions/min_terminated_length": 1879.0, "entropy": 0.2681358586996794, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 1.3309671878814697, "kl": 0.2410402470268309, "learning_rate": 9.993413994116206e-05, "loss": 0.1903, "num_tokens": 5792478.0, "reward": 0.471214234828949, "reward_std": 0.5625734329223633, "rewards/rollout_eval_reward_func/mean": 0.3643292784690857, "rewards/rollout_eval_reward_func/std": 0.34053289890289307, "rewards/rollout_reward_func/mean": 0.471214234828949, "rewards/rollout_reward_func/std": 0.6072424650192261, "sampling/importance_sampling_ratio/max": 1.8356192111968994, "sampling/importance_sampling_ratio/mean": 1.0007987022399902, "sampling/importance_sampling_ratio/min": 0.4829617738723755, "sampling/sampling_logp_difference/max": 0.7278177738189697, "sampling/sampling_logp_difference/mean": 0.014709306880831718, "step": 41, "step_time": 87.47009326799707 }, { "clip_ratio/high_max": 0.034506134572438896, "clip_ratio/high_mean": 0.01836913888109848, "clip_ratio/low_mean": 0.03956068912521005, "clip_ratio/low_min": 0.012500000651925802, "clip_ratio/region_mean": 0.05792982783168554, "entropy": 0.27205855678766966, "epoch": 0.00042, "grad_norm": 1.0188957452774048, "kl": 0.30527770798653364, "learning_rate": 9.990517373346957e-05, "loss": 0.1841, "step": 42, "step_time": 27.952364619004584 }, { "clip_ratio/high_max": 0.005300949211232364, "clip_ratio/high_mean": 0.002650474605616182, "clip_ratio/low_mean": 0.0015625000814907253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004212974687106907, "completions/clipped_ratio": 0.0, "completions/max_length": 10094.0, "completions/max_terminated_length": 10094.0, "completions/mean_length": 6369.84375, "completions/mean_terminated_length": 6369.84375, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "entropy": 0.24548510648310184, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.8371849656105042, "kl": 0.22607734380289912, "learning_rate": 9.98709505668081e-05, "loss": -0.1383, "num_tokens": 6024570.0, "reward": 0.5083565711975098, "reward_std": 0.7129669785499573, "rewards/rollout_eval_reward_func/mean": 0.4181910753250122, "rewards/rollout_eval_reward_func/std": 0.3106958866119385, "rewards/rollout_reward_func/mean": 0.5083565711975098, "rewards/rollout_reward_func/std": 0.679851770401001, "sampling/importance_sampling_ratio/max": 1.6035348176956177, "sampling/importance_sampling_ratio/mean": 1.0009479522705078, "sampling/importance_sampling_ratio/min": 0.7113155722618103, "sampling/sampling_logp_difference/max": 0.4722104072570801, "sampling/sampling_logp_difference/mean": 0.010827964171767235, "step": 43, "step_time": 81.8608712560017 }, { "clip_ratio/high_max": 0.022805775748565793, "clip_ratio/high_mean": 0.01218413794413209, "clip_ratio/low_mean": 0.026488096278626472, "clip_ratio/low_min": 0.0020833334419876337, "clip_ratio/region_mean": 0.03867223463021219, "entropy": 0.2484031356871128, "epoch": 0.00044, "grad_norm": 0.6352972388267517, "kl": 0.24903920874930918, "learning_rate": 9.983147525093428e-05, "loss": -0.1456, "step": 44, "step_time": 28.312056484001005 }, { "clip_ratio/high_max": 0.0020833334419876337, "clip_ratio/high_mean": 0.0010416667209938169, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 10030.0, "completions/max_terminated_length": 10030.0, "completions/mean_length": 7470.40625, "completions/mean_terminated_length": 7470.40625, "completions/min_length": 3212.0, "completions/min_terminated_length": 3212.0, "entropy": 0.26859680097550154, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.9950742125511169, "kl": 0.2715269709005952, "learning_rate": 9.978675333374685e-05, "loss": 0.1354, "num_tokens": 6292193.0, "reward": 0.31536591053009033, "reward_std": 0.626213550567627, "rewards/rollout_eval_reward_func/mean": 0.2950965166091919, "rewards/rollout_eval_reward_func/std": 0.3288768529891968, "rewards/rollout_reward_func/mean": 0.31536591053009033, "rewards/rollout_reward_func/std": 0.6272794604301453, "sampling/importance_sampling_ratio/max": 1.2761257886886597, "sampling/importance_sampling_ratio/mean": 0.9995177388191223, "sampling/importance_sampling_ratio/min": 0.6398259401321411, "sampling/sampling_logp_difference/max": 0.44655919075012207, "sampling/sampling_logp_difference/mean": 0.01289924792945385, "step": 45, "step_time": 89.98842330299703 }, { "clip_ratio/high_max": 0.029475471819750965, "clip_ratio/high_mean": 0.017039196158293635, "clip_ratio/low_mean": 0.035884891636669636, "clip_ratio/low_min": 0.014583333861082792, "clip_ratio/region_mean": 0.05292408773675561, "entropy": 0.25596251618117094, "epoch": 0.00046, "grad_norm": 1.0492225885391235, "kl": 0.4555607410147786, "learning_rate": 9.973679110050689e-05, "loss": 0.1236, "step": 46, "step_time": 28.10059149600238 }, { "clip_ratio/high_max": 0.005558473523706198, "clip_ratio/high_mean": 0.002779236761853099, "clip_ratio/low_mean": 0.0031250001629814506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005904236924834549, "completions/clipped_ratio": 0.0, "completions/max_length": 10171.0, "completions/max_terminated_length": 10171.0, "completions/mean_length": 7720.34375, "completions/mean_terminated_length": 7720.34375, "completions/min_length": 2255.0, "completions/min_terminated_length": 2255.0, "entropy": 0.21848125476390123, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.9580699801445007, "kl": 0.2126072864048183, "learning_rate": 9.968159557295458e-05, "loss": 0.2391, "num_tokens": 6567972.0, "reward": 0.585047721862793, "reward_std": 0.4849390387535095, "rewards/rollout_eval_reward_func/mean": 0.35200709104537964, "rewards/rollout_eval_reward_func/std": 0.33855971693992615, "rewards/rollout_reward_func/mean": 0.585047721862793, "rewards/rollout_reward_func/std": 0.4694308936595917, "sampling/importance_sampling_ratio/max": 1.3900582790374756, "sampling/importance_sampling_ratio/mean": 1.0005149841308594, "sampling/importance_sampling_ratio/min": 0.5463369488716125, "sampling/sampling_logp_difference/max": 0.6045193672180176, "sampling/sampling_logp_difference/mean": 0.012745920568704605, "step": 47, "step_time": 91.15270540599704 }, { "clip_ratio/high_max": 0.03133936191443354, "clip_ratio/high_mean": 0.017232181096915156, "clip_ratio/low_mean": 0.04218750132713467, "clip_ratio/low_min": 0.01145833358168602, "clip_ratio/region_mean": 0.059419682365842164, "entropy": 0.23045554850250483, "epoch": 0.00048, "grad_norm": 1.2474925518035889, "kl": 0.18294932693243027, "learning_rate": 9.962117450832225e-05, "loss": 0.238, "step": 48, "step_time": 29.046616760999314 }, { "clip_ratio/high_max": 0.006842764443717897, "clip_ratio/high_mean": 0.0034213822218589485, "clip_ratio/low_mean": 0.0015625000232830644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004983882245142013, "completions/clipped_ratio": 0.0, "completions/max_length": 10017.0, "completions/max_terminated_length": 10017.0, "completions/mean_length": 7918.40625, "completions/mean_terminated_length": 7918.40625, "completions/min_length": 1876.0, "completions/min_terminated_length": 1876.0, "entropy": 0.24847039952874184, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 1.1504110097885132, "kl": 0.3213502997532487, "learning_rate": 9.955553639824423e-05, "loss": 0.1906, "num_tokens": 6849638.0, "reward": 0.39189645648002625, "reward_std": 0.5209037065505981, "rewards/rollout_eval_reward_func/mean": 0.2815040647983551, "rewards/rollout_eval_reward_func/std": 0.332853227853775, "rewards/rollout_reward_func/mean": 0.39189645648002625, "rewards/rollout_reward_func/std": 0.5881980061531067, "sampling/importance_sampling_ratio/max": 1.4030216932296753, "sampling/importance_sampling_ratio/mean": 0.9992052316665649, "sampling/importance_sampling_ratio/min": 0.6490213871002197, "sampling/sampling_logp_difference/max": 0.43228960037231445, "sampling/sampling_logp_difference/mean": 0.011766092851758003, "step": 49, "step_time": 91.98302743600289 }, { "clip_ratio/high_max": 0.030021664802916348, "clip_ratio/high_mean": 0.01896916568512097, "clip_ratio/low_mean": 0.02840909146470949, "clip_ratio/low_min": 0.0031250001629814506, "clip_ratio/region_mean": 0.04737825732445344, "entropy": 0.22083801217377186, "epoch": 0.0005, "grad_norm": 1.493245005607605, "kl": 0.6161252139136195, "learning_rate": 9.948469046756344e-05, "loss": 0.1882, "step": 50, "step_time": 29.706524382998396 }, { "clip_ratio/high_max": 0.007615459966473281, "clip_ratio/high_mean": 0.0038077299832366407, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0048493967042304575, "completions/clipped_ratio": 0.0, "completions/max_length": 10057.0, "completions/max_terminated_length": 10057.0, "completions/mean_length": 7061.03125, "completions/mean_terminated_length": 7061.03125, "completions/min_length": 2525.0, "completions/min_terminated_length": 2525.0, "entropy": 0.24380221962928772, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 1.1951247453689575, "kl": 0.28687382210046053, "learning_rate": 9.940864667303489e-05, "loss": 0.1425, "num_tokens": 7103728.0, "reward": 0.406146377325058, "reward_std": 0.6755715608596802, "rewards/rollout_eval_reward_func/mean": 0.3859247863292694, "rewards/rollout_eval_reward_func/std": 0.33643871545791626, "rewards/rollout_reward_func/mean": 0.406146377325058, "rewards/rollout_reward_func/std": 0.6774359345436096, "sampling/importance_sampling_ratio/max": 1.367674469947815, "sampling/importance_sampling_ratio/mean": 0.9991032481193542, "sampling/importance_sampling_ratio/min": 0.6542518734931946, "sampling/sampling_logp_difference/max": 0.4242628812789917, "sampling/sampling_logp_difference/mean": 0.012621527537703514, "step": 51, "step_time": 85.52853098199739 }, { "clip_ratio/high_max": 0.023708798456937075, "clip_ratio/high_mean": 0.015155438333749771, "clip_ratio/low_mean": 0.02644535672152415, "clip_ratio/low_min": 0.009695513173937798, "clip_ratio/region_mean": 0.04160079546272755, "entropy": 0.24589570611715317, "epoch": 0.00052, "grad_norm": 0.6901561617851257, "kl": 0.2809536149725318, "learning_rate": 9.932741570192633e-05, "loss": 0.1278, "step": 52, "step_time": 28.923457664002854 }, { "clip_ratio/high_max": 0.0011160714784637094, "clip_ratio/high_mean": 0.0005580357392318547, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001078869099728763, "completions/clipped_ratio": 0.0, "completions/max_length": 10169.0, "completions/max_terminated_length": 10169.0, "completions/mean_length": 7814.28125, "completions/mean_terminated_length": 7814.28125, "completions/min_length": 1989.0, "completions/min_terminated_length": 1989.0, "entropy": 0.21275948453694582, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.7513790130615234, "kl": 0.23512054793536663, "learning_rate": 9.924100897051629e-05, "loss": 0.1945, "num_tokens": 7382261.0, "reward": 0.42753756046295166, "reward_std": 0.49785035848617554, "rewards/rollout_eval_reward_func/mean": 0.26969003677368164, "rewards/rollout_eval_reward_func/std": 0.3341839611530304, "rewards/rollout_reward_func/mean": 0.42753756046295166, "rewards/rollout_reward_func/std": 0.49307680130004883, "sampling/importance_sampling_ratio/max": 1.3325144052505493, "sampling/importance_sampling_ratio/mean": 0.9995752573013306, "sampling/importance_sampling_ratio/min": 0.6147154569625854, "sampling/sampling_logp_difference/max": 0.48659586906433105, "sampling/sampling_logp_difference/mean": 0.010477245785295963, "step": 53, "step_time": 89.77598898800352 }, { "clip_ratio/high_max": 0.014756215270608664, "clip_ratio/high_mean": 0.007378107635304332, "clip_ratio/low_mean": 0.026041667733807117, "clip_ratio/low_min": 0.008333333651535213, "clip_ratio/region_mean": 0.03341977560194209, "entropy": 0.20128578413277864, "epoch": 0.00054, "grad_norm": 0.570249080657959, "kl": 0.24723996873944998, "learning_rate": 9.914943862248966e-05, "loss": 0.1836, "step": 54, "step_time": 28.66781206799169 }, { "clip_ratio/high_max": 0.005409664008766413, "clip_ratio/high_mean": 0.0027048320043832064, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003225665364880115, "completions/clipped_ratio": 0.0, "completions/max_length": 9612.0, "completions/max_terminated_length": 9612.0, "completions/mean_length": 7466.40625, "completions/mean_terminated_length": 7466.40625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "entropy": 0.2242852784693241, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.7776551246643066, "kl": 0.22325131320394576, "learning_rate": 9.905271752723088e-05, "loss": 0.0206, "num_tokens": 7648812.0, "reward": 0.40199464559555054, "reward_std": 0.5598407983779907, "rewards/rollout_eval_reward_func/mean": 0.3375253975391388, "rewards/rollout_eval_reward_func/std": 0.351279616355896, "rewards/rollout_reward_func/mean": 0.40199464559555054, "rewards/rollout_reward_func/std": 0.5975609421730042, "sampling/importance_sampling_ratio/max": 1.317135214805603, "sampling/importance_sampling_ratio/mean": 0.9976714849472046, "sampling/importance_sampling_ratio/min": 0.6417545676231384, "sampling/sampling_logp_difference/max": 0.4435492753982544, "sampling/sampling_logp_difference/mean": 0.012365585193037987, "step": 55, "step_time": 90.48158546899867 }, { "clip_ratio/high_max": 0.02967093954794109, "clip_ratio/high_mean": 0.01639796979725361, "clip_ratio/low_mean": 0.017361111822538078, "clip_ratio/low_min": 0.0031250001629814506, "clip_ratio/region_mean": 0.03375908185262233, "entropy": 0.2281673550605774, "epoch": 0.00056, "grad_norm": 0.48372626304626465, "kl": 0.23605143558233976, "learning_rate": 9.895085927801542e-05, "loss": 0.0086, "step": 56, "step_time": 27.291444884000157 }, { "clip_ratio/high_max": 0.003557769814506173, "clip_ratio/high_mean": 0.0017788849072530866, "clip_ratio/low_mean": 0.0015625000814907253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003341384930536151, "completions/clipped_ratio": 0.0, "completions/max_length": 10322.0, "completions/max_terminated_length": 10322.0, "completions/mean_length": 6645.3125, "completions/mean_terminated_length": 6645.3125, "completions/min_length": 1995.0, "completions/min_terminated_length": 1995.0, "entropy": 0.22339679207652807, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.7947802543640137, "kl": 0.33472199738025665, "learning_rate": 9.884387819009922e-05, "loss": 0.0286, "num_tokens": 7889241.0, "reward": 0.3272937536239624, "reward_std": 0.7356898784637451, "rewards/rollout_eval_reward_func/mean": 0.38528963923454285, "rewards/rollout_eval_reward_func/std": 0.3158987760543823, "rewards/rollout_reward_func/mean": 0.3272937536239624, "rewards/rollout_reward_func/std": 0.7287615537643433, "sampling/importance_sampling_ratio/max": 1.519856333732605, "sampling/importance_sampling_ratio/mean": 1.0008394718170166, "sampling/importance_sampling_ratio/min": 0.6888355612754822, "sampling/sampling_logp_difference/max": 0.41861581802368164, "sampling/sampling_logp_difference/mean": 0.01188460923731327, "step": 57, "step_time": 83.6079965079989 }, { "clip_ratio/high_max": 0.02337649872060865, "clip_ratio/high_mean": 0.012729916197713464, "clip_ratio/low_mean": 0.03550771565642208, "clip_ratio/low_min": 0.013886852888390422, "clip_ratio/region_mean": 0.048237632028758526, "entropy": 0.23247116059064865, "epoch": 0.00058, "grad_norm": 0.6895915269851685, "kl": 0.30278117302805185, "learning_rate": 9.873178929870695e-05, "loss": 0.0178, "step": 58, "step_time": 29.01562165299947 }, { "clip_ratio/high_max": 0.006458333344198763, "clip_ratio/high_mean": 0.00375000003259629, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004791666753590107, "completions/clipped_ratio": 0.0, "completions/max_length": 10121.0, "completions/max_terminated_length": 10121.0, "completions/mean_length": 7354.9375, "completions/mean_terminated_length": 7354.9375, "completions/min_length": 1114.0, "completions/min_terminated_length": 1114.0, "entropy": 0.2855970785021782, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 1.1581138372421265, "kl": 0.30998079385608435, "learning_rate": 9.86146083569188e-05, "loss": -0.077, "num_tokens": 8152533.0, "reward": 0.12930195033550262, "reward_std": 0.6660091876983643, "rewards/rollout_eval_reward_func/mean": 0.33841466903686523, "rewards/rollout_eval_reward_func/std": 0.3268774151802063, "rewards/rollout_reward_func/mean": 0.12930195033550262, "rewards/rollout_reward_func/std": 0.7711123824119568, "sampling/importance_sampling_ratio/max": 1.4381568431854248, "sampling/importance_sampling_ratio/mean": 0.9980136156082153, "sampling/importance_sampling_ratio/min": 0.7020198106765747, "sampling/sampling_logp_difference/max": 0.36336231231689453, "sampling/sampling_logp_difference/mean": 0.016959059983491898, "step": 59, "step_time": 87.87077508199764 }, { "clip_ratio/high_max": 0.048061754438094795, "clip_ratio/high_mean": 0.031483913655392826, "clip_ratio/low_mean": 0.04418836906552315, "clip_ratio/low_min": 0.007291666814126074, "clip_ratio/region_mean": 0.07567228260450065, "entropy": 0.26963882334530354, "epoch": 0.0006, "grad_norm": 1.0022964477539062, "kl": 0.30027929320931435, "learning_rate": 9.84923518334567e-05, "loss": -0.0828, "step": 60, "step_time": 28.71259851099967 }, { "clip_ratio/high_max": 0.01005121401976794, "clip_ratio/high_mean": 0.005546440428588539, "clip_ratio/low_mean": 0.0020026409183628857, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007549081346951425, "completions/clipped_ratio": 0.0, "completions/max_length": 10379.0, "completions/max_terminated_length": 10379.0, "completions/mean_length": 6955.875, "completions/mean_terminated_length": 6955.875, "completions/min_length": 2081.0, "completions/min_terminated_length": 2081.0, "entropy": 0.23451983137056231, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 1.1702102422714233, "kl": 0.28609952982515097, "learning_rate": 9.83650369103696e-05, "loss": 0.0631, "num_tokens": 8403186.0, "reward": 0.3940112888813019, "reward_std": 0.67255699634552, "rewards/rollout_eval_reward_func/mean": 0.3715701103210449, "rewards/rollout_eval_reward_func/std": 0.3261474072933197, "rewards/rollout_reward_func/mean": 0.3940112888813019, "rewards/rollout_reward_func/std": 0.6762000322341919, "sampling/importance_sampling_ratio/max": 1.3093942403793335, "sampling/importance_sampling_ratio/mean": 1.0008020401000977, "sampling/importance_sampling_ratio/min": 0.5961512923240662, "sampling/sampling_logp_difference/max": 0.5172607898712158, "sampling/sampling_logp_difference/mean": 0.014042183756828308, "step": 61, "step_time": 86.51144317899707 }, { "clip_ratio/high_max": 0.051156656933017075, "clip_ratio/high_mean": 0.03779221937293187, "clip_ratio/low_mean": 0.05211732583120465, "clip_ratio/low_min": 0.024354460649192333, "clip_ratio/region_mean": 0.08990954549517483, "entropy": 0.21548824943602085, "epoch": 0.00062, "grad_norm": 1.1680642366409302, "kl": 0.5453370595350862, "learning_rate": 9.823268148061883e-05, "loss": 0.0666, "step": 62, "step_time": 28.28093677799916 }, { "clip_ratio/high_max": 0.009642903693020344, "clip_ratio/high_mean": 0.004821451846510172, "clip_ratio/low_mean": 0.0010416667209938169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005863118567503989, "completions/clipped_ratio": 0.0, "completions/max_length": 9611.0, "completions/max_terminated_length": 9611.0, "completions/mean_length": 5474.5, "completions/mean_terminated_length": 5474.5, "completions/min_length": 1264.0, "completions/min_terminated_length": 1264.0, "entropy": 0.20994199626147747, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 1.5162609815597534, "kl": 0.3376044826582074, "learning_rate": 9.809530414556335e-05, "loss": 0.1386, "num_tokens": 8606212.0, "reward": 0.6158473491668701, "reward_std": 0.705132782459259, "rewards/rollout_eval_reward_func/mean": 0.5119410753250122, "rewards/rollout_eval_reward_func/std": 0.2654803693294525, "rewards/rollout_reward_func/mean": 0.6158473491668701, "rewards/rollout_reward_func/std": 0.6767383813858032, "sampling/importance_sampling_ratio/max": 1.9123412370681763, "sampling/importance_sampling_ratio/mean": 0.9994137287139893, "sampling/importance_sampling_ratio/min": 0.6006231904029846, "sampling/sampling_logp_difference/max": 0.6483283042907715, "sampling/sampling_logp_difference/mean": 0.015111252665519714, "step": 63, "step_time": 74.64896667399807 }, { "clip_ratio/high_max": 0.05132549628615379, "clip_ratio/high_mean": 0.030718339723534882, "clip_ratio/low_mean": 0.028882576967589557, "clip_ratio/low_min": 0.0031250000465661287, "clip_ratio/region_mean": 0.05960091657470912, "entropy": 0.20065013086423278, "epoch": 0.00064, "grad_norm": 1.244667649269104, "kl": 0.453593029640615, "learning_rate": 9.79529242123455e-05, "loss": 0.1234, "step": 64, "step_time": 24.8986287849948 }, { "clip_ratio/high_max": 0.0077537596225738525, "clip_ratio/high_mean": 0.0038768798112869263, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004397713171783835, "completions/clipped_ratio": 0.0, "completions/max_length": 10299.0, "completions/max_terminated_length": 10299.0, "completions/mean_length": 6500.03125, "completions/mean_terminated_length": 6500.03125, "completions/min_length": 1712.0, "completions/min_terminated_length": 1712.0, "entropy": 0.14482268318533897, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.9947667121887207, "kl": 0.2627185983583331, "learning_rate": 9.780556169117757e-05, "loss": 0.0665, "num_tokens": 8841902.0, "reward": 0.6678237915039062, "reward_std": 0.5866862535476685, "rewards/rollout_eval_reward_func/mean": 0.5013973712921143, "rewards/rollout_eval_reward_func/std": 0.27832266688346863, "rewards/rollout_reward_func/mean": 0.6678237915039062, "rewards/rollout_reward_func/std": 0.5921808481216431, "sampling/importance_sampling_ratio/max": 1.4597694873809814, "sampling/importance_sampling_ratio/mean": 0.99915611743927, "sampling/importance_sampling_ratio/min": 0.27695003151893616, "sampling/sampling_logp_difference/max": 1.2839181423187256, "sampling/sampling_logp_difference/mean": 0.010844534263014793, "step": 65, "step_time": 80.86000475000401 }, { "clip_ratio/high_max": 0.025044884881936014, "clip_ratio/high_mean": 0.014345359115395695, "clip_ratio/low_mean": 0.02013494382845238, "clip_ratio/low_min": 0.0020833334419876337, "clip_ratio/region_mean": 0.03448030271101743, "entropy": 0.13131517032161355, "epoch": 0.00066, "grad_norm": 0.4750834107398987, "kl": 0.34219094878062606, "learning_rate": 9.765323729252955e-05, "loss": 0.0561, "step": 66, "step_time": 28.661124781996477 }, { "clip_ratio/high_max": 0.009476827806793153, "clip_ratio/high_mean": 0.0062825315981172025, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062825315981172025, "completions/clipped_ratio": 0.0, "completions/max_length": 10269.0, "completions/max_terminated_length": 10269.0, "completions/mean_length": 6501.5, "completions/mean_terminated_length": 6501.5, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.14845013478770852, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.9048762917518616, "kl": 0.3337597157806158, "learning_rate": 9.749597242421838e-05, "loss": 0.0677, "num_tokens": 9077833.0, "reward": 0.6164548397064209, "reward_std": 0.5683386325836182, "rewards/rollout_eval_reward_func/mean": 0.4744664430618286, "rewards/rollout_eval_reward_func/std": 0.29613611102104187, "rewards/rollout_reward_func/mean": 0.6164548397064209, "rewards/rollout_reward_func/std": 0.6236394643783569, "sampling/importance_sampling_ratio/max": 1.9927904605865479, "sampling/importance_sampling_ratio/mean": 1.0013047456741333, "sampling/importance_sampling_ratio/min": 0.5228504538536072, "sampling/sampling_logp_difference/max": 0.6895358562469482, "sampling/sampling_logp_difference/mean": 0.011342051438987255, "step": 67, "step_time": 79.70687562199964 }, { "clip_ratio/high_max": 0.0376884457655251, "clip_ratio/high_mean": 0.026128767582122236, "clip_ratio/low_mean": 0.026416301843710244, "clip_ratio/low_min": 0.007291666814126074, "clip_ratio/region_mean": 0.0525450695422478, "entropy": 0.15412914380431175, "epoch": 0.00068, "grad_norm": 0.8491650223731995, "kl": 0.3899666126817465, "learning_rate": 9.733378918839942e-05, "loss": 0.0638, "step": 68, "step_time": 27.40538086699962 }, { "clip_ratio/high_max": 0.006514550419524312, "clip_ratio/high_mean": 0.003257275209762156, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003257275209762156, "completions/clipped_ratio": 0.0, "completions/max_length": 10202.0, "completions/max_terminated_length": 10202.0, "completions/mean_length": 5860.65625, "completions/mean_terminated_length": 5860.65625, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "entropy": 0.16269859950989485, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.9143983721733093, "kl": 0.47777214366942644, "learning_rate": 9.716671037846007e-05, "loss": 0.1152, "num_tokens": 9293397.0, "reward": 0.6956244707107544, "reward_std": 0.5506021976470947, "rewards/rollout_eval_reward_func/mean": 0.5125762224197388, "rewards/rollout_eval_reward_func/std": 0.2857610881328583, "rewards/rollout_reward_func/mean": 0.6956244707107544, "rewards/rollout_reward_func/std": 0.5588130354881287, "sampling/importance_sampling_ratio/max": 1.4407436847686768, "sampling/importance_sampling_ratio/mean": 1.0004699230194092, "sampling/importance_sampling_ratio/min": 0.5672728419303894, "sampling/sampling_logp_difference/max": 0.5669147968292236, "sampling/sampling_logp_difference/mean": 0.010705020278692245, "step": 69, "step_time": 77.93740563500614 }, { "clip_ratio/high_max": 0.04634982522111386, "clip_ratio/high_mean": 0.029493737209122628, "clip_ratio/low_mean": 0.01730769290588796, "clip_ratio/low_min": 0.004166666767559946, "clip_ratio/region_mean": 0.046801429823972285, "entropy": 0.1784980888478458, "epoch": 0.0007, "grad_norm": 0.7063129544258118, "kl": 0.3514184970408678, "learning_rate": 9.699475947581644e-05, "loss": 0.1049, "step": 70, "step_time": 27.06573885999751 }, { "clip_ratio/high_max": 0.0018382353009656072, "clip_ratio/high_mean": 0.0009191176504828036, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009191176504828036, "completions/clipped_ratio": 0.0, "completions/max_length": 10593.0, "completions/max_terminated_length": 10593.0, "completions/mean_length": 6225.28125, "completions/mean_terminated_length": 6225.28125, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "entropy": 0.17604797054082155, "epoch": 0.00071, "frac_reward_zero_std": 0.25, "grad_norm": 0.7393732070922852, "kl": 0.20170354284346104, "learning_rate": 9.681796064661319e-05, "loss": 0.0413, "num_tokens": 9520372.0, "reward": 0.8103519678115845, "reward_std": 0.3980957269668579, "rewards/rollout_eval_reward_func/mean": 0.5907012224197388, "rewards/rollout_eval_reward_func/std": 0.20860876142978668, "rewards/rollout_reward_func/mean": 0.8103519678115845, "rewards/rollout_reward_func/std": 0.49828964471817017, "sampling/importance_sampling_ratio/max": 1.5257266759872437, "sampling/importance_sampling_ratio/mean": 0.9991195201873779, "sampling/importance_sampling_ratio/min": 0.6470949649810791, "sampling/sampling_logp_difference/max": 0.43526220321655273, "sampling/sampling_logp_difference/mean": 0.010150602087378502, "step": 71, "step_time": 79.67722541299918 }, { "clip_ratio/high_max": 0.01454339677002281, "clip_ratio/high_mean": 0.008729609136935323, "clip_ratio/low_mean": 0.009114583488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01784419280011207, "entropy": 0.18162205442786217, "epoch": 0.00072, "grad_norm": 0.44046881794929504, "kl": 0.20182663016021252, "learning_rate": 9.663633873832725e-05, "loss": 0.0328, "step": 72, "step_time": 28.538212690000364 }, { "clip_ratio/high_max": 0.004232634324580431, "clip_ratio/high_mean": 0.0021163171622902155, "clip_ratio/low_mean": 0.0005208333604969084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002637150522787124, "completions/clipped_ratio": 0.0, "completions/max_length": 10142.0, "completions/max_terminated_length": 10142.0, "completions/mean_length": 6840.59375, "completions/mean_terminated_length": 6840.59375, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.20604060776531696, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.6912176609039307, "kl": 0.26183000626042485, "learning_rate": 9.644991927627566e-05, "loss": -0.0088, "num_tokens": 9767000.0, "reward": 0.7756590247154236, "reward_std": 0.5361872911453247, "rewards/rollout_eval_reward_func/mean": 0.5909552574157715, "rewards/rollout_eval_reward_func/std": 0.2465948760509491, "rewards/rollout_reward_func/mean": 0.7756590247154236, "rewards/rollout_reward_func/std": 0.5321318507194519, "sampling/importance_sampling_ratio/max": 1.2645851373672485, "sampling/importance_sampling_ratio/mean": 1.0009121894836426, "sampling/importance_sampling_ratio/min": 0.6386132836341858, "sampling/sampling_logp_difference/max": 0.4484562873840332, "sampling/sampling_logp_difference/mean": 0.010102368891239166, "step": 73, "step_time": 82.14820753000458 }, { "clip_ratio/high_max": 0.02557993505615741, "clip_ratio/high_mean": 0.01748599053826183, "clip_ratio/low_mean": 0.009895833674818277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027381824154872447, "entropy": 0.22169003915041685, "epoch": 0.00074, "grad_norm": 0.42521047592163086, "kl": 0.23322301171720028, "learning_rate": 9.625872846002834e-05, "loss": -0.0155, "step": 74, "step_time": 28.134478513999056 }, { "clip_ratio/high_max": 0.008986742584966123, "clip_ratio/high_mean": 0.005014204594772309, "clip_ratio/low_mean": 0.002018229220993817, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007032433815766126, "completions/clipped_ratio": 0.0, "completions/max_length": 9857.0, "completions/max_terminated_length": 9857.0, "completions/mean_length": 7195.375, "completions/mean_terminated_length": 7195.375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "entropy": 0.27833056077361107, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.8185970187187195, "kl": 0.24367811996489763, "learning_rate": 9.606279315972582e-05, "loss": -0.1492, "num_tokens": 10025024.0, "reward": 0.2978099584579468, "reward_std": 0.6597497463226318, "rewards/rollout_eval_reward_func/mean": 0.32901421189308167, "rewards/rollout_eval_reward_func/std": 0.3157320022583008, "rewards/rollout_reward_func/mean": 0.2978099584579468, "rewards/rollout_reward_func/std": 0.690675675868988, "sampling/importance_sampling_ratio/max": 1.4156017303466797, "sampling/importance_sampling_ratio/mean": 1.0000808238983154, "sampling/importance_sampling_ratio/min": 0.6558278799057007, "sampling/sampling_logp_difference/max": 0.4218568801879883, "sampling/sampling_logp_difference/mean": 0.013327672146260738, "step": 75, "step_time": 88.96669697499601 }, { "epoch": 0.00075, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 9194.0, "eval_completions/max_terminated_length": 9194.0, "eval_completions/mean_length": 7026.0375, "eval_completions/mean_terminated_length": 7026.0375, "eval_completions/min_length": 4333.95, "eval_completions/min_terminated_length": 4333.95, "eval_entropy": 0.3085056647658348, "eval_frac_reward_zero_std": 1.0, "eval_kl": 0.22236853390932082, "eval_loss": 0.0002063037100015208, "eval_num_tokens": 10025024.0, "eval_reward": 0.35444250535219907, "eval_reward_std": 0.0, "eval_rewards/rollout_eval_reward_func/mean": 0.3484247986227274, "eval_rewards/rollout_eval_reward_func/std": 0.26531881298869847, "eval_rewards/rollout_reward_func/mean": 0.35444250535219907, "eval_rewards/rollout_reward_func/std": 0.5791118375957012, "eval_runtime": 161.4965, "eval_samples_per_second": 0.062, "eval_sampling/importance_sampling_ratio/max": 1.1964155852794647, "eval_sampling/importance_sampling_ratio/mean": 1.0003154128789902, "eval_sampling/importance_sampling_ratio/min": 0.7968822807073593, "eval_sampling/sampling_logp_difference/max": 0.2617991387844086, "eval_sampling/sampling_logp_difference/mean": 0.01210988024249673, "eval_steps_per_second": 0.019, "step": 75 } ], "logging_steps": 1.0, "max_steps": 300, "num_input_tokens_seen": 10025024, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }