{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12000711153253527, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 948.4, "completions/max_terminated_length": 903.8, "completions/mean_length": 255.7796875, "completions/mean_terminated_length": 250.9670440673828, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "entropy": 0.9540863061944643, "epoch": 0.0005333649401446012, "frac_reward_zero_std": 0.15, "grad_norm": 1.4098669290542603, "kl": 0.0, "learning_rate": 3.571428571428571e-08, "loss": 0.3423, "num_tokens": 261131.0, "reward": 0.2078125, "reward_std": 0.3793635904788971, "rewards/equation_reward_func/mean": 0.009375, "rewards/equation_reward_func/std": 0.07295562326908112, "rewards/format_reward_func/mean": 0.1984375, "rewards/format_reward_func/std": 0.39687987565994265, "sampling/importance_sampling_ratio/max": 1.432898497581482, "sampling/importance_sampling_ratio/mean": 0.9999992489814759, "sampling/importance_sampling_ratio/min": 0.7043134570121765, "sampling/sampling_logp_difference/max": 0.4017820119857788, "sampling/sampling_logp_difference/mean": 0.017648475617170332, "step": 2 }, { "clip_ratio/high_max": 5.39583884852214e-06, "clip_ratio/high_mean": 5.39583884852214e-06, "clip_ratio/low_mean": 1.2291138975544729e-05, "clip_ratio/low_min": 1.2291138975544729e-05, "clip_ratio/region_mean": 1.7686977824066868e-05, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.25, "completions/mean_length": 269.384765625, "completions/mean_terminated_length": 257.3567123413086, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.9880843129422929, "epoch": 0.0010667298802892024, "frac_reward_zero_std": 0.078125, "grad_norm": 1.4387495517730713, "kl": 0.0005483530710787616, "learning_rate": 1.0714285714285713e-07, "loss": 0.4365, "num_tokens": 477152.0, "reward": 0.22265625, "reward_std": 0.4065710976719856, "rewards/equation_reward_func/mean": 0.009765625, "rewards/equation_reward_func/std": 0.08216489106416702, "rewards/format_reward_func/mean": 0.212890625, "rewards/format_reward_func/std": 0.4071633070707321, "sampling/importance_sampling_ratio/max": 1.5262340903282166, "sampling/importance_sampling_ratio/mean": 1.0000044405460358, "sampling/importance_sampling_ratio/min": 0.6418062299489975, "sampling/sampling_logp_difference/max": 0.45973144471645355, "sampling/sampling_logp_difference/mean": 0.017983675003051758, "step": 4 }, { "clip_ratio/high_max": 1.5120941194860885e-05, "clip_ratio/high_mean": 1.5120941194860885e-05, "clip_ratio/low_mean": 1.1421976927926557e-05, "clip_ratio/low_min": 1.1421976927926557e-05, "clip_ratio/region_mean": 2.6542918122787443e-05, "completions/clipped_ratio": 0.00625, "completions/max_length": 943.8, "completions/max_terminated_length": 820.8, "completions/mean_length": 252.6390625, "completions/mean_terminated_length": 247.8078125, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "entropy": 0.9723760336637497, "epoch": 0.0016000948204338034, "frac_reward_zero_std": 0.075, "grad_norm": 1.6049115657806396, "kl": 0.0005472781637460381, "learning_rate": 1.7857142857142858e-07, "loss": 0.4774, "num_tokens": 736521.0, "reward": 0.23125, "reward_std": 0.4071147978305817, "rewards/equation_reward_func/mean": 0.0125, "rewards/equation_reward_func/std": 0.09785700291395187, "rewards/format_reward_func/mean": 0.21875, "rewards/format_reward_func/std": 0.41303261518478396, "sampling/importance_sampling_ratio/max": 1.3702354192733766, "sampling/importance_sampling_ratio/mean": 1.0000279784202575, "sampling/importance_sampling_ratio/min": 0.7041255116462708, "sampling/sampling_logp_difference/max": 0.38532485961914065, "sampling/sampling_logp_difference/mean": 0.01796877197921276, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 1.7957742076962153e-05, "clip_ratio/low_min": 1.7957742076962153e-05, "clip_ratio/region_mean": 1.7957742076962153e-05, "completions/clipped_ratio": 0.0078125, "completions/max_length": 991.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 253.9609375, "completions/mean_terminated_length": 248.11470794677734, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.9921977304750018, "epoch": 0.002133459760578405, "frac_reward_zero_std": 0.109375, "grad_norm": 1.5427881479263306, "kl": 0.000653341956826302, "learning_rate": 2.5e-07, "loss": 0.4591, "num_tokens": 944917.0, "reward": 0.232421875, "reward_std": 0.4018888622522354, "rewards/equation_reward_func/mean": 0.01171875, "rewards/equation_reward_func/std": 0.09338017366826534, "rewards/format_reward_func/mean": 0.220703125, "rewards/format_reward_func/std": 0.41563551872968674, "sampling/importance_sampling_ratio/max": 1.4093050062656403, "sampling/importance_sampling_ratio/mean": 0.9952466189861298, "sampling/importance_sampling_ratio/min": 0.5234628617769418, "sampling/sampling_logp_difference/max": 6.975502386689186, "sampling/sampling_logp_difference/mean": 0.08560637244954705, "step": 8 }, { "clip_ratio/high_max": 6.66453383423181e-06, "clip_ratio/high_mean": 6.66453383423181e-06, "clip_ratio/low_mean": 4.133189819791975e-05, "clip_ratio/low_min": 4.133189819791975e-05, "clip_ratio/region_mean": 4.799643203215156e-05, "completions/clipped_ratio": 0.0078125, "completions/max_length": 937.2, "completions/max_terminated_length": 888.2, "completions/mean_length": 258.7390625, "completions/mean_terminated_length": 252.81815185546876, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "entropy": 0.9827360986835427, "epoch": 0.0026668247007230056, "frac_reward_zero_std": 0.125, "grad_norm": 1.7612438201904297, "kl": 0.0005702357165217917, "learning_rate": 3.2142857142857145e-07, "loss": 0.5141, "num_tokens": 1208246.0, "reward": 0.2390625, "reward_std": 0.40106837153434755, "rewards/equation_reward_func/mean": 0.0125, "rewards/equation_reward_func/std": 0.09960551857948304, "rewards/format_reward_func/mean": 0.2265625, "rewards/format_reward_func/std": 0.41728968024253843, "sampling/importance_sampling_ratio/max": 1.38476619720459, "sampling/importance_sampling_ratio/mean": 1.0000717878341674, "sampling/importance_sampling_ratio/min": 0.644167011976242, "sampling/sampling_logp_difference/max": 0.4600466012954712, "sampling/sampling_logp_difference/mean": 0.018111751601099967, "step": 10 }, { "clip_ratio/high_max": 2.5259445667163367e-05, "clip_ratio/high_mean": 2.5259445667163367e-05, "clip_ratio/low_mean": 3.4389816492977035e-05, "clip_ratio/low_min": 3.4389816492977035e-05, "clip_ratio/region_mean": 5.96492621601404e-05, "completions/clipped_ratio": 0.009765625, "completions/max_length": 928.25, "completions/max_terminated_length": 866.75, "completions/mean_length": 254.19921875, "completions/mean_terminated_length": 246.56705474853516, "completions/min_length": 37.75, "completions/min_terminated_length": 37.75, "entropy": 0.9584754295647144, "epoch": 0.003200189640867607, "frac_reward_zero_std": 0.09375, "grad_norm": 1.6411950588226318, "kl": 0.0006625644515831178, "learning_rate": 3.928571428571428e-07, "loss": 0.5029, "num_tokens": 1416620.0, "reward": 0.244140625, "reward_std": 0.41240257024765015, "rewards/equation_reward_func/mean": 0.01171875, "rewards/equation_reward_func/std": 0.0878632478415966, "rewards/format_reward_func/mean": 0.232421875, "rewards/format_reward_func/std": 0.42017484456300735, "sampling/importance_sampling_ratio/max": 1.3969968855381012, "sampling/importance_sampling_ratio/mean": 0.9975202530622482, "sampling/importance_sampling_ratio/min": 0.5361489206557354, "sampling/sampling_logp_difference/max": 6.994814962148666, "sampling/sampling_logp_difference/mean": 0.05950223561376333, "step": 12 }, { "clip_ratio/high_max": 1.590006084168433e-05, "clip_ratio/high_mean": 1.590006084168433e-05, "clip_ratio/low_mean": 2.5911842789759652e-05, "clip_ratio/low_min": 2.5911842789759652e-05, "clip_ratio/region_mean": 4.181190363144399e-05, "completions/clipped_ratio": 0.0078125, "completions/max_length": 974.0, "completions/max_terminated_length": 921.8, "completions/mean_length": 247.2390625, "completions/mean_terminated_length": 241.17325134277343, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "entropy": 0.9551170787049664, "epoch": 0.003733554581012208, "frac_reward_zero_std": 0.075, "grad_norm": 1.3737132549285889, "kl": 0.0006883465761752126, "learning_rate": 4.6428571428571427e-07, "loss": 0.4942, "num_tokens": 1672549.0, "reward": 0.25, "reward_std": 0.42161002159118655, "rewards/equation_reward_func/mean": 0.0125, "rewards/equation_reward_func/std": 0.09785700291395187, "rewards/format_reward_func/mean": 0.2375, "rewards/format_reward_func/std": 0.42624163031578066, "sampling/importance_sampling_ratio/max": 1.3151276588439942, "sampling/importance_sampling_ratio/mean": 0.9999078154563904, "sampling/importance_sampling_ratio/min": 0.6646330237388611, "sampling/sampling_logp_difference/max": 0.4373103141784668, "sampling/sampling_logp_difference/mean": 0.01776142530143261, "step": 14 }, { "clip_ratio/high_max": 1.0442778956429618e-05, "clip_ratio/high_mean": 1.0442778956429618e-05, "clip_ratio/low_mean": 1.7649072914436045e-05, "clip_ratio/low_min": 1.7649072914436045e-05, "clip_ratio/region_mean": 2.8091851870865663e-05, "completions/clipped_ratio": 0.013671875, "completions/max_length": 933.5, "completions/max_terminated_length": 830.0, "completions/mean_length": 232.98046875, "completions/mean_terminated_length": 222.03210830688477, "completions/min_length": 39.5, "completions/min_terminated_length": 39.5, "entropy": 0.9428578350279067, "epoch": 0.00426691952115681, "frac_reward_zero_std": 0.0625, "grad_norm": 1.5488348007202148, "kl": 0.0009593554702102362, "learning_rate": 4.999935101463869e-07, "loss": 0.5028, "num_tokens": 1869755.0, "reward": 0.271484375, "reward_std": 0.437812015414238, "rewards/equation_reward_func/mean": 0.013671875, "rewards/equation_reward_func/std": 0.11329161562025547, "rewards/format_reward_func/mean": 0.2578125, "rewards/format_reward_func/std": 0.43795718252658844, "sampling/importance_sampling_ratio/max": 1.418111115694046, "sampling/importance_sampling_ratio/mean": 0.9950796961784363, "sampling/importance_sampling_ratio/min": 0.4989380836488372, "sampling/sampling_logp_difference/max": 7.342170834541321, "sampling/sampling_logp_difference/mean": 0.0878427573479712, "step": 16 }, { "clip_ratio/high_max": 5.304694503946747e-05, "clip_ratio/high_mean": 5.304694503946747e-05, "clip_ratio/low_mean": 0.0001491465482104104, "clip_ratio/low_min": 0.0001491465482104104, "clip_ratio/region_mean": 0.00020219349244143814, "completions/clipped_ratio": 0.0109375, "completions/max_length": 982.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 229.1265625, "completions/mean_terminated_length": 220.32821044921874, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "entropy": 0.9507015152937837, "epoch": 0.00480028446130141, "frac_reward_zero_std": 0.0625, "grad_norm": 1.5790953636169434, "kl": 0.0018293357906158133, "learning_rate": 4.999415933391384e-07, "loss": 0.5898, "num_tokens": 2114148.0, "reward": 0.3578125, "reward_std": 0.45415692329406737, "rewards/equation_reward_func/mean": 0.01875, "rewards/equation_reward_func/std": 0.13104374706745148, "rewards/format_reward_func/mean": 0.3390625, "rewards/format_reward_func/std": 0.47431493997573854, "sampling/importance_sampling_ratio/max": 1.5123102188110351, "sampling/importance_sampling_ratio/mean": 0.9999386310577393, "sampling/importance_sampling_ratio/min": 0.680599057674408, "sampling/sampling_logp_difference/max": 0.48615825176239014, "sampling/sampling_logp_difference/mean": 0.017719805985689164, "step": 18 }, { "clip_ratio/high_max": 6.4962061717071465e-06, "clip_ratio/high_mean": 6.4962061717071465e-06, "clip_ratio/low_mean": 0.00010160425558246465, "clip_ratio/low_min": 0.00010160425558246465, "clip_ratio/region_mean": 0.0001081004617541718, "completions/clipped_ratio": 0.009765625, "completions/max_length": 915.0, "completions/max_terminated_length": 774.5, "completions/mean_length": 208.271484375, "completions/mean_terminated_length": 200.28839874267578, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.9100811419387659, "epoch": 0.005333649401446011, "frac_reward_zero_std": 0.03125, "grad_norm": 1.8960355520248413, "kl": 0.0027993117918312135, "learning_rate": 4.998377705063407e-07, "loss": 0.6479, "num_tokens": 2298727.0, "reward": 0.400390625, "reward_std": 0.4889729246497154, "rewards/equation_reward_func/mean": 0.013671875, "rewards/equation_reward_func/std": 0.11329161562025547, "rewards/format_reward_func/mean": 0.38671875, "rewards/format_reward_func/std": 0.4860092028975487, "sampling/importance_sampling_ratio/max": 1.469222605228424, "sampling/importance_sampling_ratio/mean": 1.0001395344734192, "sampling/importance_sampling_ratio/min": 0.6491907387971878, "sampling/sampling_logp_difference/max": 0.4367278516292572, "sampling/sampling_logp_difference/mean": 0.017449268605560064, "step": 20 }, { "clip_ratio/high_max": 3.932892852592179e-05, "clip_ratio/high_mean": 3.932892852592179e-05, "clip_ratio/low_mean": 5.291779113273757e-05, "clip_ratio/low_min": 5.291779113273757e-05, "clip_ratio/region_mean": 9.224671965865936e-05, "completions/clipped_ratio": 0.0125, "completions/max_length": 1016.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 220.003125, "completions/mean_terminated_length": 209.95844421386718, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "entropy": 0.9095436150415076, "epoch": 0.005867014341590613, "frac_reward_zero_std": 0.0375, "grad_norm": 1.8503657579421997, "kl": 0.0036210449753626664, "learning_rate": 4.996820632091536e-07, "loss": 0.4627, "num_tokens": 2537465.0, "reward": 0.434375, "reward_std": 0.4760441243648529, "rewards/equation_reward_func/mean": 0.00625, "rewards/equation_reward_func/std": 0.06025671809911728, "rewards/format_reward_func/mean": 0.428125, "rewards/format_reward_func/std": 0.49461003541946413, "sampling/importance_sampling_ratio/max": 1.4342424154281617, "sampling/importance_sampling_ratio/mean": 0.9999365687370301, "sampling/importance_sampling_ratio/min": 0.7008239388465881, "sampling/sampling_logp_difference/max": 0.394360089302063, "sampling/sampling_logp_difference/mean": 0.017483803629875182, "step": 22 }, { "clip_ratio/high_max": 4.093984575269537e-05, "clip_ratio/high_mean": 4.093984575269537e-05, "clip_ratio/low_mean": 7.864602796164238e-06, "clip_ratio/low_min": 7.864602796164238e-06, "clip_ratio/region_mean": 4.8804448548859604e-05, "completions/clipped_ratio": 0.001953125, "completions/max_length": 891.5, "completions/max_terminated_length": 878.5, "completions/mean_length": 182.994140625, "completions/mean_terminated_length": 181.32217407226562, "completions/min_length": 39.25, "completions/min_terminated_length": 39.25, "entropy": 0.8905075606372621, "epoch": 0.006400379281735214, "frac_reward_zero_std": 0.0, "grad_norm": 2.2550694942474365, "kl": 0.004935654772756, "learning_rate": 4.994745037837194e-07, "loss": 0.5955, "num_tokens": 2709110.0, "reward": 0.517578125, "reward_std": 0.5198267549276352, "rewards/equation_reward_func/mean": 0.02734375, "rewards/equation_reward_func/std": 0.153958547860384, "rewards/format_reward_func/mean": 0.490234375, "rewards/format_reward_func/std": 0.5015200674533844, "sampling/importance_sampling_ratio/max": 1.4241470694541931, "sampling/importance_sampling_ratio/mean": 0.999960258603096, "sampling/importance_sampling_ratio/min": 0.7223343104124069, "sampling/sampling_logp_difference/max": 0.38423967361450195, "sampling/sampling_logp_difference/mean": 0.017527095042169094, "step": 24 }, { "clip_ratio/high_max": 0.00014086242501636862, "clip_ratio/high_mean": 0.00014086242501636862, "clip_ratio/low_mean": 0.0003290529796989479, "clip_ratio/low_min": 0.0003290529796989479, "clip_ratio/region_mean": 0.0004699154055237563, "completions/clipped_ratio": 0.0015625, "completions/max_length": 782.2, "completions/max_terminated_length": 706.6, "completions/mean_length": 158.4265625, "completions/mean_terminated_length": 157.09267883300782, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "entropy": 0.8476280462410715, "epoch": 0.006933744221879815, "frac_reward_zero_std": 0.0375, "grad_norm": 2.449361562728882, "kl": 0.010158799758452611, "learning_rate": 4.992151353344481e-07, "loss": 0.6381, "num_tokens": 2908303.0, "reward": 0.659375, "reward_std": 0.4784186124801636, "rewards/equation_reward_func/mean": 0.00625, "rewards/equation_reward_func/std": 0.06025671809911728, "rewards/format_reward_func/mean": 0.653125, "rewards/format_reward_func/std": 0.471921968460083, "sampling/importance_sampling_ratio/max": 1.4163753271102906, "sampling/importance_sampling_ratio/mean": 0.9999792098999023, "sampling/importance_sampling_ratio/min": 0.7149053812026978, "sampling/sampling_logp_difference/max": 0.3758361577987671, "sampling/sampling_logp_difference/mean": 0.0172608382999897, "step": 26 }, { "clip_ratio/high_max": 0.0001202653535680535, "clip_ratio/high_mean": 0.0001202653535680535, "clip_ratio/low_mean": 6.997568844882253e-05, "clip_ratio/low_min": 6.997568844882253e-05, "clip_ratio/region_mean": 0.00019024104444219524, "completions/clipped_ratio": 0.001953125, "completions/max_length": 805.75, "completions/max_terminated_length": 718.5, "completions/mean_length": 156.099609375, "completions/mean_terminated_length": 154.44060516357422, "completions/min_length": 40.75, "completions/min_terminated_length": 40.75, "entropy": 0.8275971532695823, "epoch": 0.007467109162024416, "frac_reward_zero_std": 0.078125, "grad_norm": 2.6489038467407227, "kl": 0.012742498100528287, "learning_rate": 4.989040117250646e-07, "loss": 0.6724, "num_tokens": 3066490.0, "reward": 0.77734375, "reward_std": 0.41671569645404816, "rewards/equation_reward_func/mean": 0.009765625, "rewards/equation_reward_func/std": 0.06909744255244732, "rewards/format_reward_func/mean": 0.767578125, "rewards/format_reward_func/std": 0.42265236377716064, "sampling/importance_sampling_ratio/max": 1.3687202334403992, "sampling/importance_sampling_ratio/mean": 0.9999553114175797, "sampling/importance_sampling_ratio/min": 0.695921778678894, "sampling/sampling_logp_difference/max": 0.3773607909679413, "sampling/sampling_logp_difference/mean": 0.016928753815591335, "step": 28 }, { "clip_ratio/high_max": 0.00041999312735343765, "clip_ratio/high_mean": 0.00041999312735343765, "clip_ratio/low_mean": 8.283444064242455e-05, "clip_ratio/low_min": 8.283444064242455e-05, "clip_ratio/region_mean": 0.0005028275679958622, "completions/clipped_ratio": 0.0046875, "completions/max_length": 856.0, "completions/max_terminated_length": 775.4, "completions/mean_length": 147.934375, "completions/mean_terminated_length": 144.22625732421875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.8315250639700227, "epoch": 0.008000474102169017, "frac_reward_zero_std": 0.1625, "grad_norm": 2.362121820449829, "kl": 0.016558959729283944, "learning_rate": 4.985411975674243e-07, "loss": 0.428, "num_tokens": 3258944.0, "reward": 0.790625, "reward_std": 0.3786823511123657, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.06748042851686478, "rewards/format_reward_func/mean": 0.7828125, "rewards/format_reward_func/std": 0.412441211938858, "sampling/importance_sampling_ratio/max": 1.446297574043274, "sampling/importance_sampling_ratio/mean": 0.9957321882247925, "sampling/importance_sampling_ratio/min": 0.5673706173897368, "sampling/sampling_logp_difference/max": 6.094879531860352, "sampling/sampling_logp_difference/mean": 0.07716870233416558, "step": 30 }, { "clip_ratio/high_max": 3.2706935598980635e-05, "clip_ratio/high_mean": 3.2706935598980635e-05, "clip_ratio/low_mean": 8.444509027564588e-05, "clip_ratio/low_min": 8.444509027564588e-05, "clip_ratio/region_mean": 0.00011715202587462652, "completions/clipped_ratio": 0.001953125, "completions/max_length": 732.5, "completions/max_terminated_length": 681.0, "completions/mean_length": 139.982421875, "completions/mean_terminated_length": 138.2721290588379, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.7911798564924134, "epoch": 0.00853383904231362, "frac_reward_zero_std": 0.1875, "grad_norm": 2.637077569961548, "kl": 0.022358925191737298, "learning_rate": 4.981267682080939e-07, "loss": 0.3828, "num_tokens": 3408743.0, "reward": 0.87890625, "reward_std": 0.3349350690841675, "rewards/equation_reward_func/mean": 0.01953125, "rewards/equation_reward_func/std": 0.13819488510489464, "rewards/format_reward_func/mean": 0.859375, "rewards/format_reward_func/std": 0.34379299730062485, "sampling/importance_sampling_ratio/max": 1.2771025598049164, "sampling/importance_sampling_ratio/mean": 1.0000105053186417, "sampling/importance_sampling_ratio/min": 0.7657925933599472, "sampling/sampling_logp_difference/max": 0.268058180809021, "sampling/sampling_logp_difference/mean": 0.01670322148129344, "step": 32 }, { "clip_ratio/high_max": 0.00010431134109644013, "clip_ratio/high_mean": 0.00010431134109644013, "clip_ratio/low_mean": 7.221281783939857e-05, "clip_ratio/low_min": 7.221281783939857e-05, "clip_ratio/region_mean": 0.0001765241589358387, "completions/clipped_ratio": 0.003125, "completions/max_length": 848.6, "completions/max_terminated_length": 773.0, "completions/mean_length": 133.2203125, "completions/mean_terminated_length": 130.4301315307617, "completions/min_length": 43.2, "completions/min_terminated_length": 43.2, "entropy": 0.7861367205364836, "epoch": 0.00906720398245822, "frac_reward_zero_std": 0.3, "grad_norm": 2.236886739730835, "kl": 0.038182342226112574, "learning_rate": 4.976608097127043e-07, "loss": 0.4325, "num_tokens": 3591660.0, "reward": 0.8640625, "reward_std": 0.2960824638605118, "rewards/equation_reward_func/mean": 0.009375, "rewards/equation_reward_func/std": 0.08515809774398804, "rewards/format_reward_func/mean": 0.8546875, "rewards/format_reward_func/std": 0.3516235172748566, "sampling/importance_sampling_ratio/max": 1.4304035663604737, "sampling/importance_sampling_ratio/mean": 0.9999574184417724, "sampling/importance_sampling_ratio/min": 0.699518883228302, "sampling/sampling_logp_difference/max": 0.4391578912734985, "sampling/sampling_logp_difference/mean": 0.01687545031309128, "step": 34 }, { "clip_ratio/high_max": 4.018020344018522e-05, "clip_ratio/high_mean": 4.018020344018522e-05, "clip_ratio/low_mean": 3.673622187408101e-05, "clip_ratio/low_min": 3.673622187408101e-05, "clip_ratio/region_mean": 7.691642531426623e-05, "completions/clipped_ratio": 0.009765625, "completions/max_length": 849.75, "completions/max_terminated_length": 737.0, "completions/mean_length": 134.04296875, "completions/mean_terminated_length": 125.25497436523438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.7617439021252923, "epoch": 0.00960056892260282, "frac_reward_zero_std": 0.359375, "grad_norm": 2.2578821182250977, "kl": 0.026364122996003263, "learning_rate": 4.97143418848077e-07, "loss": 0.6425, "num_tokens": 3738354.0, "reward": 0.908203125, "reward_std": 0.250680897384882, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0753208976238966, "rewards/format_reward_func/mean": 0.900390625, "rewards/format_reward_func/std": 0.29824410378932953, "sampling/importance_sampling_ratio/max": 1.3523357212543488, "sampling/importance_sampling_ratio/mean": 0.9997732192277908, "sampling/importance_sampling_ratio/min": 0.711165651679039, "sampling/sampling_logp_difference/max": 0.3560762405395508, "sampling/sampling_logp_difference/mean": 0.01586813572794199, "step": 36 }, { "clip_ratio/high_max": 3.3478166895090704e-05, "clip_ratio/high_mean": 3.3478166895090704e-05, "clip_ratio/low_mean": 6.741477773175575e-05, "clip_ratio/low_min": 6.741477773175575e-05, "clip_ratio/region_mean": 0.00010089294462684645, "completions/clipped_ratio": 0.0078125, "completions/max_length": 812.0, "completions/max_terminated_length": 692.6, "completions/mean_length": 125.8046875, "completions/mean_terminated_length": 118.735791015625, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "entropy": 0.7452815804216597, "epoch": 0.010133933862747422, "frac_reward_zero_std": 0.475, "grad_norm": 2.4362378120422363, "kl": 0.02234620086124374, "learning_rate": 4.965747030621286e-07, "loss": 0.6251, "num_tokens": 3916317.0, "reward": 0.9453125, "reward_std": 0.20868168473243714, "rewards/equation_reward_func/mean": 0.0171875, "rewards/equation_reward_func/std": 0.12823357731103896, "rewards/format_reward_func/mean": 0.928125, "rewards/format_reward_func/std": 0.2580002784729004, "sampling/importance_sampling_ratio/max": 1.3453300952911378, "sampling/importance_sampling_ratio/mean": 1.0000825643539428, "sampling/importance_sampling_ratio/min": 0.7632271409034729, "sampling/sampling_logp_difference/max": 0.32036001682281495, "sampling/sampling_logp_difference/mean": 0.016150952130556107, "step": 38 }, { "clip_ratio/high_max": 2.6781968901761706e-05, "clip_ratio/high_mean": 2.6781968901761706e-05, "clip_ratio/low_mean": 4.471394317483323e-05, "clip_ratio/low_min": 4.471394317483323e-05, "clip_ratio/region_mean": 7.149591207659493e-05, "completions/clipped_ratio": 0.005859375, "completions/max_length": 754.0, "completions/max_terminated_length": 551.5, "completions/mean_length": 110.4296875, "completions/mean_terminated_length": 105.04872703552246, "completions/min_length": 37.25, "completions/min_terminated_length": 37.25, "entropy": 0.7101358626451757, "epoch": 0.010667298802892022, "frac_reward_zero_std": 0.53125, "grad_norm": 2.222115993499756, "kl": 0.021090973996453814, "learning_rate": 4.959547804615562e-07, "loss": 0.3503, "num_tokens": 4050769.0, "reward": 0.966796875, "reward_std": 0.18099884688854218, "rewards/equation_reward_func/mean": 0.01953125, "rewards/equation_reward_func/std": 0.13819488510489464, "rewards/format_reward_func/mean": 0.947265625, "rewards/format_reward_func/std": 0.2233206108212471, "sampling/importance_sampling_ratio/max": 1.3199980556964874, "sampling/importance_sampling_ratio/mean": 1.0002519190311432, "sampling/importance_sampling_ratio/min": 0.7612035870552063, "sampling/sampling_logp_difference/max": 0.27983659505844116, "sampling/sampling_logp_difference/mean": 0.01587264542467892, "step": 40 }, { "clip_ratio/high_max": 7.222697846979524e-05, "clip_ratio/high_mean": 7.222697846979524e-05, "clip_ratio/low_mean": 2.2401433347517417e-05, "clip_ratio/low_min": 2.2401433347517417e-05, "clip_ratio/region_mean": 9.462841181731265e-05, "completions/clipped_ratio": 0.0015625, "completions/max_length": 740.2, "completions/max_terminated_length": 604.4, "completions/mean_length": 109.115625, "completions/mean_terminated_length": 107.67238006591796, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.6922806586242385, "epoch": 0.011200663743036625, "frac_reward_zero_std": 0.65, "grad_norm": 1.6084100008010864, "kl": 0.02318804946844466, "learning_rate": 4.952837797873106e-07, "loss": 0.2803, "num_tokens": 4218115.0, "reward": 0.9703125, "reward_std": 0.1344187319278717, "rewards/equation_reward_func/mean": 0.0140625, "rewards/equation_reward_func/std": 0.09021321386098861, "rewards/format_reward_func/mean": 0.95625, "rewards/format_reward_func/std": 0.20276073813438417, "sampling/importance_sampling_ratio/max": 1.33431236743927, "sampling/importance_sampling_ratio/mean": 0.9999107360839844, "sampling/importance_sampling_ratio/min": 0.6391365647315979, "sampling/sampling_logp_difference/max": 0.4481684923171997, "sampling/sampling_logp_difference/mean": 0.015980424545705318, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.2419878708509106e-05, "clip_ratio/low_min": 2.2419878708509106e-05, "clip_ratio/region_mean": 2.2419878708509106e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 667.5, "completions/max_terminated_length": 667.5, "completions/mean_length": 108.20703125, "completions/mean_terminated_length": 108.20703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.6913923049966494, "epoch": 0.011734028683181226, "frac_reward_zero_std": 0.75, "grad_norm": 1.4393494129180908, "kl": 0.03530516418524914, "learning_rate": 4.9456184038786e-07, "loss": 0.1187, "num_tokens": 4351717.0, "reward": 0.99609375, "reward_std": 0.0977855734527111, "rewards/equation_reward_func/mean": 0.01953125, "rewards/equation_reward_func/std": 0.13819488510489464, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.14740595407783985, "sampling/importance_sampling_ratio/max": 1.3048366606235504, "sampling/importance_sampling_ratio/mean": 1.0000732243061066, "sampling/importance_sampling_ratio/min": 0.722207173705101, "sampling/sampling_logp_difference/max": 0.3274679034948349, "sampling/sampling_logp_difference/mean": 0.016004684381186962, "step": 44 }, { "clip_ratio/high_max": 7.936507851506272e-06, "clip_ratio/high_mean": 7.936507851506272e-06, "clip_ratio/low_mean": 4.1295072555335035e-05, "clip_ratio/low_min": 4.1295072555335035e-05, "clip_ratio/region_mean": 4.9231580406841305e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 99.740625, "completions/mean_terminated_length": 99.740625, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "entropy": 0.6694079784469472, "epoch": 0.012267393623325826, "frac_reward_zero_std": 0.7875, "grad_norm": 1.2727700471878052, "kl": 0.026739204363871574, "learning_rate": 4.937891121902508e-07, "loss": 0.0974, "num_tokens": 4513111.0, "reward": 1.0015625, "reward_std": 0.07513009384274483, "rewards/equation_reward_func/mean": 0.0140625, "rewards/equation_reward_func/std": 0.11553467214107513, "rewards/format_reward_func/mean": 0.9875, "rewards/format_reward_func/std": 0.10831096172332763, "sampling/importance_sampling_ratio/max": 1.2640952825546266, "sampling/importance_sampling_ratio/mean": 0.9999603986740112, "sampling/importance_sampling_ratio/min": 0.6934172868728637, "sampling/sampling_logp_difference/max": 0.3729831695556641, "sampling/sampling_logp_difference/mean": 0.015973835811018945, "step": 46 }, { "clip_ratio/high_max": 1.332906766846362e-05, "clip_ratio/high_mean": 1.332906766846362e-05, "clip_ratio/low_mean": 4.297080482097549e-05, "clip_ratio/low_min": 4.297080482097549e-05, "clip_ratio/region_mean": 5.629987248943912e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 442.75, "completions/max_terminated_length": 442.75, "completions/mean_length": 94.974609375, "completions/mean_terminated_length": 94.974609375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.6421954420705637, "epoch": 0.012800758563470427, "frac_reward_zero_std": 0.734375, "grad_norm": 1.1685991287231445, "kl": 0.030390939183740154, "learning_rate": 4.929657556689726e-07, "loss": -0.0126, "num_tokens": 4639626.0, "reward": 1.01171875, "reward_std": 0.09818374924361706, "rewards/equation_reward_func/mean": 0.025390625, "rewards/equation_reward_func/std": 0.15643559210002422, "rewards/format_reward_func/mean": 0.986328125, "rewards/format_reward_func/std": 0.09803852252662182, "sampling/importance_sampling_ratio/max": 1.4218900501728058, "sampling/importance_sampling_ratio/mean": 0.9997434169054031, "sampling/importance_sampling_ratio/min": 0.7273384630680084, "sampling/sampling_logp_difference/max": 0.40461739897727966, "sampling/sampling_logp_difference/mean": 0.015656364848837256, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.3864070600312618e-05, "clip_ratio/low_min": 2.3864070600312618e-05, "clip_ratio/region_mean": 2.3864070600312618e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 515.4, "completions/max_terminated_length": 515.4, "completions/mean_length": 92.515625, "completions/mean_terminated_length": 92.515625, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "entropy": 0.6340925707999203, "epoch": 0.01333412350361503, "frac_reward_zero_std": 0.8375, "grad_norm": 1.1278184652328491, "kl": 0.03294617449217993, "learning_rate": 4.920919418126312e-07, "loss": 0.0304, "num_tokens": 4796420.0, "reward": 1.025, "reward_std": 0.06360324881970883, "rewards/equation_reward_func/mean": 0.0265625, "rewards/equation_reward_func/std": 0.15488989353179933, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.3638750076293946, "sampling/importance_sampling_ratio/mean": 1.0000147581100465, "sampling/importance_sampling_ratio/min": 0.681254529953003, "sampling/sampling_logp_difference/max": 0.42158195972442625, "sampling/sampling_logp_difference/mean": 0.015522122010588646, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.332843137594561e-05, "clip_ratio/low_min": 6.332843137594561e-05, "clip_ratio/region_mean": 6.332843137594561e-05, "completions/clipped_ratio": 0.001953125, "completions/max_length": 540.5, "completions/max_terminated_length": 419.5, "completions/mean_length": 97.3984375, "completions/mean_terminated_length": 95.6043758392334, "completions/min_length": 41.25, "completions/min_terminated_length": 41.25, "entropy": 0.6415038226793209, "epoch": 0.01386748844375963, "frac_reward_zero_std": 0.859375, "grad_norm": 0.9927432537078857, "kl": 0.03755232151080337, "learning_rate": 4.911678520884398e-07, "loss": 0.0805, "num_tokens": 4924560.0, "reward": 1.017578125, "reward_std": 0.053135840222239494, "rewards/equation_reward_func/mean": 0.01953125, "rewards/equation_reward_func/std": 0.11772368662059307, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.3248384892940521, "sampling/importance_sampling_ratio/mean": 0.999882698059082, "sampling/importance_sampling_ratio/min": 0.7175185084342957, "sampling/sampling_logp_difference/max": 0.3594819903373718, "sampling/sampling_logp_difference/mean": 0.015806726180016994, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.9849300012251155e-05, "clip_ratio/low_min": 2.9849300012251155e-05, "clip_ratio/region_mean": 2.9849300012251155e-05, "completions/clipped_ratio": 0.0046875, "completions/max_length": 777.0, "completions/max_terminated_length": 569.6, "completions/mean_length": 99.21875, "completions/mean_terminated_length": 94.85718536376953, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "entropy": 0.6518724225461483, "epoch": 0.014400853383904231, "frac_reward_zero_std": 0.75, "grad_norm": 1.5945581197738647, "kl": 0.03560998403312018, "learning_rate": 4.901936784045324e-07, "loss": 0.2677, "num_tokens": 5085996.0, "reward": 1.0203125, "reward_std": 0.09453916996717453, "rewards/equation_reward_func/mean": 0.0296875, "rewards/equation_reward_func/std": 0.16189142167568207, "rewards/format_reward_func/mean": 0.990625, "rewards/format_reward_func/std": 0.06075314879417419, "sampling/importance_sampling_ratio/max": 1.326232123374939, "sampling/importance_sampling_ratio/mean": 0.99991774559021, "sampling/importance_sampling_ratio/min": 0.7284057378768921, "sampling/sampling_logp_difference/max": 0.3198550701141357, "sampling/sampling_logp_difference/mean": 0.01613671872764826, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.3620558850881127e-05, "clip_ratio/low_min": 2.3620558850881127e-05, "clip_ratio/region_mean": 2.3620558850881127e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 462.5, "completions/max_terminated_length": 462.5, "completions/mean_length": 95.28515625, "completions/mean_terminated_length": 95.28515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.629227254125807, "epoch": 0.014934218324048832, "frac_reward_zero_std": 0.75, "grad_norm": 0.890274703502655, "kl": 0.03829095684017779, "learning_rate": 4.891696230701103e-07, "loss": 0.0697, "num_tokens": 5212894.0, "reward": 1.025390625, "reward_std": 0.09575232956558466, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1720482874661684, "rewards/format_reward_func/mean": 0.994140625, "rewards/format_reward_func/std": 0.05322381108999252, "sampling/importance_sampling_ratio/max": 1.358025699853897, "sampling/importance_sampling_ratio/mean": 0.9997666776180267, "sampling/importance_sampling_ratio/min": 0.7236018925905228, "sampling/sampling_logp_difference/max": 0.3506334722042084, "sampling/sampling_logp_difference/mean": 0.01580139296129346, "step": 56 }, { "clip_ratio/high_max": 4.6546614612452686e-05, "clip_ratio/high_mean": 4.6546614612452686e-05, "clip_ratio/low_mean": 5.537834416221206e-06, "clip_ratio/low_min": 5.537834416221206e-06, "clip_ratio/region_mean": 5.2084449028673894e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 534.8, "completions/max_terminated_length": 534.8, "completions/mean_length": 93.0078125, "completions/mean_terminated_length": 93.0078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.6247536558657885, "epoch": 0.015467583264193435, "frac_reward_zero_std": 0.775, "grad_norm": 1.8575509786605835, "kl": 0.04158723902785116, "learning_rate": 4.880958987534282e-07, "loss": 0.172, "num_tokens": 5370291.0, "reward": 1.0234375, "reward_std": 0.08843425288796425, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.16652367264032364, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.07793438732624054, "sampling/importance_sampling_ratio/max": 1.3377473592758178, "sampling/importance_sampling_ratio/mean": 1.00005099773407, "sampling/importance_sampling_ratio/min": 0.7097499012947083, "sampling/sampling_logp_difference/max": 0.3517502784729004, "sampling/sampling_logp_difference/mean": 0.015681001730263232, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.831049429614925e-05, "clip_ratio/low_min": 4.831049429614925e-05, "clip_ratio/region_mean": 4.831049429614925e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 305.5, "completions/max_terminated_length": 305.5, "completions/mean_length": 83.3046875, "completions/mean_terminated_length": 83.3046875, "completions/min_length": 36.5, "completions/min_terminated_length": 36.5, "entropy": 0.5815543766236968, "epoch": 0.016000948204338034, "frac_reward_zero_std": 0.75, "grad_norm": 1.0433164834976196, "kl": 0.14031794717690596, "learning_rate": 4.869727284376277e-07, "loss": 0.0137, "num_tokens": 5490911.0, "reward": 1.046875, "reward_std": 0.10034800879657269, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.20951511710882187, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3324269950389862, "sampling/importance_sampling_ratio/mean": 1.0000700801610947, "sampling/importance_sampling_ratio/min": 0.6867435872554779, "sampling/sampling_logp_difference/max": 0.3771655261516571, "sampling/sampling_logp_difference/mean": 0.015149830840528011, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.6009155489090415e-05, "clip_ratio/low_min": 2.6009155489090415e-05, "clip_ratio/region_mean": 2.6009155489090415e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 382.2, "completions/max_terminated_length": 382.2, "completions/mean_length": 84.815625, "completions/mean_terminated_length": 84.815625, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "entropy": 0.5775704857789807, "epoch": 0.016534313144482636, "frac_reward_zero_std": 0.7625, "grad_norm": 1.1085841655731201, "kl": 0.047807488305908113, "learning_rate": 4.858003453744314e-07, "loss": -0.0112, "num_tokens": 5642825.0, "reward": 1.04375, "reward_std": 0.09216970056295395, "rewards/equation_reward_func/mean": 0.04375, "rewards/equation_reward_func/std": 0.20302656292915344, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3588390588760375, "sampling/importance_sampling_ratio/mean": 1.0001762866973878, "sampling/importance_sampling_ratio/min": 0.7345640540122986, "sampling/sampling_logp_difference/max": 0.3220035552978516, "sampling/sampling_logp_difference/mean": 0.015222874283790589, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.360232767597254e-05, "clip_ratio/low_min": 7.360232767597254e-05, "clip_ratio/region_mean": 7.360232767597254e-05, "completions/clipped_ratio": 0.001953125, "completions/max_length": 608.75, "completions/max_terminated_length": 506.5, "completions/mean_length": 84.5234375, "completions/mean_terminated_length": 82.6848087310791, "completions/min_length": 39.5, "completions/min_terminated_length": 39.5, "entropy": 0.5398880841417445, "epoch": 0.01706767808462724, "frac_reward_zero_std": 0.8125, "grad_norm": 1.3397088050842285, "kl": 0.06067239720788267, "learning_rate": 4.845789930357016e-07, "loss": 0.1284, "num_tokens": 5764341.0, "reward": 1.02734375, "reward_std": 0.07312605250626802, "rewards/equation_reward_func/mean": 0.029296875, "rewards/equation_reward_func/std": 0.16634993068873882, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.4397042393684387, "sampling/importance_sampling_ratio/mean": 1.000122606754303, "sampling/importance_sampling_ratio/min": 0.7100281417369843, "sampling/sampling_logp_difference/max": 0.4158157557249069, "sampling/sampling_logp_difference/mean": 0.01495048706419766, "step": 64 }, { "clip_ratio/high_max": 5.6386516209588284e-05, "clip_ratio/high_mean": 5.6386516209588284e-05, "clip_ratio/low_mean": 7.478632485597498e-05, "clip_ratio/low_min": 7.478632485597498e-05, "clip_ratio/region_mean": 0.00013117284268244274, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 77.1109375, "completions/mean_terminated_length": 77.1109375, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "entropy": 0.5209556261284484, "epoch": 0.017601043024771838, "frac_reward_zero_std": 0.675, "grad_norm": 1.7486906051635742, "kl": 0.06048609020136711, "learning_rate": 4.833089250628786e-07, "loss": 0.0349, "num_tokens": 5911100.0, "reward": 1.0625, "reward_std": 0.13036474585533142, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.23395621329545974, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3723551034927368, "sampling/importance_sampling_ratio/mean": 0.9998387336730957, "sampling/importance_sampling_ratio/min": 0.7327517747879029, "sampling/sampling_logp_difference/max": 0.34424552917480467, "sampling/sampling_logp_difference/mean": 0.014390312135219574, "step": 66 }, { "clip_ratio/high_max": 9.397756204836899e-05, "clip_ratio/high_mean": 9.397756204836899e-05, "clip_ratio/low_mean": 2.2116065439250735e-05, "clip_ratio/low_min": 2.2116065439250735e-05, "clip_ratio/region_mean": 0.00011609362748761971, "completions/clipped_ratio": 0.0, "completions/max_length": 279.75, "completions/max_terminated_length": 279.75, "completions/mean_length": 76.70703125, "completions/mean_terminated_length": 76.70703125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.5340419264717234, "epoch": 0.01813440796491644, "frac_reward_zero_std": 0.6875, "grad_norm": 1.502469778060913, "kl": 0.06684589956421405, "learning_rate": 4.819904052143058e-07, "loss": -0.0125, "num_tokens": 6028710.0, "reward": 1.0546875, "reward_std": 0.12330006062984467, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.2276071086525917, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4370152056217194, "sampling/importance_sampling_ratio/mean": 1.0001281201839447, "sampling/importance_sampling_ratio/min": 0.6974522769451141, "sampling/sampling_logp_difference/max": 0.38355720043182373, "sampling/sampling_logp_difference/mean": 0.015159015310928226, "step": 68 }, { "clip_ratio/high_max": 2.5070196392738984e-05, "clip_ratio/high_mean": 2.5070196392738984e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.5070196392738984e-05, "completions/clipped_ratio": 0.0015625, "completions/max_length": 377.0, "completions/max_terminated_length": 243.8, "completions/mean_length": 76.10625, "completions/mean_terminated_length": 74.62754669189454, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "entropy": 0.5229987460705969, "epoch": 0.01866777290506104, "frac_reward_zero_std": 0.5875, "grad_norm": 2.1594698429107666, "kl": 0.07401664355873233, "learning_rate": 4.806237073104548e-07, "loss": 0.1009, "num_tokens": 6175042.0, "reward": 1.0875, "reward_std": 0.17360132932662964, "rewards/equation_reward_func/mean": 0.0890625, "rewards/equation_reward_func/std": 0.283771225810051, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.37129807472229, "sampling/importance_sampling_ratio/mean": 0.9997862100601196, "sampling/importance_sampling_ratio/min": 0.724765682220459, "sampling/sampling_logp_difference/max": 0.353516149520874, "sampling/sampling_logp_difference/mean": 0.014753091894090176, "step": 70 }, { "clip_ratio/high_max": 4.6763934531352585e-05, "clip_ratio/high_mean": 4.6763934531352585e-05, "clip_ratio/low_mean": 4.218439021820409e-05, "clip_ratio/low_min": 4.218439021820409e-05, "clip_ratio/region_mean": 8.894832474955668e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 200.75, "completions/max_terminated_length": 200.75, "completions/mean_length": 73.201171875, "completions/mean_terminated_length": 73.201171875, "completions/min_length": 36.5, "completions/min_terminated_length": 36.5, "entropy": 0.5062265004962683, "epoch": 0.01920113784520564, "frac_reward_zero_std": 0.671875, "grad_norm": 1.3941539525985718, "kl": 0.07746947697725975, "learning_rate": 4.792091151770602e-07, "loss": 0.0329, "num_tokens": 6290609.0, "reward": 1.08203125, "reward_std": 0.1396032553166151, "rewards/equation_reward_func/mean": 0.08203125, "rewards/equation_reward_func/std": 0.2689567133784294, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4342742264270782, "sampling/importance_sampling_ratio/mean": 0.9997692108154297, "sampling/importance_sampling_ratio/min": 0.7225751578807831, "sampling/sampling_logp_difference/max": 0.39524587988853455, "sampling/sampling_logp_difference/mean": 0.01487131672911346, "step": 72 }, { "clip_ratio/high_max": 2.581577832137959e-05, "clip_ratio/high_mean": 2.581577832137959e-05, "clip_ratio/low_mean": 4.858444865223848e-05, "clip_ratio/low_min": 4.858444865223848e-05, "clip_ratio/region_mean": 7.440022697361807e-05, "completions/clipped_ratio": 0.0015625, "completions/max_length": 423.6, "completions/max_terminated_length": 286.2, "completions/mean_length": 76.0296875, "completions/mean_terminated_length": 74.54901580810547, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "entropy": 0.5001937970519066, "epoch": 0.019734502785350244, "frac_reward_zero_std": 0.6875, "grad_norm": 1.5726782083511353, "kl": 0.07847756228875369, "learning_rate": 4.777469225861765e-07, "loss": 0.1762, "num_tokens": 6436932.0, "reward": 1.05625, "reward_std": 0.12709913849830629, "rewards/equation_reward_func/mean": 0.059375, "rewards/equation_reward_func/std": 0.2326520323753357, "rewards/format_reward_func/mean": 0.996875, "rewards/format_reward_func/std": 0.03535533845424652, "sampling/importance_sampling_ratio/max": 1.4298873901367188, "sampling/importance_sampling_ratio/mean": 0.999869453907013, "sampling/importance_sampling_ratio/min": 0.6843819975852966, "sampling/sampling_logp_difference/max": 0.4139400660991669, "sampling/sampling_logp_difference/mean": 0.014392230287194252, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.4366472239813043e-05, "clip_ratio/low_min": 2.4366472239813043e-05, "clip_ratio/region_mean": 2.4366472239813043e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 260.75, "completions/max_terminated_length": 260.75, "completions/mean_length": 75.982421875, "completions/mean_terminated_length": 75.982421875, "completions/min_length": 37.5, "completions/min_terminated_length": 37.5, "entropy": 0.4951359668953551, "epoch": 0.020267867725494843, "frac_reward_zero_std": 0.640625, "grad_norm": 1.4562492370605469, "kl": 0.09235726988926116, "learning_rate": 4.762374331951703e-07, "loss": 0.0057, "num_tokens": 6553947.0, "reward": 1.083984375, "reward_std": 0.15524747781455517, "rewards/equation_reward_func/mean": 0.083984375, "rewards/equation_reward_func/std": 0.27071787044405937, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4269471764564514, "sampling/importance_sampling_ratio/mean": 0.9999112039804459, "sampling/importance_sampling_ratio/min": 0.7220955342054367, "sampling/sampling_logp_difference/max": 0.3707742691040039, "sampling/sampling_logp_difference/mean": 0.014429742936044931, "step": 76 }, { "clip_ratio/high_max": 6.955442303377722e-05, "clip_ratio/high_mean": 6.955442303377722e-05, "clip_ratio/low_mean": 0.000101867288726175, "clip_ratio/low_min": 0.000101867288726175, "clip_ratio/region_mean": 0.00017142171175995222, "completions/clipped_ratio": 0.0015625, "completions/max_length": 387.8, "completions/max_terminated_length": 244.4, "completions/mean_length": 75.340625, "completions/mean_terminated_length": 73.85445404052734, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.5044450511535009, "epoch": 0.020801232665639446, "frac_reward_zero_std": 0.6, "grad_norm": 1.8438678979873657, "kl": 0.10730917653482822, "learning_rate": 4.7468096048365814e-07, "loss": 0.1229, "num_tokens": 6699805.0, "reward": 1.1078125, "reward_std": 0.1811378538608551, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.3108581602573395, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.399577260017395, "sampling/importance_sampling_ratio/mean": 0.9996798276901245, "sampling/importance_sampling_ratio/min": 0.7104148745536805, "sampling/sampling_logp_difference/max": 0.38080081939697263, "sampling/sampling_logp_difference/mean": 0.014719891734421254, "step": 78 }, { "clip_ratio/high_max": 1.6613502844443752e-05, "clip_ratio/high_mean": 1.6613502844443752e-05, "clip_ratio/low_mean": 2.8696050220686528e-05, "clip_ratio/low_min": 2.8696050220686528e-05, "clip_ratio/region_mean": 4.530955306513028e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 309.75, "completions/max_terminated_length": 309.75, "completions/mean_length": 74.994140625, "completions/mean_terminated_length": 74.994140625, "completions/min_length": 40.75, "completions/min_terminated_length": 40.75, "entropy": 0.482443464299043, "epoch": 0.021334597605784045, "frac_reward_zero_std": 0.546875, "grad_norm": 1.993070363998413, "kl": 0.1494190221807609, "learning_rate": 4.730778276884061e-07, "loss": 0.0166, "num_tokens": 6816402.0, "reward": 1.08984375, "reward_std": 0.18892474845051765, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29158225655555725, "rewards/format_reward_func/mean": 0.99609375, "rewards/format_reward_func/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 1.506881058216095, "sampling/importance_sampling_ratio/mean": 1.0001471191644669, "sampling/importance_sampling_ratio/min": 0.7001816779375076, "sampling/sampling_logp_difference/max": 0.41368168592453003, "sampling/sampling_logp_difference/mean": 0.01445120107382536, "step": 80 }, { "clip_ratio/high_max": 4.6539518128459655e-05, "clip_ratio/high_mean": 4.6539518128459655e-05, "clip_ratio/low_mean": 0.00014464352312239094, "clip_ratio/low_min": 0.00014464352312239094, "clip_ratio/region_mean": 0.0001911830412508506, "completions/clipped_ratio": 0.0015625, "completions/max_length": 378.2, "completions/max_terminated_length": 209.4, "completions/mean_length": 74.1296875, "completions/mean_terminated_length": 72.63872985839843, "completions/min_length": 31.4, "completions/min_terminated_length": 31.4, "entropy": 0.48503100499510765, "epoch": 0.021867962545928647, "frac_reward_zero_std": 0.525, "grad_norm": 1.7124360799789429, "kl": 0.1061913346638903, "learning_rate": 4.7142836773620227e-07, "loss": 0.1111, "num_tokens": 6961357.0, "reward": 1.10625, "reward_std": 0.19822016656398772, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.3043874204158783, "rewards/format_reward_func/mean": 0.996875, "rewards/format_reward_func/std": 0.03535533845424652, "sampling/importance_sampling_ratio/max": 1.3922384262084961, "sampling/importance_sampling_ratio/mean": 1.0000276207923888, "sampling/importance_sampling_ratio/min": 0.6917226910591125, "sampling/sampling_logp_difference/max": 0.390087628364563, "sampling/sampling_logp_difference/mean": 0.014520143903791905, "step": 82 }, { "clip_ratio/high_max": 1.9452225792014764e-05, "clip_ratio/high_mean": 1.9452225792014764e-05, "clip_ratio/low_mean": 1.3303533099436512e-05, "clip_ratio/low_min": 1.3303533099436512e-05, "clip_ratio/region_mean": 3.2755758891451275e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 285.75, "completions/max_terminated_length": 285.75, "completions/mean_length": 74.244140625, "completions/mean_terminated_length": 74.244140625, "completions/min_length": 42.25, "completions/min_terminated_length": 42.25, "entropy": 0.4775158947126733, "epoch": 0.02240132748607325, "frac_reward_zero_std": 0.65625, "grad_norm": 1.6565135717391968, "kl": 0.09978139868730472, "learning_rate": 4.6973292317471635e-07, "loss": 0.0196, "num_tokens": 7077450.0, "reward": 1.09375, "reward_std": 0.14795516058802605, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29067372530698776, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4083246290683746, "sampling/importance_sampling_ratio/mean": 1.0001163333654404, "sampling/importance_sampling_ratio/min": 0.686163455247879, "sampling/sampling_logp_difference/max": 0.40500450134277344, "sampling/sampling_logp_difference/mean": 0.014088437426835299, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.572498806338344e-05, "clip_ratio/low_min": 7.572498806338344e-05, "clip_ratio/region_mean": 7.572498806338344e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 285.6, "completions/max_terminated_length": 285.6, "completions/mean_length": 73.8625, "completions/mean_terminated_length": 73.8625, "completions/min_length": 38.8, "completions/min_terminated_length": 38.8, "entropy": 0.47260352762209046, "epoch": 0.02293469242621785, "frac_reward_zero_std": 0.5875, "grad_norm": 1.9618802070617676, "kl": 0.11411506514478889, "learning_rate": 4.679918461013627e-07, "loss": -0.0089, "num_tokens": 7222298.0, "reward": 1.096875, "reward_std": 0.1768060728907585, "rewards/equation_reward_func/mean": 0.0984375, "rewards/equation_reward_func/std": 0.2915918380022049, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.475810742378235, "sampling/importance_sampling_ratio/mean": 1.0003753423690795, "sampling/importance_sampling_ratio/min": 0.7039525270462036, "sampling/sampling_logp_difference/max": 0.4118189811706543, "sampling/sampling_logp_difference/mean": 0.013945561647415162, "step": 86 }, { "clip_ratio/high_max": 5.374506387549142e-05, "clip_ratio/high_mean": 5.374506387549142e-05, "clip_ratio/low_mean": 0.0001103352874957232, "clip_ratio/low_min": 0.0001103352874957232, "clip_ratio/region_mean": 0.0001640803513712146, "completions/clipped_ratio": 0.0, "completions/max_length": 231.75, "completions/max_terminated_length": 231.75, "completions/mean_length": 73.8515625, "completions/mean_terminated_length": 73.8515625, "completions/min_length": 40.25, "completions/min_terminated_length": 40.25, "entropy": 0.47108812775048947, "epoch": 0.02346805736636245, "frac_reward_zero_std": 0.609375, "grad_norm": 1.8041415214538574, "kl": 0.11021246168658966, "learning_rate": 4.6620549809017885e-07, "loss": 0.0579, "num_tokens": 7338526.0, "reward": 1.125, "reward_std": 0.17905130330473185, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.32874006032943726, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4289512038230896, "sampling/importance_sampling_ratio/mean": 0.9999499619007111, "sampling/importance_sampling_ratio/min": 0.6994078606367111, "sampling/sampling_logp_difference/max": 0.3902685344219208, "sampling/sampling_logp_difference/mean": 0.014488923363387585, "step": 88 }, { "clip_ratio/high_max": 0.00010049852426163852, "clip_ratio/high_mean": 0.00010049852426163852, "clip_ratio/low_mean": 9.273409604146663e-05, "clip_ratio/low_min": 9.273409604146663e-05, "clip_ratio/region_mean": 0.00019323262030310516, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 71.3265625, "completions/mean_terminated_length": 71.3265625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "entropy": 0.4535586033016443, "epoch": 0.024001422306507054, "frac_reward_zero_std": 0.5125, "grad_norm": 2.8394386768341064, "kl": 0.17492727172147068, "learning_rate": 4.643742501167366e-07, "loss": 0.0157, "num_tokens": 7481751.0, "reward": 1.13125, "reward_std": 0.21978105306625367, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.33758683800697326, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.410378885269165, "sampling/importance_sampling_ratio/mean": 1.0000622272491455, "sampling/importance_sampling_ratio/min": 0.716123902797699, "sampling/sampling_logp_difference/max": 0.4100630283355713, "sampling/sampling_logp_difference/mean": 0.013988791592419147, "step": 90 }, { "clip_ratio/high_max": 1.7898052141794728e-05, "clip_ratio/high_mean": 1.7898052141794728e-05, "clip_ratio/low_mean": 7.572902541141957e-05, "clip_ratio/low_min": 7.572902541141957e-05, "clip_ratio/region_mean": 9.36270775532143e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 70.310546875, "completions/mean_terminated_length": 70.310546875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.44293284188542104, "epoch": 0.024534787246651653, "frac_reward_zero_std": 0.75, "grad_norm": 1.4256576299667358, "kl": 0.12838838810825515, "learning_rate": 4.624984824811006e-07, "loss": 0.0425, "num_tokens": 7595910.0, "reward": 1.06640625, "reward_std": 0.10915425233542919, "rewards/equation_reward_func/mean": 0.06640625, "rewards/equation_reward_func/std": 0.2493438720703125, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3955857455730438, "sampling/importance_sampling_ratio/mean": 1.0001295506954193, "sampling/importance_sampling_ratio/min": 0.72032031416893, "sampling/sampling_logp_difference/max": 0.348740816116333, "sampling/sampling_logp_difference/mean": 0.013854713412001729, "step": 92 }, { "clip_ratio/high_max": 0.00012608158512092713, "clip_ratio/high_mean": 0.00012608158512092713, "clip_ratio/low_mean": 0.00012443958096102707, "clip_ratio/low_min": 0.00012443958096102707, "clip_ratio/region_mean": 0.0002505211660819542, "completions/clipped_ratio": 0.0, "completions/max_length": 197.2, "completions/max_terminated_length": 197.2, "completions/mean_length": 70.0375, "completions/mean_terminated_length": 70.0375, "completions/min_length": 37.6, "completions/min_terminated_length": 37.6, "entropy": 0.4572728584623999, "epoch": 0.025068152186796255, "frac_reward_zero_std": 0.625, "grad_norm": 1.6360570192337036, "kl": 0.1268385768764549, "learning_rate": 4.605785847288502e-07, "loss": 0.0076, "num_tokens": 7738518.0, "reward": 1.0828125, "reward_std": 0.15850431323051453, "rewards/equation_reward_func/mean": 0.084375, "rewards/equation_reward_func/std": 0.2756199300289154, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.3934306383132935, "sampling/importance_sampling_ratio/mean": 1.0001968026161194, "sampling/importance_sampling_ratio/min": 0.7068123102188111, "sampling/sampling_logp_difference/max": 0.36842348575592043, "sampling/sampling_logp_difference/mean": 0.013838604465126992, "step": 94 }, { "clip_ratio/high_max": 5.269061431236979e-05, "clip_ratio/high_mean": 5.269061431236979e-05, "clip_ratio/low_mean": 0.00017156386456917971, "clip_ratio/low_min": 0.00017156386456917971, "clip_ratio/region_mean": 0.0002242544788815495, "completions/clipped_ratio": 0.0, "completions/max_length": 241.25, "completions/max_terminated_length": 241.25, "completions/mean_length": 68.80859375, "completions/mean_terminated_length": 68.80859375, "completions/min_length": 36.25, "completions/min_terminated_length": 36.25, "entropy": 0.4216196187254455, "epoch": 0.025601517126940854, "frac_reward_zero_std": 0.6875, "grad_norm": 1.8514827489852905, "kl": 0.1491334587852988, "learning_rate": 4.5861495557018206e-07, "loss": 0.0113, "num_tokens": 7852020.0, "reward": 1.09765625, "reward_std": 0.14091254770755768, "rewards/equation_reward_func/mean": 0.099609375, "rewards/equation_reward_func/std": 0.2874883860349655, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.3814985752105713, "sampling/importance_sampling_ratio/mean": 1.0000580251216888, "sampling/importance_sampling_ratio/min": 0.6413320899009705, "sampling/sampling_logp_difference/max": 0.4490201771259308, "sampling/sampling_logp_difference/mean": 0.013374796835705638, "step": 96 }, { "clip_ratio/high_max": 0.00013577588672180555, "clip_ratio/high_mean": 0.00013577588672180555, "clip_ratio/low_mean": 0.00012991529203847877, "clip_ratio/low_min": 0.00012991529203847877, "clip_ratio/region_mean": 0.0002656911787602843, "completions/clipped_ratio": 0.0, "completions/max_length": 203.8, "completions/max_terminated_length": 203.8, "completions/mean_length": 67.6140625, "completions/mean_terminated_length": 67.6140625, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "entropy": 0.4183834062682258, "epoch": 0.026134882067085457, "frac_reward_zero_std": 0.5125, "grad_norm": 2.063455104827881, "kl": 0.16493767727580336, "learning_rate": 4.566080027971082e-07, "loss": 0.0319, "num_tokens": 7992917.0, "reward": 1.1265625, "reward_std": 0.21557478755712509, "rewards/equation_reward_func/mean": 0.1265625, "rewards/equation_reward_func/std": 0.3245864808559418, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4088326930999755, "sampling/importance_sampling_ratio/mean": 0.9999056816101074, "sampling/importance_sampling_ratio/min": 0.6855303049087524, "sampling/sampling_logp_difference/max": 0.4068246364593506, "sampling/sampling_logp_difference/mean": 0.01259845271706581, "step": 98 }, { "clip_ratio/high_max": 5.181487035264986e-05, "clip_ratio/high_mean": 5.181487035264986e-05, "clip_ratio/low_mean": 0.00016702881677904062, "clip_ratio/low_min": 0.00016702881677904062, "clip_ratio/region_mean": 0.00021884368713169047, "completions/clipped_ratio": 0.0, "completions/max_length": 149.5, "completions/max_terminated_length": 149.5, "completions/mean_length": 65.58984375, "completions/mean_terminated_length": 65.58984375, "completions/min_length": 39.25, "completions/min_terminated_length": 39.25, "entropy": 0.3843678143910236, "epoch": 0.02666824700723006, "frac_reward_zero_std": 0.578125, "grad_norm": 2.063755989074707, "kl": 0.1375926880993777, "learning_rate": 4.545581431987694e-07, "loss": -0.0061, "num_tokens": 8104539.0, "reward": 1.1484375, "reward_std": 0.18550293892621994, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.3524423688650131, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.477320522069931, "sampling/importance_sampling_ratio/mean": 1.0000247359275818, "sampling/importance_sampling_ratio/min": 0.7372096627950668, "sampling/sampling_logp_difference/max": 0.4059351086616516, "sampling/sampling_logp_difference/mean": 0.01236387575045228, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 9.137515250282983e-05, "clip_ratio/low_min": 9.137515250282983e-05, "clip_ratio/region_mean": 9.137515250282983e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 140.4, "completions/max_terminated_length": 140.4, "completions/mean_length": 65.0921875, "completions/mean_terminated_length": 65.0921875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.38236751272860503, "epoch": 0.02720161194737466, "frac_reward_zero_std": 0.625, "grad_norm": 1.9620100259780884, "kl": 0.15543709245199958, "learning_rate": 4.5246580247487933e-07, "loss": 0.0521, "num_tokens": 8243750.0, "reward": 1.115625, "reward_std": 0.16639700382947922, "rewards/equation_reward_func/mean": 0.115625, "rewards/equation_reward_func/std": 0.31528539061546323, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.41611750125885, "sampling/importance_sampling_ratio/mean": 0.9999123454093933, "sampling/importance_sampling_ratio/min": 0.720720911026001, "sampling/sampling_logp_difference/max": 0.3845190525054932, "sampling/sampling_logp_difference/mean": 0.012204485386610031, "step": 102 }, { "clip_ratio/high_max": 6.01250600690643e-05, "clip_ratio/high_mean": 6.01250600690643e-05, "clip_ratio/low_mean": 5.751092814736896e-05, "clip_ratio/low_min": 5.751092814736896e-05, "clip_ratio/region_mean": 0.00011763598821643327, "completions/clipped_ratio": 0.0, "completions/max_length": 281.25, "completions/max_terminated_length": 281.25, "completions/mean_length": 63.03125, "completions/mean_terminated_length": 63.03125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.35578649139238727, "epoch": 0.02773497688751926, "frac_reward_zero_std": 0.65625, "grad_norm": 1.6827569007873535, "kl": 0.17754504539900356, "learning_rate": 4.5033141514731786e-07, "loss": 0.0038, "num_tokens": 8354086.0, "reward": 1.142578125, "reward_std": 0.15537971258163452, "rewards/equation_reward_func/mean": 0.142578125, "rewards/equation_reward_func/std": 0.3468663841485977, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3807437717914581, "sampling/importance_sampling_ratio/mean": 1.0001117289066315, "sampling/importance_sampling_ratio/min": 0.685884028673172, "sampling/sampling_logp_difference/max": 0.41647469997406006, "sampling/sampling_logp_difference/mean": 0.011777903651818633, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.948756976674e-05, "clip_ratio/low_min": 5.948756976674e-05, "clip_ratio/region_mean": 5.948756976674e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 124.8, "completions/max_terminated_length": 124.8, "completions/mean_length": 62.121875, "completions/mean_terminated_length": 62.121875, "completions/min_length": 40.4, "completions/min_terminated_length": 40.4, "entropy": 0.3208046312340432, "epoch": 0.02826834182766386, "frac_reward_zero_std": 0.6625, "grad_norm": 1.7315077781677246, "kl": 0.24651829375781947, "learning_rate": 4.4815542446989373e-07, "loss": -0.0047, "num_tokens": 8491732.0, "reward": 1.1078125, "reward_std": 0.14840333461761473, "rewards/equation_reward_func/mean": 0.1078125, "rewards/equation_reward_func/std": 0.3044365972280502, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.445423173904419, "sampling/importance_sampling_ratio/mean": 0.99986732006073, "sampling/importance_sampling_ratio/min": 0.6828996181488037, "sampling/sampling_logp_difference/max": 0.4102823257446289, "sampling/sampling_logp_difference/mean": 0.011591934971511364, "step": 106 }, { "clip_ratio/high_max": 2.744839811283681e-05, "clip_ratio/high_mean": 2.744839811283681e-05, "clip_ratio/low_mean": 8.609110631773042e-05, "clip_ratio/low_min": 8.609110631773042e-05, "clip_ratio/region_mean": 0.00011353950443056722, "completions/clipped_ratio": 0.0, "completions/max_length": 118.75, "completions/max_terminated_length": 118.75, "completions/mean_length": 60.416015625, "completions/mean_terminated_length": 60.416015625, "completions/min_length": 40.5, "completions/min_terminated_length": 40.5, "entropy": 0.3034154197408093, "epoch": 0.028801706767808462, "frac_reward_zero_std": 0.59375, "grad_norm": 1.7046583890914917, "kl": 0.15775487199425697, "learning_rate": 4.4593828233629214e-07, "loss": 0.0037, "num_tokens": 8600737.0, "reward": 1.16796875, "reward_std": 0.18431037291884422, "rewards/equation_reward_func/mean": 0.16796875, "rewards/equation_reward_func/std": 0.3659972548484802, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.374863475561142, "sampling/importance_sampling_ratio/mean": 0.9999902546405792, "sampling/importance_sampling_ratio/min": 0.6491745263338089, "sampling/sampling_logp_difference/max": 0.4482293128967285, "sampling/sampling_logp_difference/mean": 0.010837621288374066, "step": 108 }, { "clip_ratio/high_max": 0.000122115258515502, "clip_ratio/high_mean": 0.000122115258515502, "clip_ratio/low_mean": 8.306467659874923e-05, "clip_ratio/low_min": 8.306467659874923e-05, "clip_ratio/region_mean": 0.00020517993511425125, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 59.4140625, "completions/mean_terminated_length": 59.4140625, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "entropy": 0.2971830885443423, "epoch": 0.029335071707953065, "frac_reward_zero_std": 0.6, "grad_norm": 2.707641363143921, "kl": 0.17208816783709657, "learning_rate": 4.4368044918622893e-07, "loss": -0.0084, "num_tokens": 8736090.0, "reward": 1.1875, "reward_std": 0.19090495258569717, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.37982410192489624, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3816960096359252, "sampling/importance_sampling_ratio/mean": 1.000107967853546, "sampling/importance_sampling_ratio/min": 0.6941247701644897, "sampling/sampling_logp_difference/max": 0.3912020683288574, "sampling/sampling_logp_difference/mean": 0.010582260973751544, "step": 110 }, { "clip_ratio/high_max": 3.142282690128518e-05, "clip_ratio/high_mean": 3.142282690128518e-05, "clip_ratio/low_mean": 6.160811042516596e-05, "clip_ratio/low_min": 6.160811042516596e-05, "clip_ratio/region_mean": 9.303093732645114e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 97.75, "completions/max_terminated_length": 97.75, "completions/mean_length": 58.693359375, "completions/mean_terminated_length": 58.693359375, "completions/min_length": 39.75, "completions/min_terminated_length": 39.75, "entropy": 0.2775607474355234, "epoch": 0.029868436648097664, "frac_reward_zero_std": 0.40625, "grad_norm": 1.8576958179473877, "kl": 0.16128988765800992, "learning_rate": 4.4138239390983e-07, "loss": 0.0245, "num_tokens": 8844005.0, "reward": 1.24609375, "reward_std": 0.2744579315185547, "rewards/equation_reward_func/mean": 0.24609375, "rewards/equation_reward_func/std": 0.416913166642189, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4538166522979736, "sampling/importance_sampling_ratio/mean": 0.999872162938118, "sampling/importance_sampling_ratio/min": 0.7110061496496201, "sampling/sampling_logp_difference/max": 0.39000406861305237, "sampling/sampling_logp_difference/mean": 0.010843092808499932, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 3.156565556613108e-05, "clip_ratio/low_min": 3.156565556613108e-05, "clip_ratio/region_mean": 3.156565556613108e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 144.4, "completions/max_terminated_length": 144.4, "completions/mean_length": 61.1703125, "completions/mean_terminated_length": 61.1703125, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "entropy": 0.28720159942491186, "epoch": 0.030401801588242267, "frac_reward_zero_std": 0.6625, "grad_norm": 1.5062352418899536, "kl": 0.19503807400663695, "learning_rate": 4.390445937502557e-07, "loss": -0.0092, "num_tokens": 8981146.0, "reward": 1.1140625, "reward_std": 0.1491337910294533, "rewards/equation_reward_func/mean": 0.1140625, "rewards/equation_reward_func/std": 0.31090696156024933, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5016798496246337, "sampling/importance_sampling_ratio/mean": 0.9999350905418396, "sampling/importance_sampling_ratio/min": 0.621754503250122, "sampling/sampling_logp_difference/max": 0.5074280261993408, "sampling/sampling_logp_difference/mean": 0.010939785093069077, "step": 114 }, { "clip_ratio/high_max": 2.9932950080061953e-05, "clip_ratio/high_mean": 2.9932950080061953e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9932950080061953e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 120.5, "completions/max_terminated_length": 120.5, "completions/mean_length": 60.384765625, "completions/mean_terminated_length": 60.384765625, "completions/min_length": 38.75, "completions/min_terminated_length": 38.75, "entropy": 0.2791112618934777, "epoch": 0.03093516652838687, "frac_reward_zero_std": 0.703125, "grad_norm": 1.997960090637207, "kl": 0.17415831098333, "learning_rate": 4.3666753420459023e-07, "loss": -0.0004, "num_tokens": 9090295.0, "reward": 1.125, "reward_std": 0.13138357549905777, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3110863082110882, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3373797833919525, "sampling/importance_sampling_ratio/mean": 1.0000529140233994, "sampling/importance_sampling_ratio/min": 0.7275194972753525, "sampling/sampling_logp_difference/max": 0.3365427851676941, "sampling/sampling_logp_difference/mean": 0.010237103793770075, "step": 116 }, { "clip_ratio/high_max": 5.673566031166249e-05, "clip_ratio/high_mean": 5.673566031166249e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.673566031166249e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 110.4, "completions/max_terminated_length": 110.4, "completions/mean_length": 60.403125, "completions/mean_terminated_length": 60.403125, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "entropy": 0.2734551280736923, "epoch": 0.03146853146853147, "frac_reward_zero_std": 0.7, "grad_norm": 1.4753539562225342, "kl": 0.16841050651338366, "learning_rate": 4.3425170892301764e-07, "loss": 0.0021, "num_tokens": 9226665.0, "reward": 1.1484375, "reward_std": 0.13497940748929976, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.348021799325943, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.469872283935547, "sampling/importance_sampling_ratio/mean": 1.0000021576881408, "sampling/importance_sampling_ratio/min": 0.6969752192497254, "sampling/sampling_logp_difference/max": 0.43164453506469724, "sampling/sampling_logp_difference/mean": 0.010616865567862987, "step": 118 }, { "clip_ratio/high_max": 6.0689839301630855e-05, "clip_ratio/high_mean": 6.0689839301630855e-05, "clip_ratio/low_mean": 5.96018443401489e-05, "clip_ratio/low_min": 5.96018443401489e-05, "clip_ratio/region_mean": 0.00012029168364177976, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 60.349609375, "completions/mean_terminated_length": 60.349609375, "completions/min_length": 41.75, "completions/min_terminated_length": 41.75, "entropy": 0.27133370677216184, "epoch": 0.03200189640867607, "frac_reward_zero_std": 0.6875, "grad_norm": 1.87366783618927, "kl": 0.18455991433519456, "learning_rate": 4.3179761960630357e-07, "loss": 0.0114, "num_tokens": 9335604.0, "reward": 1.119140625, "reward_std": 0.13578890450298786, "rewards/equation_reward_func/mean": 0.119140625, "rewards/equation_reward_func/std": 0.3225974068045616, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3893374800682068, "sampling/importance_sampling_ratio/mean": 1.0000585615634918, "sampling/importance_sampling_ratio/min": 0.6728315651416779, "sampling/sampling_logp_difference/max": 0.39994753897190094, "sampling/sampling_logp_difference/mean": 0.010147883323952556, "step": 120 }, { "clip_ratio/high_max": 2.9056252161454824e-05, "clip_ratio/high_mean": 2.9056252161454824e-05, "clip_ratio/low_mean": 2.9178339496461882e-05, "clip_ratio/low_min": 2.9178339496461882e-05, "clip_ratio/region_mean": 5.82345916579167e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 101.2, "completions/max_terminated_length": 101.2, "completions/mean_length": 59.4765625, "completions/mean_terminated_length": 59.4765625, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "entropy": 0.2625521798100736, "epoch": 0.03253526134882067, "frac_reward_zero_std": 0.625, "grad_norm": 1.85478937625885, "kl": 0.23499730358728105, "learning_rate": 4.293057759016063e-07, "loss": -0.0261, "num_tokens": 9471245.0, "reward": 1.1546875, "reward_std": 0.17738013863563537, "rewards/equation_reward_func/mean": 0.1546875, "rewards/equation_reward_func/std": 0.3517531335353851, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.410482120513916, "sampling/importance_sampling_ratio/mean": 0.9999771833419799, "sampling/importance_sampling_ratio/min": 0.6693259477615356, "sampling/sampling_logp_difference/max": 0.4148847222328186, "sampling/sampling_logp_difference/mean": 0.0104746263474226, "step": 122 }, { "clip_ratio/high_max": 3.0458089895546436e-05, "clip_ratio/high_mean": 3.0458089895546436e-05, "clip_ratio/low_mean": 6.091617979109287e-05, "clip_ratio/low_min": 6.091617979109287e-05, "clip_ratio/region_mean": 9.137426968663931e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 124.5, "completions/max_terminated_length": 124.5, "completions/mean_length": 59.26171875, "completions/mean_terminated_length": 59.26171875, "completions/min_length": 39.25, "completions/min_terminated_length": 39.25, "entropy": 0.26121031689561075, "epoch": 0.03306862628896527, "frac_reward_zero_std": 0.59375, "grad_norm": 1.7555099725723267, "kl": 0.4429074769011802, "learning_rate": 4.2677669529663686e-07, "loss": 0.0123, "num_tokens": 9579579.0, "reward": 1.17578125, "reward_std": 0.18740076199173927, "rewards/equation_reward_func/mean": 0.17578125, "rewards/equation_reward_func/std": 0.3811173066496849, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3602531552314758, "sampling/importance_sampling_ratio/mean": 1.0000222623348236, "sampling/importance_sampling_ratio/min": 0.6435240805149078, "sampling/sampling_logp_difference/max": 0.44846171140670776, "sampling/sampling_logp_difference/mean": 0.0106544173322618, "step": 124 }, { "clip_ratio/high_max": 5.8355657125098835e-05, "clip_ratio/high_mean": 5.8355657125098835e-05, "clip_ratio/low_mean": 0.00019639414515242807, "clip_ratio/low_min": 0.00019639414515242807, "clip_ratio/region_mean": 0.00025474980227752693, "completions/clipped_ratio": 0.0, "completions/max_length": 111.6, "completions/max_terminated_length": 111.6, "completions/mean_length": 60.3484375, "completions/mean_terminated_length": 60.3484375, "completions/min_length": 38.2, "completions/min_terminated_length": 38.2, "entropy": 0.26713117129272884, "epoch": 0.033601991229109875, "frac_reward_zero_std": 0.525, "grad_norm": 1.6895452737808228, "kl": 0.4376677656546235, "learning_rate": 4.2421090301219077e-07, "loss": 0.0083, "num_tokens": 9715810.0, "reward": 1.2203125, "reward_std": 0.2141014516353607, "rewards/equation_reward_func/mean": 0.2203125, "rewards/equation_reward_func/std": 0.4053365707397461, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2952686071395874, "sampling/importance_sampling_ratio/mean": 0.9998293399810791, "sampling/importance_sampling_ratio/min": 0.6135809898376465, "sampling/sampling_logp_difference/max": 0.4893936634063721, "sampling/sampling_logp_difference/mean": 0.010468007065355778, "step": 126 }, { "clip_ratio/high_max": 0.00012416387390759256, "clip_ratio/high_mean": 0.00012416387390759256, "clip_ratio/low_mean": 8.608136300204528e-05, "clip_ratio/low_min": 8.608136300204528e-05, "clip_ratio/region_mean": 0.00021024523690963784, "completions/clipped_ratio": 0.0, "completions/max_length": 94.5, "completions/max_terminated_length": 94.5, "completions/mean_length": 60.478515625, "completions/mean_terminated_length": 60.478515625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.2529927268624306, "epoch": 0.03413535616925448, "frac_reward_zero_std": 0.609375, "grad_norm": 1.7231546640396118, "kl": 0.16552753146323892, "learning_rate": 4.216089318930741e-07, "loss": 0.0073, "num_tokens": 9825095.0, "reward": 1.1640625, "reward_std": 0.17536746710538864, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.362117238342762, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.350740671157837, "sampling/importance_sampling_ratio/mean": 0.9999200105667114, "sampling/importance_sampling_ratio/min": 0.6580379158258438, "sampling/sampling_logp_difference/max": 0.4415377974510193, "sampling/sampling_logp_difference/mean": 0.010004544630646706, "step": 128 }, { "clip_ratio/high_max": 3.019323533711334e-05, "clip_ratio/high_mean": 3.019323533711334e-05, "clip_ratio/low_mean": 8.618442775008993e-05, "clip_ratio/low_min": 8.618442775008993e-05, "clip_ratio/region_mean": 0.00011637766308720327, "completions/clipped_ratio": 0.0, "completions/max_length": 112.4, "completions/max_terminated_length": 112.4, "completions/mean_length": 59.5171875, "completions/mean_terminated_length": 59.5171875, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "entropy": 0.24692470632079574, "epoch": 0.03466872110939907, "frac_reward_zero_std": 0.6, "grad_norm": 1.3249354362487793, "kl": 0.19976856864781845, "learning_rate": 4.189713222974466e-07, "loss": 0.0107, "num_tokens": 9960458.0, "reward": 1.2140625, "reward_std": 0.1782768577337265, "rewards/equation_reward_func/mean": 0.2140625, "rewards/equation_reward_func/std": 0.40370296835899355, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.428548312187195, "sampling/importance_sampling_ratio/mean": 1.0000859141349792, "sampling/importance_sampling_ratio/min": 0.6825728058815003, "sampling/sampling_logp_difference/max": 0.4194270372390747, "sampling/sampling_logp_difference/mean": 0.01015628632158041, "step": 130 }, { "clip_ratio/high_max": 8.364082249398861e-05, "clip_ratio/high_mean": 8.364082249398861e-05, "clip_ratio/low_mean": 9.027158699205352e-05, "clip_ratio/low_min": 9.027158699205352e-05, "clip_ratio/region_mean": 0.00017391240948604213, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 60.43359375, "completions/mean_terminated_length": 60.43359375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.24369203195803696, "epoch": 0.035202086049543675, "frac_reward_zero_std": 0.703125, "grad_norm": 1.4998974800109863, "kl": 0.20761416272984612, "learning_rate": 4.162986219846037e-07, "loss": 0.0154, "num_tokens": 10069736.0, "reward": 1.115234375, "reward_std": 0.1339472383260727, "rewards/equation_reward_func/mean": 0.115234375, "rewards/equation_reward_func/std": 0.3130350150167942, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3388360440731049, "sampling/importance_sampling_ratio/mean": 0.9999191462993622, "sampling/importance_sampling_ratio/min": 0.6552015244960785, "sampling/sampling_logp_difference/max": 0.4390140175819397, "sampling/sampling_logp_difference/mean": 0.010385974310338497, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.6, "completions/max_terminated_length": 101.6, "completions/mean_length": 61.409375, "completions/mean_terminated_length": 61.409375, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "entropy": 0.25017345593207413, "epoch": 0.03573545098968828, "frac_reward_zero_std": 0.6625, "grad_norm": 1.6827465295791626, "kl": 0.22101046589927542, "learning_rate": 4.135913860012219e-07, "loss": -0.004, "num_tokens": 10206742.0, "reward": 1.153125, "reward_std": 0.1472960114479065, "rewards/equation_reward_func/mean": 0.153125, "rewards/equation_reward_func/std": 0.34688435196876527, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4853968143463134, "sampling/importance_sampling_ratio/mean": 0.9999613642692566, "sampling/importance_sampling_ratio/min": 0.6845607280731201, "sampling/sampling_logp_difference/max": 0.45167109966278074, "sampling/sampling_logp_difference/mean": 0.009881124645471574, "step": 134 }, { "clip_ratio/high_max": 0.00015235561942164268, "clip_ratio/high_mean": 0.00015235561942164268, "clip_ratio/low_mean": 5.88913195921729e-05, "clip_ratio/low_min": 5.88913195921729e-05, "clip_ratio/region_mean": 0.0002112469390138156, "completions/clipped_ratio": 0.0, "completions/max_length": 100.5, "completions/max_terminated_length": 100.5, "completions/mean_length": 59.689453125, "completions/mean_terminated_length": 59.689453125, "completions/min_length": 39.75, "completions/min_terminated_length": 39.75, "entropy": 0.23396999802854326, "epoch": 0.03626881592983288, "frac_reward_zero_std": 0.625, "grad_norm": 1.4149812459945679, "kl": 0.16530047320864266, "learning_rate": 4.10850176566091e-07, "loss": -0.0105, "num_tokens": 10315343.0, "reward": 1.216796875, "reward_std": 0.16978508606553078, "rewards/equation_reward_func/mean": 0.216796875, "rewards/equation_reward_func/std": 0.412067174911499, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4376217424869537, "sampling/importance_sampling_ratio/mean": 1.0002870708703995, "sampling/importance_sampling_ratio/min": 0.6420085281133652, "sampling/sampling_logp_difference/max": 0.4656393527984619, "sampling/sampling_logp_difference/mean": 0.009524585912004113, "step": 136 }, { "clip_ratio/high_max": 8.085408707201068e-05, "clip_ratio/high_mean": 8.085408707201068e-05, "clip_ratio/low_mean": 3.019323533711334e-05, "clip_ratio/low_min": 3.019323533711334e-05, "clip_ratio/region_mean": 0.00011104732240912401, "completions/clipped_ratio": 0.0, "completions/max_length": 98.6, "completions/max_terminated_length": 98.6, "completions/mean_length": 59.9953125, "completions/mean_terminated_length": 59.9953125, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2394670732319355, "epoch": 0.03680218086997748, "frac_reward_zero_std": 0.55, "grad_norm": 2.4222986698150635, "kl": 0.1840939605091181, "learning_rate": 4.080755629533566e-07, "loss": -0.0078, "num_tokens": 10451172.0, "reward": 1.234375, "reward_std": 0.20026658475399017, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42457646131515503, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4464009523391723, "sampling/importance_sampling_ratio/mean": 0.999961793422699, "sampling/importance_sampling_ratio/min": 0.6856006741523742, "sampling/sampling_logp_difference/max": 0.46341880559921267, "sampling/sampling_logp_difference/mean": 0.009751284867525101, "step": 138 }, { "clip_ratio/high_max": 6.0532112709350055e-05, "clip_ratio/high_mean": 6.0532112709350055e-05, "clip_ratio/low_mean": 2.7777779097151426e-05, "clip_ratio/low_min": 2.7777779097151426e-05, "clip_ratio/region_mean": 8.830989180650149e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 93.25, "completions/max_terminated_length": 93.25, "completions/mean_length": 59.587890625, "completions/mean_terminated_length": 59.587890625, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.23158407314783996, "epoch": 0.03733554581012208, "frac_reward_zero_std": 0.578125, "grad_norm": 1.9072867631912231, "kl": 0.2105928630464607, "learning_rate": 4.052681213742971e-07, "loss": 0.0236, "num_tokens": 10559761.0, "reward": 1.189453125, "reward_std": 0.190362598747015, "rewards/equation_reward_func/mean": 0.189453125, "rewards/equation_reward_func/std": 0.386874295771122, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4299006462097168, "sampling/importance_sampling_ratio/mean": 0.9998154938220978, "sampling/importance_sampling_ratio/min": 0.6671769767999649, "sampling/sampling_logp_difference/max": 0.4305380582809448, "sampling/sampling_logp_difference/mean": 0.009733326034620404, "step": 140 }, { "clip_ratio/high_max": 6.023795059364703e-05, "clip_ratio/high_mean": 6.023795059364703e-05, "clip_ratio/low_mean": 0.0001483339195450147, "clip_ratio/low_min": 0.0001483339195450147, "clip_ratio/region_mean": 0.00020857187013866173, "completions/clipped_ratio": 0.0, "completions/max_length": 122.6, "completions/max_terminated_length": 122.6, "completions/mean_length": 61.15, "completions/mean_terminated_length": 61.15, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "entropy": 0.24258363609098726, "epoch": 0.03786891075026668, "frac_reward_zero_std": 0.6125, "grad_norm": 2.362435817718506, "kl": 0.16479992866516113, "learning_rate": 4.024284348576611e-07, "loss": 0.0282, "num_tokens": 10696769.0, "reward": 1.1671875, "reward_std": 0.1713821828365326, "rewards/equation_reward_func/mean": 0.1671875, "rewards/equation_reward_func/std": 0.3714756488800049, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.461859893798828, "sampling/importance_sampling_ratio/mean": 0.9997500777244568, "sampling/importance_sampling_ratio/min": 0.6186473846435547, "sampling/sampling_logp_difference/max": 0.48368375301361083, "sampling/sampling_logp_difference/mean": 0.010271353088319302, "step": 142 }, { "clip_ratio/high_max": 6.264390928360324e-05, "clip_ratio/high_mean": 6.264390928360324e-05, "clip_ratio/low_mean": 3.1141008043454756e-05, "clip_ratio/low_min": 3.1141008043454756e-05, "clip_ratio/region_mean": 9.378491732705798e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 126.75, "completions/max_terminated_length": 126.75, "completions/mean_length": 60.69140625, "completions/mean_terminated_length": 60.69140625, "completions/min_length": 41.5, "completions/min_terminated_length": 41.5, "entropy": 0.23982544740041098, "epoch": 0.03840227569041128, "frac_reward_zero_std": 0.625, "grad_norm": 1.6397101879119873, "kl": 0.2571847657155659, "learning_rate": 3.9955709312858744e-07, "loss": -0.019, "num_tokens": 10805979.0, "reward": 1.16796875, "reward_std": 0.1678729709237814, "rewards/equation_reward_func/mean": 0.16796875, "rewards/equation_reward_func/std": 0.36610323190689087, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5372987389564514, "sampling/importance_sampling_ratio/mean": 1.00014628469944, "sampling/importance_sampling_ratio/min": 0.700438067317009, "sampling/sampling_logp_difference/max": 0.42949178814888, "sampling/sampling_logp_difference/mean": 0.010411194059997797, "step": 144 }, { "clip_ratio/high_max": 0.00015315896598622203, "clip_ratio/high_mean": 0.00015315896598622203, "clip_ratio/low_mean": 8.936385145514375e-05, "clip_ratio/low_min": 8.936385145514375e-05, "clip_ratio/region_mean": 0.0002425228174413658, "completions/clipped_ratio": 0.0, "completions/max_length": 112.8, "completions/max_terminated_length": 112.8, "completions/mean_length": 59.1109375, "completions/mean_terminated_length": 59.1109375, "completions/min_length": 44.4, "completions/min_terminated_length": 44.4, "entropy": 0.22110586861769357, "epoch": 0.038935640630555886, "frac_reward_zero_std": 0.5625, "grad_norm": 1.8405417203903198, "kl": 0.17161257580543557, "learning_rate": 3.9665469248613616e-07, "loss": 0.0063, "num_tokens": 10941242.0, "reward": 1.2171875, "reward_std": 0.1969999998807907, "rewards/equation_reward_func/mean": 0.2171875, "rewards/equation_reward_func/std": 0.3969175547361374, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4033096790313722, "sampling/importance_sampling_ratio/mean": 0.9997965216636657, "sampling/importance_sampling_ratio/min": 0.6852907180786133, "sampling/sampling_logp_difference/max": 0.40577986240386965, "sampling/sampling_logp_difference/mean": 0.009403046779334545, "step": 146 }, { "clip_ratio/high_max": 9.230344827907781e-05, "clip_ratio/high_mean": 9.230344827907781e-05, "clip_ratio/low_mean": 0.0001770234893127862, "clip_ratio/low_min": 0.0001770234893127862, "clip_ratio/region_mean": 0.00026932693759186403, "completions/clipped_ratio": 0.0, "completions/max_length": 130.5, "completions/max_terminated_length": 130.5, "completions/mean_length": 60.326171875, "completions/mean_terminated_length": 60.326171875, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.2170955626707938, "epoch": 0.03946900557070049, "frac_reward_zero_std": 0.65625, "grad_norm": 2.0791473388671875, "kl": 0.2331375734259685, "learning_rate": 3.9372183567945314e-07, "loss": -0.0017, "num_tokens": 11050353.0, "reward": 1.17578125, "reward_std": 0.15564491972327232, "rewards/equation_reward_func/mean": 0.17578125, "rewards/equation_reward_func/std": 0.3815586641430855, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.457154631614685, "sampling/importance_sampling_ratio/mean": 0.9996602088212967, "sampling/importance_sampling_ratio/min": 0.6626079231500626, "sampling/sampling_logp_difference/max": 0.4430467188358307, "sampling/sampling_logp_difference/mean": 0.00953103438951075, "step": 148 }, { "clip_ratio/high_max": 5.9412569195653e-05, "clip_ratio/high_mean": 5.9412569195653e-05, "clip_ratio/low_mean": 8.849175517550773e-05, "clip_ratio/low_min": 8.849175517550773e-05, "clip_ratio/region_mean": 0.00014790432437116074, "completions/clipped_ratio": 0.0, "completions/max_length": 105.2, "completions/max_terminated_length": 105.2, "completions/mean_length": 59.6828125, "completions/mean_terminated_length": 59.6828125, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "entropy": 0.205387931317091, "epoch": 0.040002370510845084, "frac_reward_zero_std": 0.6625, "grad_norm": 1.5700286626815796, "kl": 0.20885711359894937, "learning_rate": 3.907591317825956e-07, "loss": -0.0339, "num_tokens": 11186182.0, "reward": 1.165625, "reward_std": 0.15976039171218873, "rewards/equation_reward_func/mean": 0.165625, "rewards/equation_reward_func/std": 0.36154150664806367, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.351095199584961, "sampling/importance_sampling_ratio/mean": 1.0000422239303588, "sampling/importance_sampling_ratio/min": 0.6123627126216888, "sampling/sampling_logp_difference/max": 0.500025749206543, "sampling/sampling_logp_difference/mean": 0.009313166700303554, "step": 150 }, { "clip_ratio/high_max": 5.5955558006341256e-05, "clip_ratio/high_mean": 5.5955558006341256e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.5955558006341256e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 104.25, "completions/max_terminated_length": 104.25, "completions/mean_length": 61.07421875, "completions/mean_terminated_length": 61.07421875, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.21371133335762554, "epoch": 0.040535735450989686, "frac_reward_zero_std": 0.5625, "grad_norm": 2.1190476417541504, "kl": 0.18383360576505461, "learning_rate": 3.877671960680443e-07, "loss": 0.0615, "num_tokens": 11295564.0, "reward": 1.236328125, "reward_std": 0.20469434186816216, "rewards/equation_reward_func/mean": 0.23828125, "rewards/equation_reward_func/std": 0.42655108124017715, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.3952956795692444, "sampling/importance_sampling_ratio/mean": 1.000019371509552, "sampling/importance_sampling_ratio/min": 0.665789470076561, "sampling/sampling_logp_difference/max": 0.41187337040901184, "sampling/sampling_logp_difference/mean": 0.009665116667747498, "step": 152 }, { "clip_ratio/high_max": 2.419667079165164e-05, "clip_ratio/high_mean": 2.419667079165164e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.419667079165164e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 110.2, "completions/max_terminated_length": 110.2, "completions/mean_length": 60.2625, "completions/mean_terminated_length": 60.2625, "completions/min_length": 44.6, "completions/min_terminated_length": 44.6, "entropy": 0.20480736293312576, "epoch": 0.04106910039113429, "frac_reward_zero_std": 0.6125, "grad_norm": 1.605876088142395, "kl": 0.28210241585556006, "learning_rate": 3.847466498789282e-07, "loss": -0.0177, "num_tokens": 11431756.0, "reward": 1.18125, "reward_std": 0.1809024393558502, "rewards/equation_reward_func/mean": 0.18125, "rewards/equation_reward_func/std": 0.38422803282737733, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3802667617797852, "sampling/importance_sampling_ratio/mean": 1.0000742316246032, "sampling/importance_sampling_ratio/min": 0.6787930727005005, "sampling/sampling_logp_difference/max": 0.4008303165435791, "sampling/sampling_logp_difference/mean": 0.009053303487598896, "step": 154 }, { "clip_ratio/high_max": 0.00015077821121344136, "clip_ratio/high_mean": 0.00015077821121344136, "clip_ratio/low_mean": 0.00019887392732521726, "clip_ratio/low_min": 0.00019887392732521726, "clip_ratio/region_mean": 0.0003496521385386586, "completions/clipped_ratio": 0.0, "completions/max_length": 94.25, "completions/max_terminated_length": 94.25, "completions/mean_length": 60.216796875, "completions/mean_terminated_length": 60.216796875, "completions/min_length": 46.5, "completions/min_terminated_length": 46.5, "entropy": 0.20356002615557778, "epoch": 0.04160246533127889, "frac_reward_zero_std": 0.578125, "grad_norm": 2.0241599082946777, "kl": 0.1666457034750945, "learning_rate": 3.816981204999882e-07, "loss": 0.0029, "num_tokens": 11540515.0, "reward": 1.18359375, "reward_std": 0.181291151791811, "rewards/equation_reward_func/mean": 0.18359375, "rewards/equation_reward_func/std": 0.3818978816270828, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3811768591403961, "sampling/importance_sampling_ratio/mean": 1.000122681260109, "sampling/importance_sampling_ratio/min": 0.670781597495079, "sampling/sampling_logp_difference/max": 0.4184178411960602, "sampling/sampling_logp_difference/mean": 0.009315791074186563, "step": 156 }, { "clip_ratio/high_max": 0.00014583735578020828, "clip_ratio/high_mean": 0.00014583735578020828, "clip_ratio/low_mean": 5.7038796108423005e-05, "clip_ratio/low_min": 5.7038796108423005e-05, "clip_ratio/region_mean": 0.00020287615188863128, "completions/clipped_ratio": 0.0, "completions/max_length": 99.6, "completions/max_terminated_length": 99.6, "completions/mean_length": 60.6765625, "completions/mean_terminated_length": 60.6765625, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.20100580217937628, "epoch": 0.042135830271423494, "frac_reward_zero_std": 0.5375, "grad_norm": 1.9981679916381836, "kl": 0.1713719450765186, "learning_rate": 3.786222410273078e-07, "loss": -0.0039, "num_tokens": 11676940.0, "reward": 1.25625, "reward_std": 0.20426265597343446, "rewards/equation_reward_func/mean": 0.25625, "rewards/equation_reward_func/std": 0.43777836561203004, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4604878664016723, "sampling/importance_sampling_ratio/mean": 1.0001580238342285, "sampling/importance_sampling_ratio/min": 0.6269764065742492, "sampling/sampling_logp_difference/max": 0.4673612594604492, "sampling/sampling_logp_difference/mean": 0.00927397571504116, "step": 158 }, { "clip_ratio/high_max": 0.0001139156504197874, "clip_ratio/high_mean": 0.0001139156504197874, "clip_ratio/low_mean": 0.00014253722232145569, "clip_ratio/low_min": 0.00014253722232145569, "clip_ratio/region_mean": 0.0002564528662737252, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 60.744140625, "completions/mean_terminated_length": 60.744140625, "completions/min_length": 44.75, "completions/min_terminated_length": 44.75, "entropy": 0.19869527727779415, "epoch": 0.04266919521156809, "frac_reward_zero_std": 0.71875, "grad_norm": 2.0738189220428467, "kl": 0.176669475208554, "learning_rate": 3.755196502368361e-07, "loss": -0.0125, "num_tokens": 11786193.0, "reward": 1.197265625, "reward_std": 0.12500434182584286, "rewards/equation_reward_func/mean": 0.197265625, "rewards/equation_reward_func/std": 0.3843386322259903, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.432737112045288, "sampling/importance_sampling_ratio/mean": 1.0001157820224762, "sampling/importance_sampling_ratio/min": 0.6635591238737106, "sampling/sampling_logp_difference/max": 0.4434420168399811, "sampling/sampling_logp_difference/mean": 0.009436382446438074, "step": 160 }, { "clip_ratio/high_max": 2.7667110164960224e-05, "clip_ratio/high_mean": 2.7667110164960224e-05, "clip_ratio/low_mean": 0.00011843939622243245, "clip_ratio/low_min": 0.00011843939622243245, "clip_ratio/region_mean": 0.00014610650638739267, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 61.0359375, "completions/mean_terminated_length": 61.0359375, "completions/min_length": 46.8, "completions/min_terminated_length": 46.8, "entropy": 0.19139085011556745, "epoch": 0.04320256015171269, "frac_reward_zero_std": 0.6, "grad_norm": 1.9594528675079346, "kl": 0.43444608545137775, "learning_rate": 3.723909924517314e-07, "loss": 0.0169, "num_tokens": 11922920.0, "reward": 1.2171875, "reward_std": 0.1784880429506302, "rewards/equation_reward_func/mean": 0.2171875, "rewards/equation_reward_func/std": 0.40534732937812806, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4049390316009522, "sampling/importance_sampling_ratio/mean": 0.9999004483222962, "sampling/importance_sampling_ratio/min": 0.6934358954429627, "sampling/sampling_logp_difference/max": 0.3868489623069763, "sampling/sampling_logp_difference/mean": 0.00837649628520012, "step": 162 }, { "clip_ratio/high_max": 0.0002380031865969714, "clip_ratio/high_mean": 0.0002380031865969714, "clip_ratio/low_mean": 6.095839651404983e-05, "clip_ratio/low_min": 6.095839651404983e-05, "clip_ratio/region_mean": 0.00029896158311102126, "completions/clipped_ratio": 0.0, "completions/max_length": 107.75, "completions/max_terminated_length": 107.75, "completions/mean_length": 62.31640625, "completions/mean_terminated_length": 62.31640625, "completions/min_length": 46.75, "completions/min_terminated_length": 46.75, "entropy": 0.20228930976655748, "epoch": 0.043735925091857294, "frac_reward_zero_std": 0.546875, "grad_norm": 1.6721035242080688, "kl": 0.19043642468750477, "learning_rate": 3.692369174085534e-07, "loss": -0.0166, "num_tokens": 12032994.0, "reward": 1.21875, "reward_std": 0.21416642516851425, "rewards/equation_reward_func/mean": 0.220703125, "rewards/equation_reward_func/std": 0.4142964780330658, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.5082706809043884, "sampling/importance_sampling_ratio/mean": 1.0001254230737686, "sampling/importance_sampling_ratio/min": 0.6698237657546997, "sampling/sampling_logp_difference/max": 0.41855400800704956, "sampling/sampling_logp_difference/mean": 0.008872574893757701, "step": 164 }, { "clip_ratio/high_max": 5.388532008510083e-05, "clip_ratio/high_mean": 5.388532008510083e-05, "clip_ratio/low_mean": 3.2603026031413014e-05, "clip_ratio/low_min": 3.2603026031413014e-05, "clip_ratio/region_mean": 8.648834611651384e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 62.121875, "completions/mean_terminated_length": 62.121875, "completions/min_length": 47.6, "completions/min_terminated_length": 47.6, "entropy": 0.19981019705947903, "epoch": 0.0442692900320019, "frac_reward_zero_std": 0.6125, "grad_norm": 1.7159110307693481, "kl": 0.1863187122055226, "learning_rate": 3.6605808012233004e-07, "loss": 0.0034, "num_tokens": 12170424.0, "reward": 1.196875, "reward_std": 0.1654445081949234, "rewards/equation_reward_func/mean": 0.196875, "rewards/equation_reward_func/std": 0.38471853733062744, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4101471185684205, "sampling/importance_sampling_ratio/mean": 1.0000666379928589, "sampling/importance_sampling_ratio/min": 0.6134101748466492, "sampling/sampling_logp_difference/max": 0.4950767278671265, "sampling/sampling_logp_difference/mean": 0.00904884897172451, "step": 166 }, { "clip_ratio/high_max": 0.00013415459640479335, "clip_ratio/high_mean": 0.00013415459640479335, "clip_ratio/low_mean": 0.00010965501779638644, "clip_ratio/low_min": 0.00010965501779638644, "clip_ratio/region_mean": 0.00024380961420117982, "completions/clipped_ratio": 0.0, "completions/max_length": 97.75, "completions/max_terminated_length": 97.75, "completions/mean_length": 63.865234375, "completions/mean_terminated_length": 63.865234375, "completions/min_length": 47.75, "completions/min_terminated_length": 47.75, "entropy": 0.2031761605499519, "epoch": 0.0448026549721465, "frac_reward_zero_std": 0.625, "grad_norm": 1.7952263355255127, "kl": 0.21836262770618, "learning_rate": 3.628551407505292e-07, "loss": -0.0184, "num_tokens": 12281475.0, "reward": 1.16015625, "reward_std": 0.1633341796696186, "rewards/equation_reward_func/mean": 0.16015625, "rewards/equation_reward_func/std": 0.35781755298376083, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3953973948955536, "sampling/importance_sampling_ratio/mean": 0.9999680519104004, "sampling/importance_sampling_ratio/min": 0.6324247270822525, "sampling/sampling_logp_difference/max": 0.45941513776779175, "sampling/sampling_logp_difference/mean": 0.009319797158241272, "step": 168 }, { "clip_ratio/high_max": 2.9178339496461882e-05, "clip_ratio/high_mean": 2.9178339496461882e-05, "clip_ratio/low_mean": 0.00014260712002093592, "clip_ratio/low_min": 0.00014260712002093592, "clip_ratio/region_mean": 0.0001717854595173978, "completions/clipped_ratio": 0.0, "completions/max_length": 113.8, "completions/max_terminated_length": 113.8, "completions/mean_length": 63.65625, "completions/mean_terminated_length": 63.65625, "completions/min_length": 47.8, "completions/min_terminated_length": 47.8, "entropy": 0.20811213459819555, "epoch": 0.0453360199122911, "frac_reward_zero_std": 0.675, "grad_norm": 2.225015878677368, "kl": 0.19611967872414324, "learning_rate": 3.5962876445596224e-07, "loss": -0.0131, "num_tokens": 12419935.0, "reward": 1.1921875, "reward_std": 0.14945077449083327, "rewards/equation_reward_func/mean": 0.1921875, "rewards/equation_reward_func/std": 0.3853117167949677, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.387257742881775, "sampling/importance_sampling_ratio/mean": 0.9999917268753051, "sampling/importance_sampling_ratio/min": 0.5835487842559814, "sampling/sampling_logp_difference/max": 0.5435838222503662, "sampling/sampling_logp_difference/mean": 0.009055312536656857, "step": 170 }, { "clip_ratio/high_max": 8.06462372161655e-05, "clip_ratio/high_mean": 8.06462372161655e-05, "clip_ratio/low_mean": 0.0001336433407333162, "clip_ratio/low_min": 0.0001336433407333162, "clip_ratio/region_mean": 0.0002142895779494817, "completions/clipped_ratio": 0.0, "completions/max_length": 88.25, "completions/max_terminated_length": 88.25, "completions/mean_length": 63.41015625, "completions/mean_terminated_length": 63.41015625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.20984113444056776, "epoch": 0.0458693848524357, "frac_reward_zero_std": 0.703125, "grad_norm": 1.7931643724441528, "kl": 0.19685209444206622, "learning_rate": 3.563796212686475e-07, "loss": 0.0266, "num_tokens": 12530401.0, "reward": 1.24609375, "reward_std": 0.13821592181921005, "rewards/equation_reward_func/mean": 0.248046875, "rewards/equation_reward_func/std": 0.4120710641145706, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.4153652489185333, "sampling/importance_sampling_ratio/mean": 0.9998707175254822, "sampling/importance_sampling_ratio/min": 0.6463374644517899, "sampling/sampling_logp_difference/max": 0.469277024269104, "sampling/sampling_logp_difference/mean": 0.008903665002435446, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.504197421638916e-05, "clip_ratio/low_min": 5.504197421638916e-05, "clip_ratio/region_mean": 5.504197421638916e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 134.4, "completions/max_terminated_length": 134.4, "completions/mean_length": 64.921875, "completions/mean_terminated_length": 64.921875, "completions/min_length": 49.2, "completions/min_terminated_length": 49.2, "entropy": 0.2091738460585475, "epoch": 0.0464027497925803, "frac_reward_zero_std": 0.7125, "grad_norm": 1.9955060482025146, "kl": 0.1718884548689756, "learning_rate": 3.531083859466635e-07, "loss": 0.0061, "num_tokens": 12669807.0, "reward": 1.1671875, "reward_std": 0.13030132576823233, "rewards/equation_reward_func/mean": 0.1671875, "rewards/equation_reward_func/std": 0.3570630460977554, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.402362012863159, "sampling/importance_sampling_ratio/mean": 1.0002094507217407, "sampling/importance_sampling_ratio/min": 0.6561053633689881, "sampling/sampling_logp_difference/max": 0.42577319145202636, "sampling/sampling_logp_difference/mean": 0.00882110558450222, "step": 174 }, { "clip_ratio/high_max": 0.0001310070607966433, "clip_ratio/high_mean": 0.0001310070607966433, "clip_ratio/low_mean": 5.238043538863874e-05, "clip_ratio/low_min": 5.238043538863874e-05, "clip_ratio/region_mean": 0.00018338749618528204, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 64.634765625, "completions/mean_terminated_length": 64.634765625, "completions/min_length": 48.5, "completions/min_terminated_length": 48.5, "entropy": 0.20925955733077395, "epoch": 0.0469361147327249, "frac_reward_zero_std": 0.59375, "grad_norm": 2.0089259147644043, "kl": 0.1909264405258, "learning_rate": 3.498157378360204e-07, "loss": -0.0027, "num_tokens": 12781172.0, "reward": 1.20703125, "reward_std": 0.19167311489582062, "rewards/equation_reward_func/mean": 0.20703125, "rewards/equation_reward_func/std": 0.39632514119148254, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.417098730802536, "sampling/importance_sampling_ratio/mean": 1.0000827610492706, "sampling/importance_sampling_ratio/min": 0.6654897183179855, "sampling/sampling_logp_difference/max": 0.41814956068992615, "sampling/sampling_logp_difference/mean": 0.009116811444982886, "step": 176 }, { "clip_ratio/high_max": 8.042784207241816e-05, "clip_ratio/high_mean": 8.042784207241816e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.042784207241816e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 118.2, "completions/max_terminated_length": 118.2, "completions/mean_length": 63.940625, "completions/mean_terminated_length": 63.940625, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.20552092480162779, "epoch": 0.047469479672869505, "frac_reward_zero_std": 0.7, "grad_norm": 1.5263612270355225, "kl": 0.1846906136100491, "learning_rate": 3.465023607295784e-07, "loss": 0.0105, "num_tokens": 12919646.0, "reward": 1.19375, "reward_std": 0.14402883723378182, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3936863362789154, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.444833016395569, "sampling/importance_sampling_ratio/mean": 1.0000781059265136, "sampling/importance_sampling_ratio/min": 0.6021921098232269, "sampling/sampling_logp_difference/max": 0.5145103931427002, "sampling/sampling_logp_difference/mean": 0.008983203582465649, "step": 178 }, { "clip_ratio/high_max": 8.750663578717245e-05, "clip_ratio/high_mean": 8.750663578717245e-05, "clip_ratio/low_mean": 0.0001153587234309978, "clip_ratio/low_min": 0.0001153587234309978, "clip_ratio/region_mean": 0.00020286535921817025, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 63.517578125, "completions/mean_terminated_length": 63.517578125, "completions/min_length": 43.25, "completions/min_terminated_length": 43.25, "entropy": 0.1993725932099753, "epoch": 0.04800284461301411, "frac_reward_zero_std": 0.6875, "grad_norm": 1.5091460943222046, "kl": 0.23696451106419167, "learning_rate": 3.4316894272504225e-07, "loss": -0.0056, "num_tokens": 13030303.0, "reward": 1.126953125, "reward_std": 0.14064733497798443, "rewards/equation_reward_func/mean": 0.126953125, "rewards/equation_reward_func/std": 0.3309350237250328, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4980274438858032, "sampling/importance_sampling_ratio/mean": 0.999913901090622, "sampling/importance_sampling_ratio/min": 0.6534235626459122, "sampling/sampling_logp_difference/max": 0.45179370045661926, "sampling/sampling_logp_difference/mean": 0.009124602656811476, "step": 180 }, { "clip_ratio/high_max": 5.85379053114189e-05, "clip_ratio/high_mean": 5.85379053114189e-05, "clip_ratio/low_mean": 0.00011267932455262376, "clip_ratio/low_min": 0.00011267932455262376, "clip_ratio/region_mean": 0.00017121722986404266, "completions/clipped_ratio": 0.0, "completions/max_length": 108.2, "completions/max_terminated_length": 108.2, "completions/mean_length": 63.775, "completions/mean_terminated_length": 63.775, "completions/min_length": 48.8, "completions/min_terminated_length": 48.8, "entropy": 0.19764780367000234, "epoch": 0.0485362095531587, "frac_reward_zero_std": 0.7, "grad_norm": 1.9451539516448975, "kl": 0.20599548052996397, "learning_rate": 3.398161760820628e-07, "loss": -0.0127, "num_tokens": 13168831.0, "reward": 1.2, "reward_std": 0.1417677104473114, "rewards/equation_reward_func/mean": 0.2, "rewards/equation_reward_func/std": 0.40060241222381593, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3801430463790894, "sampling/importance_sampling_ratio/mean": 0.9999285578727722, "sampling/importance_sampling_ratio/min": 0.6406774878501892, "sampling/sampling_logp_difference/max": 0.45495412349700926, "sampling/sampling_logp_difference/mean": 0.009095152094960212, "step": 182 }, { "clip_ratio/high_max": 0.00011568861858298381, "clip_ratio/high_mean": 0.00011568861858298381, "clip_ratio/low_mean": 0.0001075267306684206, "clip_ratio/low_min": 0.0001075267306684206, "clip_ratio/region_mean": 0.0002232153492514044, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 63.171875, "completions/mean_terminated_length": 63.171875, "completions/min_length": 47.25, "completions/min_terminated_length": 47.25, "entropy": 0.1908335169363353, "epoch": 0.049069574493303306, "frac_reward_zero_std": 0.609375, "grad_norm": 1.8466938734054565, "kl": 0.21848639054223895, "learning_rate": 3.364447570784731e-07, "loss": 0.0195, "num_tokens": 13279375.0, "reward": 1.21875, "reward_std": 0.17504537850618362, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.4053068608045578, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4124400615692139, "sampling/importance_sampling_ratio/mean": 0.9999132603406906, "sampling/importance_sampling_ratio/min": 0.6521744579076767, "sampling/sampling_logp_difference/max": 0.4437905550003052, "sampling/sampling_logp_difference/mean": 0.008945783600211143, "step": 184 }, { "clip_ratio/high_max": 8.261863614380773e-05, "clip_ratio/high_mean": 8.261863614380773e-05, "clip_ratio/low_mean": 7.651985086138464e-05, "clip_ratio/low_min": 7.651985086138464e-05, "clip_ratio/region_mean": 0.00015913848700519238, "completions/clipped_ratio": 0.0, "completions/max_length": 103.8, "completions/max_terminated_length": 103.8, "completions/mean_length": 63.2953125, "completions/mean_terminated_length": 63.2953125, "completions/min_length": 48.4, "completions/min_terminated_length": 48.4, "entropy": 0.18987937147418657, "epoch": 0.04960293943344791, "frac_reward_zero_std": 0.5625, "grad_norm": 2.354302167892456, "kl": 0.18163209206735095, "learning_rate": 3.3305538586569116e-07, "loss": 0.0166, "num_tokens": 13417476.0, "reward": 1.2671875, "reward_std": 0.19243125021457672, "rewards/equation_reward_func/mean": 0.2671875, "rewards/equation_reward_func/std": 0.44210425615310667, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4941126823425293, "sampling/importance_sampling_ratio/mean": 1.0001677989959716, "sampling/importance_sampling_ratio/min": 0.6339724779129028, "sampling/sampling_logp_difference/max": 0.5065191984176636, "sampling/sampling_logp_difference/mean": 0.00851092506200075, "step": 186 }, { "clip_ratio/high_max": 5.184006133478963e-05, "clip_ratio/high_mean": 5.184006133478963e-05, "clip_ratio/low_mean": 2.650551323313266e-05, "clip_ratio/low_min": 2.650551323313266e-05, "clip_ratio/region_mean": 7.834557456792229e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 108.75, "completions/max_terminated_length": 108.75, "completions/mean_length": 63.244140625, "completions/mean_terminated_length": 63.244140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18945501444654334, "epoch": 0.05013630437359251, "frac_reward_zero_std": 0.71875, "grad_norm": 1.751578688621521, "kl": 0.18739058687869045, "learning_rate": 3.296487663233168e-07, "loss": -0.013, "num_tokens": 13527929.0, "reward": 1.205078125, "reward_std": 0.12756678089499474, "rewards/equation_reward_func/mean": 0.205078125, "rewards/equation_reward_func/std": 0.379383884370327, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4296348094940186, "sampling/importance_sampling_ratio/mean": 0.9996595829725266, "sampling/importance_sampling_ratio/min": 0.6262573152780533, "sampling/sampling_logp_difference/max": 0.46863260865211487, "sampling/sampling_logp_difference/mean": 0.008397250552661717, "step": 188 }, { "clip_ratio/high_max": 8.775523423941599e-05, "clip_ratio/high_mean": 8.775523423941599e-05, "clip_ratio/low_mean": 5.117018655356434e-05, "clip_ratio/low_min": 5.117018655356434e-05, "clip_ratio/region_mean": 0.00013892542079298032, "completions/clipped_ratio": 0.0, "completions/max_length": 92.4, "completions/max_terminated_length": 92.4, "completions/mean_length": 62.7578125, "completions/mean_terminated_length": 62.7578125, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.17811263197412094, "epoch": 0.05066966931373711, "frac_reward_zero_std": 0.675, "grad_norm": 1.7318817377090454, "kl": 0.24012729215125242, "learning_rate": 3.2622560591295606e-07, "loss": -0.0077, "num_tokens": 13665670.0, "reward": 1.1953125, "reward_std": 0.14308778196573257, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.39625563621521, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4409423351287842, "sampling/importance_sampling_ratio/mean": 0.9997446298599243, "sampling/importance_sampling_ratio/min": 0.5831532835960388, "sampling/sampling_logp_difference/max": 0.5436709642410278, "sampling/sampling_logp_difference/mean": 0.008637924678623676, "step": 190 }, { "clip_ratio/high_max": 0.00019298659107233916, "clip_ratio/high_mean": 0.00019298659107233916, "clip_ratio/low_mean": 5.212489567283127e-05, "clip_ratio/low_min": 5.212489567283127e-05, "clip_ratio/region_mean": 0.00024511148674517043, "completions/clipped_ratio": 0.0, "completions/max_length": 104.5, "completions/max_terminated_length": 104.5, "completions/mean_length": 63.19921875, "completions/mean_terminated_length": 63.19921875, "completions/min_length": 47.25, "completions/min_terminated_length": 47.25, "entropy": 0.180551425450378, "epoch": 0.05120303425388171, "frac_reward_zero_std": 0.671875, "grad_norm": 1.8166390657424927, "kl": 0.21641908177278107, "learning_rate": 3.227866155313002e-07, "loss": 0.0047, "num_tokens": 13776220.0, "reward": 1.208984375, "reward_std": 0.14131195098161697, "rewards/equation_reward_func/mean": 0.208984375, "rewards/equation_reward_func/std": 0.3998575359582901, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4574789702892303, "sampling/importance_sampling_ratio/mean": 1.0001034140586853, "sampling/importance_sampling_ratio/min": 0.6247721761465073, "sampling/sampling_logp_difference/max": 0.47462034225463867, "sampling/sampling_logp_difference/mean": 0.00849132495932281, "step": 192 }, { "clip_ratio/high_max": 0.0001094494218705222, "clip_ratio/high_mean": 0.0001094494218705222, "clip_ratio/low_mean": 2.0729685072890585e-05, "clip_ratio/low_min": 2.0729685072890585e-05, "clip_ratio/region_mean": 0.0001301791069434128, "completions/clipped_ratio": 0.0, "completions/max_length": 99.4, "completions/max_terminated_length": 99.4, "completions/mean_length": 62.9640625, "completions/mean_terminated_length": 62.9640625, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.18412012358506522, "epoch": 0.05173639919402631, "frac_reward_zero_std": 0.7375, "grad_norm": 1.6384707689285278, "kl": 0.19922550576221612, "learning_rate": 3.1933250936249213e-07, "loss": 0.0141, "num_tokens": 13913909.0, "reward": 1.246875, "reward_std": 0.11394569501280785, "rewards/equation_reward_func/mean": 0.246875, "rewards/equation_reward_func/std": 0.42095857858657837, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3506606578826905, "sampling/importance_sampling_ratio/mean": 0.9998487830162048, "sampling/importance_sampling_ratio/min": 0.6323154926300049, "sampling/sampling_logp_difference/max": 0.46338059902191164, "sampling/sampling_logp_difference/mean": 0.008664014749228954, "step": 194 }, { "clip_ratio/high_max": 8.412238740776148e-05, "clip_ratio/high_mean": 8.412238740776148e-05, "clip_ratio/low_mean": 2.7557320815200608e-05, "clip_ratio/low_min": 2.7557320815200608e-05, "clip_ratio/region_mean": 0.00011167970822296209, "completions/clipped_ratio": 0.0, "completions/max_length": 102.5, "completions/max_terminated_length": 102.5, "completions/mean_length": 63.1953125, "completions/mean_terminated_length": 63.1953125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18057077559125093, "epoch": 0.052269764134170914, "frac_reward_zero_std": 0.671875, "grad_norm": 1.701501488685608, "kl": 0.310817190963361, "learning_rate": 3.158640047298098e-07, "loss": -0.0023, "num_tokens": 14024297.0, "reward": 1.24609375, "reward_std": 0.1529458425939083, "rewards/equation_reward_func/mean": 0.24609375, "rewards/equation_reward_func/std": 0.4260425418615341, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.43620565533638, "sampling/importance_sampling_ratio/mean": 0.9999806135892868, "sampling/importance_sampling_ratio/min": 0.6063458770513535, "sampling/sampling_logp_difference/max": 0.5083138346672058, "sampling/sampling_logp_difference/mean": 0.00880246888846159, "step": 196 }, { "clip_ratio/high_max": 0.000140754438082998, "clip_ratio/high_mean": 0.000140754438082998, "clip_ratio/low_mean": 8.577961448786987e-05, "clip_ratio/low_min": 8.577961448786987e-05, "clip_ratio/region_mean": 0.00022653405257086788, "completions/clipped_ratio": 0.0, "completions/max_length": 100.6, "completions/max_terminated_length": 100.6, "completions/mean_length": 64.1171875, "completions/mean_terminated_length": 64.1171875, "completions/min_length": 48.2, "completions/min_terminated_length": 48.2, "entropy": 0.1831229431554675, "epoch": 0.052803129074315516, "frac_reward_zero_std": 0.6875, "grad_norm": 1.097544550895691, "kl": 0.2240865388367739, "learning_rate": 3.123818219466981e-07, "loss": 0.0186, "num_tokens": 14162780.0, "reward": 1.2359375, "reward_std": 0.13914016783237457, "rewards/equation_reward_func/mean": 0.2359375, "rewards/equation_reward_func/std": 0.4192229866981506, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4297068119049072, "sampling/importance_sampling_ratio/mean": 0.9998314619064331, "sampling/importance_sampling_ratio/min": 0.6185783624649048, "sampling/sampling_logp_difference/max": 0.48047289848327634, "sampling/sampling_logp_difference/mean": 0.00881122201681137, "step": 198 }, { "clip_ratio/high_max": 0.0001034611551504996, "clip_ratio/high_mean": 0.0001034611551504996, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001034611551504996, "completions/clipped_ratio": 0.0, "completions/max_length": 106.25, "completions/max_terminated_length": 106.25, "completions/mean_length": 63.58203125, "completions/mean_terminated_length": 63.58203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18562056750266087, "epoch": 0.05333649401446012, "frac_reward_zero_std": 0.671875, "grad_norm": 1.6609328985214233, "kl": 0.22386336792260408, "learning_rate": 3.088866841671789e-07, "loss": -0.0202, "num_tokens": 14273174.0, "reward": 1.236328125, "reward_std": 0.14702780172228813, "rewards/equation_reward_func/mean": 0.236328125, "rewards/equation_reward_func/std": 0.4226124584674835, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4070797264575958, "sampling/importance_sampling_ratio/mean": 0.9999641180038452, "sampling/importance_sampling_ratio/min": 0.6580639630556107, "sampling/sampling_logp_difference/max": 0.43316417932510376, "sampling/sampling_logp_difference/mean": 0.008555968524888158, "step": 200 }, { "clip_ratio/high_max": 2.53446875528122e-05, "clip_ratio/high_mean": 2.53446875528122e-05, "clip_ratio/low_mean": 8.104020930154042e-05, "clip_ratio/low_min": 8.104020930154042e-05, "clip_ratio/region_mean": 0.00010638489685435261, "completions/clipped_ratio": 0.0, "completions/max_length": 111.8, "completions/max_terminated_length": 111.8, "completions/mean_length": 64.84375, "completions/mean_terminated_length": 64.84375, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.1889410357301434, "epoch": 0.053869858954604714, "frac_reward_zero_std": 0.6875, "grad_norm": 1.9926122426986694, "kl": 0.2871894968363146, "learning_rate": 3.0537931723567253e-07, "loss": -0.012, "num_tokens": 14412194.0, "reward": 1.253125, "reward_std": 0.14187310189008712, "rewards/equation_reward_func/mean": 0.253125, "rewards/equation_reward_func/std": 0.424527508020401, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3976219177246094, "sampling/importance_sampling_ratio/mean": 0.9997592449188233, "sampling/importance_sampling_ratio/min": 0.6429756283760071, "sampling/sampling_logp_difference/max": 0.44319958686828614, "sampling/sampling_logp_difference/mean": 0.008395052142441272, "step": 202 }, { "clip_ratio/high_max": 0.00016149873327877786, "clip_ratio/high_mean": 0.00016149873327877786, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016149873327877786, "completions/clipped_ratio": 0.0, "completions/max_length": 114.5, "completions/max_terminated_length": 114.5, "completions/mean_length": 66.111328125, "completions/mean_terminated_length": 66.111328125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19355154554877016, "epoch": 0.05440322389474932, "frac_reward_zero_std": 0.828125, "grad_norm": 1.423039197921753, "kl": 0.18596438468537396, "learning_rate": 3.01860449536259e-07, "loss": 0.0166, "num_tokens": 14524251.0, "reward": 1.146484375, "reward_std": 0.07640802673995495, "rewards/equation_reward_func/mean": 0.146484375, "rewards/equation_reward_func/std": 0.32832180336117744, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4592007994651794, "sampling/importance_sampling_ratio/mean": 0.9998245090246201, "sampling/importance_sampling_ratio/min": 0.646577998995781, "sampling/sampling_logp_difference/max": 0.443134605884552, "sampling/sampling_logp_difference/mean": 0.008427762775681913, "step": 204 }, { "clip_ratio/high_max": 5.668933994861113e-05, "clip_ratio/high_mean": 5.668933994861113e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.668933994861113e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 101.2, "completions/max_terminated_length": 101.2, "completions/mean_length": 65.328125, "completions/mean_terminated_length": 65.328125, "completions/min_length": 47.6, "completions/min_terminated_length": 47.6, "entropy": 0.193049149794711, "epoch": 0.05493658883489392, "frac_reward_zero_std": 0.675, "grad_norm": 1.9184489250183105, "kl": 0.1909414147440758, "learning_rate": 2.983308118414131e-07, "loss": -0.0227, "num_tokens": 14663757.0, "reward": 1.2296875, "reward_std": 0.15586024522781372, "rewards/equation_reward_func/mean": 0.2296875, "rewards/equation_reward_func/std": 0.4137424826622009, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5901880025863648, "sampling/importance_sampling_ratio/mean": 1.0000939846038819, "sampling/importance_sampling_ratio/min": 0.6207010388374329, "sampling/sampling_logp_difference/max": 0.5368104219436646, "sampling/sampling_logp_difference/mean": 0.008375569432973861, "step": 206 }, { "clip_ratio/high_max": 0.00013453266324682368, "clip_ratio/high_mean": 0.00013453266324682368, "clip_ratio/low_mean": 9.905318600229091e-05, "clip_ratio/low_min": 9.905318600229091e-05, "clip_ratio/region_mean": 0.0002335858492491146, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 64.328125, "completions/mean_terminated_length": 64.328125, "completions/min_length": 47.25, "completions/min_terminated_length": 47.25, "entropy": 0.18625920695356196, "epoch": 0.05546995377503852, "frac_reward_zero_std": 0.65625, "grad_norm": 1.8009364604949951, "kl": 0.18786600982356402, "learning_rate": 2.9479113716024275e-07, "loss": 0.0239, "num_tokens": 14774741.0, "reward": 1.2421875, "reward_std": 0.15222752653062344, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4227939695119858, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4697144031524658, "sampling/importance_sampling_ratio/mean": 0.9999816715717316, "sampling/importance_sampling_ratio/min": 0.6553923785686493, "sampling/sampling_logp_difference/max": 0.48270073533058167, "sampling/sampling_logp_difference/mean": 0.008194109657779336, "step": 208 }, { "clip_ratio/high_max": 8.17424103540058e-05, "clip_ratio/high_mean": 8.17424103540058e-05, "clip_ratio/low_mean": 7.822393672540784e-05, "clip_ratio/low_min": 7.822393672540784e-05, "clip_ratio/region_mean": 0.00015996634707941362, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 64.759375, "completions/mean_terminated_length": 64.759375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18833257868472073, "epoch": 0.056003318715183124, "frac_reward_zero_std": 0.625, "grad_norm": 2.0888671875, "kl": 0.20068206457007262, "learning_rate": 2.912421605862632e-07, "loss": -0.0214, "num_tokens": 14913499.0, "reward": 1.3203125, "reward_std": 0.1623930275440216, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.45186705589294435, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5435975790023804, "sampling/importance_sampling_ratio/mean": 0.9999268412590027, "sampling/importance_sampling_ratio/min": 0.681738805770874, "sampling/sampling_logp_difference/max": 0.44818577766418455, "sampling/sampling_logp_difference/mean": 0.008317346125841141, "step": 210 }, { "clip_ratio/high_max": 7.937684131320566e-05, "clip_ratio/high_mean": 7.937684131320566e-05, "clip_ratio/low_mean": 2.81151604010827e-05, "clip_ratio/low_min": 2.81151604010827e-05, "clip_ratio/region_mean": 0.00010749200171428836, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 66.74609375, "completions/mean_terminated_length": 66.74609375, "completions/min_length": 47.25, "completions/min_terminated_length": 47.25, "entropy": 0.18747672935326895, "epoch": 0.05653668365532772, "frac_reward_zero_std": 0.6875, "grad_norm": 1.72495436668396, "kl": 0.19299201729396978, "learning_rate": 2.8768461914473794e-07, "loss": 0.0241, "num_tokens": 15026073.0, "reward": 1.185546875, "reward_std": 0.14512556791305542, "rewards/equation_reward_func/mean": 0.185546875, "rewards/equation_reward_func/std": 0.3886159062385559, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.549866259098053, "sampling/importance_sampling_ratio/mean": 1.0001529604196548, "sampling/importance_sampling_ratio/min": 0.59317347407341, "sampling/sampling_logp_difference/max": 0.5247410833835602, "sampling/sampling_logp_difference/mean": 0.008259737165644765, "step": 212 }, { "clip_ratio/high_max": 0.00013496437006526522, "clip_ratio/high_mean": 0.00013496437006526522, "clip_ratio/low_mean": 8.351036780772524e-05, "clip_ratio/low_min": 8.351036780772524e-05, "clip_ratio/region_mean": 0.00021847473787299046, "completions/clipped_ratio": 0.0, "completions/max_length": 89.4, "completions/max_terminated_length": 89.4, "completions/mean_length": 64.7140625, "completions/mean_terminated_length": 64.7140625, "completions/min_length": 46.8, "completions/min_terminated_length": 46.8, "entropy": 0.1833285784555806, "epoch": 0.05707004859547232, "frac_reward_zero_std": 0.575, "grad_norm": 2.1709375381469727, "kl": 0.17600014288392332, "learning_rate": 2.8411925163961926e-07, "loss": -0.0206, "num_tokens": 15165138.0, "reward": 1.23125, "reward_std": 0.18506672978401184, "rewards/equation_reward_func/mean": 0.23125, "rewards/equation_reward_func/std": 0.4160960495471954, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5050809144973756, "sampling/importance_sampling_ratio/mean": 0.9999413013458252, "sampling/importance_sampling_ratio/min": 0.6202431082725525, "sampling/sampling_logp_difference/max": 0.49312261343002317, "sampling/sampling_logp_difference/mean": 0.00787862464785576, "step": 214 }, { "clip_ratio/high_max": 0.00010878545355000015, "clip_ratio/high_mean": 0.00010878545355000015, "clip_ratio/low_mean": 8.11065919050533e-05, "clip_ratio/low_min": 8.11065919050533e-05, "clip_ratio/region_mean": 0.00018989204545505345, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 65.4765625, "completions/mean_terminated_length": 65.4765625, "completions/min_length": 46.75, "completions/min_terminated_length": 46.75, "entropy": 0.18642145136578214, "epoch": 0.057603413535616925, "frac_reward_zero_std": 0.65625, "grad_norm": 1.9843711853027344, "kl": 0.1750187755872806, "learning_rate": 2.8054679850011825e-07, "loss": 0.0211, "num_tokens": 15276750.0, "reward": 1.2109375, "reward_std": 0.15879588015377522, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4009798914194107, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6093161702156067, "sampling/importance_sampling_ratio/mean": 0.999867856502533, "sampling/importance_sampling_ratio/min": 0.652244046330452, "sampling/sampling_logp_difference/max": 0.5390398502349854, "sampling/sampling_logp_difference/mean": 0.008385407272726297, "step": 216 }, { "clip_ratio/high_max": 5.7748814773124955e-05, "clip_ratio/high_mean": 5.7748814773124955e-05, "clip_ratio/low_mean": 2.7667110164960224e-05, "clip_ratio/low_min": 2.7667110164960224e-05, "clip_ratio/region_mean": 8.541592493808518e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 94.8, "completions/max_terminated_length": 94.8, "completions/mean_length": 64.15625, "completions/mean_terminated_length": 64.15625, "completions/min_length": 47.4, "completions/min_terminated_length": 47.4, "entropy": 0.1881969994865358, "epoch": 0.05813677847576153, "frac_reward_zero_std": 0.675, "grad_norm": 2.102374315261841, "kl": 0.16769350371840927, "learning_rate": 2.769680016269385e-07, "loss": 0.019, "num_tokens": 15415322.0, "reward": 1.246875, "reward_std": 0.1474482908844948, "rewards/equation_reward_func/mean": 0.246875, "rewards/equation_reward_func/std": 0.42402565479278564, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4707128047943114, "sampling/importance_sampling_ratio/mean": 1.0000540256500243, "sampling/importance_sampling_ratio/min": 0.6912026286125184, "sampling/sampling_logp_difference/max": 0.4338069438934326, "sampling/sampling_logp_difference/mean": 0.007925937138497829, "step": 218 }, { "clip_ratio/high_max": 0.00015820709813851863, "clip_ratio/high_mean": 0.00015820709813851863, "clip_ratio/low_mean": 7.490155338827107e-05, "clip_ratio/low_min": 7.490155338827107e-05, "clip_ratio/region_mean": 0.0002331086515267897, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 65.103515625, "completions/mean_terminated_length": 65.103515625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.17939016398870283, "epoch": 0.05867014341590613, "frac_reward_zero_std": 0.625, "grad_norm": 1.8401851654052734, "kl": 0.17055363782371083, "learning_rate": 2.7338360423820327e-07, "loss": 0.0121, "num_tokens": 15526935.0, "reward": 1.26953125, "reward_std": 0.16787051782011986, "rewards/equation_reward_func/mean": 0.26953125, "rewards/equation_reward_func/std": 0.43650104850530624, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5646073818206787, "sampling/importance_sampling_ratio/mean": 0.9998454600572586, "sampling/importance_sampling_ratio/min": 0.5650992169976234, "sampling/sampling_logp_difference/max": 0.6013510823249817, "sampling/sampling_logp_difference/mean": 0.007840978098101914, "step": 220 }, { "clip_ratio/high_max": 2.6009155489090415e-05, "clip_ratio/high_mean": 2.6009155489090415e-05, "clip_ratio/low_mean": 3.156565556613108e-05, "clip_ratio/low_min": 3.156565556613108e-05, "clip_ratio/region_mean": 5.757481105522149e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 92.8, "completions/max_terminated_length": 92.8, "completions/mean_length": 64.4203125, "completions/mean_terminated_length": 64.4203125, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.17514930318834054, "epoch": 0.059203508356050726, "frac_reward_zero_std": 0.6875, "grad_norm": 1.8907098770141602, "kl": 0.49691931406656903, "learning_rate": 2.6979435071510956e-07, "loss": 0.0058, "num_tokens": 15665884.0, "reward": 1.203125, "reward_std": 0.14554866254329682, "rewards/equation_reward_func/mean": 0.2046875, "rewards/equation_reward_func/std": 0.40206090807914735, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.3840144157409668, "sampling/importance_sampling_ratio/mean": 1.0000867247581482, "sampling/importance_sampling_ratio/min": 0.6481404185295105, "sampling/sampling_logp_difference/max": 0.46214933395385743, "sampling/sampling_logp_difference/mean": 0.007646582275629044, "step": 222 }, { "clip_ratio/high_max": 5.28094630377988e-05, "clip_ratio/high_mean": 5.28094630377988e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.28094630377988e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 106.5, "completions/max_terminated_length": 106.5, "completions/mean_length": 64.080078125, "completions/mean_terminated_length": 64.080078125, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.1798876695231431, "epoch": 0.05973687329619533, "frac_reward_zero_std": 0.609375, "grad_norm": 2.35087251663208, "kl": 0.1781889583605031, "learning_rate": 2.662009864473406e-07, "loss": 0.0186, "num_tokens": 15776853.0, "reward": 1.294921875, "reward_std": 0.16938887536525726, "rewards/equation_reward_func/mean": 0.294921875, "rewards/equation_reward_func/std": 0.4551277905702591, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5079967379570007, "sampling/importance_sampling_ratio/mean": 0.9999506622552872, "sampling/importance_sampling_ratio/min": 0.6650903224945068, "sampling/sampling_logp_difference/max": 0.47136190533638, "sampling/sampling_logp_difference/mean": 0.008146809646859765, "step": 224 }, { "clip_ratio/high_max": 2.8344669974305565e-05, "clip_ratio/high_mean": 2.8344669974305565e-05, "clip_ratio/low_mean": 0.00010814796001391692, "clip_ratio/low_min": 0.00010814796001391692, "clip_ratio/region_mean": 0.0001364926299882225, "completions/clipped_ratio": 0.0, "completions/max_length": 97.8, "completions/max_terminated_length": 97.8, "completions/mean_length": 64.3671875, "completions/mean_terminated_length": 64.3671875, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.17947699833247396, "epoch": 0.06027023823633993, "frac_reward_zero_std": 0.5625, "grad_norm": 1.542752742767334, "kl": 0.21120414825984174, "learning_rate": 2.626042576782687e-07, "loss": -0.003, "num_tokens": 15915776.0, "reward": 1.2375, "reward_std": 0.1995237112045288, "rewards/equation_reward_func/mean": 0.2375, "rewards/equation_reward_func/std": 0.41402295231819153, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4337264776229859, "sampling/importance_sampling_ratio/mean": 0.9998421669006348, "sampling/importance_sampling_ratio/min": 0.6376795053482056, "sampling/sampling_logp_difference/max": 0.45897268056869506, "sampling/sampling_logp_difference/mean": 0.007988696359097958, "step": 226 }, { "clip_ratio/high_max": 0.00010427991381018526, "clip_ratio/high_mean": 0.00010427991381018526, "clip_ratio/low_mean": 5.5908518131925826e-05, "clip_ratio/low_min": 5.5908518131925826e-05, "clip_ratio/region_mean": 0.00016018843194211109, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 64.43359375, "completions/mean_terminated_length": 64.43359375, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.17611877459825742, "epoch": 0.06080360317648453, "frac_reward_zero_std": 0.703125, "grad_norm": 1.759642481803894, "kl": 0.16036804304975602, "learning_rate": 2.590049113499809e-07, "loss": 0.0053, "num_tokens": 16026950.0, "reward": 1.2734375, "reward_std": 0.13933486305177212, "rewards/equation_reward_func/mean": 0.2734375, "rewards/equation_reward_func/std": 0.4260459691286087, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4359349310398102, "sampling/importance_sampling_ratio/mean": 1.0001140534877777, "sampling/importance_sampling_ratio/min": 0.6872338503599167, "sampling/sampling_logp_difference/max": 0.3922656178474426, "sampling/sampling_logp_difference/mean": 0.007596448762342334, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00016883576528117474, "clip_ratio/low_min": 0.00016883576528117474, "clip_ratio/region_mean": 0.00016883576528117474, "completions/clipped_ratio": 0.0, "completions/max_length": 111.6, "completions/max_terminated_length": 111.6, "completions/mean_length": 62.809375, "completions/mean_terminated_length": 62.809375, "completions/min_length": 46.2, "completions/min_terminated_length": 46.2, "entropy": 0.17455151811656025, "epoch": 0.061336968116629136, "frac_reward_zero_std": 0.7875, "grad_norm": 1.4685519933700562, "kl": 0.1685524294152856, "learning_rate": 2.5540369494815966e-07, "loss": -0.0137, "num_tokens": 16164580.0, "reward": 1.2296875, "reward_std": 0.09558307379484177, "rewards/equation_reward_func/mean": 0.2296875, "rewards/equation_reward_func/std": 0.4103892266750336, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4958541631698608, "sampling/importance_sampling_ratio/mean": 1.000268292427063, "sampling/importance_sampling_ratio/min": 0.6612611413002014, "sampling/sampling_logp_difference/max": 0.4455454468727112, "sampling/sampling_logp_difference/mean": 0.00748553154990077, "step": 230 }, { "clip_ratio/high_max": 8.20280935537691e-05, "clip_ratio/high_mean": 8.20280935537691e-05, "clip_ratio/low_mean": 2.7889334079292086e-05, "clip_ratio/low_min": 2.7889334079292086e-05, "clip_ratio/region_mean": 0.00010991742763306118, "completions/clipped_ratio": 0.0, "completions/max_length": 101.75, "completions/max_terminated_length": 101.75, "completions/mean_length": 64.02734375, "completions/mean_terminated_length": 64.02734375, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.17249081330373883, "epoch": 0.06187033305677374, "frac_reward_zero_std": 0.671875, "grad_norm": 1.2454813718795776, "kl": 0.1742195769523581, "learning_rate": 2.5180135634685064e-07, "loss": 0.0106, "num_tokens": 16275562.0, "reward": 1.2578125, "reward_std": 0.13789577968418598, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.4305393770337105, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3367740213871002, "sampling/importance_sampling_ratio/mean": 0.9998167902231216, "sampling/importance_sampling_ratio/min": 0.6072127670049667, "sampling/sampling_logp_difference/max": 0.5013927519321442, "sampling/sampling_logp_difference/mean": 0.0074725772719830275, "step": 232 }, { "clip_ratio/high_max": 0.00011424851577935947, "clip_ratio/high_mean": 0.00011424851577935947, "clip_ratio/low_mean": 5.298180783736623e-05, "clip_ratio/low_min": 5.298180783736623e-05, "clip_ratio/region_mean": 0.0001672303236167257, "completions/clipped_ratio": 0.0, "completions/max_length": 100.4, "completions/max_terminated_length": 100.4, "completions/mean_length": 64.2234375, "completions/mean_terminated_length": 64.2234375, "completions/min_length": 46.2, "completions/min_terminated_length": 46.2, "entropy": 0.16395368996179766, "epoch": 0.062403697996918334, "frac_reward_zero_std": 0.625, "grad_norm": 2.2200074195861816, "kl": 0.25329555353770655, "learning_rate": 2.4819864365314934e-07, "loss": -0.0136, "num_tokens": 16414665.0, "reward": 1.2453125, "reward_std": 0.1626042127609253, "rewards/equation_reward_func/mean": 0.2453125, "rewards/equation_reward_func/std": 0.4238908767700195, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3636396884918214, "sampling/importance_sampling_ratio/mean": 0.9999714016914367, "sampling/importance_sampling_ratio/min": 0.6560260057449341, "sampling/sampling_logp_difference/max": 0.43016657829284666, "sampling/sampling_logp_difference/mean": 0.007287522312253714, "step": 234 }, { "clip_ratio/high_max": 7.901271520596411e-05, "clip_ratio/high_mean": 7.901271520596411e-05, "clip_ratio/low_mean": 7.790034255271571e-05, "clip_ratio/low_min": 7.790034255271571e-05, "clip_ratio/region_mean": 0.00015691305775867982, "completions/clipped_ratio": 0.0, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 63.24609375, "completions/mean_terminated_length": 63.24609375, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.17434011270395583, "epoch": 0.06293706293706294, "frac_reward_zero_std": 0.703125, "grad_norm": 2.1644797325134277, "kl": 0.173382216029697, "learning_rate": 2.445963050518403e-07, "loss": 0.0258, "num_tokens": 16525031.0, "reward": 1.271484375, "reward_std": 0.13795070722699165, "rewards/equation_reward_func/mean": 0.271484375, "rewards/equation_reward_func/std": 0.4451773837208748, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3942452371120453, "sampling/importance_sampling_ratio/mean": 1.000148966908455, "sampling/importance_sampling_ratio/min": 0.6400409936904907, "sampling/sampling_logp_difference/max": 0.46636348962783813, "sampling/sampling_logp_difference/mean": 0.0074723472353070974, "step": 236 }, { "clip_ratio/high_max": 5.536585457674745e-05, "clip_ratio/high_mean": 5.536585457674745e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.536585457674745e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 95.8, "completions/max_terminated_length": 95.8, "completions/mean_length": 63.6671875, "completions/mean_terminated_length": 63.6671875, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.16411014687683848, "epoch": 0.06347042787720754, "frac_reward_zero_std": 0.6875, "grad_norm": 2.1260673999786377, "kl": 0.17642005714070466, "learning_rate": 2.4099508865001914e-07, "loss": -0.0054, "num_tokens": 16663586.0, "reward": 1.296875, "reward_std": 0.1395635187625885, "rewards/equation_reward_func/mean": 0.296875, "rewards/equation_reward_func/std": 0.45453664660453796, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5177248477935792, "sampling/importance_sampling_ratio/mean": 0.9999238848686218, "sampling/importance_sampling_ratio/min": 0.6211411714553833, "sampling/sampling_logp_difference/max": 0.5416148662567138, "sampling/sampling_logp_difference/mean": 0.0073494515381753445, "step": 238 }, { "clip_ratio/high_max": 0.00011504948876487713, "clip_ratio/high_mean": 0.00011504948876487713, "clip_ratio/low_mean": 3.086419827822182e-05, "clip_ratio/low_min": 3.086419827822182e-05, "clip_ratio/region_mean": 0.00014591368704309894, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 63.0859375, "completions/mean_terminated_length": 63.0859375, "completions/min_length": 46.75, "completions/min_terminated_length": 46.75, "entropy": 0.16794319543987513, "epoch": 0.06400379281735213, "frac_reward_zero_std": 0.71875, "grad_norm": 1.759703278541565, "kl": 0.3639255567557282, "learning_rate": 2.3739574232173134e-07, "loss": 0.0205, "num_tokens": 16774038.0, "reward": 1.279296875, "reward_std": 0.12191150058060884, "rewards/equation_reward_func/mean": 0.279296875, "rewards/equation_reward_func/std": 0.44548200070858, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4261211156845093, "sampling/importance_sampling_ratio/mean": 1.0000389963388443, "sampling/importance_sampling_ratio/min": 0.6970338821411133, "sampling/sampling_logp_difference/max": 0.41733336448669434, "sampling/sampling_logp_difference/mean": 0.007256430690176785, "step": 240 }, { "clip_ratio/high_max": 5.274084914061758e-05, "clip_ratio/high_mean": 5.274084914061758e-05, "clip_ratio/low_mean": 3.006253003453215e-05, "clip_ratio/low_min": 3.006253003453215e-05, "clip_ratio/region_mean": 8.280337917514973e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 62.5796875, "completions/mean_terminated_length": 62.5796875, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.16305220685899258, "epoch": 0.06453715775749674, "frac_reward_zero_std": 0.6625, "grad_norm": 2.328376531600952, "kl": 0.182965694874939, "learning_rate": 2.3379901355265936e-07, "loss": 0.0124, "num_tokens": 16911697.0, "reward": 1.315625, "reward_std": 0.1500299260020256, "rewards/equation_reward_func/mean": 0.315625, "rewards/equation_reward_func/std": 0.4523154258728027, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4065765619277955, "sampling/importance_sampling_ratio/mean": 0.9999871253967285, "sampling/importance_sampling_ratio/min": 0.6944168090820313, "sampling/sampling_logp_difference/max": 0.4040639281272888, "sampling/sampling_logp_difference/mean": 0.007436616346240044, "step": 242 }, { "clip_ratio/high_max": 8.113653651283433e-05, "clip_ratio/high_mean": 8.113653651283433e-05, "clip_ratio/low_mean": 2.8460836296694146e-05, "clip_ratio/low_min": 2.8460836296694146e-05, "clip_ratio/region_mean": 0.00010959737280952848, "completions/clipped_ratio": 0.0, "completions/max_length": 98.5, "completions/max_terminated_length": 98.5, "completions/mean_length": 63.607421875, "completions/mean_terminated_length": 63.607421875, "completions/min_length": 44.25, "completions/min_terminated_length": 44.25, "entropy": 0.16438170320664844, "epoch": 0.06507052269764134, "frac_reward_zero_std": 0.828125, "grad_norm": 15.456812858581543, "kl": 2.056027880869806, "learning_rate": 2.3020564928489041e-07, "loss": -0.0086, "num_tokens": 17022480.0, "reward": 1.169921875, "reward_std": 0.07352104876190424, "rewards/equation_reward_func/mean": 0.169921875, "rewards/equation_reward_func/std": 0.37413112819194794, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5000926554203033, "sampling/importance_sampling_ratio/mean": 1.000256508588791, "sampling/importance_sampling_ratio/min": 0.6484794020652771, "sampling/sampling_logp_difference/max": 0.44839775562286377, "sampling/sampling_logp_difference/mean": 0.006827926612459123, "step": 244 }, { "clip_ratio/high_max": 0.00015663525846321136, "clip_ratio/high_mean": 0.00015663525846321136, "clip_ratio/low_mean": 5.522443098016083e-05, "clip_ratio/low_min": 5.522443098016083e-05, "clip_ratio/region_mean": 0.0002118596894433722, "completions/clipped_ratio": 0.0, "completions/max_length": 90.8, "completions/max_terminated_length": 90.8, "completions/mean_length": 61.8484375, "completions/mean_terminated_length": 61.8484375, "completions/min_length": 44.4, "completions/min_terminated_length": 44.4, "entropy": 0.16090325576563677, "epoch": 0.06560388763778595, "frac_reward_zero_std": 0.6125, "grad_norm": 1.667910099029541, "kl": 0.23218239264355767, "learning_rate": 2.2661639576179676e-07, "loss": 0.0076, "num_tokens": 17159647.0, "reward": 1.25625, "reward_std": 0.1683431163430214, "rewards/equation_reward_func/mean": 0.25625, "rewards/equation_reward_func/std": 0.4315088868141174, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4762542724609375, "sampling/importance_sampling_ratio/mean": 1.0004387855529786, "sampling/importance_sampling_ratio/min": 0.6808830142021179, "sampling/sampling_logp_difference/max": 0.4440726637840271, "sampling/sampling_logp_difference/mean": 0.007348168082535267, "step": 246 }, { "clip_ratio/high_max": 8.930452491363717e-05, "clip_ratio/high_mean": 8.930452491363717e-05, "clip_ratio/low_mean": 2.8001791280176905e-05, "clip_ratio/low_min": 2.8001791280176905e-05, "clip_ratio/region_mean": 0.00011730631619381408, "completions/clipped_ratio": 0.0, "completions/max_length": 91.75, "completions/max_terminated_length": 91.75, "completions/mean_length": 61.177734375, "completions/mean_terminated_length": 61.177734375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15887406411477262, "epoch": 0.06613725257793054, "frac_reward_zero_std": 0.6875, "grad_norm": 2.06657338142395, "kl": 0.19579486372984117, "learning_rate": 2.2303199837306153e-07, "loss": 0.0141, "num_tokens": 17269098.0, "reward": 1.302734375, "reward_std": 0.13664141483604908, "rewards/equation_reward_func/mean": 0.302734375, "rewards/equation_reward_func/std": 0.4480810612440109, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.653996467590332, "sampling/importance_sampling_ratio/mean": 1.0000506192445755, "sampling/importance_sampling_ratio/min": 0.6548631638288498, "sampling/sampling_logp_difference/max": 0.5027787685394287, "sampling/sampling_logp_difference/mean": 0.007499830215238035, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.6607066198873024e-05, "clip_ratio/low_min": 2.6607066198873024e-05, "clip_ratio/region_mean": 2.6607066198873024e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 62.3125, "completions/mean_terminated_length": 62.3125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1612296857767635, "epoch": 0.06667061751807514, "frac_reward_zero_std": 0.75, "grad_norm": 1.6572133302688599, "kl": 0.17362945407835972, "learning_rate": 2.194532014998817e-07, "loss": -0.0035, "num_tokens": 17406770.0, "reward": 1.24375, "reward_std": 0.12109648138284683, "rewards/equation_reward_func/mean": 0.24375, "rewards/equation_reward_func/std": 0.42204188704490664, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4614347696304322, "sampling/importance_sampling_ratio/mean": 0.9998581647872925, "sampling/importance_sampling_ratio/min": 0.6137632668018341, "sampling/sampling_logp_difference/max": 0.5247549295425415, "sampling/sampling_logp_difference/mean": 0.0070387150160968305, "step": 250 }, { "clip_ratio/high_max": 0.00010662981205516391, "clip_ratio/high_mean": 0.00010662981205516391, "clip_ratio/low_mean": 8.831652424608667e-05, "clip_ratio/low_min": 8.831652424608667e-05, "clip_ratio/region_mean": 0.0001949463363012506, "completions/clipped_ratio": 0.0, "completions/max_length": 94.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 60.849609375, "completions/mean_terminated_length": 60.849609375, "completions/min_length": 46.75, "completions/min_terminated_length": 46.75, "entropy": 0.16233359753257698, "epoch": 0.06720398245821975, "frac_reward_zero_std": 0.6875, "grad_norm": 1.7682164907455444, "kl": 0.21149040117031997, "learning_rate": 2.1588074836038071e-07, "loss": -0.0099, "num_tokens": 17515941.0, "reward": 1.2265625, "reward_std": 0.13611222803592682, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4145585522055626, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.327787846326828, "sampling/importance_sampling_ratio/mean": 0.9999480247497559, "sampling/importance_sampling_ratio/min": 0.6244406849145889, "sampling/sampling_logp_difference/max": 0.47243717312812805, "sampling/sampling_logp_difference/mean": 0.007292501046322286, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 88.2, "completions/max_terminated_length": 88.2, "completions/mean_length": 61.1390625, "completions/mean_terminated_length": 61.1390625, "completions/min_length": 46.8, "completions/min_terminated_length": 46.8, "entropy": 0.15377678411702314, "epoch": 0.06773734739836434, "frac_reward_zero_std": 0.775, "grad_norm": 1.7278259992599487, "kl": 0.2115213862206373, "learning_rate": 2.1231538085526204e-07, "loss": 0.016, "num_tokens": 17652614.0, "reward": 1.2421875, "reward_std": 0.10368099659681321, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4283242881298065, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5154411792755127, "sampling/importance_sampling_ratio/mean": 0.9999118685722351, "sampling/importance_sampling_ratio/min": 0.5803701639175415, "sampling/sampling_logp_difference/max": 0.5891647100448608, "sampling/sampling_logp_difference/mean": 0.007327141705900431, "step": 254 }, { "clip_ratio/high_max": 2.7667110164960224e-05, "clip_ratio/high_mean": 2.7667110164960224e-05, "clip_ratio/low_mean": 8.62706801854074e-05, "clip_ratio/low_min": 8.62706801854074e-05, "clip_ratio/region_mean": 0.00011393779035036762, "completions/clipped_ratio": 0.0, "completions/max_length": 87.25, "completions/max_terminated_length": 87.25, "completions/mean_length": 61.6171875, "completions/mean_terminated_length": 61.6171875, "completions/min_length": 44.25, "completions/min_terminated_length": 44.25, "entropy": 0.15958832333692247, "epoch": 0.06827071233850895, "frac_reward_zero_std": 0.765625, "grad_norm": 1.8996710777282715, "kl": 0.2802976945208179, "learning_rate": 2.0875783941373686e-07, "loss": -0.0091, "num_tokens": 17762338.0, "reward": 1.19921875, "reward_std": 0.09620788879692554, "rewards/equation_reward_func/mean": 0.201171875, "rewards/equation_reward_func/std": 0.3905909135937691, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.5269966423511505, "sampling/importance_sampling_ratio/mean": 1.0001296997070312, "sampling/importance_sampling_ratio/min": 0.6524465084075928, "sampling/sampling_logp_difference/max": 0.47164058685302734, "sampling/sampling_logp_difference/mean": 0.00734924350399524, "step": 256 }, { "clip_ratio/high_max": 5.62179031678372e-05, "clip_ratio/high_mean": 5.62179031678372e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.62179031678372e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 61.4296875, "completions/mean_terminated_length": 61.4296875, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "entropy": 0.16017217851347393, "epoch": 0.06880407727865355, "frac_reward_zero_std": 0.7125, "grad_norm": 1.555677056312561, "kl": 0.17322223778400156, "learning_rate": 2.052088628397572e-07, "loss": -0.0136, "num_tokens": 17899237.0, "reward": 1.2328125, "reward_std": 0.1271430805325508, "rewards/equation_reward_func/mean": 0.2328125, "rewards/equation_reward_func/std": 0.4192458033561707, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.550822639465332, "sampling/importance_sampling_ratio/mean": 0.9997948050498963, "sampling/importance_sampling_ratio/min": 0.5896112442016601, "sampling/sampling_logp_difference/max": 0.5305134296417237, "sampling/sampling_logp_difference/mean": 0.007488936278969049, "step": 258 }, { "clip_ratio/high_max": 5.9088365459400746e-05, "clip_ratio/high_mean": 5.9088365459400746e-05, "clip_ratio/low_mean": 5.8607078648896684e-05, "clip_ratio/low_min": 5.8607078648896684e-05, "clip_ratio/region_mean": 0.00011769544410829742, "completions/clipped_ratio": 0.0, "completions/max_length": 94.25, "completions/max_terminated_length": 94.25, "completions/mean_length": 61.767578125, "completions/mean_terminated_length": 61.767578125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16376644310851893, "epoch": 0.06933744221879815, "frac_reward_zero_std": 0.859375, "grad_norm": 1.2167359590530396, "kl": 0.18065945664420724, "learning_rate": 2.0166918815858688e-07, "loss": 0.0079, "num_tokens": 18009086.0, "reward": 1.185546875, "reward_std": 0.05569827510043979, "rewards/equation_reward_func/mean": 0.185546875, "rewards/equation_reward_func/std": 0.3832038938999176, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4295075833797455, "sampling/importance_sampling_ratio/mean": 1.000031366944313, "sampling/importance_sampling_ratio/min": 0.6006131768226624, "sampling/sampling_logp_difference/max": 0.5110743939876556, "sampling/sampling_logp_difference/mean": 0.007330463966354728, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 91.6, "completions/max_terminated_length": 91.6, "completions/mean_length": 61.678125, "completions/mean_terminated_length": 61.678125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1666737573945688, "epoch": 0.06987080715894275, "frac_reward_zero_std": 0.7625, "grad_norm": 1.7746992111206055, "kl": 0.1870659707734982, "learning_rate": 1.9813955046374102e-07, "loss": 0.0099, "num_tokens": 18146056.0, "reward": 1.190625, "reward_std": 0.10489567779004574, "rewards/equation_reward_func/mean": 0.190625, "rewards/equation_reward_func/std": 0.38583163022994993, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6973002910614015, "sampling/importance_sampling_ratio/mean": 0.9999050259590149, "sampling/importance_sampling_ratio/min": 0.6285896301269531, "sampling/sampling_logp_difference/max": 0.5488207340240479, "sampling/sampling_logp_difference/mean": 0.007785079348832369, "step": 262 }, { "clip_ratio/high_max": 2.7667110164960224e-05, "clip_ratio/high_mean": 2.7667110164960224e-05, "clip_ratio/low_mean": 3.245067394648989e-05, "clip_ratio/low_min": 3.245067394648989e-05, "clip_ratio/region_mean": 6.0117784111450114e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 62.26953125, "completions/mean_terminated_length": 62.26953125, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.17135223927390245, "epoch": 0.07040417209908735, "frac_reward_zero_std": 0.765625, "grad_norm": 1.656660795211792, "kl": 0.17799003432608312, "learning_rate": 1.946206827643275e-07, "loss": 0.0059, "num_tokens": 18256042.0, "reward": 1.173828125, "reward_std": 0.09594268072396517, "rewards/equation_reward_func/mean": 0.17578125, "rewards/equation_reward_func/std": 0.3475184980779886, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.672349601984024, "sampling/importance_sampling_ratio/mean": 1.000132530927658, "sampling/importance_sampling_ratio/min": 0.5941258743405342, "sampling/sampling_logp_difference/max": 0.622923344373703, "sampling/sampling_logp_difference/mean": 0.008001693873666227, "step": 264 }, { "clip_ratio/high_max": 5.570618345195221e-05, "clip_ratio/high_mean": 5.570618345195221e-05, "clip_ratio/low_mean": 8.285678470403784e-05, "clip_ratio/low_min": 8.285678470403784e-05, "clip_ratio/region_mean": 0.00013856296815599004, "completions/clipped_ratio": 0.0, "completions/max_length": 86.4, "completions/max_terminated_length": 86.4, "completions/mean_length": 61.1328125, "completions/mean_terminated_length": 61.1328125, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.16705449670553207, "epoch": 0.07093753703923196, "frac_reward_zero_std": 0.7125, "grad_norm": 2.1985723972320557, "kl": 0.17178592634283835, "learning_rate": 1.9111331583282103e-07, "loss": -0.0157, "num_tokens": 18392647.0, "reward": 1.275, "reward_std": 0.12919401228427888, "rewards/equation_reward_func/mean": 0.275, "rewards/equation_reward_func/std": 0.4404537439346313, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6002644777297974, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.6167470455169678, "sampling/sampling_logp_difference/max": 0.55384202003479, "sampling/sampling_logp_difference/mean": 0.00788359558209777, "step": 266 }, { "clip_ratio/high_max": 0.0001113139878725633, "clip_ratio/high_mean": 0.0001113139878725633, "clip_ratio/low_mean": 3.0458089895546436e-05, "clip_ratio/low_min": 3.0458089895546436e-05, "clip_ratio/region_mean": 0.00014177207776810974, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 61.9140625, "completions/mean_terminated_length": 61.9140625, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.1660270510862271, "epoch": 0.07147090197937656, "frac_reward_zero_std": 0.6875, "grad_norm": 2.1252636909484863, "kl": 0.21718645483876267, "learning_rate": 1.8761817805330195e-07, "loss": 0.0157, "num_tokens": 18502299.0, "reward": 1.26171875, "reward_std": 0.1343441903591156, "rewards/equation_reward_func/mean": 0.26171875, "rewards/equation_reward_func/std": 0.44087420403957367, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507978081703186, "sampling/importance_sampling_ratio/mean": 0.9999596327543259, "sampling/importance_sampling_ratio/min": 0.5853188037872314, "sampling/sampling_logp_difference/max": 0.5520137846469879, "sampling/sampling_logp_difference/mean": 0.007540238089859486, "step": 268 }, { "clip_ratio/high_max": 8.638186651902895e-05, "clip_ratio/high_mean": 8.638186651902895e-05, "clip_ratio/low_mean": 2.7889334079292086e-05, "clip_ratio/low_min": 2.7889334079292086e-05, "clip_ratio/region_mean": 0.00011427120059832103, "completions/clipped_ratio": 0.0, "completions/max_length": 87.6, "completions/max_terminated_length": 87.6, "completions/mean_length": 62.5921875, "completions/mean_terminated_length": 62.5921875, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.16667411419459516, "epoch": 0.07200426691952115, "frac_reward_zero_std": 0.7625, "grad_norm": 2.6209089756011963, "kl": 0.22719267962707412, "learning_rate": 1.8413599527019018e-07, "loss": 0.0082, "num_tokens": 18640158.0, "reward": 1.2390625, "reward_std": 0.10194959491491318, "rewards/equation_reward_func/mean": 0.2390625, "rewards/equation_reward_func/std": 0.4210586488246918, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6122398376464844, "sampling/importance_sampling_ratio/mean": 0.9997969031333923, "sampling/importance_sampling_ratio/min": 0.6130451917648315, "sampling/sampling_logp_difference/max": 0.5467406749725342, "sampling/sampling_logp_difference/mean": 0.007544608414173126, "step": 270 }, { "clip_ratio/high_max": 5.5620260860046576e-05, "clip_ratio/high_mean": 5.5620260860046576e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.5620260860046576e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 88.5, "completions/max_terminated_length": 88.5, "completions/mean_length": 61.634765625, "completions/mean_terminated_length": 61.634765625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1673782431624002, "epoch": 0.07253763185966576, "frac_reward_zero_std": 0.765625, "grad_norm": 2.0303914546966553, "kl": 0.15923971832833356, "learning_rate": 1.806674906375079e-07, "loss": -0.0079, "num_tokens": 18749803.0, "reward": 1.2421875, "reward_std": 0.10448616743087769, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.42203710973262787, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.478906661272049, "sampling/importance_sampling_ratio/mean": 1.0000398606061935, "sampling/importance_sampling_ratio/min": 0.6365063786506653, "sampling/sampling_logp_difference/max": 0.5071949064731598, "sampling/sampling_logp_difference/mean": 0.007742989109829068, "step": 272 }, { "clip_ratio/high_max": 0.0001486778136394504, "clip_ratio/high_mean": 0.0001486778136394504, "clip_ratio/low_mean": 8.276374137494713e-05, "clip_ratio/low_min": 8.276374137494713e-05, "clip_ratio/region_mean": 0.00023144155501439754, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 61.978125, "completions/mean_terminated_length": 61.978125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16645302452767888, "epoch": 0.07307099679981036, "frac_reward_zero_std": 0.7375, "grad_norm": 1.8097690343856812, "kl": 0.17136518062195844, "learning_rate": 1.7721338446869976e-07, "loss": -0.0275, "num_tokens": 18886997.0, "reward": 1.290625, "reward_std": 0.10532157719135285, "rewards/equation_reward_func/mean": 0.290625, "rewards/equation_reward_func/std": 0.45344066619873047, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4187933683395386, "sampling/importance_sampling_ratio/mean": 0.9998487591743469, "sampling/importance_sampling_ratio/min": 0.6455894589424134, "sampling/sampling_logp_difference/max": 0.4439103126525879, "sampling/sampling_logp_difference/mean": 0.008036257605999709, "step": 274 }, { "clip_ratio/high_max": 8.5113440743751e-05, "clip_ratio/high_mean": 8.5113440743751e-05, "clip_ratio/low_mean": 8.823373031595515e-05, "clip_ratio/low_min": 8.823373031595515e-05, "clip_ratio/region_mean": 0.00017334717105970613, "completions/clipped_ratio": 0.0, "completions/max_length": 103.25, "completions/max_terminated_length": 103.25, "completions/mean_length": 63.16015625, "completions/mean_terminated_length": 63.16015625, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.16968232341524628, "epoch": 0.07360436173995497, "frac_reward_zero_std": 0.65625, "grad_norm": 1.3696211576461792, "kl": 0.17073426545701093, "learning_rate": 1.7377439408704392e-07, "loss": 0.0226, "num_tokens": 18997431.0, "reward": 1.244140625, "reward_std": 0.1505793947726488, "rewards/equation_reward_func/mean": 0.244140625, "rewards/equation_reward_func/std": 0.40324973315000534, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4994879066944122, "sampling/importance_sampling_ratio/mean": 0.9999091327190399, "sampling/importance_sampling_ratio/min": 0.5597525835037231, "sampling/sampling_logp_difference/max": 0.5873761177062988, "sampling/sampling_logp_difference/mean": 0.007468943134881556, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.4, "completions/max_terminated_length": 89.4, "completions/mean_length": 63.1515625, "completions/mean_terminated_length": 63.1515625, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.16834329503277937, "epoch": 0.07413772668009956, "frac_reward_zero_std": 0.725, "grad_norm": 1.8784897327423096, "kl": 0.3823344835287167, "learning_rate": 1.7035123367668323e-07, "loss": 0.0042, "num_tokens": 19135384.0, "reward": 1.2640625, "reward_std": 0.12319546788930893, "rewards/equation_reward_func/mean": 0.2640625, "rewards/equation_reward_func/std": 0.4213733494281769, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6087879896163941, "sampling/importance_sampling_ratio/mean": 1.0002206802368163, "sampling/importance_sampling_ratio/min": 0.667132580280304, "sampling/sampling_logp_difference/max": 0.5475925087928772, "sampling/sampling_logp_difference/mean": 0.007364885974675417, "step": 278 }, { "clip_ratio/high_max": 2.81151604010827e-05, "clip_ratio/high_mean": 2.81151604010827e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.81151604010827e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 63.357421875, "completions/mean_terminated_length": 63.357421875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1599074963790675, "epoch": 0.07467109162024416, "frac_reward_zero_std": 0.671875, "grad_norm": 1.7391357421875, "kl": 0.1496015904057357, "learning_rate": 1.6694461413430893e-07, "loss": -0.0126, "num_tokens": 19246143.0, "reward": 1.353515625, "reward_std": 0.1361870914697647, "rewards/equation_reward_func/mean": 0.353515625, "rewards/equation_reward_func/std": 0.47788967192173004, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5863225162029266, "sampling/importance_sampling_ratio/mean": 1.0001775622367859, "sampling/importance_sampling_ratio/min": 0.6293168216943741, "sampling/sampling_logp_difference/max": 0.4962163418531418, "sampling/sampling_logp_difference/mean": 0.007331064320169389, "step": 280 }, { "clip_ratio/high_max": 0.0001992941445981463, "clip_ratio/high_mean": 0.0001992941445981463, "clip_ratio/low_mean": 0.00011343831849646651, "clip_ratio/low_min": 0.00011343831849646651, "clip_ratio/region_mean": 0.0003127324630946128, "completions/clipped_ratio": 0.0, "completions/max_length": 91.8, "completions/max_terminated_length": 91.8, "completions/mean_length": 62.5421875, "completions/mean_terminated_length": 62.5421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16647056821319792, "epoch": 0.07520445656038877, "frac_reward_zero_std": 0.6875, "grad_norm": 2.177482843399048, "kl": 0.3770016178799172, "learning_rate": 1.6355524292152684e-07, "loss": 0.0148, "num_tokens": 19383842.0, "reward": 1.3203125, "reward_std": 0.1463983103632927, "rewards/equation_reward_func/mean": 0.3203125, "rewards/equation_reward_func/std": 0.4657664060592651, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6361550569534302, "sampling/importance_sampling_ratio/mean": 1.0002413749694825, "sampling/importance_sampling_ratio/min": 0.6656458258628846, "sampling/sampling_logp_difference/max": 0.5299483299255371, "sampling/sampling_logp_difference/mean": 0.008117585256695748, "step": 282 }, { "clip_ratio/high_max": 0.00010767569862461339, "clip_ratio/high_mean": 0.00010767569862461339, "clip_ratio/low_mean": 2.64047315188994e-05, "clip_ratio/low_min": 2.64047315188994e-05, "clip_ratio/region_mean": 0.00013408043014351279, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 62.701171875, "completions/mean_terminated_length": 62.701171875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16929591581639317, "epoch": 0.07573782150053336, "frac_reward_zero_std": 0.796875, "grad_norm": 2.4693572521209717, "kl": 0.16570124619950852, "learning_rate": 1.6018382391793722e-07, "loss": -0.0132, "num_tokens": 19493993.0, "reward": 1.169921875, "reward_std": 0.09172647446393967, "rewards/equation_reward_func/mean": 0.169921875, "rewards/equation_reward_func/std": 0.3734985291957855, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5421675145626068, "sampling/importance_sampling_ratio/mean": 0.9996543824672699, "sampling/importance_sampling_ratio/min": 0.655137300491333, "sampling/sampling_logp_difference/max": 0.512511134147644, "sampling/sampling_logp_difference/mean": 0.007928695646114647, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.3864070600312618e-05, "clip_ratio/low_min": 2.3864070600312618e-05, "clip_ratio/region_mean": 2.3864070600312618e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 63.0734375, "completions/mean_terminated_length": 63.0734375, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.1685334078243209, "epoch": 0.07627118644067797, "frac_reward_zero_std": 0.725, "grad_norm": 1.6882261037826538, "kl": 0.3469633719262977, "learning_rate": 1.5683105727495778e-07, "loss": 0.0133, "num_tokens": 19632008.0, "reward": 1.2515625, "reward_std": 0.11725681126117707, "rewards/equation_reward_func/mean": 0.253125, "rewards/equation_reward_func/std": 0.4268356800079346, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.504924750328064, "sampling/importance_sampling_ratio/mean": 1.0003291487693786, "sampling/importance_sampling_ratio/min": 0.6604488730430603, "sampling/sampling_logp_difference/max": 0.44870591163635254, "sampling/sampling_logp_difference/mean": 0.007683346141129732, "step": 286 }, { "clip_ratio/high_max": 0.00011052796132086466, "clip_ratio/high_mean": 0.00011052796132086466, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011052796132086466, "completions/clipped_ratio": 0.0, "completions/max_length": 95.5, "completions/max_terminated_length": 95.5, "completions/mean_length": 63.81640625, "completions/mean_terminated_length": 63.81640625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.16416497435420752, "epoch": 0.07680455138082257, "frac_reward_zero_std": 0.765625, "grad_norm": 1.407440185546875, "kl": 0.1843227494197587, "learning_rate": 1.5349763927042168e-07, "loss": -0.0032, "num_tokens": 19742906.0, "reward": 1.201171875, "reward_std": 0.10251103527843952, "rewards/equation_reward_func/mean": 0.201171875, "rewards/equation_reward_func/std": 0.35500316321849823, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4987813234329224, "sampling/importance_sampling_ratio/mean": 1.0000157356262207, "sampling/importance_sampling_ratio/min": 0.5743847489356995, "sampling/sampling_logp_difference/max": 0.6061298549175262, "sampling/sampling_logp_difference/mean": 0.007707677781581879, "step": 288 }, { "clip_ratio/high_max": 3.128128203873833e-05, "clip_ratio/high_mean": 3.128128203873833e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.128128203873833e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 91.4, "completions/max_terminated_length": 91.4, "completions/mean_length": 63.1234375, "completions/mean_terminated_length": 63.1234375, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.16190135618671775, "epoch": 0.07733791632096716, "frac_reward_zero_std": 0.725, "grad_norm": 1.6126437187194824, "kl": 0.16573938147889244, "learning_rate": 1.501842621639796e-07, "loss": 0.0116, "num_tokens": 19881081.0, "reward": 1.26875, "reward_std": 0.1140530526638031, "rewards/equation_reward_func/mean": 0.26875, "rewards/equation_reward_func/std": 0.44139830470085145, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5084954500198364, "sampling/importance_sampling_ratio/mean": 1.0000431418418885, "sampling/importance_sampling_ratio/min": 0.5070626914501191, "sampling/sampling_logp_difference/max": 0.6949543237686158, "sampling/sampling_logp_difference/mean": 0.007862045057117939, "step": 290 }, { "clip_ratio/high_max": 5.570618345195221e-05, "clip_ratio/high_mean": 5.570618345195221e-05, "clip_ratio/low_mean": 8.352650749859297e-05, "clip_ratio/low_min": 8.352650749859297e-05, "clip_ratio/region_mean": 0.00013923269095054516, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 62.107421875, "completions/mean_terminated_length": 62.107421875, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.16269824695256022, "epoch": 0.07787128126111177, "frac_reward_zero_std": 0.671875, "grad_norm": 1.905688762664795, "kl": 0.1797306988802221, "learning_rate": 1.4689161405333652e-07, "loss": 0.0075, "num_tokens": 19990928.0, "reward": 1.3046875, "reward_std": 0.1441395953297615, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.4588534012436867, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5021350979804993, "sampling/importance_sampling_ratio/mean": 0.9997458308935165, "sampling/importance_sampling_ratio/min": 0.6408550441265106, "sampling/sampling_logp_difference/max": 0.47652292251586914, "sampling/sampling_logp_difference/mean": 0.007676503853872418, "step": 292 }, { "clip_ratio/high_max": 8.435645011357135e-05, "clip_ratio/high_mean": 8.435645011357135e-05, "clip_ratio/low_mean": 2.9301451933052805e-05, "clip_ratio/low_min": 2.9301451933052805e-05, "clip_ratio/region_mean": 0.00011365790204662416, "completions/clipped_ratio": 0.0, "completions/max_length": 109.8, "completions/max_terminated_length": 109.8, "completions/mean_length": 62.9421875, "completions/mean_terminated_length": 62.9421875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1621687257041534, "epoch": 0.07840464620125637, "frac_reward_zero_std": 0.725, "grad_norm": 2.040407419204712, "kl": 0.3422270532593959, "learning_rate": 1.4362037873135255e-07, "loss": 0.0297, "num_tokens": 20128931.0, "reward": 1.23125, "reward_std": 0.1267770729959011, "rewards/equation_reward_func/mean": 0.23125, "rewards/equation_reward_func/std": 0.40610227584838865, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4460718631744385, "sampling/importance_sampling_ratio/mean": 1.0001681447029114, "sampling/importance_sampling_ratio/min": 0.608282995223999, "sampling/sampling_logp_difference/max": 0.5198527336120605, "sampling/sampling_logp_difference/mean": 0.00785816153511405, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.8356678992923764e-05, "clip_ratio/low_min": 5.8356678992923764e-05, "clip_ratio/region_mean": 5.8356678992923764e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.75, "completions/max_terminated_length": 89.75, "completions/mean_length": 63.248046875, "completions/mean_terminated_length": 63.248046875, "completions/min_length": 46.25, "completions/min_terminated_length": 46.25, "entropy": 0.15600042904002798, "epoch": 0.07893801114140098, "frac_reward_zero_std": 0.734375, "grad_norm": 2.2707901000976562, "kl": 0.19476062970028984, "learning_rate": 1.403712355440378e-07, "loss": -0.0012, "num_tokens": 20239610.0, "reward": 1.232421875, "reward_std": 0.11093903332948685, "rewards/equation_reward_func/mean": 0.232421875, "rewards/equation_reward_func/std": 0.416695311665535, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5503148436546326, "sampling/importance_sampling_ratio/mean": 0.9996602535247803, "sampling/importance_sampling_ratio/min": 0.6442981511354446, "sampling/sampling_logp_difference/max": 0.48464709520339966, "sampling/sampling_logp_difference/mean": 0.007375328801572323, "step": 296 }, { "clip_ratio/high_max": 8.59696382475603e-05, "clip_ratio/high_mean": 8.59696382475603e-05, "clip_ratio/low_mean": 0.00011250976401950336, "clip_ratio/low_min": 0.00011250976401950336, "clip_ratio/region_mean": 0.00019847940226706365, "completions/clipped_ratio": 0.0, "completions/max_length": 87.6, "completions/max_terminated_length": 87.6, "completions/mean_length": 60.4296875, "completions/mean_terminated_length": 60.4296875, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.1527714607719746, "epoch": 0.07947137608154557, "frac_reward_zero_std": 0.7875, "grad_norm": 2.1192171573638916, "kl": 0.1836801348771486, "learning_rate": 1.371448592494707e-07, "loss": 0.012, "num_tokens": 20375717.0, "reward": 1.2015625, "reward_std": 0.09289761185646057, "rewards/equation_reward_func/mean": 0.2015625, "rewards/equation_reward_func/std": 0.39929420948028566, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5626222133636474, "sampling/importance_sampling_ratio/mean": 1.0000908970832825, "sampling/importance_sampling_ratio/min": 0.576184606552124, "sampling/sampling_logp_difference/max": 0.5587828874588012, "sampling/sampling_logp_difference/mean": 0.007543122302740813, "step": 298 }, { "clip_ratio/high_max": 2.942561210754017e-05, "clip_ratio/high_mean": 2.942561210754017e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.942561210754017e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 95.25, "completions/max_terminated_length": 95.25, "completions/mean_length": 61.8984375, "completions/mean_terminated_length": 61.8984375, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.15187999274995592, "epoch": 0.08000474102169017, "frac_reward_zero_std": 0.6875, "grad_norm": 2.1531012058258057, "kl": 0.15510166208777162, "learning_rate": 1.3394191987766996e-07, "loss": -0.0051, "num_tokens": 20485465.0, "reward": 1.28125, "reward_std": 0.14091254770755768, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.45049871504306793, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4555107653141022, "sampling/importance_sampling_ratio/mean": 1.0001604110002518, "sampling/importance_sampling_ratio/min": 0.6326434463262558, "sampling/sampling_logp_difference/max": 0.46094879508018494, "sampling/sampling_logp_difference/mean": 0.007268833229318261, "step": 300 }, { "clip_ratio/high_max": 3.006253003453215e-05, "clip_ratio/high_mean": 3.006253003453215e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.006253003453215e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.6, "completions/max_terminated_length": 89.6, "completions/mean_length": 61.825, "completions/mean_terminated_length": 61.825, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "entropy": 0.15210040182703072, "epoch": 0.08053810596183478, "frac_reward_zero_std": 0.775, "grad_norm": 1.0212541818618774, "kl": 0.578163127363142, "learning_rate": 1.3076308259144652e-07, "loss": -0.0107, "num_tokens": 20622833.0, "reward": 1.1984375, "reward_std": 0.10026311352849007, "rewards/equation_reward_func/mean": 0.1984375, "rewards/equation_reward_func/std": 0.3982939958572388, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4686452150344849, "sampling/importance_sampling_ratio/mean": 1.0000053763389587, "sampling/importance_sampling_ratio/min": 0.6036676645278931, "sampling/sampling_logp_difference/max": 0.535408329963684, "sampling/sampling_logp_difference/mean": 0.007250169012695551, "step": 302 }, { "clip_ratio/high_max": 5.493884964380413e-05, "clip_ratio/high_mean": 5.493884964380413e-05, "clip_ratio/low_mean": 0.00012273332686163485, "clip_ratio/low_min": 0.00012273332686163485, "clip_ratio/region_mean": 0.00017767217650543898, "completions/clipped_ratio": 0.0, "completions/max_length": 98.5, "completions/max_terminated_length": 98.5, "completions/mean_length": 61.677734375, "completions/mean_terminated_length": 61.677734375, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.15151018287158674, "epoch": 0.08107147090197937, "frac_reward_zero_std": 0.796875, "grad_norm": 1.2295188903808594, "kl": 0.1704214598155684, "learning_rate": 1.2760900754826858e-07, "loss": 0.0219, "num_tokens": 20732620.0, "reward": 1.234375, "reward_std": 0.08548389002680779, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42354996502399445, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4188537895679474, "sampling/importance_sampling_ratio/mean": 0.99995157122612, "sampling/importance_sampling_ratio/min": 0.6260853409767151, "sampling/sampling_logp_difference/max": 0.46892082691192627, "sampling/sampling_logp_difference/mean": 0.007041251054033637, "step": 304 }, { "clip_ratio/high_max": 0.0001164733031247225, "clip_ratio/high_mean": 0.0001164733031247225, "clip_ratio/low_mean": 5.8733366636766325e-05, "clip_ratio/low_min": 5.8733366636766325e-05, "clip_ratio/region_mean": 0.00017520666976148883, "completions/clipped_ratio": 0.0, "completions/max_length": 85.2, "completions/max_terminated_length": 85.2, "completions/mean_length": 60.353125, "completions/mean_terminated_length": 60.353125, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.1476845426691903, "epoch": 0.08160483584212398, "frac_reward_zero_std": 0.75, "grad_norm": 2.542125940322876, "kl": 0.20440695693509447, "learning_rate": 1.2448034976316394e-07, "loss": -0.0176, "num_tokens": 20868806.0, "reward": 1.3265625, "reward_std": 0.10547287315130234, "rewards/equation_reward_func/mean": 0.3265625, "rewards/equation_reward_func/std": 0.47015486359596254, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.558743405342102, "sampling/importance_sampling_ratio/mean": 1.000117290019989, "sampling/importance_sampling_ratio/min": 0.5977307558059692, "sampling/sampling_logp_difference/max": 0.605611515045166, "sampling/sampling_logp_difference/mean": 0.0069610177539289, "step": 306 }, { "clip_ratio/high_max": 2.9056252161454824e-05, "clip_ratio/high_mean": 2.9056252161454824e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9056252161454824e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 93.75, "completions/max_terminated_length": 93.75, "completions/mean_length": 60.859375, "completions/mean_terminated_length": 60.859375, "completions/min_length": 46.75, "completions/min_terminated_length": 46.75, "entropy": 0.14723145051134956, "epoch": 0.08213820078226858, "frac_reward_zero_std": 0.71875, "grad_norm": 2.1533043384552, "kl": 0.17001093111725318, "learning_rate": 1.213777589726922e-07, "loss": 0.017, "num_tokens": 20978078.0, "reward": 1.30078125, "reward_std": 0.1159328892827034, "rewards/equation_reward_func/mean": 0.30078125, "rewards/equation_reward_func/std": 0.45463699847459793, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4889400601387024, "sampling/importance_sampling_ratio/mean": 0.9999859035015106, "sampling/importance_sampling_ratio/min": 0.698627382516861, "sampling/sampling_logp_difference/max": 0.4138526916503906, "sampling/sampling_logp_difference/mean": 0.006993932765908539, "step": 308 }, { "clip_ratio/high_max": 2.723311707894835e-05, "clip_ratio/high_mean": 2.723311707894835e-05, "clip_ratio/low_mean": 2.8935186694272692e-05, "clip_ratio/low_min": 2.8935186694272692e-05, "clip_ratio/region_mean": 5.616830377322104e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 107.2, "completions/max_terminated_length": 107.2, "completions/mean_length": 60.8421875, "completions/mean_terminated_length": 60.8421875, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.14791247719484898, "epoch": 0.08267156572241317, "frac_reward_zero_std": 0.7625, "grad_norm": 1.409075379371643, "kl": 0.1939486001825167, "learning_rate": 1.183018795000118e-07, "loss": -0.0018, "num_tokens": 21114705.0, "reward": 1.259375, "reward_std": 0.10626165643334388, "rewards/equation_reward_func/mean": 0.259375, "rewards/equation_reward_func/std": 0.4264992535114288, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4488518476486205, "sampling/importance_sampling_ratio/mean": 0.999934720993042, "sampling/importance_sampling_ratio/min": 0.5747387528419494, "sampling/sampling_logp_difference/max": 0.5576710939407349, "sampling/sampling_logp_difference/mean": 0.0070027833804488186, "step": 310 }, { "clip_ratio/high_max": 5.623123918970426e-05, "clip_ratio/high_mean": 5.623123918970426e-05, "clip_ratio/low_mean": 5.942461818146209e-05, "clip_ratio/low_min": 5.942461818146209e-05, "clip_ratio/region_mean": 0.00011565585737116635, "completions/clipped_ratio": 0.0, "completions/max_length": 83.75, "completions/max_terminated_length": 83.75, "completions/mean_length": 59.505859375, "completions/mean_terminated_length": 59.505859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.13861441234540609, "epoch": 0.08320493066255778, "frac_reward_zero_std": 0.703125, "grad_norm": 3.323216676712036, "kl": 0.17116504271204272, "learning_rate": 1.1525335012107188e-07, "loss": 0.0039, "num_tokens": 21223356.0, "reward": 1.251953125, "reward_std": 0.1293503325432539, "rewards/equation_reward_func/mean": 0.251953125, "rewards/equation_reward_func/std": 0.43190427869558334, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5722920894622803, "sampling/importance_sampling_ratio/mean": 1.000055804848671, "sampling/importance_sampling_ratio/min": 0.6373433768749237, "sampling/sampling_logp_difference/max": 0.4705616235733032, "sampling/sampling_logp_difference/mean": 0.006334169302135706, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.813699565103484e-05, "clip_ratio/low_min": 8.813699565103484e-05, "clip_ratio/region_mean": 8.813699565103484e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.2, "completions/max_terminated_length": 89.2, "completions/mean_length": 60.003125, "completions/mean_terminated_length": 60.003125, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.14592335435251394, "epoch": 0.08373829560270238, "frac_reward_zero_std": 0.675, "grad_norm": 1.9989326000213623, "kl": 0.45487243961542845, "learning_rate": 1.1223280393195566e-07, "loss": -0.0168, "num_tokens": 21359374.0, "reward": 1.3015625, "reward_std": 0.1363004580140114, "rewards/equation_reward_func/mean": 0.3015625, "rewards/equation_reward_func/std": 0.42825528383255007, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3855737686157226, "sampling/importance_sampling_ratio/mean": 0.9998773813247681, "sampling/importance_sampling_ratio/min": 0.6165928721427918, "sampling/sampling_logp_difference/max": 0.5128003716468811, "sampling/sampling_logp_difference/mean": 0.007020724471658468, "step": 314 }, { "clip_ratio/high_max": 5.600358256035381e-05, "clip_ratio/high_mean": 5.600358256035381e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.600358256035381e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 98.25, "completions/max_terminated_length": 98.25, "completions/mean_length": 59.7421875, "completions/mean_terminated_length": 59.7421875, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.1445285806225406, "epoch": 0.08427166054284699, "frac_reward_zero_std": 0.828125, "grad_norm": 1.935788631439209, "kl": 0.18509919392979807, "learning_rate": 1.0924086821740436e-07, "loss": 0.0196, "num_tokens": 21468074.0, "reward": 1.216796875, "reward_std": 0.07640802673995495, "rewards/equation_reward_func/mean": 0.216796875, "rewards/equation_reward_func/std": 0.3967781960964203, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4880629479885101, "sampling/importance_sampling_ratio/mean": 1.0003069341182709, "sampling/importance_sampling_ratio/min": 0.6373428702354431, "sampling/sampling_logp_difference/max": 0.4659912586212158, "sampling/sampling_logp_difference/mean": 0.0066119476687163115, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 92.4, "completions/max_terminated_length": 92.4, "completions/mean_length": 59.784375, "completions/mean_terminated_length": 59.784375, "completions/min_length": 44.4, "completions/min_terminated_length": 44.4, "entropy": 0.14941889968597227, "epoch": 0.08480502548299158, "frac_reward_zero_std": 0.825, "grad_norm": 2.1098833084106445, "kl": 0.20412971172481775, "learning_rate": 1.0627816432054689e-07, "loss": -0.0197, "num_tokens": 21603808.0, "reward": 1.31875, "reward_std": 0.07527982704341411, "rewards/equation_reward_func/mean": 0.31875, "rewards/equation_reward_func/std": 0.4637947142124176, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4394292831420898, "sampling/importance_sampling_ratio/mean": 1.000177812576294, "sampling/importance_sampling_ratio/min": 0.6724377512931824, "sampling/sampling_logp_difference/max": 0.45770553350448606, "sampling/sampling_logp_difference/mean": 0.006965827383100987, "step": 318 }, { "clip_ratio/high_max": 5.526170328569909e-05, "clip_ratio/high_mean": 5.526170328569909e-05, "clip_ratio/low_mean": 6.433342868048284e-05, "clip_ratio/low_min": 6.433342868048284e-05, "clip_ratio/region_mean": 0.00011959513196618193, "completions/clipped_ratio": 0.0, "completions/max_length": 89.5, "completions/max_terminated_length": 89.5, "completions/mean_length": 59.29296875, "completions/mean_terminated_length": 59.29296875, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.1440402068094247, "epoch": 0.08533839042313618, "frac_reward_zero_std": 0.75, "grad_norm": 1.7009668350219727, "kl": 0.17148414802634054, "learning_rate": 1.0334530751386386e-07, "loss": 0.0017, "num_tokens": 21712230.0, "reward": 1.28125, "reward_std": 0.10632783640176058, "rewards/equation_reward_func/mean": 0.28125, "rewards/equation_reward_func/std": 0.4459381252527237, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3765664100646973, "sampling/importance_sampling_ratio/mean": 0.9998722523450851, "sampling/importance_sampling_ratio/min": 0.599141463637352, "sampling/sampling_logp_difference/max": 0.5374931991100311, "sampling/sampling_logp_difference/mean": 0.006750167929567397, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 3.156565556613108e-05, "clip_ratio/low_min": 3.156565556613108e-05, "clip_ratio/region_mean": 3.156565556613108e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 59.3625, "completions/mean_terminated_length": 59.3625, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.14133981661871076, "epoch": 0.08587175536328079, "frac_reward_zero_std": 0.8, "grad_norm": 1.234773874282837, "kl": 0.17927683300028244, "learning_rate": 1.0044290687141255e-07, "loss": 0.0055, "num_tokens": 21847894.0, "reward": 1.2328125, "reward_std": 0.09321458786725997, "rewards/equation_reward_func/mean": 0.2328125, "rewards/equation_reward_func/std": 0.4038575947284698, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5336121320724487, "sampling/importance_sampling_ratio/mean": 0.9999105572700501, "sampling/importance_sampling_ratio/min": 0.6529236197471618, "sampling/sampling_logp_difference/max": 0.47073605060577395, "sampling/sampling_logp_difference/mean": 0.0066753306426107885, "step": 322 }, { "clip_ratio/high_max": 5.4904515208262536e-05, "clip_ratio/high_mean": 5.4904515208262536e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.4904515208262536e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 60.2578125, "completions/mean_terminated_length": 60.2578125, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.14529094912318719, "epoch": 0.08640512030342538, "frac_reward_zero_std": 0.78125, "grad_norm": 2.4741289615631104, "kl": 0.17329017124656174, "learning_rate": 9.757156514233892e-08, "loss": -0.0049, "num_tokens": 21956858.0, "reward": 1.193359375, "reward_std": 0.10658863373100758, "rewards/equation_reward_func/mean": 0.193359375, "rewards/equation_reward_func/std": 0.3953615799546242, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5873429775238037, "sampling/importance_sampling_ratio/mean": 0.9998360723257065, "sampling/importance_sampling_ratio/min": 0.6423820406198502, "sampling/sampling_logp_difference/max": 0.4654676914215088, "sampling/sampling_logp_difference/mean": 0.006776340422220528, "step": 324 }, { "clip_ratio/high_max": 2.8577959811728862e-05, "clip_ratio/high_mean": 2.8577959811728862e-05, "clip_ratio/low_mean": 2.6009155489090415e-05, "clip_ratio/low_min": 2.6009155489090415e-05, "clip_ratio/region_mean": 5.458711530081928e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 82.4, "completions/max_terminated_length": 82.4, "completions/mean_length": 59.3765625, "completions/mean_terminated_length": 59.3765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.14065386318705148, "epoch": 0.08693848524357, "frac_reward_zero_std": 0.725, "grad_norm": 1.811319351196289, "kl": 0.1825854441461464, "learning_rate": 9.473187862570289e-08, "loss": -0.0008, "num_tokens": 22092323.0, "reward": 1.31875, "reward_std": 0.12203969210386276, "rewards/equation_reward_func/mean": 0.31875, "rewards/equation_reward_func/std": 0.45874998569488523, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4195994138717651, "sampling/importance_sampling_ratio/mean": 1.0000884771347045, "sampling/importance_sampling_ratio/min": 0.6057180881500244, "sampling/sampling_logp_difference/max": 0.5045634984970093, "sampling/sampling_logp_difference/mean": 0.0062735193409025666, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.9677114475311504e-05, "clip_ratio/low_min": 2.9677114475311504e-05, "clip_ratio/region_mean": 2.9677114475311504e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 60.17578125, "completions/mean_terminated_length": 60.17578125, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.14347862773057488, "epoch": 0.08747185018371459, "frac_reward_zero_std": 0.640625, "grad_norm": 2.2519924640655518, "kl": 0.1730220969248977, "learning_rate": 9.192443704664344e-08, "loss": 0.0096, "num_tokens": 22201157.0, "reward": 1.240234375, "reward_std": 0.15887073799967766, "rewards/equation_reward_func/mean": 0.240234375, "rewards/equation_reward_func/std": 0.4248482957482338, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4630204439163208, "sampling/importance_sampling_ratio/mean": 1.0001176595687866, "sampling/importance_sampling_ratio/min": 0.6251243576407433, "sampling/sampling_logp_difference/max": 0.5141621828079224, "sampling/sampling_logp_difference/mean": 0.006838137283921242, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 3.245067394648989e-05, "clip_ratio/low_min": 3.245067394648989e-05, "clip_ratio/region_mean": 3.245067394648989e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 86.4, "completions/max_terminated_length": 86.4, "completions/mean_length": 59.7578125, "completions/mean_terminated_length": 59.7578125, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.14106454860625994, "epoch": 0.0880052151238592, "frac_reward_zero_std": 0.8, "grad_norm": 1.736958384513855, "kl": 0.20853251073923376, "learning_rate": 8.914982343390895e-08, "loss": -0.0077, "num_tokens": 22336818.0, "reward": 1.325, "reward_std": 0.0893733486533165, "rewards/equation_reward_func/mean": 0.325, "rewards/equation_reward_func/std": 0.4464641273021698, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7095434188842773, "sampling/importance_sampling_ratio/mean": 1.000044572353363, "sampling/importance_sampling_ratio/min": 0.6005661249160766, "sampling/sampling_logp_difference/max": 0.588768720626831, "sampling/sampling_logp_difference/mean": 0.006634648703038692, "step": 330 }, { "clip_ratio/high_max": 5.7644081405467456e-05, "clip_ratio/high_mean": 5.7644081405467456e-05, "clip_ratio/low_mean": 6.091617979109287e-05, "clip_ratio/low_min": 6.091617979109287e-05, "clip_ratio/region_mean": 0.00011856026119656033, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 60.38671875, "completions/mean_terminated_length": 60.38671875, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.14060988815294373, "epoch": 0.0885385800640038, "frac_reward_zero_std": 0.8125, "grad_norm": 1.4968165159225464, "kl": 0.47234819425890845, "learning_rate": 8.640861399877805e-08, "loss": 0.015, "num_tokens": 22445872.0, "reward": 1.216796875, "reward_std": 0.0776611645705998, "rewards/equation_reward_func/mean": 0.216796875, "rewards/equation_reward_func/std": 0.38788647949695587, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4249710142612457, "sampling/importance_sampling_ratio/mean": 1.0002595782279968, "sampling/importance_sampling_ratio/min": 0.6887055933475494, "sampling/sampling_logp_difference/max": 0.3900427222251892, "sampling/sampling_logp_difference/mean": 0.0066901418613269925, "step": 332 }, { "clip_ratio/high_max": 5.760905171175384e-05, "clip_ratio/high_mean": 5.760905171175384e-05, "clip_ratio/low_mean": 5.8641977375373244e-05, "clip_ratio/low_min": 5.8641977375373244e-05, "clip_ratio/region_mean": 0.00011625102908712708, "completions/clipped_ratio": 0.0, "completions/max_length": 90.2, "completions/max_terminated_length": 90.2, "completions/mean_length": 60.89375, "completions/mean_terminated_length": 60.89375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.14231606174467337, "epoch": 0.08907194500414839, "frac_reward_zero_std": 0.825, "grad_norm": 1.4557781219482422, "kl": 0.2778156142578357, "learning_rate": 8.370137801539634e-08, "loss": -0.0105, "num_tokens": 22582580.0, "reward": 1.2296875, "reward_std": 0.07301771529018879, "rewards/equation_reward_func/mean": 0.2296875, "rewards/equation_reward_func/std": 0.419759476184845, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5608803749084472, "sampling/importance_sampling_ratio/mean": 0.9999013304710388, "sampling/importance_sampling_ratio/min": 0.5894364297389985, "sampling/sampling_logp_difference/max": 0.5750768661499024, "sampling/sampling_logp_difference/mean": 0.00656352136284113, "step": 334 }, { "clip_ratio/high_max": 2.562525656281246e-05, "clip_ratio/high_mean": 2.562525656281246e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.562525656281246e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 94.75, "completions/max_terminated_length": 94.75, "completions/mean_length": 60.1796875, "completions/mean_terminated_length": 60.1796875, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.14301741568164694, "epoch": 0.089605309944293, "frac_reward_zero_std": 0.765625, "grad_norm": 1.9819399118423462, "kl": 0.3808240231850909, "learning_rate": 8.102867770255337e-08, "loss": 0.0166, "num_tokens": 22691512.0, "reward": 1.2109375, "reward_std": 0.10363120585680008, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4072655364871025, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5495450794696808, "sampling/importance_sampling_ratio/mean": 1.0000189244747162, "sampling/importance_sampling_ratio/min": 0.609138548374176, "sampling/sampling_logp_difference/max": 0.5363466143608093, "sampling/sampling_logp_difference/mean": 0.0067337562795728445, "step": 336 }, { "clip_ratio/high_max": 8.52074913887514e-05, "clip_ratio/high_mean": 8.52074913887514e-05, "clip_ratio/low_mean": 3.3710895675337975e-05, "clip_ratio/low_min": 3.3710895675337975e-05, "clip_ratio/region_mean": 0.00011891838706408937, "completions/clipped_ratio": 0.0, "completions/max_length": 86.8, "completions/max_terminated_length": 86.8, "completions/mean_length": 60.11875, "completions/mean_terminated_length": 60.11875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.13841548225738937, "epoch": 0.0901386748844376, "frac_reward_zero_std": 0.675, "grad_norm": 2.3782083988189697, "kl": 0.1706045995362931, "learning_rate": 7.839106810692589e-08, "loss": 0.013, "num_tokens": 22827700.0, "reward": 1.3, "reward_std": 0.14287561625242234, "rewards/equation_reward_func/mean": 0.3, "rewards/equation_reward_func/std": 0.4522272884845734, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5887601613998412, "sampling/importance_sampling_ratio/mean": 1.0001999974250793, "sampling/importance_sampling_ratio/min": 0.5935817837715149, "sampling/sampling_logp_difference/max": 0.6079166412353516, "sampling/sampling_logp_difference/mean": 0.006239292211830616, "step": 338 }, { "clip_ratio/high_max": 0.00014842554487081038, "clip_ratio/high_mean": 0.00014842554487081038, "clip_ratio/low_mean": 2.6607066198873024e-05, "clip_ratio/low_min": 2.6607066198873024e-05, "clip_ratio/region_mean": 0.0001750326110696834, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 60.140625, "completions/mean_terminated_length": 60.140625, "completions/min_length": 44.25, "completions/min_terminated_length": 44.25, "entropy": 0.14084081113752392, "epoch": 0.0906720398245822, "frac_reward_zero_std": 0.765625, "grad_norm": 2.1535048484802246, "kl": 0.23395326868113545, "learning_rate": 7.57890969878093e-08, "loss": 0.0089, "num_tokens": 22936452.0, "reward": 1.265625, "reward_std": 0.1029821140691638, "rewards/equation_reward_func/mean": 0.265625, "rewards/equation_reward_func/std": 0.4226730987429619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7359807193279266, "sampling/importance_sampling_ratio/mean": 0.999912291765213, "sampling/importance_sampling_ratio/min": 0.6219378262758255, "sampling/sampling_logp_difference/max": 0.5780813097953796, "sampling/sampling_logp_difference/mean": 0.006775699555873871, "step": 340 }, { "clip_ratio/high_max": 6.118572006622951e-05, "clip_ratio/high_mean": 6.118572006622951e-05, "clip_ratio/low_mean": 5.777028208184573e-05, "clip_ratio/low_min": 5.777028208184573e-05, "clip_ratio/region_mean": 0.00011895600214807523, "completions/clipped_ratio": 0.0, "completions/max_length": 91.8, "completions/max_terminated_length": 91.8, "completions/mean_length": 61.284375, "completions/mean_terminated_length": 61.284375, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.14334746326009432, "epoch": 0.0912054047647268, "frac_reward_zero_std": 0.725, "grad_norm": 2.3153364658355713, "kl": 0.2527553900662396, "learning_rate": 7.322330470336313e-08, "loss": 0.0225, "num_tokens": 23073514.0, "reward": 1.2765625, "reward_std": 0.12541010826826096, "rewards/equation_reward_func/mean": 0.2765625, "rewards/equation_reward_func/std": 0.4400101602077484, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.661290431022644, "sampling/importance_sampling_ratio/mean": 1.0000014305114746, "sampling/importance_sampling_ratio/min": 0.6191738069057464, "sampling/sampling_logp_difference/max": 0.651321291923523, "sampling/sampling_logp_difference/mean": 0.006695216335356235, "step": 342 }, { "clip_ratio/high_max": 2.7667110164960224e-05, "clip_ratio/high_mean": 2.7667110164960224e-05, "clip_ratio/low_mean": 3.059226502146986e-05, "clip_ratio/low_min": 3.059226502146986e-05, "clip_ratio/region_mean": 5.8259375186430086e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 86.75, "completions/max_terminated_length": 86.75, "completions/mean_length": 60.3359375, "completions/mean_terminated_length": 60.3359375, "completions/min_length": 44.75, "completions/min_terminated_length": 44.75, "entropy": 0.1344318784152468, "epoch": 0.0917387697048714, "frac_reward_zero_std": 0.6875, "grad_norm": 2.7678236961364746, "kl": 0.1856951889478498, "learning_rate": 7.069422409839363e-08, "loss": -0.0289, "num_tokens": 23182582.0, "reward": 1.275390625, "reward_std": 0.1380849089473486, "rewards/equation_reward_func/mean": 0.275390625, "rewards/equation_reward_func/std": 0.44815048575401306, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.542555809020996, "sampling/importance_sampling_ratio/mean": 0.999896451830864, "sampling/importance_sampling_ratio/min": 0.6530320793390274, "sampling/sampling_logp_difference/max": 0.48765629529953003, "sampling/sampling_logp_difference/mean": 0.006277261069044471, "step": 344 }, { "clip_ratio/high_max": 2.723311707894835e-05, "clip_ratio/high_mean": 2.723311707894835e-05, "clip_ratio/low_mean": 3.404139554024571e-05, "clip_ratio/low_min": 3.404139554024571e-05, "clip_ratio/region_mean": 6.127451261919405e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 85.6, "completions/max_terminated_length": 85.6, "completions/mean_length": 61.8234375, "completions/mean_terminated_length": 61.8234375, "completions/min_length": 46.6, "completions/min_terminated_length": 46.6, "entropy": 0.13985799961826867, "epoch": 0.092272134645016, "frac_reward_zero_std": 0.7375, "grad_norm": 0.9720630645751953, "kl": 0.22493495900804797, "learning_rate": 6.820238039369647e-08, "loss": -0.0043, "num_tokens": 23320109.0, "reward": 1.315625, "reward_std": 0.1159956470131874, "rewards/equation_reward_func/mean": 0.315625, "rewards/equation_reward_func/std": 0.4645606458187103, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5440977096557618, "sampling/importance_sampling_ratio/mean": 1.000130307674408, "sampling/importance_sampling_ratio/min": 0.6300942063331604, "sampling/sampling_logp_difference/max": 0.4807961702346802, "sampling/sampling_logp_difference/mean": 0.006234641931951046, "step": 346 }, { "clip_ratio/high_max": 2.8344669974305565e-05, "clip_ratio/high_mean": 2.8344669974305565e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.8344669974305565e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.5, "completions/max_terminated_length": 87.5, "completions/mean_length": 59.89453125, "completions/mean_terminated_length": 59.89453125, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.13524813008391195, "epoch": 0.0928054995851606, "frac_reward_zero_std": 0.8125, "grad_norm": 2.3985605239868164, "kl": 0.20756302717038327, "learning_rate": 6.574829107698238e-08, "loss": 0.0049, "num_tokens": 23428743.0, "reward": 1.27734375, "reward_std": 0.07227109000086784, "rewards/equation_reward_func/mean": 0.27734375, "rewards/equation_reward_func/std": 0.4337798282504082, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3906760811805725, "sampling/importance_sampling_ratio/mean": 1.0000679641962051, "sampling/importance_sampling_ratio/min": 0.6559338718652725, "sampling/sampling_logp_difference/max": 0.4229520559310913, "sampling/sampling_logp_difference/mean": 0.0060104799922555685, "step": 348 }, { "clip_ratio/high_max": 0.00021877222914352186, "clip_ratio/high_mean": 0.00021877222914352186, "clip_ratio/low_mean": 8.814395631715242e-05, "clip_ratio/low_min": 8.814395631715242e-05, "clip_ratio/region_mean": 0.0003069161854606743, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 60.2625, "completions/mean_terminated_length": 60.2625, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.14050283810744682, "epoch": 0.09333886452530521, "frac_reward_zero_std": 0.7125, "grad_norm": 1.9780207872390747, "kl": 0.18246198921567863, "learning_rate": 6.333246579540971e-08, "loss": 0.0158, "num_tokens": 23564991.0, "reward": 1.24375, "reward_std": 0.1271421045064926, "rewards/equation_reward_func/mean": 0.24375, "rewards/equation_reward_func/std": 0.4238588511943817, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5685606718063354, "sampling/importance_sampling_ratio/mean": 1.0000767946243285, "sampling/importance_sampling_ratio/min": 0.6233599960803986, "sampling/sampling_logp_difference/max": 0.5295800685882568, "sampling/sampling_logp_difference/mean": 0.0067291872575879095, "step": 350 }, { "clip_ratio/high_max": 2.9178339496461882e-05, "clip_ratio/high_mean": 2.9178339496461882e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9178339496461882e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 98.25, "completions/max_terminated_length": 98.25, "completions/mean_length": 60.69140625, "completions/mean_terminated_length": 60.69140625, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.13691650869117844, "epoch": 0.0938722294654498, "frac_reward_zero_std": 0.75, "grad_norm": 2.524827241897583, "kl": 0.20603164611384273, "learning_rate": 6.095540624974435e-08, "loss": -0.0058, "num_tokens": 23674057.0, "reward": 1.275390625, "reward_std": 0.11283685639500618, "rewards/equation_reward_func/mean": 0.275390625, "rewards/equation_reward_func/std": 0.44712164252996445, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.510300099849701, "sampling/importance_sampling_ratio/mean": 1.0000484585762024, "sampling/importance_sampling_ratio/min": 0.6747458577156067, "sampling/sampling_logp_difference/max": 0.4387631416320801, "sampling/sampling_logp_difference/mean": 0.006464021629653871, "step": 352 }, { "clip_ratio/high_max": 2.7889334079292086e-05, "clip_ratio/high_mean": 2.7889334079292086e-05, "clip_ratio/low_mean": 2.81151604010827e-05, "clip_ratio/low_min": 2.81151604010827e-05, "clip_ratio/region_mean": 5.6004494480374786e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.4, "completions/max_terminated_length": 89.4, "completions/mean_length": 61.0421875, "completions/mean_terminated_length": 61.0421875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.13619560142979026, "epoch": 0.0944055944055944, "frac_reward_zero_std": 0.825, "grad_norm": 1.7357251644134521, "kl": 0.1626316506622566, "learning_rate": 5.861760609017002e-08, "loss": 0.0142, "num_tokens": 23810892.0, "reward": 1.23125, "reward_std": 0.07575064934790135, "rewards/equation_reward_func/mean": 0.2328125, "rewards/equation_reward_func/std": 0.4170250236988068, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.5823601722717284, "sampling/importance_sampling_ratio/mean": 1.0001402020454406, "sampling/importance_sampling_ratio/min": 0.634768670797348, "sampling/sampling_logp_difference/max": 0.6596877336502075, "sampling/sampling_logp_difference/mean": 0.006154829170554876, "step": 354 }, { "clip_ratio/high_max": 3.019323533711334e-05, "clip_ratio/high_mean": 3.019323533711334e-05, "clip_ratio/low_mean": 0.00010919483287984299, "clip_ratio/low_min": 0.00010919483287984299, "clip_ratio/region_mean": 0.00013938806821695634, "completions/clipped_ratio": 0.0, "completions/max_length": 87.25, "completions/max_terminated_length": 87.25, "completions/mean_length": 61.1640625, "completions/mean_terminated_length": 61.1640625, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.13299460543526542, "epoch": 0.09493895934573901, "frac_reward_zero_std": 0.671875, "grad_norm": 2.1723873615264893, "kl": 0.2727597154573434, "learning_rate": 5.63195508137711e-08, "loss": -0.0017, "num_tokens": 23920400.0, "reward": 1.234375, "reward_std": 0.14670202136039734, "rewards/equation_reward_func/mean": 0.236328125, "rewards/equation_reward_func/std": 0.4165647551417351, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.5103906393051147, "sampling/importance_sampling_ratio/mean": 0.999917134642601, "sampling/importance_sampling_ratio/min": 0.6228740364313126, "sampling/sampling_logp_difference/max": 0.5028542280197144, "sampling/sampling_logp_difference/mean": 0.006233160383999348, "step": 356 }, { "clip_ratio/high_max": 0.0001172048925784313, "clip_ratio/high_mean": 0.0001172048925784313, "clip_ratio/low_mean": 2.8229447909527356e-05, "clip_ratio/low_min": 2.8229447909527356e-05, "clip_ratio/region_mean": 0.00014543434048795866, "completions/clipped_ratio": 0.0, "completions/max_length": 87.2, "completions/max_terminated_length": 87.2, "completions/mean_length": 60.034375, "completions/mean_terminated_length": 60.034375, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.1342512322589755, "epoch": 0.0954723242858836, "frac_reward_zero_std": 0.6875, "grad_norm": 2.6087586879730225, "kl": 1.6735540447342727, "learning_rate": 5.4061717663707843e-08, "loss": 0.0135, "num_tokens": 24056326.0, "reward": 1.35625, "reward_std": 0.1434987172484398, "rewards/equation_reward_func/mean": 0.35625, "rewards/equation_reward_func/std": 0.46249967217445376, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4962616205215453, "sampling/importance_sampling_ratio/mean": 1.0000120759010316, "sampling/importance_sampling_ratio/min": 0.6342434465885163, "sampling/sampling_logp_difference/max": 0.4717667579650879, "sampling/sampling_logp_difference/mean": 0.006138017866760492, "step": 358 }, { "clip_ratio/high_max": 5.811250432290965e-05, "clip_ratio/high_mean": 5.811250432290965e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.811250432290965e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 60.90234375, "completions/mean_terminated_length": 60.90234375, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.13294768162692586, "epoch": 0.09600568922602822, "frac_reward_zero_std": 0.71875, "grad_norm": 2.2287750244140625, "kl": 0.19377786764461133, "learning_rate": 5.1844575530106265e-08, "loss": 0.0044, "num_tokens": 24165764.0, "reward": 1.23828125, "reward_std": 0.11961427144706249, "rewards/equation_reward_func/mean": 0.23828125, "rewards/equation_reward_func/std": 0.4218999445438385, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4444665312767029, "sampling/importance_sampling_ratio/mean": 1.0001286715269089, "sampling/importance_sampling_ratio/min": 0.6475261896848679, "sampling/sampling_logp_difference/max": 0.4546026587486267, "sampling/sampling_logp_difference/mean": 0.00657831015996635, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.8, "completions/max_terminated_length": 82.8, "completions/mean_length": 60.3234375, "completions/mean_terminated_length": 60.3234375, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.13049771041712827, "epoch": 0.09653905416617281, "frac_reward_zero_std": 0.8, "grad_norm": 1.5479376316070557, "kl": 0.20203281049099234, "learning_rate": 4.9668584852682134e-08, "loss": -0.0018, "num_tokens": 24302059.0, "reward": 1.2359375, "reward_std": 0.0845904678106308, "rewards/equation_reward_func/mean": 0.2359375, "rewards/equation_reward_func/std": 0.4178708136081696, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4102025032043457, "sampling/importance_sampling_ratio/mean": 1.0001339077949525, "sampling/importance_sampling_ratio/min": 0.6150591492652893, "sampling/sampling_logp_difference/max": 0.48610634803771974, "sampling/sampling_logp_difference/mean": 0.006109198927879334, "step": 362 }, { "clip_ratio/high_max": 8.598310847042335e-05, "clip_ratio/high_mean": 8.598310847042335e-05, "clip_ratio/low_mean": 2.8815123692361845e-05, "clip_ratio/low_min": 2.8815123692361845e-05, "clip_ratio/region_mean": 0.00011479823216278519, "completions/clipped_ratio": 0.0, "completions/max_length": 93.75, "completions/max_terminated_length": 93.75, "completions/mean_length": 59.55859375, "completions/mean_terminated_length": 59.55859375, "completions/min_length": 44.75, "completions/min_terminated_length": 44.75, "entropy": 0.13303544356798133, "epoch": 0.0970724191063174, "frac_reward_zero_std": 0.765625, "grad_norm": 2.0344302654266357, "kl": 0.16839452631150684, "learning_rate": 4.753419752512072e-08, "loss": 0.0003, "num_tokens": 24410433.0, "reward": 1.29296875, "reward_std": 0.09482250921428204, "rewards/equation_reward_func/mean": 0.29296875, "rewards/equation_reward_func/std": 0.44832389056682587, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.509868860244751, "sampling/importance_sampling_ratio/mean": 1.0001548826694489, "sampling/importance_sampling_ratio/min": 0.624009408056736, "sampling/sampling_logp_difference/max": 0.5735228061676025, "sampling/sampling_logp_difference/mean": 0.006263232557103038, "step": 364 }, { "clip_ratio/high_max": 5.636335279430366e-05, "clip_ratio/high_mean": 5.636335279430366e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.636335279430366e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 85.6, "completions/max_terminated_length": 85.6, "completions/mean_length": 58.8328125, "completions/mean_terminated_length": 58.8328125, "completions/min_length": 44.6, "completions/min_terminated_length": 44.6, "entropy": 0.12483744090422988, "epoch": 0.09760578404646202, "frac_reward_zero_std": 0.825, "grad_norm": 1.439304232597351, "kl": 0.1845027159175111, "learning_rate": 4.5441856801230525e-08, "loss": 0.0028, "num_tokens": 24545454.0, "reward": 1.240625, "reward_std": 0.07758941166102887, "rewards/equation_reward_func/mean": 0.240625, "rewards/equation_reward_func/std": 0.40785287618637084, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.383548951148987, "sampling/importance_sampling_ratio/mean": 0.9999644875526428, "sampling/importance_sampling_ratio/min": 0.6619933009147644, "sampling/sampling_logp_difference/max": 0.43538556098937986, "sampling/sampling_logp_difference/mean": 0.005884497798979283, "step": 366 }, { "clip_ratio/high_max": 8.09853793018394e-05, "clip_ratio/high_mean": 8.09853793018394e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.09853793018394e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 60.212890625, "completions/mean_terminated_length": 60.212890625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.12786115509354407, "epoch": 0.09813914898660661, "frac_reward_zero_std": 0.75, "grad_norm": 2.2670207023620605, "kl": 0.33000727577341926, "learning_rate": 4.3391997202891825e-08, "loss": 0.0063, "num_tokens": 24654435.0, "reward": 1.345703125, "reward_std": 0.10403061285614967, "rewards/equation_reward_func/mean": 0.345703125, "rewards/equation_reward_func/std": 0.47676315903663635, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4116013646125793, "sampling/importance_sampling_ratio/mean": 1.0000161081552505, "sampling/importance_sampling_ratio/min": 0.6367608904838562, "sampling/sampling_logp_difference/max": 0.452808678150177, "sampling/sampling_logp_difference/mean": 0.0059723727172240615, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.6, "completions/max_terminated_length": 85.6, "completions/mean_length": 59.58125, "completions/mean_terminated_length": 59.58125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.13071501762088802, "epoch": 0.09867251392675122, "frac_reward_zero_std": 0.725, "grad_norm": 1.800971269607544, "kl": 0.1956717123070525, "learning_rate": 4.1385044429817966e-08, "loss": -0.0221, "num_tokens": 24790191.0, "reward": 1.290625, "reward_std": 0.11699815690517426, "rewards/equation_reward_func/mean": 0.290625, "rewards/equation_reward_func/std": 0.44812433123588563, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4319786310195923, "sampling/importance_sampling_ratio/mean": 0.9999115228652954, "sampling/importance_sampling_ratio/min": 0.6537201523780822, "sampling/sampling_logp_difference/max": 0.4426090717315674, "sampling/sampling_logp_difference/mean": 0.00641742879524827, "step": 370 }, { "clip_ratio/high_max": 5.533509991235203e-05, "clip_ratio/high_mean": 5.533509991235203e-05, "clip_ratio/low_mean": 3.006253003453215e-05, "clip_ratio/low_min": 3.006253003453215e-05, "clip_ratio/region_mean": 8.539762994688418e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 98.5, "completions/max_terminated_length": 98.5, "completions/mean_length": 60.853515625, "completions/mean_terminated_length": 60.853515625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.13347570882696244, "epoch": 0.09920587886689582, "frac_reward_zero_std": 0.6875, "grad_norm": 2.672558307647705, "kl": 0.30756443785503507, "learning_rate": 3.942141527114978e-08, "loss": 0.0278, "num_tokens": 24899628.0, "reward": 1.16796875, "reward_std": 0.13066281378269196, "rewards/equation_reward_func/mean": 0.169921875, "rewards/equation_reward_func/std": 0.34059055522084236, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.4259434342384338, "sampling/importance_sampling_ratio/mean": 0.9997378587722778, "sampling/importance_sampling_ratio/min": 0.6358720809221268, "sampling/sampling_logp_difference/max": 0.4534604251384735, "sampling/sampling_logp_difference/mean": 0.006418961100280285, "step": 372 }, { "clip_ratio/high_max": 0.0001481837793512063, "clip_ratio/high_mean": 0.0001481837793512063, "clip_ratio/low_mean": 5.8321309754521484e-05, "clip_ratio/low_min": 5.8321309754521484e-05, "clip_ratio/region_mean": 0.00020650508910572776, "completions/clipped_ratio": 0.0, "completions/max_length": 108.2, "completions/max_terminated_length": 108.2, "completions/mean_length": 59.9046875, "completions/mean_terminated_length": 59.9046875, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1344662292653488, "epoch": 0.09973924380704041, "frac_reward_zero_std": 0.7125, "grad_norm": 2.474351644515991, "kl": 0.22649422003370193, "learning_rate": 3.7501517518899486e-08, "loss": 0.0359, "num_tokens": 25035247.0, "reward": 1.2953125, "reward_std": 0.1346124067902565, "rewards/equation_reward_func/mean": 0.2953125, "rewards/equation_reward_func/std": 0.4506261646747589, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4547551631927491, "sampling/importance_sampling_ratio/mean": 0.9998876333236695, "sampling/importance_sampling_ratio/min": 0.6309379935264587, "sampling/sampling_logp_difference/max": 0.48324047327041625, "sampling/sampling_logp_difference/mean": 0.006273848097771406, "step": 374 }, { "clip_ratio/high_max": 2.5070196392738984e-05, "clip_ratio/high_mean": 2.5070196392738984e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.5070196392738984e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 94.5, "completions/max_terminated_length": 94.5, "completions/mean_length": 59.451171875, "completions/mean_terminated_length": 59.451171875, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.1281018871296611, "epoch": 0.10027260874718502, "frac_reward_zero_std": 0.78125, "grad_norm": 1.2001689672470093, "kl": 0.19659941834915015, "learning_rate": 3.562574988326342e-08, "loss": -0.0164, "num_tokens": 25143686.0, "reward": 1.37890625, "reward_std": 0.09896067157387733, "rewards/equation_reward_func/mean": 0.37890625, "rewards/equation_reward_func/std": 0.4521568939089775, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.338096261024475, "sampling/importance_sampling_ratio/mean": 1.000062733888626, "sampling/importance_sampling_ratio/min": 0.6538477838039398, "sampling/sampling_logp_difference/max": 0.4262542948126793, "sampling/sampling_logp_difference/mean": 0.006024778354912996, "step": 376 }, { "clip_ratio/high_max": 5.427417636383325e-05, "clip_ratio/high_mean": 5.427417636383325e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.427417636383325e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.4, "completions/max_terminated_length": 87.4, "completions/mean_length": 59.9546875, "completions/mean_terminated_length": 59.9546875, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.1291940630827513, "epoch": 0.10080597368732962, "frac_reward_zero_std": 0.7125, "grad_norm": 1.9153673648834229, "kl": 0.17186383297666907, "learning_rate": 3.379450190982114e-08, "loss": 0.0224, "num_tokens": 25279577.0, "reward": 1.328125, "reward_std": 0.11868463754653931, "rewards/equation_reward_func/mean": 0.328125, "rewards/equation_reward_func/std": 0.4537951111793518, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.491814875602722, "sampling/importance_sampling_ratio/mean": 1.000006091594696, "sampling/importance_sampling_ratio/min": 0.7007228970527649, "sampling/sampling_logp_difference/max": 0.4378682851791382, "sampling/sampling_logp_difference/mean": 0.005993215274065733, "step": 378 }, { "clip_ratio/high_max": 3.0325085390359163e-05, "clip_ratio/high_mean": 3.0325085390359163e-05, "clip_ratio/low_mean": 2.64047315188994e-05, "clip_ratio/low_min": 2.64047315188994e-05, "clip_ratio/region_mean": 5.6729816909258567e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 85.5, "completions/max_terminated_length": 85.5, "completions/mean_length": 60.068359375, "completions/mean_terminated_length": 60.068359375, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.13051971663824385, "epoch": 0.10133933862747423, "frac_reward_zero_std": 0.71875, "grad_norm": 1.5062041282653809, "kl": 0.20231979987066653, "learning_rate": 3.2008153898637255e-08, "loss": -0.0174, "num_tokens": 25388420.0, "reward": 1.26953125, "reward_std": 0.11823134683072567, "rewards/equation_reward_func/mean": 0.26953125, "rewards/equation_reward_func/std": 0.4418637976050377, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5400249660015106, "sampling/importance_sampling_ratio/mean": 1.0000593960285187, "sampling/importance_sampling_ratio/min": 0.6541474461555481, "sampling/sampling_logp_difference/max": 0.47688713669776917, "sampling/sampling_logp_difference/mean": 0.0062780900625512, "step": 380 }, { "clip_ratio/high_max": 2.2843567421659827e-05, "clip_ratio/high_mean": 2.2843567421659827e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.2843567421659827e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 91.8, "completions/max_terminated_length": 91.8, "completions/mean_length": 60.6875, "completions/mean_terminated_length": 60.6875, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.13094672177814776, "epoch": 0.10187270356761882, "frac_reward_zero_std": 0.825, "grad_norm": 1.8069969415664673, "kl": 0.21800594099072945, "learning_rate": 3.026707682528365e-08, "loss": 0.0009, "num_tokens": 25524996.0, "reward": 1.2984375, "reward_std": 0.07143859006464481, "rewards/equation_reward_func/mean": 0.2984375, "rewards/equation_reward_func/std": 0.4384769380092621, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6111286163330079, "sampling/importance_sampling_ratio/mean": 1.0002249121665954, "sampling/importance_sampling_ratio/min": 0.6192312419414521, "sampling/sampling_logp_difference/max": 0.5502667665481568, "sampling/sampling_logp_difference/mean": 0.006101610139012337, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 60.498046875, "completions/mean_terminated_length": 60.498046875, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.12430162588134408, "epoch": 0.10240606850776342, "frac_reward_zero_std": 0.765625, "grad_norm": 1.4515024423599243, "kl": 0.2106134378247791, "learning_rate": 2.8571632263797745e-08, "loss": -0.008, "num_tokens": 25634179.0, "reward": 1.24609375, "reward_std": 0.09765137732028961, "rewards/equation_reward_func/mean": 0.24609375, "rewards/equation_reward_func/std": 0.41374894976615906, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4164173603057861, "sampling/importance_sampling_ratio/mean": 1.0000277161598206, "sampling/importance_sampling_ratio/min": 0.6115260124206543, "sampling/sampling_logp_difference/max": 0.4920421242713928, "sampling/sampling_logp_difference/mean": 0.005590940592810512, "step": 384 }, { "clip_ratio/high_max": 2.8460836296694146e-05, "clip_ratio/high_mean": 2.8460836296694146e-05, "clip_ratio/low_mean": 2.9301451933052805e-05, "clip_ratio/low_min": 2.9301451933052805e-05, "clip_ratio/region_mean": 5.776228822974695e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 83.4, "completions/max_terminated_length": 83.4, "completions/mean_length": 59.2953125, "completions/mean_terminated_length": 59.2953125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12854439449600047, "epoch": 0.10293943344790803, "frac_reward_zero_std": 0.7625, "grad_norm": 1.5940061807632446, "kl": 0.20719057373288605, "learning_rate": 2.6922172311593884e-08, "loss": -0.0099, "num_tokens": 25769480.0, "reward": 1.2875, "reward_std": 0.10442289412021637, "rewards/equation_reward_func/mean": 0.2875, "rewards/equation_reward_func/std": 0.4411031901836395, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7096091270446778, "sampling/importance_sampling_ratio/mean": 0.9999329447746277, "sampling/importance_sampling_ratio/min": 0.575983053445816, "sampling/sampling_logp_difference/max": 0.6358009219169617, "sampling/sampling_logp_difference/mean": 0.006519278418272734, "step": 386 }, { "clip_ratio/high_max": 8.058342468252199e-05, "clip_ratio/high_mean": 8.058342468252199e-05, "clip_ratio/low_mean": 5.742740338771707e-05, "clip_ratio/low_min": 5.742740338771707e-05, "clip_ratio/region_mean": 0.00013801082807023905, "completions/clipped_ratio": 0.001953125, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 60.958984375, "completions/mean_terminated_length": 60.89130115509033, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.13359902307598126, "epoch": 0.10347279838805262, "frac_reward_zero_std": 0.75, "grad_norm": 1.6044938564300537, "kl": 0.1787527039543622, "learning_rate": 2.5319039516341844e-08, "loss": 0.0075, "num_tokens": 25878787.0, "reward": 1.33203125, "reward_std": 0.10712223639711738, "rewards/equation_reward_func/mean": 0.33203125, "rewards/equation_reward_func/std": 0.4671602323651314, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.381981521844864, "sampling/importance_sampling_ratio/mean": 0.9991913884878159, "sampling/importance_sampling_ratio/min": 0.4863002896312365, "sampling/sampling_logp_difference/max": 7.171817094087601, "sampling/sampling_logp_difference/mean": 0.023529342142865062, "step": 388 }, { "clip_ratio/high_max": 5.62179031678372e-05, "clip_ratio/high_mean": 5.62179031678372e-05, "clip_ratio/low_mean": 2.7340332356592018e-05, "clip_ratio/low_min": 2.7340332356592018e-05, "clip_ratio/region_mean": 8.355823552442921e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 83.8, "completions/max_terminated_length": 83.8, "completions/mean_length": 61.2859375, "completions/mean_terminated_length": 61.2859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.13023308012634516, "epoch": 0.10400616332819723, "frac_reward_zero_std": 0.7875, "grad_norm": 1.41175377368927, "kl": 0.17217528820037842, "learning_rate": 2.3762566804829742e-08, "loss": -0.0094, "num_tokens": 26015954.0, "reward": 1.1796875, "reward_std": 0.09648019522428512, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.36328538954257966, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4666733026504517, "sampling/importance_sampling_ratio/mean": 1.0001488089561463, "sampling/importance_sampling_ratio/min": 0.6108168601989746, "sampling/sampling_logp_difference/max": 0.494760537147522, "sampling/sampling_logp_difference/mean": 0.006309473235160112, "step": 390 }, { "clip_ratio/high_max": 2.9178339496461882e-05, "clip_ratio/high_mean": 2.9178339496461882e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9178339496461882e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 83.25, "completions/max_terminated_length": 83.25, "completions/mean_length": 60.005859375, "completions/mean_terminated_length": 60.005859375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.12703627803259426, "epoch": 0.10453952826834183, "frac_reward_zero_std": 0.8125, "grad_norm": 1.7670952081680298, "kl": 0.180382230060382, "learning_rate": 2.2253077413823458e-08, "loss": 0.0082, "num_tokens": 26124885.0, "reward": 1.27734375, "reward_std": 0.08679318241775036, "rewards/equation_reward_func/mean": 0.27734375, "rewards/equation_reward_func/std": 0.44570842385292053, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4617320597171783, "sampling/importance_sampling_ratio/mean": 1.000129833817482, "sampling/importance_sampling_ratio/min": 0.6844279915094376, "sampling/sampling_logp_difference/max": 0.4297443926334381, "sampling/sampling_logp_difference/mean": 0.00584987853653729, "step": 392 }, { "clip_ratio/high_max": 8.385900420964592e-05, "clip_ratio/high_mean": 8.385900420964592e-05, "clip_ratio/low_mean": 6.171876965608034e-05, "clip_ratio/low_min": 6.171876965608034e-05, "clip_ratio/region_mean": 0.00014557777386572625, "completions/clipped_ratio": 0.0, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 59.821875, "completions/mean_terminated_length": 59.821875, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "entropy": 0.13188083899310893, "epoch": 0.10507289320848642, "frac_reward_zero_std": 0.75, "grad_norm": 1.563563346862793, "kl": 0.22656734474003315, "learning_rate": 2.0790884822939836e-08, "loss": 0.0058, "num_tokens": 26260675.0, "reward": 1.3046875, "reward_std": 0.10884230881929398, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.44983998537063596, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4970514059066773, "sampling/importance_sampling_ratio/mean": 1.0001416683197022, "sampling/importance_sampling_ratio/min": 0.647747814655304, "sampling/sampling_logp_difference/max": 0.47006185054779054, "sampling/sampling_logp_difference/mean": 0.006314558535814285, "step": 394 }, { "clip_ratio/high_max": 3.019323533711334e-05, "clip_ratio/high_mean": 3.019323533711334e-05, "clip_ratio/low_mean": 3.086419827822182e-05, "clip_ratio/low_min": 3.086419827822182e-05, "clip_ratio/region_mean": 6.105743361533516e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 88.5, "completions/max_terminated_length": 88.5, "completions/mean_length": 61.119140625, "completions/mean_terminated_length": 61.119140625, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.13032677345391777, "epoch": 0.10560625814863103, "frac_reward_zero_std": 0.75, "grad_norm": 1.7888156175613403, "kl": 0.17278889597704014, "learning_rate": 1.9376292689545158e-08, "loss": 0.003, "num_tokens": 26370296.0, "reward": 1.267578125, "reward_std": 0.114868875592947, "rewards/equation_reward_func/mean": 0.267578125, "rewards/equation_reward_func/std": 0.43912947177886963, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.42270827293396, "sampling/importance_sampling_ratio/mean": 1.0000508278608322, "sampling/importance_sampling_ratio/min": 0.6108743250370026, "sampling/sampling_logp_difference/max": 0.5003768801689148, "sampling/sampling_logp_difference/mean": 0.005960810463875532, "step": 396 }, { "clip_ratio/high_max": 2.7557320815200608e-05, "clip_ratio/high_mean": 2.7557320815200608e-05, "clip_ratio/low_mean": 6.246288669192129e-05, "clip_ratio/low_min": 6.246288669192129e-05, "clip_ratio/region_mean": 9.00202075071219e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 82.8, "completions/max_terminated_length": 82.8, "completions/mean_length": 59.7328125, "completions/mean_terminated_length": 59.7328125, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.12474298689307438, "epoch": 0.10613962308877563, "frac_reward_zero_std": 0.65, "grad_norm": 2.092935085296631, "kl": 0.1740477404350208, "learning_rate": 1.800959478569422e-08, "loss": -0.0145, "num_tokens": 26506157.0, "reward": 1.359375, "reward_std": 0.15470800101757048, "rewards/equation_reward_func/mean": 0.3609375, "rewards/equation_reward_func/std": 0.46775019764900205, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.5833498954772949, "sampling/importance_sampling_ratio/mean": 1.0000155568122864, "sampling/importance_sampling_ratio/min": 0.647001314163208, "sampling/sampling_logp_difference/max": 0.5482145309448242, "sampling/sampling_logp_difference/mean": 0.005933112371712923, "step": 398 }, { "clip_ratio/high_max": 5.6182988272565934e-05, "clip_ratio/high_mean": 5.6182988272565934e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.6182988272565934e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 86.25, "completions/max_terminated_length": 86.25, "completions/mean_length": 59.6015625, "completions/mean_terminated_length": 59.6015625, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.12827710833193529, "epoch": 0.10667298802892024, "frac_reward_zero_std": 0.75, "grad_norm": 2.054353713989258, "kl": 0.21830679341736767, "learning_rate": 1.6691074937121407e-08, "loss": 0.0178, "num_tokens": 26614657.0, "reward": 1.3671875, "reward_std": 0.1105396319180727, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.48276757448911667, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5129340887069702, "sampling/importance_sampling_ratio/mean": 0.9998404383659363, "sampling/importance_sampling_ratio/min": 0.5237895771861076, "sampling/sampling_logp_difference/max": 0.6509159058332443, "sampling/sampling_logp_difference/mean": 0.0061527101788669825, "step": 400 }, { "clip_ratio/high_max": 3.006253003453215e-05, "clip_ratio/high_mean": 3.006253003453215e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.006253003453215e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.8, "completions/max_terminated_length": 89.8, "completions/mean_length": 59.90625, "completions/mean_terminated_length": 59.90625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.12815166813217932, "epoch": 0.10720635296906483, "frac_reward_zero_std": 0.775, "grad_norm": 1.1940408945083618, "kl": 0.19283324449012676, "learning_rate": 1.5421006964298377e-08, "loss": 0.0099, "num_tokens": 26750741.0, "reward": 1.31875, "reward_std": 0.09479626119136811, "rewards/equation_reward_func/mean": 0.31875, "rewards/equation_reward_func/std": 0.45598559379577636, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4404696464538573, "sampling/importance_sampling_ratio/mean": 1.000059998035431, "sampling/importance_sampling_ratio/min": 0.6066517412662507, "sampling/sampling_logp_difference/max": 0.5222442984580994, "sampling/sampling_logp_difference/mean": 0.005891676619648933, "step": 402 }, { "clip_ratio/high_max": 8.207944564573053e-05, "clip_ratio/high_mean": 8.207944564573053e-05, "clip_ratio/low_mean": 5.8277620054367515e-05, "clip_ratio/low_min": 5.8277620054367515e-05, "clip_ratio/region_mean": 0.00014035706570009806, "completions/clipped_ratio": 0.0, "completions/max_length": 88.5, "completions/max_terminated_length": 88.5, "completions/mean_length": 60.41015625, "completions/mean_terminated_length": 60.41015625, "completions/min_length": 43.25, "completions/min_terminated_length": 43.25, "entropy": 0.13083325497185191, "epoch": 0.10773971790920943, "frac_reward_zero_std": 0.71875, "grad_norm": 1.954267144203186, "kl": 0.18376081669703126, "learning_rate": 1.4199654625568575e-08, "loss": -0.0013, "num_tokens": 26859759.0, "reward": 1.24609375, "reward_std": 0.12553353421390057, "rewards/equation_reward_func/mean": 0.24609375, "rewards/equation_reward_func/std": 0.4291451722383499, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7187917530536652, "sampling/importance_sampling_ratio/mean": 1.0002452433109283, "sampling/importance_sampling_ratio/min": 0.6705833300948143, "sampling/sampling_logp_difference/max": 0.6571616530418396, "sampling/sampling_logp_difference/mean": 0.006430137436836958, "step": 404 }, { "clip_ratio/high_max": 0.00011105913694740998, "clip_ratio/high_mean": 0.00011105913694740998, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011105913694740998, "completions/clipped_ratio": 0.0, "completions/max_length": 92.8, "completions/max_terminated_length": 92.8, "completions/mean_length": 60.046875, "completions/mean_terminated_length": 60.046875, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.129937127387772, "epoch": 0.10827308284935404, "frac_reward_zero_std": 0.7875, "grad_norm": 1.6967568397521973, "kl": 0.1976651683346265, "learning_rate": 1.302727156237224e-08, "loss": 0.0084, "num_tokens": 26995837.0, "reward": 1.2796875, "reward_std": 0.09447673708200455, "rewards/equation_reward_func/mean": 0.2796875, "rewards/equation_reward_func/std": 0.43781574368476867, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.57857403755188, "sampling/importance_sampling_ratio/mean": 1.0000683069229126, "sampling/importance_sampling_ratio/min": 0.6933550000190735, "sampling/sampling_logp_difference/max": 0.4925155997276306, "sampling/sampling_logp_difference/mean": 0.005957444757223129, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.5593575881276694e-05, "clip_ratio/low_min": 5.5593575881276694e-05, "clip_ratio/region_mean": 5.5593575881276694e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 96.75, "completions/max_terminated_length": 96.75, "completions/mean_length": 59.9453125, "completions/mean_terminated_length": 59.9453125, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.1296540542712642, "epoch": 0.10880644778949863, "frac_reward_zero_std": 0.828125, "grad_norm": 1.7109380960464478, "kl": 0.4134930967767205, "learning_rate": 1.1904101246571874e-08, "loss": 0.0015, "num_tokens": 27104745.0, "reward": 1.205078125, "reward_std": 0.0721381213515997, "rewards/equation_reward_func/mean": 0.205078125, "rewards/equation_reward_func/std": 0.4002479314804077, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.393905222415924, "sampling/importance_sampling_ratio/mean": 0.9998906701803207, "sampling/importance_sampling_ratio/min": 0.6444456428289413, "sampling/sampling_logp_difference/max": 0.44058823585510254, "sampling/sampling_logp_difference/mean": 0.0058896406553685665, "step": 408 }, { "clip_ratio/high_max": 0.00011761447886884626, "clip_ratio/high_mean": 0.00011761447886884626, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011761447886884626, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 59.815625, "completions/mean_terminated_length": 59.815625, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.12636753683909774, "epoch": 0.10933981272964324, "frac_reward_zero_std": 0.775, "grad_norm": 1.4596238136291504, "kl": 0.18164162089427313, "learning_rate": 1.0830376929889612e-08, "loss": 0.0137, "num_tokens": 27240651.0, "reward": 1.246875, "reward_std": 0.10047527849674225, "rewards/equation_reward_func/mean": 0.246875, "rewards/equation_reward_func/std": 0.4285436153411865, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3769176006317139, "sampling/importance_sampling_ratio/mean": 1.0001584887504578, "sampling/importance_sampling_ratio/min": 0.6679477572441102, "sampling/sampling_logp_difference/max": 0.4059238076210022, "sampling/sampling_logp_difference/mean": 0.006127269193530083, "step": 410 }, { "clip_ratio/high_max": 2.9677114475311504e-05, "clip_ratio/high_mean": 2.9677114475311504e-05, "clip_ratio/low_mean": 3.128128203873833e-05, "clip_ratio/low_min": 3.128128203873833e-05, "clip_ratio/region_mean": 6.095839651404983e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 83.75, "completions/max_terminated_length": 83.75, "completions/mean_length": 60.115234375, "completions/mean_terminated_length": 60.115234375, "completions/min_length": 44.5, "completions/min_terminated_length": 44.5, "entropy": 0.12989878452693424, "epoch": 0.10987317766978784, "frac_reward_zero_std": 0.78125, "grad_norm": 2.66369366645813, "kl": 0.2909034272241924, "learning_rate": 9.806321595467598e-09, "loss": 0.0042, "num_tokens": 27349566.0, "reward": 1.19921875, "reward_std": 0.09922465216368437, "rewards/equation_reward_func/mean": 0.19921875, "rewards/equation_reward_func/std": 0.40035977959632874, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.484683334827423, "sampling/importance_sampling_ratio/mean": 0.9997598975896835, "sampling/importance_sampling_ratio/min": 0.6004936695098877, "sampling/sampling_logp_difference/max": 0.5137730538845062, "sampling/sampling_logp_difference/mean": 0.006483575445599854, "step": 412 }, { "clip_ratio/high_max": 5.5383790620706146e-05, "clip_ratio/high_mean": 5.5383790620706146e-05, "clip_ratio/low_mean": 2.6205451124244268e-05, "clip_ratio/low_min": 2.6205451124244268e-05, "clip_ratio/region_mean": 8.158924174495041e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.8, "completions/max_terminated_length": 87.8, "completions/mean_length": 59.9640625, "completions/mean_terminated_length": 59.9640625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12692222128518754, "epoch": 0.11040654260993243, "frac_reward_zero_std": 0.7875, "grad_norm": 1.8278348445892334, "kl": 0.1834631760397719, "learning_rate": 8.832147911560173e-09, "loss": 0.012, "num_tokens": 27485631.0, "reward": 1.2484375, "reward_std": 0.09652668498456478, "rewards/equation_reward_func/mean": 0.2484375, "rewards/equation_reward_func/std": 0.41859459280967715, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4881003856658936, "sampling/importance_sampling_ratio/mean": 1.0000845789909363, "sampling/importance_sampling_ratio/min": 0.5320762634277344, "sampling/sampling_logp_difference/max": 0.6725892066955567, "sampling/sampling_logp_difference/mean": 0.006414630822837353, "step": 414 }, { "clip_ratio/high_max": 2.9301451933052805e-05, "clip_ratio/high_mean": 2.9301451933052805e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9301451933052805e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 89.75, "completions/max_terminated_length": 89.75, "completions/mean_length": 59.572265625, "completions/mean_terminated_length": 59.572265625, "completions/min_length": 45.5, "completions/min_terminated_length": 45.5, "entropy": 0.12699386068723267, "epoch": 0.11093990755007704, "frac_reward_zero_std": 0.6875, "grad_norm": 1.3803774118423462, "kl": 0.17113885055813524, "learning_rate": 7.908058187368726e-09, "loss": -0.004, "num_tokens": 27594132.0, "reward": 1.341796875, "reward_std": 0.135197926312685, "rewards/equation_reward_func/mean": 0.341796875, "rewards/equation_reward_func/std": 0.44888336956501007, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.548683077096939, "sampling/importance_sampling_ratio/mean": 1.0001097917556763, "sampling/importance_sampling_ratio/min": 0.6486568599939346, "sampling/sampling_logp_difference/max": 0.490677073597908, "sampling/sampling_logp_difference/mean": 0.00620193884242326, "step": 416 }, { "clip_ratio/high_max": 8.671806426718831e-05, "clip_ratio/high_mean": 8.671806426718831e-05, "clip_ratio/low_mean": 2.9550826487441857e-05, "clip_ratio/low_min": 2.9550826487441857e-05, "clip_ratio/region_mean": 0.00011626889075463016, "completions/clipped_ratio": 0.0, "completions/max_length": 89.2, "completions/max_terminated_length": 89.2, "completions/mean_length": 59.503125, "completions/mean_terminated_length": 59.503125, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.12677614427068168, "epoch": 0.11147327249022164, "frac_reward_zero_std": 0.7625, "grad_norm": 1.7999166250228882, "kl": 0.18055716576054692, "learning_rate": 7.0342443310273665e-09, "loss": 0.0006, "num_tokens": 27729862.0, "reward": 1.2546875, "reward_std": 0.10395108982920646, "rewards/equation_reward_func/mean": 0.2546875, "rewards/equation_reward_func/std": 0.41556936502456665, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.51474609375, "sampling/importance_sampling_ratio/mean": 0.99989253282547, "sampling/importance_sampling_ratio/min": 0.5786081969738006, "sampling/sampling_logp_difference/max": 0.5525192737579345, "sampling/sampling_logp_difference/mean": 0.0061160212382674215, "step": 418 }, { "clip_ratio/high_max": 3.059226502146986e-05, "clip_ratio/high_mean": 3.059226502146986e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.059226502146986e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.75, "completions/max_terminated_length": 87.75, "completions/mean_length": 60.623046875, "completions/mean_terminated_length": 60.623046875, "completions/min_length": 44.75, "completions/min_terminated_length": 44.75, "entropy": 0.12454396336235934, "epoch": 0.11200663743036625, "frac_reward_zero_std": 0.8125, "grad_norm": 1.402348279953003, "kl": 0.250829657436245, "learning_rate": 6.210887809749099e-09, "loss": -0.0004, "num_tokens": 27839189.0, "reward": 1.279296875, "reward_std": 0.08049003407359123, "rewards/equation_reward_func/mean": 0.279296875, "rewards/equation_reward_func/std": 0.44842614233493805, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3625956773757935, "sampling/importance_sampling_ratio/mean": 0.9998741149902344, "sampling/importance_sampling_ratio/min": 0.602161779999733, "sampling/sampling_logp_difference/max": 0.5080580413341522, "sampling/sampling_logp_difference/mean": 0.005774849443696439, "step": 420 }, { "clip_ratio/high_max": 2.9932950080061953e-05, "clip_ratio/high_mean": 2.9932950080061953e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.9932950080061953e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 86.2, "completions/max_terminated_length": 86.2, "completions/mean_length": 59.459375, "completions/mean_terminated_length": 59.459375, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "entropy": 0.12641834441779387, "epoch": 0.11254000237051084, "frac_reward_zero_std": 0.7625, "grad_norm": 1.3371871709823608, "kl": 0.1961807892140415, "learning_rate": 5.4381596121399476e-09, "loss": 0.0255, "num_tokens": 27974843.0, "reward": 1.215625, "reward_std": 0.10626165680587292, "rewards/equation_reward_func/mean": 0.215625, "rewards/equation_reward_func/std": 0.38566107749938966, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5341267585754395, "sampling/importance_sampling_ratio/mean": 0.9999158501625061, "sampling/importance_sampling_ratio/min": 0.616521668434143, "sampling/sampling_logp_difference/max": 0.5541383266448975, "sampling/sampling_logp_difference/mean": 0.006367453467100858, "step": 422 }, { "clip_ratio/high_max": 0.00011839923617016111, "clip_ratio/high_mean": 0.00011839923617016111, "clip_ratio/low_mean": 5.95742975646216e-05, "clip_ratio/low_min": 5.95742975646216e-05, "clip_ratio/region_mean": 0.00017797353373478271, "completions/clipped_ratio": 0.0, "completions/max_length": 86.25, "completions/max_terminated_length": 86.25, "completions/mean_length": 59.552734375, "completions/mean_terminated_length": 59.552734375, "completions/min_length": 45.25, "completions/min_terminated_length": 45.25, "entropy": 0.12887385627254844, "epoch": 0.11307336731065544, "frac_reward_zero_std": 0.765625, "grad_norm": 2.158684730529785, "kl": 0.17370186477071708, "learning_rate": 4.716220212689332e-09, "loss": -0.0008, "num_tokens": 28083462.0, "reward": 1.21875, "reward_std": 0.10592720285058022, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.39608463644981384, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4083328247070312, "sampling/importance_sampling_ratio/mean": 1.0004501640796661, "sampling/importance_sampling_ratio/min": 0.5882390588521957, "sampling/sampling_logp_difference/max": 0.5338348597288132, "sampling/sampling_logp_difference/mean": 0.006216718233190477, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.6, "completions/max_terminated_length": 84.6, "completions/mean_length": 60.4171875, "completions/mean_terminated_length": 60.4171875, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.12304476369172335, "epoch": 0.11360673225080005, "frac_reward_zero_std": 0.85, "grad_norm": 1.4505035877227783, "kl": 0.19276829208764765, "learning_rate": 4.045219538443778e-09, "loss": -0.0024, "num_tokens": 28219929.0, "reward": 1.240625, "reward_std": 0.06439104676246643, "rewards/equation_reward_func/mean": 0.240625, "rewards/equation_reward_func/std": 0.416792094707489, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.523363995552063, "sampling/importance_sampling_ratio/mean": 1.0002870082855224, "sampling/importance_sampling_ratio/min": 0.6567166924476624, "sampling/sampling_logp_difference/max": 0.478407096862793, "sampling/sampling_logp_difference/mean": 0.0057648224756121635, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.75, "completions/max_terminated_length": 97.75, "completions/mean_length": 59.931640625, "completions/mean_terminated_length": 59.931640625, "completions/min_length": 46.5, "completions/min_terminated_length": 46.5, "entropy": 0.12755089284231266, "epoch": 0.11414009719094464, "frac_reward_zero_std": 0.84375, "grad_norm": 1.1358726024627686, "kl": 0.1791373600458933, "learning_rate": 3.4252969378714134e-09, "loss": 0.0064, "num_tokens": 28328678.0, "reward": 1.302734375, "reward_std": 0.06405018735677004, "rewards/equation_reward_func/mean": 0.302734375, "rewards/equation_reward_func/std": 0.45350871980190277, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4579938054084778, "sampling/importance_sampling_ratio/mean": 1.0000010430812836, "sampling/importance_sampling_ratio/min": 0.6091656982898712, "sampling/sampling_logp_difference/max": 0.5073221325874329, "sampling/sampling_logp_difference/mean": 0.005905604222789407, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.4, "completions/max_terminated_length": 84.4, "completions/mean_length": 58.9484375, "completions/mean_terminated_length": 58.9484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.12297437562503749, "epoch": 0.11467346213108925, "frac_reward_zero_std": 0.825, "grad_norm": 1.4636846780776978, "kl": 0.16625024543868172, "learning_rate": 2.856581151922943e-09, "loss": 0.0011, "num_tokens": 28463925.0, "reward": 1.21875, "reward_std": 0.07848456650972366, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.4077851951122284, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4145832538604737, "sampling/importance_sampling_ratio/mean": 1.0000919818878173, "sampling/importance_sampling_ratio/min": 0.6459633827209472, "sampling/sampling_logp_difference/max": 0.4377876043319702, "sampling/sampling_logp_difference/mean": 0.006151107605546713, "step": 430 }, { "clip_ratio/high_max": 5.4865567815593546e-05, "clip_ratio/high_mean": 5.4865567815593546e-05, "clip_ratio/low_mean": 8.203125131937365e-05, "clip_ratio/low_min": 8.203125131937365e-05, "clip_ratio/region_mean": 0.0001368968191349672, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 59.76171875, "completions/mean_terminated_length": 59.76171875, "completions/min_length": 45.75, "completions/min_terminated_length": 45.75, "entropy": 0.1291431699258586, "epoch": 0.11520682707123385, "frac_reward_zero_std": 0.75, "grad_norm": 1.7867857217788696, "kl": 0.18384755309671164, "learning_rate": 2.339190287295678e-09, "loss": 0.0328, "num_tokens": 28572611.0, "reward": 1.39453125, "reward_std": 0.11198312044143677, "rewards/equation_reward_func/mean": 0.39453125, "rewards/equation_reward_func/std": 0.4736030250787735, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5973372161388397, "sampling/importance_sampling_ratio/mean": 1.0002234280109406, "sampling/importance_sampling_ratio/min": 0.6337156742811203, "sampling/sampling_logp_difference/max": 0.5154725015163422, "sampling/sampling_logp_difference/mean": 0.00589129189029336, "step": 432 }, { "clip_ratio/high_max": 0.00020248690594194664, "clip_ratio/high_mean": 0.00020248690594194664, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020248690594194664, "completions/clipped_ratio": 0.0, "completions/max_length": 88.8, "completions/max_terminated_length": 88.8, "completions/mean_length": 58.353125, "completions/mean_terminated_length": 58.353125, "completions/min_length": 44.4, "completions/min_terminated_length": 44.4, "entropy": 0.12384788459166884, "epoch": 0.11574019201137845, "frac_reward_zero_std": 0.8, "grad_norm": 1.4415379762649536, "kl": 0.17681175889447331, "learning_rate": 1.8732317919060715e-09, "loss": -0.0043, "num_tokens": 28707261.0, "reward": 1.234375, "reward_std": 0.08595742657780647, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.4129136800765991, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4196146726608276, "sampling/importance_sampling_ratio/mean": 0.9996507644653321, "sampling/importance_sampling_ratio/min": 0.6250426173210144, "sampling/sampling_logp_difference/max": 0.4739797115325928, "sampling/sampling_logp_difference/mean": 0.006236276216804981, "step": 434 }, { "clip_ratio/high_max": 5.791964293974969e-05, "clip_ratio/high_mean": 5.791964293974969e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.791964293974969e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.75, "completions/max_terminated_length": 87.75, "completions/mean_length": 60.08203125, "completions/mean_terminated_length": 60.08203125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.12579693888417548, "epoch": 0.11627355695152305, "frac_reward_zero_std": 0.75, "grad_norm": 2.367534637451172, "kl": 0.1685813944786787, "learning_rate": 1.4588024325756788e-09, "loss": -0.0001, "num_tokens": 28816199.0, "reward": 1.271484375, "reward_std": 0.1057380810379982, "rewards/equation_reward_func/mean": 0.271484375, "rewards/equation_reward_func/std": 0.4411757215857506, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5122828781604767, "sampling/importance_sampling_ratio/mean": 0.9997886717319489, "sampling/importance_sampling_ratio/min": 0.6094181165099144, "sampling/sampling_logp_difference/max": 0.5114473104476929, "sampling/sampling_logp_difference/mean": 0.005729502416215837, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.9056252161454824e-05, "clip_ratio/low_min": 2.9056252161454824e-05, "clip_ratio/region_mean": 2.9056252161454824e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 59.6578125, "completions/mean_terminated_length": 59.6578125, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "entropy": 0.13212950905371043, "epoch": 0.11680692189166765, "frac_reward_zero_std": 0.7625, "grad_norm": 1.959563970565796, "kl": 0.17209685536929303, "learning_rate": 1.0959882749354277e-09, "loss": 0.0054, "num_tokens": 28951860.0, "reward": 1.2515625, "reward_std": 0.09601093530654907, "rewards/equation_reward_func/mean": 0.2515625, "rewards/equation_reward_func/std": 0.43250252604484557, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4054103374481202, "sampling/importance_sampling_ratio/mean": 0.9999048829078674, "sampling/importance_sampling_ratio/min": 0.6655448198318481, "sampling/sampling_logp_difference/max": 0.4201939105987549, "sampling/sampling_logp_difference/mean": 0.00609291410073638, "step": 438 }, { "clip_ratio/high_max": 3.1709790669588576e-05, "clip_ratio/high_mean": 3.1709790669588576e-05, "clip_ratio/low_mean": 5.050505084606508e-05, "clip_ratio/low_min": 5.050505084606508e-05, "clip_ratio/region_mean": 8.221484151565366e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 60.220703125, "completions/mean_terminated_length": 60.220703125, "completions/min_length": 46.25, "completions/min_terminated_length": 46.25, "entropy": 0.12923449138179421, "epoch": 0.11734028683181226, "frac_reward_zero_std": 0.8125, "grad_norm": 1.25634765625, "kl": 0.1776046895215081, "learning_rate": 7.848646655519986e-10, "loss": 0.004, "num_tokens": 29060861.0, "reward": 1.27734375, "reward_std": 0.07825092412531376, "rewards/equation_reward_func/mean": 0.27734375, "rewards/equation_reward_func/std": 0.4412510097026825, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.517403781414032, "sampling/importance_sampling_ratio/mean": 1.0000708848237991, "sampling/importance_sampling_ratio/min": 0.4693167731165886, "sampling/sampling_logp_difference/max": 0.7854912877082825, "sampling/sampling_logp_difference/mean": 0.006504939286969602, "step": 440 }, { "clip_ratio/high_max": 3.3227005688887504e-05, "clip_ratio/high_mean": 3.3227005688887504e-05, "clip_ratio/low_mean": 8.616402596493976e-05, "clip_ratio/low_min": 8.616402596493976e-05, "clip_ratio/region_mean": 0.00011939103165382726, "completions/clipped_ratio": 0.0, "completions/max_length": 84.4, "completions/max_terminated_length": 84.4, "completions/mean_length": 59.7703125, "completions/mean_terminated_length": 59.7703125, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "entropy": 0.13163903050331605, "epoch": 0.11787365177195686, "frac_reward_zero_std": 0.75, "grad_norm": 1.7981462478637695, "kl": 0.15415573001114857, "learning_rate": 5.254962162804799e-10, "loss": 0.0118, "num_tokens": 29196698.0, "reward": 1.221875, "reward_std": 0.10815931893885136, "rewards/equation_reward_func/mean": 0.221875, "rewards/equation_reward_func/std": 0.40737531781196595, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5566086292266845, "sampling/importance_sampling_ratio/mean": 0.9999939799308777, "sampling/importance_sampling_ratio/min": 0.623484593629837, "sampling/sampling_logp_difference/max": 0.5733773708343506, "sampling/sampling_logp_difference/mean": 0.006244783475995064, "step": 442 }, { "clip_ratio/high_max": 3.1855248380452394e-05, "clip_ratio/high_mean": 3.1855248380452394e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.1855248380452394e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 59.208984375, "completions/mean_terminated_length": 59.208984375, "completions/min_length": 44.25, "completions/min_terminated_length": 44.25, "entropy": 0.1293699970572359, "epoch": 0.11840701671210145, "frac_reward_zero_std": 0.6875, "grad_norm": 2.1694672107696533, "kl": 0.1785511065584918, "learning_rate": 3.1793679084632375e-10, "loss": 0.0041, "num_tokens": 29304925.0, "reward": 1.24609375, "reward_std": 0.1355236954987049, "rewards/equation_reward_func/mean": 0.248046875, "rewards/equation_reward_func/std": 0.42666202783584595, "rewards/format_reward_func/mean": 0.998046875, "rewards/format_reward_func/std": 0.022097086533904076, "sampling/importance_sampling_ratio/max": 1.386458545923233, "sampling/importance_sampling_ratio/mean": 1.0000255703926086, "sampling/importance_sampling_ratio/min": 0.6110378354787827, "sampling/sampling_logp_difference/max": 0.49522897601127625, "sampling/sampling_logp_difference/mean": 0.0065278050024062395, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.520605206282603e-05, "clip_ratio/low_min": 6.520605206282603e-05, "clip_ratio/region_mean": 6.520605206282603e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 99.4, "completions/max_terminated_length": 99.4, "completions/mean_length": 60.3828125, "completions/mean_terminated_length": 60.3828125, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "entropy": 0.1300935610714886, "epoch": 0.11894038165224606, "frac_reward_zero_std": 0.8, "grad_norm": 1.5143063068389893, "kl": 0.18572996779241496, "learning_rate": 1.6222949365926608e-10, "loss": -0.004, "num_tokens": 29441314.0, "reward": 1.240625, "reward_std": 0.0825395405292511, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4116243064403534, "rewards/format_reward_func/mean": 0.9984375, "rewards/format_reward_func/std": 0.01767766922712326, "sampling/importance_sampling_ratio/max": 1.4786861419677735, "sampling/importance_sampling_ratio/mean": 0.9999108791351319, "sampling/importance_sampling_ratio/min": 0.654578697681427, "sampling/sampling_logp_difference/max": 0.4569679141044617, "sampling/sampling_logp_difference/mean": 0.006356936413794756, "step": 446 }, { "clip_ratio/high_max": 2.81151604010827e-05, "clip_ratio/high_mean": 2.81151604010827e-05, "clip_ratio/low_mean": 5.6779987062327564e-05, "clip_ratio/low_min": 5.6779987062327564e-05, "clip_ratio/region_mean": 8.489514746341026e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 60.103515625, "completions/mean_terminated_length": 60.103515625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.12911512040429646, "epoch": 0.11947374659239066, "frac_reward_zero_std": 0.765625, "grad_norm": 1.4116688966751099, "kl": 0.25092864046908087, "learning_rate": 5.84066608615985e-11, "loss": 0.0143, "num_tokens": 29550215.0, "reward": 1.2578125, "reward_std": 0.09311503916978836, "rewards/equation_reward_func/mean": 0.2578125, "rewards/equation_reward_func/std": 0.4087715372443199, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4625143110752106, "sampling/importance_sampling_ratio/mean": 1.000143975019455, "sampling/importance_sampling_ratio/min": 0.6613075584173203, "sampling/sampling_logp_difference/max": 0.4247802793979645, "sampling/sampling_logp_difference/mean": 0.0061110410606488585, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 59.5296875, "completions/mean_terminated_length": 59.5296875, "completions/min_length": 45.8, "completions/min_terminated_length": 45.8, "entropy": 0.12594678785858882, "epoch": 0.12000711153253527, "frac_reward_zero_std": 0.8, "grad_norm": 1.6385879516601562, "kl": 0.1821495109858612, "learning_rate": 6.489853613067531e-12, "loss": -0.0081, "num_tokens": 29685874.0, "reward": 1.265625, "reward_std": 0.09052813947200775, "rewards/equation_reward_func/mean": 0.265625, "rewards/equation_reward_func/std": 0.3982453763484955, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5007583141326903, "sampling/importance_sampling_ratio/mean": 0.9997750282287597, "sampling/importance_sampling_ratio/min": 0.604513943195343, "sampling/sampling_logp_difference/max": 0.5145397424697876, "sampling/sampling_logp_difference/mean": 0.006283565051853656, "step": 450 }, { "epoch": 0.12000711153253527, "step": 450, "total_flos": 0.0, "train_loss": 0.05667058430531243, "train_runtime": 10874.0114, "train_samples_per_second": 11.918, "train_steps_per_second": 0.041 } ], "logging_steps": 2, "max_steps": 450, "num_input_tokens_seen": 29685874, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }