{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07142857142857142, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 429.265625, "completions/mean_terminated_length": 429.265625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16631191410124302, "epoch": 0.0007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 15.064172744750977, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 106993.0, "reward": 0.3062499761581421, "reward_std": 0.44636982679367065, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.279951810836792, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 1, "step_time": 58.5344306009938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 318.9375, "completions/mean_terminated_length": 318.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21383497677743435, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 7.98560905456543, "learning_rate": 5.555555555555555e-08, "loss": -0.0, "num_tokens": 215429.0, "reward": 0.1718749850988388, "reward_std": 0.3545480966567993, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.2745848298072815, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 2, "step_time": 88.76631601905683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 473.1875, "completions/mean_terminated_length": 448.19049072265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18973157368600368, "epoch": 0.002142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 4.975986957550049, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "num_tokens": 328561.0, "reward": 0.26093751192092896, "reward_std": 0.421775221824646, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2847827076911926, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 3, "step_time": 165.18509875505697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 572.875, "completions/mean_terminated_length": 525.290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.16460688412189484, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 15.808328628540039, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "num_tokens": 457417.0, "reward": 0.3304687440395355, "reward_std": 0.4703609347343445, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.29620200395584106, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 4, "step_time": 144.26409805112053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 487.84375, "completions/mean_terminated_length": 487.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.1548436339944601, "epoch": 0.0035714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 4.682651996612549, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "num_tokens": 581367.0, "reward": 0.26640623807907104, "reward_std": 0.43771177530288696, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.27265870571136475, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 5, "step_time": 108.35025227611186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 504.1875, "completions/mean_terminated_length": 504.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.15498177334666252, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 19.60171890258789, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "num_tokens": 697603.0, "reward": 0.15312498807907104, "reward_std": 0.3424786627292633, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 6, "step_time": 101.33141891699051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 490.046875, "completions/mean_terminated_length": 490.046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16023985855281353, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 3.83193302154541, "learning_rate": 3.333333333333333e-07, "loss": -0.0, "num_tokens": 815102.0, "reward": 0.21953123807907104, "reward_std": 0.4048923850059509, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.27265870571136475, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 7, "step_time": 91.50709407392424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 304.890625, "completions/mean_terminated_length": 304.890625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.24000362865626812, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 2.5482072830200195, "learning_rate": 3.888888888888889e-07, "loss": -0.0, "num_tokens": 898271.0, "reward": 0.37343746423721313, "reward_std": 0.4891658127307892, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.2630521357059479, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 8, "step_time": 39.65540377004072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 562.890625, "completions/mean_terminated_length": 539.3175048828125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16113737598061562, "epoch": 0.0064285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 4.5696187019348145, "learning_rate": 4.444444444444444e-07, "loss": -0.0, "num_tokens": 1037680.0, "reward": 0.3976562023162842, "reward_std": 0.48798495531082153, "rewards/format_reward/mean": 0.3828125, "rewards/format_reward/std": 0.21347814798355103, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 9, "step_time": 141.6887187999091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 482.828125, "completions/mean_terminated_length": 457.9841613769531, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.13189083151519299, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 2.855482578277588, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 1177501.0, "reward": 0.2890625, "reward_std": 0.44351306557655334, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.2592533528804779, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 10, "step_time": 166.28699472307926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 475.71875, "completions/mean_terminated_length": 475.71875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1734474077820778, "epoch": 0.007857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 7.398857593536377, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "num_tokens": 1285235.0, "reward": 0.3296874761581421, "reward_std": 0.46408456563949585, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.29839184880256653, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 11, "step_time": 84.22253790113609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 421.1612854003906, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.1755824889987707, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 7.043524265289307, "learning_rate": 6.111111111111112e-07, "loss": -0.0, "num_tokens": 1402707.0, "reward": 0.3609374761581421, "reward_std": 0.4861958622932434, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.31140682101249695, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 12, "step_time": 147.3180411880021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 390.375, "completions/mean_terminated_length": 390.375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.18040168471634388, "epoch": 0.009285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 10.297653198242188, "learning_rate": 6.666666666666666e-07, "loss": -0.0, "num_tokens": 1499635.0, "reward": 0.34296876192092896, "reward_std": 0.4763341546058655, "rewards/format_reward/mean": 0.4609375, "rewards/format_reward/std": 0.37059250473976135, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 13, "step_time": 114.5199738269439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 537.03125, "completions/mean_terminated_length": 488.2903137207031, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17947025410830975, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 6.441692352294922, "learning_rate": 7.222222222222221e-07, "loss": -0.0, "num_tokens": 1617149.0, "reward": 0.1929687261581421, "reward_std": 0.3684875965118408, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.28510910272598267, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 14, "step_time": 151.2594413299812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1614.0, "completions/max_terminated_length": 1614.0, "completions/mean_length": 591.671875, "completions/mean_terminated_length": 591.671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14635740965604782, "epoch": 0.010714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 8.28996467590332, "learning_rate": 7.777777777777778e-07, "loss": -0.0, "num_tokens": 1737472.0, "reward": 0.43437498807907104, "reward_std": 0.49728426337242126, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.22712838649749756, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 15, "step_time": 89.8077824919601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 394.375, "completions/mean_terminated_length": 394.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.14997188560664654, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 15.825170516967773, "learning_rate": 8.333333333333333e-07, "loss": -0.0, "num_tokens": 1841536.0, "reward": 0.44218748807907104, "reward_std": 0.5008895993232727, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.24346621334552765, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 16, "step_time": 76.37220047204755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 405.4375, "completions/mean_terminated_length": 379.3651123046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1900953222066164, "epoch": 0.012142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 4.9255475997924805, "learning_rate": 8.888888888888888e-07, "loss": 0.0, "num_tokens": 1953596.0, "reward": 0.29296875, "reward_std": 0.444888710975647, "rewards/format_reward/mean": 0.4296875, "rewards/format_reward/std": 0.26528194546699524, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 17, "step_time": 201.1936074459809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 503.375, "completions/mean_terminated_length": 503.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.16892211325466633, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.125, "grad_norm": 1.7092769145965576, "learning_rate": 9.444444444444444e-07, "loss": 0.0, "num_tokens": 2075684.0, "reward": 0.39374998211860657, "reward_std": 0.4857901334762573, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.2182178944349289, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 18, "step_time": 85.88162007700885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 405.21875, "completions/mean_terminated_length": 405.21875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14752909820526838, "epoch": 0.013571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 5.623274803161621, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 2192466.0, "reward": 0.3187499940395355, "reward_std": 0.45114490389823914, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2357022762298584, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 19, "step_time": 92.52301802794682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 510.96875, "completions/mean_terminated_length": 461.3870849609375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.19505906477570534, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 7.067420482635498, "learning_rate": 9.999776148326214e-07, "loss": 0.0, "num_tokens": 2310224.0, "reward": 0.17656250298023224, "reward_std": 0.33117976784706116, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.23517554998397827, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 20, "step_time": 171.9173401860171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 405.171875, "completions/mean_terminated_length": 405.171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20128028839826584, "epoch": 0.015, "frac_reward_zero_std": 0.0, "grad_norm": 15.80737590789795, "learning_rate": 9.999104613348689e-07, "loss": -0.0, "num_tokens": 2409627.0, "reward": 0.32343748211860657, "reward_std": 0.4627358913421631, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.33592742681503296, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 21, "step_time": 53.99907385505503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 482.375, "completions/mean_terminated_length": 457.5238342285156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.16458906698971987, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.125, "grad_norm": 4.145486831665039, "learning_rate": 9.997985455197113e-07, "loss": -0.0, "num_tokens": 2518611.0, "reward": 0.35078126192092896, "reward_std": 0.46423619985580444, "rewards/format_reward/mean": 0.5390625, "rewards/format_reward/std": 0.3249503970146179, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 22, "step_time": 158.2987147619715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 269.3125, "completions/mean_terminated_length": 269.3125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21560950204730034, "epoch": 0.016428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 27.24405288696289, "learning_rate": 9.996418774081656e-07, "loss": -0.0, "num_tokens": 2592975.0, "reward": 0.41484373807907104, "reward_std": 0.4857231378555298, "rewards/format_reward/mean": 0.7109375, "rewards/format_reward/std": 0.2928335666656494, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 23, "step_time": 46.8515038289479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 252.953125, "completions/mean_terminated_length": 252.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.19199753180146217, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 17.405986785888672, "learning_rate": 9.994404710283998e-07, "loss": 0.0, "num_tokens": 2687908.0, "reward": 0.3343749940395355, "reward_std": 0.4513537883758545, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.3726780116558075, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 24, "step_time": 82.87710704799974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2559709567576647, "epoch": 0.017857142857142856, "frac_reward_zero_std": 0.375, "grad_norm": 8.622386932373047, "learning_rate": 9.991943444144756e-07, "loss": -0.0, "num_tokens": 2777256.0, "reward": 0.29296875, "reward_std": 0.4255591630935669, "rewards/format_reward/mean": 0.7421875, "rewards/format_reward/std": 0.2816080152988434, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 25, "step_time": 100.27635278215166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 82.921875, "completions/mean_terminated_length": 82.921875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.24555067718029022, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.125, "grad_norm": 17.675294876098633, "learning_rate": 9.989035196047348e-07, "loss": 0.0, "num_tokens": 2852203.0, "reward": 0.4906250238418579, "reward_std": 0.49432292580604553, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.265398770570755, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 26, "step_time": 71.82276537799044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 41.0625, "completions/mean_terminated_length": 41.0625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.23947478830814362, "epoch": 0.019285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 27.987302780151367, "learning_rate": 9.98568022639826e-07, "loss": -0.0, "num_tokens": 2928439.0, "reward": 0.63671875, "reward_std": 0.5023154616355896, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.20275264978408813, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 27, "step_time": 26.626900972041767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 58.015625, "completions/mean_terminated_length": 58.015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18591530248522758, "epoch": 0.02, "frac_reward_zero_std": 0.25, "grad_norm": 33.29207992553711, "learning_rate": 9.981878835603716e-07, "loss": 0.0, "num_tokens": 2998840.0, "reward": 0.4515625238418579, "reward_std": 0.48989540338516235, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.20351573824882507, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 28, "step_time": 29.762270515959244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 34.6875, "completions/mean_terminated_length": 34.6875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15785411559045315, "epoch": 0.020714285714285713, "frac_reward_zero_std": 0.375, "grad_norm": 21.5506534576416, "learning_rate": 9.977631364042794e-07, "loss": -0.0, "num_tokens": 3069804.0, "reward": 0.55078125, "reward_std": 0.502493143081665, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 29, "step_time": 27.969350750092417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 26.53125, "completions/mean_terminated_length": 26.53125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1635878887027502, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 17.759307861328125, "learning_rate": 9.972938192036944e-07, "loss": 0.0, "num_tokens": 3150486.0, "reward": 0.653124988079071, "reward_std": 0.49359992146492004, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.19669894874095917, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 30, "step_time": 32.165516318927985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 16.28125, "completions/mean_terminated_length": 16.28125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.11400723084807396, "epoch": 0.02214285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 7.3703789710998535, "learning_rate": 9.967799739815924e-07, "loss": 0.0, "num_tokens": 3208048.0, "reward": 0.4437500238418579, "reward_std": 0.4787135720252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 31, "step_time": 6.480815099028405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 19.875, "completions/mean_terminated_length": 19.875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.23133844323456287, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.25, "grad_norm": 14.804015159606934, "learning_rate": 9.96221646748019e-07, "loss": 0.0, "num_tokens": 3307336.0, "reward": 0.21484375, "reward_std": 0.33814147114753723, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.25479042530059814, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 32, "step_time": 15.765849456947763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 12.140625, "completions/mean_terminated_length": 12.140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11367706023156643, "epoch": 0.023571428571428573, "frac_reward_zero_std": 0.375, "grad_norm": 25.840938568115234, "learning_rate": 9.956188874959686e-07, "loss": 0.0, "num_tokens": 3388185.0, "reward": 0.36328125, "reward_std": 0.44670239090919495, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.10652101784944534, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 33, "step_time": 3.9221740990760736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 14.109375, "completions/mean_terminated_length": 14.109375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12458794936537743, "epoch": 0.024285714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 14.921568870544434, "learning_rate": 9.949717501969079e-07, "loss": 0.0, "num_tokens": 3454872.0, "reward": 0.5679687857627869, "reward_std": 0.5037452578544617, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.46875, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 34, "step_time": 5.632602730009239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 15.34375, "completions/mean_terminated_length": 15.34375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.11258962377905846, "epoch": 0.025, "frac_reward_zero_std": 0.75, "grad_norm": 6.26678466796875, "learning_rate": 9.942802927959442e-07, "loss": 0.0, "num_tokens": 3534958.0, "reward": 0.5375000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 35, "step_time": 9.37483271205565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.328125, "completions/mean_terminated_length": 12.328125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.13397582434117794, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 6.364090442657471, "learning_rate": 9.93544577206636e-07, "loss": 0.0, "num_tokens": 3597171.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 36, "step_time": 3.0314099779934622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 13.078125, "completions/mean_terminated_length": 13.078125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.11545340903103352, "epoch": 0.02642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 11.611798286437988, "learning_rate": 9.927646693054495e-07, "loss": 0.0, "num_tokens": 3696696.0, "reward": 0.3656250238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 37, "step_time": 8.566134120046627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.484375, "completions/mean_terminated_length": 12.484375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.10600519925355911, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 7.415499210357666, "learning_rate": 9.919406389258606e-07, "loss": 0.0, "num_tokens": 3774983.0, "reward": 0.4437500238418579, "reward_std": 0.4787135720252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 38, "step_time": 3.5948679719585925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.390625, "completions/mean_terminated_length": 12.390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.11269045062363148, "epoch": 0.027857142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 7.707824230194092, "learning_rate": 9.910725598521012e-07, "loss": -0.0, "num_tokens": 3879416.0, "reward": 0.7093750238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 39, "step_time": 6.437070619082078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 14.71875, "completions/mean_terminated_length": 14.71875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.1085408478975296, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 11.853015899658203, "learning_rate": 9.901605098125526e-07, "loss": 0.0, "num_tokens": 3949438.0, "reward": 0.7085937857627869, "reward_std": 0.49115630984306335, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 40, "step_time": 6.119476022082381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 19.484375, "completions/mean_terminated_length": 19.484375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09150838013738394, "epoch": 0.029285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.892045704727863e-07, "loss": 0.0, "num_tokens": 4034149.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 41, "step_time": 15.748524485970847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 16.890625, "completions/mean_terminated_length": 16.890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09939474705606699, "epoch": 0.03, "frac_reward_zero_std": 0.75, "grad_norm": 7.690636157989502, "learning_rate": 9.882048274282505e-07, "loss": 0.0, "num_tokens": 4101726.0, "reward": 0.39531251788139343, "reward_std": 0.4616841673851013, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 42, "step_time": 9.772893431887496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 61.640625, "completions/mean_terminated_length": 30.111112594604492, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.11599440686404705, "epoch": 0.030714285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 14.23160457611084, "learning_rate": 9.871613701966066e-07, "loss": 0.0, "num_tokens": 4184607.0, "reward": 0.5335937738418579, "reward_std": 0.5037994384765625, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18483558297157288, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 43, "step_time": 144.26547849614872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 90.21875, "completions/mean_terminated_length": 27.064516067504883, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09353543492034078, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 19.40498924255371, "learning_rate": 9.86074292209714e-07, "loss": 0.0, "num_tokens": 4253973.0, "reward": 0.581250011920929, "reward_std": 0.5070533752441406, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 44, "step_time": 122.23275292402832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 17.015625, "completions/mean_terminated_length": 17.015625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.07891270238906145, "epoch": 0.03214285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 6.091732501983643, "learning_rate": 9.849436908052636e-07, "loss": 0.0, "num_tokens": 4362886.0, "reward": 0.69140625, "reward_std": 0.49646008014678955, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 45, "step_time": 16.195965035120025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 31.546875, "completions/mean_terminated_length": 31.546875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.1003469587303698, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 13.104848861694336, "learning_rate": 9.837696672180618e-07, "loss": 0.0, "num_tokens": 4433921.0, "reward": 0.7718750238418579, "reward_std": 0.4732423424720764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.671875, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 46, "step_time": 26.342010580934584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 21.015625, "completions/mean_terminated_length": 21.015625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09486840199679136, "epoch": 0.03357142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 5.788740158081055, "learning_rate": 9.825523265709665e-07, "loss": -0.0, "num_tokens": 4514394.0, "reward": 0.8460937738418579, "reward_std": 0.44179511070251465, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.18483558297157288, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 47, "step_time": 6.866083464003168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 18.265625, "completions/mean_terminated_length": 18.265625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.07957334164530039, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.375, "grad_norm": 11.120720863342285, "learning_rate": 9.812917778654747e-07, "loss": 0.0, "num_tokens": 4614995.0, "reward": 0.7523437738418579, "reward_std": 0.48439374566078186, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.16194961965084076, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 48, "step_time": 13.319389674987178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 13.765625, "completions/mean_terminated_length": 13.765625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.05717065744102001, "epoch": 0.035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.799881339719614e-07, "loss": 0.0, "num_tokens": 4694860.0, "reward": 0.3500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 49, "step_time": 6.507926742138807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 88.109375, "completions/mean_terminated_length": 24.887096405029297, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09659092174842954, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.625, "grad_norm": 14.933631896972656, "learning_rate": 9.786415116195732e-07, "loss": 0.0, "num_tokens": 4779483.0, "reward": 0.706250011920929, "reward_std": 0.49597588181495667, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 50, "step_time": 125.25686850992497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 30.5625, "completions/mean_terminated_length": 30.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.09759852662682533, "epoch": 0.03642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 15.208141326904297, "learning_rate": 9.772520313857775e-07, "loss": -0.0, "num_tokens": 4884719.0, "reward": 0.3812500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 51, "step_time": 21.342308732913807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 51.109375, "completions/mean_terminated_length": 19.41269874572754, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.05151955410838127, "epoch": 0.037142857142857144, "frac_reward_zero_std": 0.5, "grad_norm": 10.268356323242188, "learning_rate": 9.758198176855646e-07, "loss": 0.0, "num_tokens": 4979958.0, "reward": 0.6609375476837158, "reward_std": 0.5019382238388062, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 52, "step_time": 175.891883687058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.06619986798614264, "epoch": 0.03785714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 19.621379852294922, "learning_rate": 9.74344998760308e-07, "loss": -0.0, "num_tokens": 5049110.0, "reward": 0.5523437857627869, "reward_std": 0.5009062886238098, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 53, "step_time": 4.150718963937834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03762774192728102, "epoch": 0.03857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 20.882497787475586, "learning_rate": 9.72827706666282e-07, "loss": 0.0, "num_tokens": 5112462.0, "reward": 0.7093750238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 54, "step_time": 3.5128560769953765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03650554618798196, "epoch": 0.039285714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 15.54995059967041, "learning_rate": 9.712680772628363e-07, "loss": -0.0, "num_tokens": 5210902.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 55, "step_time": 4.476348334981594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.04692800110206008, "epoch": 0.04, "frac_reward_zero_std": 0.875, "grad_norm": 11.067854881286621, "learning_rate": 9.696662502002318e-07, "loss": 0.0, "num_tokens": 5293086.0, "reward": 0.4125000238418579, "reward_std": 0.4671765863895416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 56, "step_time": 4.003949869889766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 25.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 13.1875, "completions/mean_terminated_length": 13.1875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0425227633677423, "epoch": 0.04071428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 10.757762908935547, "learning_rate": 9.680223689071362e-07, "loss": 0.0, "num_tokens": 5387882.0, "reward": 0.48906251788139343, "reward_std": 0.4931650757789612, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 57, "step_time": 4.320090122986585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 14.53125, "completions/mean_terminated_length": 14.53125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.024435568135231733, "epoch": 0.041428571428571426, "frac_reward_zero_std": 0.875, "grad_norm": 12.87435245513916, "learning_rate": 9.663365805777814e-07, "loss": 0.0, "num_tokens": 5488564.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 58, "step_time": 9.629043828055728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010381318046711385, "epoch": 0.04214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 11.77026081085205, "learning_rate": 9.646090361587827e-07, "loss": 0.0, "num_tokens": 5551852.0, "reward": 1.021875023841858, "reward_std": 0.2704896926879883, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.921875, "rewards/mcq_exact_match_reward/std": 0.27048972249031067, "step": 59, "step_time": 3.6506365661043674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 24.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 13.859375, "completions/mean_terminated_length": 13.859375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.024835634976625443, "epoch": 0.04285714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 13.699368476867676, "learning_rate": 9.628398903356239e-07, "loss": 0.0, "num_tokens": 5633435.0, "reward": 0.3109375238418579, "reward_std": 0.42168113589286804, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 60, "step_time": 5.84596428705845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011352454603184015, "epoch": 0.04357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 24.677173614501953, "learning_rate": 9.610293015188067e-07, "loss": -0.0, "num_tokens": 5714739.0, "reward": 0.8031250238418579, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 61, "step_time": 5.298729537054896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01830141741083935, "epoch": 0.04428571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.59177431829666e-07, "loss": 0.0, "num_tokens": 5774291.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 62, "step_time": 3.0780068120220676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.014526999875670299, "epoch": 0.045, "frac_reward_zero_std": 0.625, "grad_norm": 16.54155731201172, "learning_rate": 9.572844470858537e-07, "loss": -0.0, "num_tokens": 5850667.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 63, "step_time": 4.529202252917457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02446594147477299, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 27.907894134521484, "learning_rate": 9.55350516786491e-07, "loss": -0.0, "num_tokens": 5921579.0, "reward": 0.6156250238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 64, "step_time": 3.6849751479458064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.027877062326297164, "epoch": 0.04642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 29.490297317504883, "learning_rate": 9.533758140969912e-07, "loss": 0.0, "num_tokens": 6020115.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 65, "step_time": 6.300093016005121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016057265631388873, "epoch": 0.047142857142857146, "frac_reward_zero_std": 0.75, "grad_norm": 18.212966918945312, "learning_rate": 9.513605158335562e-07, "loss": 0.0, "num_tokens": 6123723.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 66, "step_time": 4.889689573086798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.008832770312437788, "epoch": 0.047857142857142855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.493048024473411e-07, "loss": 0.0, "num_tokens": 6190811.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 67, "step_time": 3.4412925080396235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01757131062913686, "epoch": 0.04857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 16.572786331176758, "learning_rate": 9.47208858008299e-07, "loss": 0.0, "num_tokens": 6293939.0, "reward": 0.5531250238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 68, "step_time": 5.069954339065589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010598326101899147, "epoch": 0.04928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.450728701886983e-07, "loss": 0.0, "num_tokens": 6360843.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 69, "step_time": 3.045385909965262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03748940723016858, "epoch": 0.05, "frac_reward_zero_std": 0.625, "grad_norm": 23.385046005249023, "learning_rate": 9.428970302463184e-07, "loss": -0.0, "num_tokens": 6435987.0, "reward": 0.5375000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 70, "step_time": 4.197044083906803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011510996264405549, "epoch": 0.05071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 50.076045989990234, "learning_rate": 9.406815330073244e-07, "loss": 0.0, "num_tokens": 6503731.0, "reward": 0.3812500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 71, "step_time": 4.2282736750203185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.031134499120526016, "epoch": 0.05142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.384265768488224e-07, "loss": 0.0, "num_tokens": 6611515.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 72, "step_time": 5.246204287977889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.026873694732785225, "epoch": 0.052142857142857144, "frac_reward_zero_std": 0.625, "grad_norm": 22.986095428466797, "learning_rate": 9.36132363681097e-07, "loss": 0.0, "num_tokens": 6719259.0, "reward": 0.5843750238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 73, "step_time": 5.210786890995223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00942779217439238, "epoch": 0.05285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 11.122662544250488, "learning_rate": 9.337990989295304e-07, "loss": 0.0, "num_tokens": 6794507.0, "reward": 0.7718750238418579, "reward_std": 0.4732423424720764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.671875, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 74, "step_time": 3.724674280034378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01156144640117418, "epoch": 0.05357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 6.401269912719727, "learning_rate": 9.314269915162114e-07, "loss": -0.0, "num_tokens": 6863011.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 75, "step_time": 3.4731274269870482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01271917184931226, "epoch": 0.054285714285714284, "frac_reward_zero_std": 0.75, "grad_norm": 14.784469604492188, "learning_rate": 9.290162538412255e-07, "loss": -0.0, "num_tokens": 6941691.0, "reward": 0.6781250238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 76, "step_time": 3.031989069073461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.015625, "completions/mean_terminated_length": 13.015625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010391183634055778, "epoch": 0.055, "frac_reward_zero_std": 0.875, "grad_norm": 4.5144572257995605, "learning_rate": 9.265671017636382e-07, "loss": 0.0, "num_tokens": 7022972.0, "reward": 1.084375023841858, "reward_std": 0.125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.984375, "rewards/mcq_exact_match_reward/std": 0.125, "step": 77, "step_time": 3.7917707841843367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0105912602157332, "epoch": 0.055714285714285716, "frac_reward_zero_std": 0.875, "grad_norm": 20.127111434936523, "learning_rate": 9.240797545821666e-07, "loss": 0.0, "num_tokens": 7095780.0, "reward": 0.6312500238418579, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 78, "step_time": 3.8805220928625204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03518920624628663, "epoch": 0.056428571428571425, "frac_reward_zero_std": 0.75, "grad_norm": 13.436991691589355, "learning_rate": 9.215544350155422e-07, "loss": 0.0, "num_tokens": 7178476.0, "reward": 0.3343750238418579, "reward_std": 0.42695629596710205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 79, "step_time": 3.431473600969184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016191222646739334, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 5.861770153045654, "learning_rate": 9.189913691825699e-07, "loss": -0.0, "num_tokens": 7261244.0, "reward": 0.6156250238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 80, "step_time": 3.8818058649194427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.029405928449705243, "epoch": 0.05785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 9.366527557373047, "learning_rate": 9.163907865818806e-07, "loss": -0.0, "num_tokens": 7359540.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 81, "step_time": 5.645089631958399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01334192231297493, "epoch": 0.05857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.137529200713809e-07, "loss": 0.0, "num_tokens": 7448924.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 82, "step_time": 5.150634653924499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02549466968048364, "epoch": 0.05928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 19.77663803100586, "learning_rate": 9.11078005847405e-07, "loss": 0.0, "num_tokens": 7518348.0, "reward": 0.5062500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 83, "step_time": 3.094080194074195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.003772719392145518, "epoch": 0.06, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.083662834235629e-07, "loss": 0.0, "num_tokens": 7595364.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 84, "step_time": 3.666810050024651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.004507055869908072, "epoch": 0.060714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.056179956092961e-07, "loss": 0.0, "num_tokens": 7691668.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 85, "step_time": 4.592653869010974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.006571650264959317, "epoch": 0.06142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.028333884881356e-07, "loss": 0.0, "num_tokens": 7802836.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 86, "step_time": 5.15318116301205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007576553849503398, "epoch": 0.062142857142857146, "frac_reward_zero_std": 0.875, "grad_norm": 6.100246906280518, "learning_rate": 9.000127113956672e-07, "loss": -0.0, "num_tokens": 7873540.0, "reward": 0.7093750238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 87, "step_time": 3.563799056049902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03412088053300977, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 13.682866096496582, "learning_rate": 8.971562168972064e-07, "loss": -0.0, "num_tokens": 7968516.0, "reward": 0.5375000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 88, "step_time": 4.252024297020398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011378830298781395, "epoch": 0.06357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 40.427642822265625, "learning_rate": 8.942641607651828e-07, "loss": -0.0, "num_tokens": 8050940.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 89, "step_time": 3.7895749480230734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.004507646634010598, "epoch": 0.06428571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.91336801956239e-07, "loss": 0.0, "num_tokens": 8118580.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 90, "step_time": 4.1695034088916145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0008988323461380787, "epoch": 0.065, "frac_reward_zero_std": 0.875, "grad_norm": 6.886898040771484, "learning_rate": 8.883744025880427e-07, "loss": 0.0, "num_tokens": 8209372.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 91, "step_time": 4.953228909056634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010154272997169755, "epoch": 0.06571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.853772279158165e-07, "loss": 0.0, "num_tokens": 8290516.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 92, "step_time": 4.79224861896364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.028330315952189267, "epoch": 0.06642857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 13.889434814453125, "learning_rate": 8.823455463085873e-07, "loss": 0.0, "num_tokens": 8369148.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 93, "step_time": 5.352049615990836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02251565270125866, "epoch": 0.06714285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 15.911195755004883, "learning_rate": 8.792796292251559e-07, "loss": 0.0, "num_tokens": 8447308.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 94, "step_time": 3.4189756700070575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010006657859776169, "epoch": 0.06785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.761797511897906e-07, "loss": 0.0, "num_tokens": 8552556.0, "reward": 0.3500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 95, "step_time": 5.176017292018514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019635747539723525, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 4.355508327484131, "learning_rate": 8.730461897676463e-07, "loss": 0.0, "num_tokens": 8632988.0, "reward": 0.9593750238418579, "reward_std": 0.3503824472427368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.859375, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 96, "step_time": 3.6474721890990622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016831232234835625, "epoch": 0.06928571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.698792255399103e-07, "loss": 0.0, "num_tokens": 8693324.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 97, "step_time": 2.8727327780216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010678260587155819, "epoch": 0.07, "frac_reward_zero_std": 0.75, "grad_norm": 14.87441635131836, "learning_rate": 8.666791420786803e-07, "loss": 0.0, "num_tokens": 8784164.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 98, "step_time": 5.402249445091002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007853476068703458, "epoch": 0.07071428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 4.313704490661621, "learning_rate": 8.634462259215718e-07, "loss": 0.0, "num_tokens": 8888620.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 99, "step_time": 5.071004737867042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.023348471499048173, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 12.054193496704102, "learning_rate": 8.601807665460619e-07, "loss": 0.0, "num_tokens": 8994956.0, "reward": 0.5531250238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 100, "step_time": 4.593129857035819 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 8994956, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }