{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.14285714285714285, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 508.0, "completions/mean_terminated_length": 458.32257080078125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14663860481232405, "epoch": 0.0007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 11.91739273071289, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 148816.0, "reward": 0.27421873807907104, "reward_std": 0.4313132166862488, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.22146137058734894, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 1, "step_time": 171.41765936795855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 377.046875, "completions/mean_terminated_length": 377.046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20175037905573845, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 14.119359016418457, "learning_rate": 5.555555555555555e-08, "loss": -0.0, "num_tokens": 255907.0, "reward": 0.53125, "reward_std": 0.5093957781791687, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.2745848298072815, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 2, "step_time": 83.64522138307802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 660.625, "completions/mean_terminated_length": 638.6032104492188, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.14103460405021906, "epoch": 0.002142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.2874624729156494, "learning_rate": 1.111111111111111e-07, "loss": -0.0, "num_tokens": 381059.0, "reward": 0.43281248211860657, "reward_std": 0.4954730272293091, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.25539806485176086, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 3, "step_time": 131.4170093961293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 451.96875, "completions/mean_terminated_length": 400.4838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.19842500798404217, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 10.316102027893066, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 483425.0, "reward": 0.24921873211860657, "reward_std": 0.4258970022201538, "rewards/format_reward/mean": 0.3046875, "rewards/format_reward/std": 0.2615155577659607, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 4, "step_time": 132.2972059249878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 623.953125, "completions/mean_terminated_length": 623.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1606526281684637, "epoch": 0.0035714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 2.722296714782715, "learning_rate": 2.222222222222222e-07, "loss": -0.0, "num_tokens": 604470.0, "reward": 0.39531248807907104, "reward_std": 0.4883336126804352, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.2592533528804779, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 5, "step_time": 119.59757148602512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 375.61907958984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1858495082706213, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 10.736406326293945, "learning_rate": 2.7777777777777776e-07, "loss": -0.0, "num_tokens": 713742.0, "reward": 0.33281245827674866, "reward_std": 0.4670252799987793, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.2741328477859497, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 6, "step_time": 122.30180106399348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 467.265625, "completions/mean_terminated_length": 467.265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16867963038384914, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 5.460102558135986, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 813639.0, "reward": 0.36796873807907104, "reward_std": 0.4780389070510864, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.31090864539146423, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 7, "step_time": 73.72938018315472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 508.4375, "completions/mean_terminated_length": 484.0000305175781, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14686184097081423, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 8.620323181152344, "learning_rate": 3.888888888888889e-07, "loss": -0.0, "num_tokens": 956587.0, "reward": 0.25312498211860657, "reward_std": 0.42565304040908813, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 8, "step_time": 147.8402461669757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 412.25, "completions/mean_terminated_length": 386.2857360839844, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18391940742731094, "epoch": 0.0064285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 9.683939933776855, "learning_rate": 4.444444444444444e-07, "loss": -0.0, "num_tokens": 1075259.0, "reward": 0.3140624761581421, "reward_std": 0.4561282992362976, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.2847827076911926, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 9, "step_time": 108.42918385588564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 521.125, "completions/mean_terminated_length": 496.888916015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.17283021286129951, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.3954830169677734, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 1193907.0, "reward": 0.16171874105930328, "reward_std": 0.3368554413318634, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.23974503576755524, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 10, "step_time": 118.48989919497399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 442.40625, "completions/mean_terminated_length": 442.40625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.19687055423855782, "epoch": 0.007857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 3.561274528503418, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "num_tokens": 1288341.0, "reward": 0.30000001192092896, "reward_std": 0.44818857312202454, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 11, "step_time": 55.07885626098141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 442.171875, "completions/mean_terminated_length": 416.68255615234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.22921719774603844, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 10.506412506103516, "learning_rate": 6.111111111111112e-07, "loss": 0.0, "num_tokens": 1388512.0, "reward": 0.3687499761581421, "reward_std": 0.48094648122787476, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.33184191584587097, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 12, "step_time": 119.49877157399897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 471.09375, "completions/mean_terminated_length": 420.2257995605469, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14649338461458683, "epoch": 0.009285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 16.722030639648438, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 1511798.0, "reward": 0.22499997913837433, "reward_std": 0.40029749274253845, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 13, "step_time": 160.80015432706568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 409.171875, "completions/mean_terminated_length": 356.3064270019531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14898105338215828, "epoch": 0.01, "frac_reward_zero_std": 0.125, "grad_norm": 38.745059967041016, "learning_rate": 7.222222222222221e-07, "loss": -0.0, "num_tokens": 1651729.0, "reward": 0.2789062261581421, "reward_std": 0.4511716961860657, "rewards/format_reward/mean": 0.2890625, "rewards/format_reward/std": 0.2789533734321594, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 14, "step_time": 193.12235332495766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 466.265625, "completions/mean_terminated_length": 466.265625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.19658867083489895, "epoch": 0.010714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 34.59446334838867, "learning_rate": 7.777777777777778e-07, "loss": 0.0, "num_tokens": 1746826.0, "reward": 0.23281247913837433, "reward_std": 0.4091433882713318, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.30496877431869507, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 15, "step_time": 68.50137870694743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 417.140625, "completions/mean_terminated_length": 364.5322570800781, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15556670725345612, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.25, "grad_norm": 6.721803188323975, "learning_rate": 8.333333333333333e-07, "loss": -0.0, "num_tokens": 1870859.0, "reward": 0.34062498807907104, "reward_std": 0.4673358201980591, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.3149704039096832, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 16, "step_time": 159.5833295909688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 444.390625, "completions/mean_terminated_length": 392.6612854003906, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18389248382300138, "epoch": 0.012142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 9.25290298461914, "learning_rate": 8.888888888888888e-07, "loss": 0.0, "num_tokens": 1987036.0, "reward": 0.16093748807907104, "reward_std": 0.34462639689445496, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.301698237657547, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 17, "step_time": 163.66844313696492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 497.984375, "completions/mean_terminated_length": 447.9838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2151591945439577, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 49.3928108215332, "learning_rate": 9.444444444444444e-07, "loss": -0.0, "num_tokens": 2084659.0, "reward": 0.32343748211860657, "reward_std": 0.4590334892272949, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.29839184880256653, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 18, "step_time": 127.23080269095954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 358.28125, "completions/mean_terminated_length": 358.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16461750492453575, "epoch": 0.013571428571428571, "frac_reward_zero_std": 0.125, "grad_norm": 33.24488830566406, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2189413.0, "reward": 0.21406248211860657, "reward_std": 0.3865258991718292, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2221602201461792, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 19, "step_time": 81.95023821806535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 443.28125, "completions/mean_terminated_length": 417.8095397949219, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16434035263955593, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 8.118695259094238, "learning_rate": 9.999776148326214e-07, "loss": -0.0, "num_tokens": 2326511.0, "reward": 0.42656248807907104, "reward_std": 0.48713353276252747, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.1985812783241272, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 20, "step_time": 171.49558448110474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 286.140625, "completions/mean_terminated_length": 286.140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2154129333794117, "epoch": 0.015, "frac_reward_zero_std": 0.125, "grad_norm": 16.506973266601562, "learning_rate": 9.999104613348689e-07, "loss": -0.0, "num_tokens": 2431592.0, "reward": 0.33203125, "reward_std": 0.45828935503959656, "rewards/format_reward/mean": 0.5078125, "rewards/format_reward/std": 0.18881812691688538, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 21, "step_time": 102.2051269490039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 351.015625, "completions/mean_terminated_length": 351.015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2384468913078308, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.125, "grad_norm": 8.052404403686523, "learning_rate": 9.997985455197113e-07, "loss": -0.0, "num_tokens": 2517985.0, "reward": 0.20859375596046448, "reward_std": 0.37886154651641846, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.28770697116851807, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 22, "step_time": 47.53120892500738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 493.734375, "completions/mean_terminated_length": 493.734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15667208284139633, "epoch": 0.016428571428571428, "frac_reward_zero_std": 0.125, "grad_norm": 4.182269096374512, "learning_rate": 9.996418774081656e-07, "loss": 0.0, "num_tokens": 2643640.0, "reward": 0.2679687440395355, "reward_std": 0.41770032048225403, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.22699186205863953, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 23, "step_time": 136.94159113999922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2605742085725069, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 2.6181070804595947, "learning_rate": 9.994404710283998e-07, "loss": 0.0, "num_tokens": 2743904.0, "reward": 0.08046875149011612, "reward_std": 0.17719532549381256, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.24384792149066925, "rewards/mcq_exact_match_reward/mean": 0.03125, "rewards/mcq_exact_match_reward/std": 0.17536810040473938, "step": 24, "step_time": 67.75085840100655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 167.71875, "completions/mean_terminated_length": 167.71875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2089465633034706, "epoch": 0.017857142857142856, "frac_reward_zero_std": 0.125, "grad_norm": 21.222145080566406, "learning_rate": 9.991943444144756e-07, "loss": -0.0, "num_tokens": 2839630.0, "reward": 0.3820312023162842, "reward_std": 0.4708458185195923, "rewards/format_reward/mean": 0.5390625, "rewards/format_reward/std": 0.18483558297157288, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 25, "step_time": 46.71443971898407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25668232701718807, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.25, "grad_norm": 17.222293853759766, "learning_rate": 9.989035196047348e-07, "loss": -0.0, "num_tokens": 2927590.0, "reward": 0.16249999403953552, "reward_std": 0.3169797956943512, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.109375, "rewards/mcq_exact_match_reward/std": 0.3145764470100403, "step": 26, "step_time": 41.34394903801149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 324.53125, "completions/mean_terminated_length": 297.17462158203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18609545193612576, "epoch": 0.019285714285714285, "frac_reward_zero_std": 0.25, "grad_norm": 17.366985321044922, "learning_rate": 9.98568022639826e-07, "loss": 0.0, "num_tokens": 3043752.0, "reward": 0.28359371423721313, "reward_std": 0.431318998336792, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.1406387835741043, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 27, "step_time": 145.95341787295183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 215.234375, "completions/mean_terminated_length": 215.234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2353730145841837, "epoch": 0.02, "frac_reward_zero_std": 0.25, "grad_norm": 17.270282745361328, "learning_rate": 9.981878835603716e-07, "loss": 0.0, "num_tokens": 3131783.0, "reward": 0.27578121423721313, "reward_std": 0.4189927279949188, "rewards/format_reward/mean": 0.5703125, "rewards/format_reward/std": 0.1751912236213684, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 28, "step_time": 48.64892271097051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 214.15625, "completions/mean_terminated_length": 214.15625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2909251060336828, "epoch": 0.020714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 14.325923919677734, "learning_rate": 9.977631364042794e-07, "loss": -0.0, "num_tokens": 3226177.0, "reward": 0.4117187261581421, "reward_std": 0.4837634861469269, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.1649840772151947, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 29, "step_time": 55.44644622900523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 281.2698669433594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2629696223884821, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 10.076370239257812, "learning_rate": 9.972938192036944e-07, "loss": 0.0, "num_tokens": 3343833.0, "reward": 0.27421873807907104, "reward_std": 0.4146132171154022, "rewards/format_reward/mean": 0.5546875, "rewards/format_reward/std": 0.26899561285972595, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 30, "step_time": 177.31874076800887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 236.171875, "completions/mean_terminated_length": 236.171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2660892754793167, "epoch": 0.02214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 11.033559799194336, "learning_rate": 9.967799739815924e-07, "loss": 0.0, "num_tokens": 3407684.0, "reward": 0.4117187261581421, "reward_std": 0.4798099994659424, "rewards/format_reward/mean": 0.6796875, "rewards/format_reward/std": 0.30035942792892456, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 31, "step_time": 19.77746521908557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 267.140625, "completions/mean_terminated_length": 238.87303161621094, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2127502802759409, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 21.78681182861328, "learning_rate": 9.96221646748019e-07, "loss": -0.0, "num_tokens": 3501853.0, "reward": 0.390625, "reward_std": 0.47525057196617126, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 32, "step_time": 118.76398004795192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 195.171875, "completions/mean_terminated_length": 195.171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.26218850910663605, "epoch": 0.023571428571428573, "frac_reward_zero_std": 0.125, "grad_norm": 13.382572174072266, "learning_rate": 9.956188874959686e-07, "loss": 0.0, "num_tokens": 3603568.0, "reward": 0.19062499701976776, "reward_std": 0.3306888937950134, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 33, "step_time": 57.80779201700352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 203.953125, "completions/mean_terminated_length": 203.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21508901193737984, "epoch": 0.024285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 18.717557907104492, "learning_rate": 9.949717501969079e-07, "loss": 0.0, "num_tokens": 3688533.0, "reward": 0.5679687261581421, "reward_std": 0.5023252964019775, "rewards/format_reward/mean": 0.6796875, "rewards/format_reward/std": 0.27265870571136475, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 34, "step_time": 64.80360024399124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 254.78125, "completions/mean_terminated_length": 254.78125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.27256614714860916, "epoch": 0.025, "frac_reward_zero_std": 0.0, "grad_norm": 14.232938766479492, "learning_rate": 9.942802927959442e-07, "loss": -0.0, "num_tokens": 3775567.0, "reward": 0.38124996423721313, "reward_std": 0.46908387541770935, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.3149704039096832, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 35, "step_time": 69.50821864098543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 279.328125, "completions/mean_terminated_length": 279.328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21617292240262032, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.125, "grad_norm": 14.788715362548828, "learning_rate": 9.93544577206636e-07, "loss": 0.0, "num_tokens": 3873788.0, "reward": 0.24140623211860657, "reward_std": 0.38684260845184326, "rewards/format_reward/mean": 0.6953125, "rewards/format_reward/std": 0.2762732207775116, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 36, "step_time": 112.71978642407339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 204.046875, "completions/mean_terminated_length": 204.046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2291890811175108, "epoch": 0.02642857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 22.162996292114258, "learning_rate": 9.927646693054495e-07, "loss": 0.0, "num_tokens": 3949719.0, "reward": 0.43906253576278687, "reward_std": 0.4866037666797638, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.2777281701564789, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 37, "step_time": 62.183381506067235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 92.578125, "completions/mean_terminated_length": 92.578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21844332106411457, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.25, "grad_norm": 21.4798641204834, "learning_rate": 9.919406389258606e-07, "loss": -0.0, "num_tokens": 4028188.0, "reward": 0.44453126192092896, "reward_std": 0.486411988735199, "rewards/format_reward/mean": 0.6953125, "rewards/format_reward/std": 0.2615155577659607, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 38, "step_time": 44.90790424309671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2506138999015093, "epoch": 0.027857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 27.89171028137207, "learning_rate": 9.910725598521012e-07, "loss": -0.0, "num_tokens": 4097708.0, "reward": 0.4664062261581421, "reward_std": 0.4868961274623871, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.19012710452079773, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 39, "step_time": 39.88727585604647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 136.984375, "completions/mean_terminated_length": 136.984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20815920643508434, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.125, "grad_norm": 22.442346572875977, "learning_rate": 9.901605098125526e-07, "loss": -0.0, "num_tokens": 4190579.0, "reward": 0.38749998807907104, "reward_std": 0.4662412703037262, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 40, "step_time": 62.169976764998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 59.859375, "completions/mean_terminated_length": 59.859375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2938700430095196, "epoch": 0.029285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 11.49561882019043, "learning_rate": 9.892045704727863e-07, "loss": -0.0, "num_tokens": 4283034.0, "reward": 0.16562500596046448, "reward_std": 0.2750000059604645, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.2182178944349289, "rewards/mcq_exact_match_reward/mean": 0.078125, "rewards/mcq_exact_match_reward/std": 0.27048972249031067, "step": 41, "step_time": 32.6582014990272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 75.765625, "completions/mean_terminated_length": 75.765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25041171722114086, "epoch": 0.03, "frac_reward_zero_std": 0.25, "grad_norm": 30.742996215820312, "learning_rate": 9.882048274282505e-07, "loss": 0.0, "num_tokens": 4361843.0, "reward": 0.57421875, "reward_std": 0.5030062198638916, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.20275264978408813, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 42, "step_time": 47.323365143092815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 47.390625, "completions/mean_terminated_length": 47.390625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18909681774675846, "epoch": 0.030714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 22.67137908935547, "learning_rate": 9.871613701966066e-07, "loss": 0.0, "num_tokens": 4457780.0, "reward": 0.7250000238418579, "reward_std": 0.485504150390625, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.233588308095932, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 43, "step_time": 47.4528916090494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 83.21875, "completions/mean_terminated_length": 52.0317497253418, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.17218941450119019, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 12.08895492553711, "learning_rate": 9.86074292209714e-07, "loss": 0.0, "num_tokens": 4527074.0, "reward": 0.628125011920929, "reward_std": 0.50661301612854, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 44, "step_time": 107.07343659299659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18100928142666817, "epoch": 0.03214285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 13.066922187805176, "learning_rate": 9.849436908052636e-07, "loss": 0.0, "num_tokens": 4608754.0, "reward": 0.38203126192092896, "reward_std": 0.465761661529541, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.26246222853660583, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 45, "step_time": 30.311806608980987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 67.34375, "completions/mean_terminated_length": 67.34375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.19344223476946354, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 8.720047950744629, "learning_rate": 9.837696672180618e-07, "loss": 0.0, "num_tokens": 4691800.0, "reward": 0.328125, "reward_std": 0.4157489538192749, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 46, "step_time": 44.673023908922914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 20.140625, "completions/mean_terminated_length": 20.140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07546021463349462, "epoch": 0.03357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 9.505743980407715, "learning_rate": 9.825523265709665e-07, "loss": -0.0, "num_tokens": 4783617.0, "reward": 0.71875, "reward_std": 0.49629583954811096, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 47, "step_time": 28.177909465972334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 20.59375, "completions/mean_terminated_length": 20.59375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.10830738116055727, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 14.707826614379883, "learning_rate": 9.812917778654747e-07, "loss": 0.0, "num_tokens": 4861247.0, "reward": 0.43125003576278687, "reward_std": 0.4888843894004822, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 48, "step_time": 10.893696008017287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 66.125, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11854543350636959, "epoch": 0.035, "frac_reward_zero_std": 0.375, "grad_norm": 15.387039184570312, "learning_rate": 9.799881339719614e-07, "loss": 0.0, "num_tokens": 4968215.0, "reward": 0.5382812023162842, "reward_std": 0.504876434803009, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.24688033759593964, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 49, "step_time": 154.59990453493083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 77.125, "completions/mean_terminated_length": 77.125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.09524603839963675, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 10.865472793579102, "learning_rate": 9.786415116195732e-07, "loss": 0.0, "num_tokens": 5049023.0, "reward": 0.44218751788139343, "reward_std": 0.47993209958076477, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 50, "step_time": 96.41551529400749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 14.734375, "completions/mean_terminated_length": 14.734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.09114427305758, "epoch": 0.03642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 37.82780838012695, "learning_rate": 9.772520313857775e-07, "loss": 0.0, "num_tokens": 5134110.0, "reward": 0.5960937738418579, "reward_std": 0.5080545544624329, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.13524486124515533, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 51, "step_time": 4.045462172012776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 13.75, "completions/mean_terminated_length": 13.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.10188040044158697, "epoch": 0.037142857142857144, "frac_reward_zero_std": 0.375, "grad_norm": 15.1320161819458, "learning_rate": 9.758198176855646e-07, "loss": 0.0, "num_tokens": 5203342.0, "reward": 0.5992187857627869, "reward_std": 0.5032033920288086, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 52, "step_time": 3.415803858079016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 15.71875, "completions/mean_terminated_length": 15.71875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.08301922678947449, "epoch": 0.03785714285714286, "frac_reward_zero_std": 0.375, "grad_norm": 10.11668586730957, "learning_rate": 9.74344998760308e-07, "loss": -0.0, "num_tokens": 5300012.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 53, "step_time": 12.412089038116392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.07525204867124557, "epoch": 0.03857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 14.29720401763916, "learning_rate": 9.72827706666282e-07, "loss": -0.0, "num_tokens": 5393076.0, "reward": 0.5835937857627869, "reward_std": 0.504507839679718, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 54, "step_time": 4.425214122980833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.05632809503003955, "epoch": 0.039285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 15.864079475402832, "learning_rate": 9.712680772628363e-07, "loss": 0.0, "num_tokens": 5464420.0, "reward": 0.4281250238418579, "reward_std": 0.4732423424720764, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 55, "step_time": 3.023675933131017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 14.203125, "completions/mean_terminated_length": 14.203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.06479678908362985, "epoch": 0.04, "frac_reward_zero_std": 0.5, "grad_norm": 18.5594539642334, "learning_rate": 9.696662502002318e-07, "loss": -0.0, "num_tokens": 5535681.0, "reward": 0.5062500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 56, "step_time": 6.536966418905649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 14.734375, "completions/mean_terminated_length": 14.734375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.11025835108011961, "epoch": 0.04071428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 10.760197639465332, "learning_rate": 9.680223689071362e-07, "loss": 0.0, "num_tokens": 5615464.0, "reward": 0.39531251788139343, "reward_std": 0.4615982174873352, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 57, "step_time": 6.5071131670847535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 53.28125, "completions/mean_terminated_length": 21.619049072265625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.060628769686445594, "epoch": 0.041428571428571426, "frac_reward_zero_std": 0.625, "grad_norm": 7.574483871459961, "learning_rate": 9.663365805777814e-07, "loss": 0.0, "num_tokens": 5694530.0, "reward": 0.37812501192092896, "reward_std": 0.45546722412109375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 58, "step_time": 121.47771771694534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 18.5, "completions/mean_terminated_length": 18.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.04920098069123924, "epoch": 0.04214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 3.911975622177124, "learning_rate": 9.646090361587827e-07, "loss": -0.0, "num_tokens": 5767930.0, "reward": 0.6937500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 59, "step_time": 10.687484149995726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 19.34375, "completions/mean_terminated_length": 19.34375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.07043586298823357, "epoch": 0.04285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 19.942119598388672, "learning_rate": 9.628398903356239e-07, "loss": 0.0, "num_tokens": 5856232.0, "reward": 0.612500011920929, "reward_std": 0.5023753046989441, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.12198751419782639, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 60, "step_time": 12.981740556890145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.043852697126567364, "epoch": 0.04357142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 19.423234939575195, "learning_rate": 9.610293015188067e-07, "loss": 0.0, "num_tokens": 5911984.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 61, "step_time": 2.5916157929459587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0222997268429026, "epoch": 0.04428571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 12.171859741210938, "learning_rate": 9.59177431829666e-07, "loss": 0.0, "num_tokens": 5999016.0, "reward": 0.8656250238418579, "reward_std": 0.42695629596710205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.765625, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 62, "step_time": 4.844649164064322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 29.203125, "completions/mean_terminated_length": 29.203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.06459272187203169, "epoch": 0.045, "frac_reward_zero_std": 0.875, "grad_norm": 11.733697891235352, "learning_rate": 9.572844470858537e-07, "loss": 0.0, "num_tokens": 6067573.0, "reward": 0.6312500238418579, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 63, "step_time": 33.15641678700922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0357388777192682, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 16.319589614868164, "learning_rate": 9.55350516786491e-07, "loss": 0.0, "num_tokens": 6119109.0, "reward": 0.7093750238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 64, "step_time": 2.4223899890785106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.017213885730598122, "epoch": 0.04642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 11.899584770202637, "learning_rate": 9.533758140969912e-07, "loss": 0.0, "num_tokens": 6189037.0, "reward": 0.8812500238418579, "reward_std": 0.4166666567325592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.78125, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 65, "step_time": 2.9868784029968083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 19.328125, "completions/mean_terminated_length": 19.328125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.042392246425151825, "epoch": 0.047142857142857146, "frac_reward_zero_std": 0.625, "grad_norm": 23.965177536010742, "learning_rate": 9.513605158335562e-07, "loss": -0.0, "num_tokens": 6279914.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 66, "step_time": 31.981172394065652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0195480928523466, "epoch": 0.047857142857142855, "frac_reward_zero_std": 0.875, "grad_norm": 7.419790744781494, "learning_rate": 9.493048024473411e-07, "loss": 0.0, "num_tokens": 6366018.0, "reward": 0.6312500238418579, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 67, "step_time": 4.065540662908461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03548452723771334, "epoch": 0.04857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 13.068404197692871, "learning_rate": 9.47208858008299e-07, "loss": 0.0, "num_tokens": 6458322.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 68, "step_time": 4.998500798013993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03951962455175817, "epoch": 0.04928571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 17.25354766845703, "learning_rate": 9.450728701886983e-07, "loss": -0.0, "num_tokens": 6536578.0, "reward": 0.7359374761581421, "reward_std": 0.4903407096862793, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 69, "step_time": 3.523755496018566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.024328175000846386, "epoch": 0.05, "frac_reward_zero_std": 0.625, "grad_norm": 16.914518356323242, "learning_rate": 9.428970302463184e-07, "loss": 0.0, "num_tokens": 6607098.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 70, "step_time": 3.8031876169261523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.04464244609698653, "epoch": 0.05071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 23.606903076171875, "learning_rate": 9.406815330073244e-07, "loss": 0.0, "num_tokens": 6687066.0, "reward": 0.3656250238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 71, "step_time": 4.856895222037565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010779005533549935, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 5.3981804847717285, "learning_rate": 9.384265768488224e-07, "loss": -0.0, "num_tokens": 6763770.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 72, "step_time": 3.7144640430342406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.027491965098306537, "epoch": 0.052142857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 9.469144821166992, "learning_rate": 9.36132363681097e-07, "loss": 0.0, "num_tokens": 6836978.0, "reward": 0.6312500238418579, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 73, "step_time": 3.3146990051609464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00936318637104705, "epoch": 0.05285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 23.384286880493164, "learning_rate": 9.337990989295304e-07, "loss": 0.0, "num_tokens": 6908506.0, "reward": 0.9125000238418579, "reward_std": 0.39339789748191833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.8125, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 74, "step_time": 3.9301627399399877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.015574420220218599, "epoch": 0.05357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.314269915162114e-07, "loss": 0.0, "num_tokens": 6982170.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 75, "step_time": 4.287910231039859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.012644122180063277, "epoch": 0.054285714285714284, "frac_reward_zero_std": 0.875, "grad_norm": 12.279651641845703, "learning_rate": 9.290162538412255e-07, "loss": -0.0, "num_tokens": 7076114.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 76, "step_time": 6.48582560592331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.043263796949759126, "epoch": 0.055, "frac_reward_zero_std": 0.875, "grad_norm": 4.609958648681641, "learning_rate": 9.265671017636382e-07, "loss": 0.0, "num_tokens": 7180538.0, "reward": 0.5843750238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 77, "step_time": 6.952459908090532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02222421654732898, "epoch": 0.055714285714285716, "frac_reward_zero_std": 0.75, "grad_norm": 8.704394340515137, "learning_rate": 9.240797545821666e-07, "loss": -0.0, "num_tokens": 7261706.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 78, "step_time": 3.9689354329602793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02659488166682422, "epoch": 0.056428571428571425, "frac_reward_zero_std": 0.75, "grad_norm": 16.201833724975586, "learning_rate": 9.215544350155422e-07, "loss": 0.0, "num_tokens": 7335362.0, "reward": 0.4437500238418579, "reward_std": 0.4787135720252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 79, "step_time": 3.920655517023988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010249783445033245, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 5.3185906410217285, "learning_rate": 9.189913691825699e-07, "loss": 0.0, "num_tokens": 7403978.0, "reward": 0.6156250238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 80, "step_time": 3.4547118460177444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009724527830258012, "epoch": 0.05785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 14.341253280639648, "learning_rate": 9.163907865818806e-07, "loss": -0.0, "num_tokens": 7468082.0, "reward": 0.6937500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 81, "step_time": 2.828022911970038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.021473060944117606, "epoch": 0.05857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.137529200713809e-07, "loss": 0.0, "num_tokens": 7558546.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 82, "step_time": 5.028618018957786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0026043455000035465, "epoch": 0.05928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.11078005847405e-07, "loss": 0.0, "num_tokens": 7625834.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 83, "step_time": 3.222042798937764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.022887271596118808, "epoch": 0.06, "frac_reward_zero_std": 0.75, "grad_norm": 11.373490333557129, "learning_rate": 9.083662834235629e-07, "loss": 0.0, "num_tokens": 7684242.0, "reward": 0.5218750238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 84, "step_time": 2.5565447990666144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01285930466838181, "epoch": 0.060714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 4.25327205657959, "learning_rate": 9.056179956092961e-07, "loss": -0.0, "num_tokens": 7786874.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 85, "step_time": 7.1175413559540175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 13.453125, "completions/mean_terminated_length": 13.453125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.04580933507531881, "epoch": 0.06142857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 29.327259063720703, "learning_rate": 9.028333884881356e-07, "loss": -0.0, "num_tokens": 7870471.0, "reward": 0.7984374761581421, "reward_std": 0.46819213032722473, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 86, "step_time": 4.2775687840767205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0068673123605549335, "epoch": 0.062142857142857146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.000127113956672e-07, "loss": 0.0, "num_tokens": 7966495.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 87, "step_time": 4.236654673120938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.006318100189673714, "epoch": 0.06285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.971562168972064e-07, "loss": 0.0, "num_tokens": 8055943.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 88, "step_time": 7.718285386974458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0103716982412152, "epoch": 0.06357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 17.37955665588379, "learning_rate": 8.942641607651828e-07, "loss": -0.0, "num_tokens": 8159495.0, "reward": 0.6625000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 89, "step_time": 4.891622756083962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019456376787275076, "epoch": 0.06428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 39.22451400756836, "learning_rate": 8.91336801956239e-07, "loss": 0.0, "num_tokens": 8238415.0, "reward": 0.6625000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 90, "step_time": 3.36728476092685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02106736705172807, "epoch": 0.065, "frac_reward_zero_std": 0.625, "grad_norm": 42.52458953857422, "learning_rate": 8.883744025880427e-07, "loss": 0.0, "num_tokens": 8310247.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 91, "step_time": 2.880418001965154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00979969812760828, "epoch": 0.06571428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 14.711299896240234, "learning_rate": 8.853772279158165e-07, "loss": 0.0, "num_tokens": 8387591.0, "reward": 0.9125000238418579, "reward_std": 0.39339789748191833, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.8125, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 92, "step_time": 5.434788639016915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.002452172411722131, "epoch": 0.06642857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.823455463085873e-07, "loss": 0.0, "num_tokens": 8481191.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 93, "step_time": 4.279522054013796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.024404613999649882, "epoch": 0.06714285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 49.90077209472656, "learning_rate": 8.792796292251559e-07, "loss": -0.0, "num_tokens": 8552663.0, "reward": 0.5218750238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 94, "step_time": 2.9446488179964945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 15.171875, "completions/mean_terminated_length": 15.171875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.015569199862511596, "epoch": 0.06785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.761797511897906e-07, "loss": 0.0, "num_tokens": 8643242.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 95, "step_time": 12.633399382932112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01959893899038434, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 35.42424011230469, "learning_rate": 8.730461897676463e-07, "loss": 0.0, "num_tokens": 8715210.0, "reward": 0.6312500238418579, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 96, "step_time": 5.476754333998542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 15.5, "completions/mean_terminated_length": 15.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019250252342317253, "epoch": 0.06928571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.698792255399103e-07, "loss": 0.0, "num_tokens": 8802986.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 97, "step_time": 10.231531332945451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.014249574625864625, "epoch": 0.07, "frac_reward_zero_std": 0.875, "grad_norm": 26.534008026123047, "learning_rate": 8.666791420786803e-07, "loss": -0.0, "num_tokens": 8887802.0, "reward": 0.4125000238418579, "reward_std": 0.4671765863895416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 98, "step_time": 4.172440590918995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007108512945706025, "epoch": 0.07071428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 7.561801910400391, "learning_rate": 8.634462259215718e-07, "loss": 0.0, "num_tokens": 8959266.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 99, "step_time": 3.1397473260294646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.012571636820212007, "epoch": 0.07142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.601807665460619e-07, "loss": 0.0, "num_tokens": 9056234.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 100, "step_time": 7.120428283000365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011042158876080066, "epoch": 0.07214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 20.18372917175293, "learning_rate": 8.568830563435694e-07, "loss": 0.0, "num_tokens": 9131162.0, "reward": 0.9281250238418579, "reward_std": 0.38025420904159546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.828125, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 101, "step_time": 3.5142359259189107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.018726955458987504, "epoch": 0.07285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 22.103269577026367, "learning_rate": 8.535533905932737e-07, "loss": 0.0, "num_tokens": 9213466.0, "reward": 0.5375000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 102, "step_time": 6.039401607995387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.002115752373356372, "epoch": 0.07357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.501920674356754e-07, "loss": 0.0, "num_tokens": 9300234.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 103, "step_time": 6.536880514177028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.023664406850002706, "epoch": 0.07428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 15.414724349975586, "learning_rate": 8.467993878459003e-07, "loss": 0.0, "num_tokens": 9408458.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 104, "step_time": 5.406510353961494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011180884641362354, "epoch": 0.075, "frac_reward_zero_std": 0.75, "grad_norm": 16.74150276184082, "learning_rate": 8.433756556067505e-07, "loss": 0.0, "num_tokens": 9522706.0, "reward": 0.7562500238418579, "reward_std": 0.4787135720252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 105, "step_time": 6.74972949095536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.022577963944058865, "epoch": 0.07571428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 66.94438934326172, "learning_rate": 8.399211772815029e-07, "loss": -0.0, "num_tokens": 9605874.0, "reward": 0.5062500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 106, "step_time": 5.258289148041513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.025772404042072594, "epoch": 0.07642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 16.906095504760742, "learning_rate": 8.364362621864594e-07, "loss": -0.0, "num_tokens": 9674906.0, "reward": 0.7875000238418579, "reward_std": 0.4671765863895416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 107, "step_time": 3.7881211650092155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009171675716061145, "epoch": 0.07714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.32921222363251e-07, "loss": 0.0, "num_tokens": 9753194.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 108, "step_time": 3.6917542329756543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007731896243058145, "epoch": 0.07785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 21.57496452331543, "learning_rate": 8.293763725508969e-07, "loss": -0.0, "num_tokens": 9832178.0, "reward": 0.3656250238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 109, "step_time": 3.5990826380439103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00748697979724966, "epoch": 0.07857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 15.718206405639648, "learning_rate": 8.258020301576223e-07, "loss": 0.0, "num_tokens": 9929978.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 110, "step_time": 5.109096220054198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007316715433262289, "epoch": 0.07928571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 20.581790924072266, "learning_rate": 8.221985152324384e-07, "loss": -0.0, "num_tokens": 9994906.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 111, "step_time": 3.6931803559418768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0037167306727496907, "epoch": 0.08, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.185661504364844e-07, "loss": 0.0, "num_tokens": 10068282.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 112, "step_time": 3.9287309639621526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007747493364149705, "epoch": 0.08071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 5.363574504852295, "learning_rate": 8.149052610141355e-07, "loss": 0.0, "num_tokens": 10147770.0, "reward": 0.8656250238418579, "reward_std": 0.42695629596710205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.765625, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 113, "step_time": 4.5494631649926305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010833295003976673, "epoch": 0.08142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.112161747638821e-07, "loss": 0.0, "num_tokens": 10226570.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 114, "step_time": 4.508432603033725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007666514196898788, "epoch": 0.08214285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.074992220089768e-07, "loss": 0.0, "num_tokens": 10295866.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 115, "step_time": 3.0693239220418036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.022078550304286182, "epoch": 0.08285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 21.685462951660156, "learning_rate": 8.037547355678576e-07, "loss": 0.0, "num_tokens": 10365162.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 116, "step_time": 3.2104193790000863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007084596261847764, "epoch": 0.08357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 20.003612518310547, "learning_rate": 7.999830507243477e-07, "loss": -0.0, "num_tokens": 10440450.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 117, "step_time": 4.254953389870934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.018427874660119414, "epoch": 0.08428571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 39.15703582763672, "learning_rate": 7.961845051976332e-07, "loss": 0.0, "num_tokens": 10527178.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 118, "step_time": 6.791951371007599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 31.65625, "completions/mean_terminated_length": 31.65625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01668112922925502, "epoch": 0.085, "frac_reward_zero_std": 0.75, "grad_norm": 17.928234100341797, "learning_rate": 7.923594391120236e-07, "loss": 0.0, "num_tokens": 10618972.0, "reward": 0.6937500238418579, "reward_std": 0.49501484632492065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 119, "step_time": 82.62583560397616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.010110943403560668, "epoch": 0.08571428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.88508194966497e-07, "loss": 0.0, "num_tokens": 10692188.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 120, "step_time": 4.424513722071424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009336426679510623, "epoch": 0.08642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.84631117604033e-07, "loss": 0.0, "num_tokens": 10782300.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 121, "step_time": 5.603050055215135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.021290896052960306, "epoch": 0.08714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 18.509204864501953, "learning_rate": 7.80728554180734e-07, "loss": 0.0, "num_tokens": 10883556.0, "reward": 0.6156250238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 122, "step_time": 6.613117044093087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0029609822304337285, "epoch": 0.08785714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.768008541347421e-07, "loss": 0.0, "num_tokens": 10960532.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 1.0, "rewards/mcq_exact_match_reward/std": 0.0, "step": 123, "step_time": 3.7264017250272445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00125310622388497, "epoch": 0.08857142857142856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.728483691549491e-07, "loss": 0.0, "num_tokens": 11053804.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 124, "step_time": 4.139133257966023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.004454300171346404, "epoch": 0.08928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 5.340372562408447, "learning_rate": 7.688714531495059e-07, "loss": 0.0, "num_tokens": 11132692.0, "reward": 0.7093750238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 125, "step_time": 5.186251167091541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019100001431070268, "epoch": 0.09, "frac_reward_zero_std": 0.75, "grad_norm": 22.68722915649414, "learning_rate": 7.648704622141347e-07, "loss": 0.0, "num_tokens": 11205652.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 126, "step_time": 4.047589352878276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.014429281174670905, "epoch": 0.09071428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 23.554597854614258, "learning_rate": 7.608457546002422e-07, "loss": 0.0, "num_tokens": 11305180.0, "reward": 0.4906250238418579, "reward_std": 0.4917473793029785, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 127, "step_time": 6.020642440940719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01491769595304504, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 17.746049880981445, "learning_rate": 7.56797690682843e-07, "loss": 0.0, "num_tokens": 11379044.0, "reward": 0.8031250238418579, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 128, "step_time": 4.16096327296691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 14.5, "completions/mean_terminated_length": 14.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03012668783776462, "epoch": 0.09214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 18.020849227905273, "learning_rate": 7.527266329282905e-07, "loss": 0.0, "num_tokens": 11445012.0, "reward": 0.7562500238418579, "reward_std": 0.4787135720252991, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 129, "step_time": 7.198684796865564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.020660966634750366, "epoch": 0.09285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 12.086788177490234, "learning_rate": 7.486329458618215e-07, "loss": 0.0, "num_tokens": 11537204.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 130, "step_time": 4.54672675288748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0055168012622743845, "epoch": 0.09357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.445169960349166e-07, "loss": 0.0, "num_tokens": 11631108.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 131, "step_time": 4.3755096909590065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009784101857803762, "epoch": 0.09428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.403791519924793e-07, "loss": 0.0, "num_tokens": 11703972.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 132, "step_time": 3.261271098861471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.008812747604679316, "epoch": 0.095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.362197842398354e-07, "loss": 0.0, "num_tokens": 11767676.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 133, "step_time": 3.028584598971065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 13.484375, "completions/mean_terminated_length": 13.484375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.027414096985012293, "epoch": 0.09571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.320392652095583e-07, "loss": 0.0, "num_tokens": 11849683.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 134, "step_time": 3.9485261590452865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 16.5625, "completions/mean_terminated_length": 16.5625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03702096623601392, "epoch": 0.09642857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 18.03097152709961, "learning_rate": 7.278379692281208e-07, "loss": 0.0, "num_tokens": 11930367.0, "reward": 0.6781250238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 135, "step_time": 16.50799851596821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 20.203125, "completions/mean_terminated_length": 20.203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016940920031629503, "epoch": 0.09714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.236162724823778e-07, "loss": 0.0, "num_tokens": 11996020.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 136, "step_time": 20.552247615996748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 26.609375, "completions/mean_terminated_length": 26.609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.07297810423187912, "epoch": 0.09785714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 18.540674209594727, "learning_rate": 7.193745529858826e-07, "loss": 0.0, "num_tokens": 12075187.0, "reward": 0.550000011920929, "reward_std": 0.5048966407775879, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 137, "step_time": 24.535814730101265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016214959672652185, "epoch": 0.09857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.151131905450385e-07, "loss": 0.0, "num_tokens": 12165891.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 138, "step_time": 4.597669165057596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 14.203125, "completions/mean_terminated_length": 14.203125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.04903080174699426, "epoch": 0.09928571428571428, "frac_reward_zero_std": 0.625, "grad_norm": 34.56901931762695, "learning_rate": 7.10832566725092e-07, "loss": 0.0, "num_tokens": 12231904.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 139, "step_time": 6.9713660419220105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0055509630183223635, "epoch": 0.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.065330648159655e-07, "loss": 0.0, "num_tokens": 12303128.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 140, "step_time": 3.502849594980944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01160202972823754, "epoch": 0.10071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 21.754417419433594, "learning_rate": 7.022150697979384e-07, "loss": -0.0, "num_tokens": 12391456.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 141, "step_time": 4.62359821901191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 24.484375, "completions/mean_terminated_length": 24.484375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0657657328993082, "epoch": 0.10142857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 40.662166595458984, "learning_rate": 6.978789683071759e-07, "loss": -0.0, "num_tokens": 12475671.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 142, "step_time": 17.122637529973872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 56.3125, "completions/mean_terminated_length": 24.698413848876953, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.02439082640921697, "epoch": 0.10214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 4.452528953552246, "learning_rate": 6.935251486011086e-07, "loss": 0.0, "num_tokens": 12552059.0, "reward": 0.5984375476837158, "reward_std": 0.5056795477867126, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 143, "step_time": 121.78867189295124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 17.328125, "completions/mean_terminated_length": 17.328125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.042446967447176576, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 21.53173828125, "learning_rate": 6.891540005236674e-07, "loss": -0.0, "num_tokens": 12635208.0, "reward": 0.6617187857627869, "reward_std": 0.5009310245513916, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 144, "step_time": 14.861859488883056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009935209178365767, "epoch": 0.10357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.847659154703785e-07, "loss": 0.0, "num_tokens": 12719712.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 145, "step_time": 5.540668106987141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 84.890625, "completions/mean_terminated_length": 84.890625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.11740031838417053, "epoch": 0.10428571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 8.078269004821777, "learning_rate": 6.803612863533149e-07, "loss": 0.0, "num_tokens": 12805409.0, "reward": 0.8656250238418579, "reward_std": 0.42695629596710205, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.765625, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 146, "step_time": 31.698010974912904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 61.859375, "completions/mean_terminated_length": 30.333335876464844, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.045849967980757356, "epoch": 0.105, "frac_reward_zero_std": 0.875, "grad_norm": 4.473923206329346, "learning_rate": 6.759405075659165e-07, "loss": 0.0, "num_tokens": 12893416.0, "reward": 0.846875011920929, "reward_std": 0.44220542907714844, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 147, "step_time": 150.5390612690244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 46.71875, "completions/mean_terminated_length": 46.71875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.16094551188871264, "epoch": 0.10571428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 6.536345958709717, "learning_rate": 6.715039749476763e-07, "loss": 0.0, "num_tokens": 12942342.0, "reward": 0.6453125476837158, "reward_std": 0.5036153793334961, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 148, "step_time": 26.220934735029005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 15.984375, "completions/mean_terminated_length": 15.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.03024544403888285, "epoch": 0.10642857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 2.6902215480804443, "learning_rate": 6.670520857486949e-07, "loss": -0.0, "num_tokens": 13035949.0, "reward": 0.5992187857627869, "reward_std": 0.5032033920288086, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 149, "step_time": 11.029159978032112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 33.375, "completions/mean_terminated_length": 33.375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.06737930839881301, "epoch": 0.10714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 2.0915071964263916, "learning_rate": 6.625852385941118e-07, "loss": 0.0, "num_tokens": 13118629.0, "reward": 0.8804687857627869, "reward_std": 0.41629672050476074, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.78125, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 150, "step_time": 37.43122749996837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 23.0625, "completions/mean_terminated_length": 23.0625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.10096252337098122, "epoch": 0.10785714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 14.689016342163086, "learning_rate": 6.58103833448412e-07, "loss": 0.0, "num_tokens": 13209761.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 151, "step_time": 7.120647723902948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 45.453125, "completions/mean_terminated_length": 45.453125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.09933191817253828, "epoch": 0.10857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 13.227392196655273, "learning_rate": 6.536082715796124e-07, "loss": 0.0, "num_tokens": 13282486.0, "reward": 0.7546875476837158, "reward_std": 0.48104703426361084, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 152, "step_time": 24.34091979288496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 102.734375, "completions/mean_terminated_length": 71.85714721679688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.10967912850901484, "epoch": 0.10928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 18.153820037841797, "learning_rate": 6.490989555233327e-07, "loss": -0.0, "num_tokens": 13371077.0, "reward": 0.753125011920929, "reward_std": 0.48336413502693176, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 153, "step_time": 129.29081673000474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.038476365618407726, "epoch": 0.11, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.445762890467517e-07, "loss": 0.0, "num_tokens": 13481541.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 154, "step_time": 7.0913085790816694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 13.625, "completions/mean_terminated_length": 13.625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.03220844024326652, "epoch": 0.11071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 6.658990383148193, "learning_rate": 6.400406771124535e-07, "loss": 0.0, "num_tokens": 13553333.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 155, "step_time": 5.22538055095356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 14.875, "completions/mean_terminated_length": 14.875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.04669280140660703, "epoch": 0.11142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 18.39362335205078, "learning_rate": 6.354925258421675e-07, "loss": 0.0, "num_tokens": 13637053.0, "reward": 0.8796875476837158, "reward_std": 0.41602033376693726, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.78125, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 156, "step_time": 8.888459036010318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.022164398804306984, "epoch": 0.11214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 18.90540313720703, "learning_rate": 6.309322424804033e-07, "loss": 0.0, "num_tokens": 13729052.0, "reward": 0.5218750238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 157, "step_time": 3.9538759248680435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.028873756295070052, "epoch": 0.11285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 7.955380916595459, "learning_rate": 6.263602353579866e-07, "loss": -0.0, "num_tokens": 13799450.0, "reward": 0.9593750238418579, "reward_std": 0.3503824472427368, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.859375, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 158, "step_time": 4.26783420908032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.00713545671897009, "epoch": 0.11357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.217769138554959e-07, "loss": 0.0, "num_tokens": 13871953.0, "reward": 1.100000023841858, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 1.0, "rewards/mcq_exact_match_reward/std": 0.0, "step": 159, "step_time": 2.712456647946965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.013005010900087655, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 9.743955612182617, "learning_rate": 6.171826883666074e-07, "loss": 0.0, "num_tokens": 13973689.0, "reward": 0.5843750238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 160, "step_time": 6.412190584058408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 13.015625, "completions/mean_terminated_length": 13.015625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02211562287993729, "epoch": 0.115, "frac_reward_zero_std": 0.875, "grad_norm": 10.129695892333984, "learning_rate": 6.12577970261347e-07, "loss": -0.0, "num_tokens": 14055802.0, "reward": 0.7242187857627869, "reward_std": 0.487379789352417, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 161, "step_time": 4.479429450002499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019997738883830607, "epoch": 0.11571428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 11.165828704833984, "learning_rate": 6.079631718492568e-07, "loss": -0.0, "num_tokens": 14166330.0, "reward": 0.6156250238418579, "reward_std": 0.5037065148353577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 162, "step_time": 7.445554191886913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.027288892189972103, "epoch": 0.11642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 12.685976028442383, "learning_rate": 6.033387063424764e-07, "loss": 0.0, "num_tokens": 14258536.0, "reward": 0.7875000238418579, "reward_std": 0.4671765863895416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 163, "step_time": 7.0269312839373015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 19.15625, "completions/mean_terminated_length": 19.15625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.03532049781642854, "epoch": 0.11714285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 13.565665245056152, "learning_rate": 5.987049878187436e-07, "loss": -0.0, "num_tokens": 14344858.0, "reward": 0.7710937857627869, "reward_std": 0.47440895438194275, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.671875, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 164, "step_time": 21.54071432899218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.016357870190404356, "epoch": 0.11785714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 16.555530548095703, "learning_rate": 5.940624311843168e-07, "loss": 0.0, "num_tokens": 14429170.0, "reward": 0.8031250238418579, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 165, "step_time": 4.7839336470351554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.026364905992522836, "epoch": 0.11857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 10.869507789611816, "learning_rate": 5.894114521368258e-07, "loss": -0.0, "num_tokens": 14490984.0, "reward": 0.9437500238418579, "reward_std": 0.36596253514289856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 166, "step_time": 3.4966191770508885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 17.46875, "completions/mean_terminated_length": 17.46875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.013556473655626178, "epoch": 0.11928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 1.16694974899292, "learning_rate": 5.847524671280483e-07, "loss": 0.0, "num_tokens": 14553334.0, "reward": 0.6148437857627869, "reward_std": 0.5029815435409546, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 167, "step_time": 12.092128862044774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.013614867959404364, "epoch": 0.12, "frac_reward_zero_std": 0.75, "grad_norm": 15.871785163879395, "learning_rate": 5.800858933266212e-07, "loss": 0.0, "num_tokens": 14642670.0, "reward": 0.9437500238418579, "reward_std": 0.36596253514289856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 168, "step_time": 4.104719978873618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.01443118933821097, "epoch": 0.12071428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.75412148580687e-07, "loss": 0.0, "num_tokens": 14710006.0, "reward": 0.3500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 169, "step_time": 3.3311328379204497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019871412543579936, "epoch": 0.12142857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 24.622060775756836, "learning_rate": 5.707316513804792e-07, "loss": 0.0, "num_tokens": 14791358.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 170, "step_time": 6.276687276025768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.007149734243284911, "epoch": 0.12214285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.660448208208513e-07, "loss": 0.0, "num_tokens": 14861150.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 171, "step_time": 3.8035881727701053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 15.703125, "completions/mean_terminated_length": 15.703125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.015912733739241958, "epoch": 0.12285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 19.102737426757812, "learning_rate": 5.613520765637489e-07, "loss": -0.0, "num_tokens": 14942923.0, "reward": 0.9437500238418579, "reward_std": 0.36596253514289856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 172, "step_time": 8.8402068670257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.953125, "completions/mean_terminated_length": 12.953125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.021281153662130237, "epoch": 0.12357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 6.939657688140869, "learning_rate": 5.56653838800635e-07, "loss": -0.0, "num_tokens": 15030208.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 173, "step_time": 4.201213694992475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 25.578125, "completions/mean_terminated_length": 25.578125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.045478251413442194, "epoch": 0.12428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 1.5842310190200806, "learning_rate": 5.519505282148643e-07, "loss": -0.0, "num_tokens": 15113453.0, "reward": 1.0499999523162842, "reward_std": 0.22730302810668945, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.1510545015335083, "rewards/mcq_exact_match_reward/mean": 0.953125, "rewards/mcq_exact_match_reward/std": 0.21304203569889069, "step": 174, "step_time": 28.228189032059163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.01783720776438713, "epoch": 0.125, "frac_reward_zero_std": 0.875, "grad_norm": 21.070640563964844, "learning_rate": 5.472425659440156e-07, "loss": -0.0, "num_tokens": 15178379.0, "reward": 0.9906250238418579, "reward_std": 0.3145764470100403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.890625, "rewards/mcq_exact_match_reward/std": 0.3145764470100403, "step": 175, "step_time": 3.1726534390472807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02504683134611696, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 20.849273681640625, "learning_rate": 5.425303735421828e-07, "loss": 0.0, "num_tokens": 15256923.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 176, "step_time": 5.022981332032941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.01540215959539637, "epoch": 0.12642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.378143729422284e-07, "loss": 0.0, "num_tokens": 15365457.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 177, "step_time": 5.918834337033331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.008651553944218904, "epoch": 0.12714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 7.7155351638793945, "learning_rate": 5.330949864180033e-07, "loss": -0.0, "num_tokens": 15443911.0, "reward": 1.0992188453674316, "reward_std": 0.006250003352761269, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 1.0, "rewards/mcq_exact_match_reward/std": 0.0, "step": 178, "step_time": 4.0626735190162435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 17.5625, "completions/mean_terminated_length": 17.5625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.0304659788380377, "epoch": 0.12785714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 9.790291786193848, "learning_rate": 5.28372636546537e-07, "loss": 0.0, "num_tokens": 15519371.0, "reward": 0.23906251788139343, "reward_std": 0.35124140977859497, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 179, "step_time": 11.25525397103047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.01876044215168804, "epoch": 0.12857142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 8.506415367126465, "learning_rate": 5.236477461701985e-07, "loss": 0.0, "num_tokens": 15607905.0, "reward": 0.8343750238418579, "reward_std": 0.44515693187713623, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 180, "step_time": 4.205256605986506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.90625, "completions/mean_terminated_length": 12.90625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.029853320098482072, "epoch": 0.12928571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 9.465096473693848, "learning_rate": 5.189207383588352e-07, "loss": 0.0, "num_tokens": 15706555.0, "reward": 0.6781250238418579, "reward_std": 0.49776285886764526, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 181, "step_time": 7.094486114161555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.90625, "completions/mean_terminated_length": 12.90625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.010658063692972064, "epoch": 0.13, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.141920363718916e-07, "loss": 0.0, "num_tokens": 15784853.0, "reward": 0.9750000238418579, "reward_std": 0.3333333432674408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 182, "step_time": 3.412773276970256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0029293073748704046, "epoch": 0.13071428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.094620636205095e-07, "loss": 0.0, "num_tokens": 15856085.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 183, "step_time": 2.87448824493913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.015074294526129961, "epoch": 0.13142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.047312436296158e-07, "loss": 0.0, "num_tokens": 15911093.0, "reward": 0.4750000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 184, "step_time": 2.8168845549225807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.009781015396583825, "epoch": 0.13214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 15982949.0, "reward": 0.7250000238418579, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 185, "step_time": 3.729889392852783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.015625, "completions/mean_terminated_length": 13.015625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.023459993302822113, "epoch": 0.13285714285714287, "frac_reward_zero_std": 0.875, "grad_norm": 5.2294769287109375, "learning_rate": 4.952687563703841e-07, "loss": 0.0, "num_tokens": 16079918.0, "reward": 0.5984375476837158, "reward_std": 0.5056795477867126, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 186, "step_time": 5.194811234949157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 13.015625, "completions/mean_terminated_length": 13.015625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.005734891252359375, "epoch": 0.13357142857142856, "frac_reward_zero_std": 0.75, "grad_norm": 14.381458282470703, "learning_rate": 4.905379363794906e-07, "loss": -0.0, "num_tokens": 16171215.0, "reward": 0.6765625476837158, "reward_std": 0.49975937604904175, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 187, "step_time": 4.580101553059649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.96875, "completions/mean_terminated_length": 12.96875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.0026500039821257815, "epoch": 0.13428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.858079636281084e-07, "loss": 0.0, "num_tokens": 16262173.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 188, "step_time": 5.5456535129924305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 18.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 13.09375, "completions/mean_terminated_length": 13.09375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.011145705298986286, "epoch": 0.135, "frac_reward_zero_std": 0.875, "grad_norm": 15.990577697753906, "learning_rate": 4.810792616411649e-07, "loss": 0.0, "num_tokens": 16320587.0, "reward": 0.971875011920929, "reward_std": 0.34201493859291077, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 189, "step_time": 2.8638349280226976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.02099756433744915, "epoch": 0.1357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 18.941129684448242, "learning_rate": 4.763522538298017e-07, "loss": -0.0, "num_tokens": 16402163.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 190, "step_time": 4.850232388009317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 13.03125, "completions/mean_terminated_length": 13.03125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.00562438897031825, "epoch": 0.13642857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.7162736345346296e-07, "loss": 0.0, "num_tokens": 16483949.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 191, "step_time": 3.6737312379991636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.010754016373539343, "epoch": 0.13714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6690501358199655e-07, "loss": 0.0, "num_tokens": 16578652.0, "reward": 0.3500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 192, "step_time": 7.894912258896511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.011054494883865118, "epoch": 0.13785714285714284, "frac_reward_zero_std": 0.875, "grad_norm": 21.005765914916992, "learning_rate": 4.621856270577718e-07, "loss": -0.0, "num_tokens": 16646828.0, "reward": 0.7406250238418579, "reward_std": 0.4836103618144989, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 193, "step_time": 2.648993079084903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.00612640570034273, "epoch": 0.13857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 23.35145378112793, "learning_rate": 4.5746962645781723e-07, "loss": 0.0, "num_tokens": 16719476.0, "reward": 0.8031250238418579, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 194, "step_time": 3.3733782949857414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.01801892218645662, "epoch": 0.1392857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5275743405598437e-07, "loss": 0.0, "num_tokens": 16806395.0, "reward": 0.8500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 195, "step_time": 7.701647555048112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.020205819397233427, "epoch": 0.14, "frac_reward_zero_std": 0.875, "grad_norm": 32.03440475463867, "learning_rate": 4.480494717851358e-07, "loss": 0.0, "num_tokens": 16883851.0, "reward": 0.6468750238418579, "reward_std": 0.501733124256134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 196, "step_time": 3.548996380006429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.019142711884342134, "epoch": 0.1407142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 20.11782455444336, "learning_rate": 4.433461611993651e-07, "loss": -0.0, "num_tokens": 16950395.0, "reward": 0.8187500238418579, "reward_std": 0.4531635046005249, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 197, "step_time": 3.6711106749717146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 13.0, "completions/mean_terminated_length": 13.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.0016078357584774494, "epoch": 0.14142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.3864792343625115e-07, "loss": 0.0, "num_tokens": 17058795.0, "reward": 0.6000000238418579, "reward_std": 0.5039526224136353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 198, "step_time": 7.064584863022901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.018027375219389796, "epoch": 0.14214285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 44.21977996826172, "learning_rate": 4.3395517917914894e-07, "loss": 0.0, "num_tokens": 17140074.0, "reward": 0.6625000238418579, "reward_std": 0.5, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 199, "step_time": 4.233198134053964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.984375, "completions/mean_terminated_length": 12.984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.02401040424592793, "epoch": 0.14285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2926834861952077e-07, "loss": 0.0, "num_tokens": 17224545.0, "reward": 0.3500000238418579, "reward_std": 0.4364357888698578, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 200, "step_time": 4.817044052877463 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 17224545, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }