{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03571428571428571, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 508.0, "completions/mean_terminated_length": 458.32257080078125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14663860481232405, "epoch": 0.0007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 11.91739273071289, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 148816.0, "reward": 0.27421873807907104, "reward_std": 0.4313132166862488, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.22146137058734894, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 1, "step_time": 171.41765936795855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 377.046875, "completions/mean_terminated_length": 377.046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20175037905573845, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 14.119359016418457, "learning_rate": 5.555555555555555e-08, "loss": -0.0, "num_tokens": 255907.0, "reward": 0.53125, "reward_std": 0.5093957781791687, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.2745848298072815, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 2, "step_time": 83.64522138307802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 660.625, "completions/mean_terminated_length": 638.6032104492188, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.14103460405021906, "epoch": 0.002142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.2874624729156494, "learning_rate": 1.111111111111111e-07, "loss": -0.0, "num_tokens": 381059.0, "reward": 0.43281248211860657, "reward_std": 0.4954730272293091, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.25539806485176086, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 3, "step_time": 131.4170093961293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 451.96875, "completions/mean_terminated_length": 400.4838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.19842500798404217, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 10.316102027893066, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "num_tokens": 483425.0, "reward": 0.24921873211860657, "reward_std": 0.4258970022201538, "rewards/format_reward/mean": 0.3046875, "rewards/format_reward/std": 0.2615155577659607, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 4, "step_time": 132.2972059249878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1611.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 623.953125, "completions/mean_terminated_length": 623.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1606526281684637, "epoch": 0.0035714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 2.722296714782715, "learning_rate": 2.222222222222222e-07, "loss": -0.0, "num_tokens": 604470.0, "reward": 0.39531248807907104, "reward_std": 0.4883336126804352, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.2592533528804779, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 5, "step_time": 119.59757148602512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 375.61907958984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1858495082706213, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 10.736406326293945, "learning_rate": 2.7777777777777776e-07, "loss": -0.0, "num_tokens": 713742.0, "reward": 0.33281245827674866, "reward_std": 0.4670252799987793, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.2741328477859497, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 6, "step_time": 122.30180106399348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 467.265625, "completions/mean_terminated_length": 467.265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16867963038384914, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 5.460102558135986, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 813639.0, "reward": 0.36796873807907104, "reward_std": 0.4780389070510864, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.31090864539146423, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 7, "step_time": 73.72938018315472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 508.4375, "completions/mean_terminated_length": 484.0000305175781, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14686184097081423, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 8.620323181152344, "learning_rate": 3.888888888888889e-07, "loss": -0.0, "num_tokens": 956587.0, "reward": 0.25312498211860657, "reward_std": 0.42565304040908813, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 8, "step_time": 147.8402461669757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 412.25, "completions/mean_terminated_length": 386.2857360839844, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18391940742731094, "epoch": 0.0064285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 9.683939933776855, "learning_rate": 4.444444444444444e-07, "loss": -0.0, "num_tokens": 1075259.0, "reward": 0.3140624761581421, "reward_std": 0.4561282992362976, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.2847827076911926, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 9, "step_time": 108.42918385588564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 521.125, "completions/mean_terminated_length": 496.888916015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.17283021286129951, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 1.3954830169677734, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 1193907.0, "reward": 0.16171874105930328, "reward_std": 0.3368554413318634, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.23974503576755524, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 10, "step_time": 118.48989919497399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 442.40625, "completions/mean_terminated_length": 442.40625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.19687055423855782, "epoch": 0.007857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 3.561274528503418, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "num_tokens": 1288341.0, "reward": 0.30000001192092896, "reward_std": 0.44818857312202454, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 11, "step_time": 55.07885626098141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 442.171875, "completions/mean_terminated_length": 416.68255615234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.22921719774603844, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 10.506412506103516, "learning_rate": 6.111111111111112e-07, "loss": 0.0, "num_tokens": 1388512.0, "reward": 0.3687499761581421, "reward_std": 0.48094648122787476, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.33184191584587097, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 12, "step_time": 119.49877157399897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 471.09375, "completions/mean_terminated_length": 420.2257995605469, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14649338461458683, "epoch": 0.009285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 16.722030639648438, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 1511798.0, "reward": 0.22499997913837433, "reward_std": 0.40029749274253845, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 13, "step_time": 160.80015432706568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 409.171875, "completions/mean_terminated_length": 356.3064270019531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14898105338215828, "epoch": 0.01, "frac_reward_zero_std": 0.125, "grad_norm": 38.745059967041016, "learning_rate": 7.222222222222221e-07, "loss": -0.0, "num_tokens": 1651729.0, "reward": 0.2789062261581421, "reward_std": 0.4511716961860657, "rewards/format_reward/mean": 0.2890625, "rewards/format_reward/std": 0.2789533734321594, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 14, "step_time": 193.12235332495766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 466.265625, "completions/mean_terminated_length": 466.265625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.19658867083489895, "epoch": 0.010714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 34.59446334838867, "learning_rate": 7.777777777777778e-07, "loss": 0.0, "num_tokens": 1746826.0, "reward": 0.23281247913837433, "reward_std": 0.4091433882713318, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.30496877431869507, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 15, "step_time": 68.50137870694743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 417.140625, "completions/mean_terminated_length": 364.5322570800781, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15556670725345612, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.25, "grad_norm": 6.721803188323975, "learning_rate": 8.333333333333333e-07, "loss": -0.0, "num_tokens": 1870859.0, "reward": 0.34062498807907104, "reward_std": 0.4673358201980591, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.3149704039096832, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 16, "step_time": 159.5833295909688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 444.390625, "completions/mean_terminated_length": 392.6612854003906, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18389248382300138, "epoch": 0.012142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 9.25290298461914, "learning_rate": 8.888888888888888e-07, "loss": 0.0, "num_tokens": 1987036.0, "reward": 0.16093748807907104, "reward_std": 0.34462639689445496, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.301698237657547, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 17, "step_time": 163.66844313696492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1489.0, "completions/mean_length": 497.984375, "completions/mean_terminated_length": 447.9838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2151591945439577, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 49.3928108215332, "learning_rate": 9.444444444444444e-07, "loss": -0.0, "num_tokens": 2084659.0, "reward": 0.32343748211860657, "reward_std": 0.4590334892272949, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.29839184880256653, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 18, "step_time": 127.23080269095954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.0, "completions/max_terminated_length": 1315.0, "completions/mean_length": 358.28125, "completions/mean_terminated_length": 358.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16461750492453575, "epoch": 0.013571428571428571, "frac_reward_zero_std": 0.125, "grad_norm": 33.24488830566406, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2189413.0, "reward": 0.21406248211860657, "reward_std": 0.3865258991718292, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2221602201461792, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 19, "step_time": 81.95023821806535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 443.28125, "completions/mean_terminated_length": 417.8095397949219, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16434035263955593, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 8.118695259094238, "learning_rate": 9.999776148326214e-07, "loss": -0.0, "num_tokens": 2326511.0, "reward": 0.42656248807907104, "reward_std": 0.48713353276252747, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.1985812783241272, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 20, "step_time": 171.49558448110474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 286.140625, "completions/mean_terminated_length": 286.140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2154129333794117, "epoch": 0.015, "frac_reward_zero_std": 0.125, "grad_norm": 16.506973266601562, "learning_rate": 9.999104613348689e-07, "loss": -0.0, "num_tokens": 2431592.0, "reward": 0.33203125, "reward_std": 0.45828935503959656, "rewards/format_reward/mean": 0.5078125, "rewards/format_reward/std": 0.18881812691688538, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 21, "step_time": 102.2051269490039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 351.015625, "completions/mean_terminated_length": 351.015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2384468913078308, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.125, "grad_norm": 8.052404403686523, "learning_rate": 9.997985455197113e-07, "loss": -0.0, "num_tokens": 2517985.0, "reward": 0.20859375596046448, "reward_std": 0.37886154651641846, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.28770697116851807, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 22, "step_time": 47.53120892500738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 493.734375, "completions/mean_terminated_length": 493.734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15667208284139633, "epoch": 0.016428571428571428, "frac_reward_zero_std": 0.125, "grad_norm": 4.182269096374512, "learning_rate": 9.996418774081656e-07, "loss": 0.0, "num_tokens": 2643640.0, "reward": 0.2679687440395355, "reward_std": 0.41770032048225403, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.22699186205863953, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 23, "step_time": 136.94159113999922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2605742085725069, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 2.6181070804595947, "learning_rate": 9.994404710283998e-07, "loss": 0.0, "num_tokens": 2743904.0, "reward": 0.08046875149011612, "reward_std": 0.17719532549381256, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.24384792149066925, "rewards/mcq_exact_match_reward/mean": 0.03125, "rewards/mcq_exact_match_reward/std": 0.17536810040473938, "step": 24, "step_time": 67.75085840100655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 167.71875, "completions/mean_terminated_length": 167.71875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2089465633034706, "epoch": 0.017857142857142856, "frac_reward_zero_std": 0.125, "grad_norm": 21.222145080566406, "learning_rate": 9.991943444144756e-07, "loss": -0.0, "num_tokens": 2839630.0, "reward": 0.3820312023162842, "reward_std": 0.4708458185195923, "rewards/format_reward/mean": 0.5390625, "rewards/format_reward/std": 0.18483558297157288, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 25, "step_time": 46.71443971898407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25668232701718807, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.25, "grad_norm": 17.222293853759766, "learning_rate": 9.989035196047348e-07, "loss": -0.0, "num_tokens": 2927590.0, "reward": 0.16249999403953552, "reward_std": 0.3169797956943512, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.109375, "rewards/mcq_exact_match_reward/std": 0.3145764470100403, "step": 26, "step_time": 41.34394903801149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 324.53125, "completions/mean_terminated_length": 297.17462158203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18609545193612576, "epoch": 0.019285714285714285, "frac_reward_zero_std": 0.25, "grad_norm": 17.366985321044922, "learning_rate": 9.98568022639826e-07, "loss": 0.0, "num_tokens": 3043752.0, "reward": 0.28359371423721313, "reward_std": 0.431318998336792, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.1406387835741043, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 27, "step_time": 145.95341787295183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 215.234375, "completions/mean_terminated_length": 215.234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2353730145841837, "epoch": 0.02, "frac_reward_zero_std": 0.25, "grad_norm": 17.270282745361328, "learning_rate": 9.981878835603716e-07, "loss": 0.0, "num_tokens": 3131783.0, "reward": 0.27578121423721313, "reward_std": 0.4189927279949188, "rewards/format_reward/mean": 0.5703125, "rewards/format_reward/std": 0.1751912236213684, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 28, "step_time": 48.64892271097051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 214.15625, "completions/mean_terminated_length": 214.15625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2909251060336828, "epoch": 0.020714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 14.325923919677734, "learning_rate": 9.977631364042794e-07, "loss": -0.0, "num_tokens": 3226177.0, "reward": 0.4117187261581421, "reward_std": 0.4837634861469269, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.1649840772151947, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 29, "step_time": 55.44644622900523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 281.2698669433594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2629696223884821, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 10.076370239257812, "learning_rate": 9.972938192036944e-07, "loss": 0.0, "num_tokens": 3343833.0, "reward": 0.27421873807907104, "reward_std": 0.4146132171154022, "rewards/format_reward/mean": 0.5546875, "rewards/format_reward/std": 0.26899561285972595, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 30, "step_time": 177.31874076800887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 236.171875, "completions/mean_terminated_length": 236.171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2660892754793167, "epoch": 0.02214285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 11.033559799194336, "learning_rate": 9.967799739815924e-07, "loss": 0.0, "num_tokens": 3407684.0, "reward": 0.4117187261581421, "reward_std": 0.4798099994659424, "rewards/format_reward/mean": 0.6796875, "rewards/format_reward/std": 0.30035942792892456, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 31, "step_time": 19.77746521908557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 267.140625, "completions/mean_terminated_length": 238.87303161621094, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2127502802759409, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 21.78681182861328, "learning_rate": 9.96221646748019e-07, "loss": -0.0, "num_tokens": 3501853.0, "reward": 0.390625, "reward_std": 0.47525057196617126, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 32, "step_time": 118.76398004795192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 195.171875, "completions/mean_terminated_length": 195.171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.26218850910663605, "epoch": 0.023571428571428573, "frac_reward_zero_std": 0.125, "grad_norm": 13.382572174072266, "learning_rate": 9.956188874959686e-07, "loss": 0.0, "num_tokens": 3603568.0, "reward": 0.19062499701976776, "reward_std": 0.3306888937950134, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 33, "step_time": 57.80779201700352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 203.953125, "completions/mean_terminated_length": 203.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21508901193737984, "epoch": 0.024285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 18.717557907104492, "learning_rate": 9.949717501969079e-07, "loss": 0.0, "num_tokens": 3688533.0, "reward": 0.5679687261581421, "reward_std": 0.5023252964019775, "rewards/format_reward/mean": 0.6796875, "rewards/format_reward/std": 0.27265870571136475, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 34, "step_time": 64.80360024399124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 254.78125, "completions/mean_terminated_length": 254.78125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.27256614714860916, "epoch": 0.025, "frac_reward_zero_std": 0.0, "grad_norm": 14.232938766479492, "learning_rate": 9.942802927959442e-07, "loss": -0.0, "num_tokens": 3775567.0, "reward": 0.38124996423721313, "reward_std": 0.46908387541770935, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.3149704039096832, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 35, "step_time": 69.50821864098543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 279.328125, "completions/mean_terminated_length": 279.328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21617292240262032, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.125, "grad_norm": 14.788715362548828, "learning_rate": 9.93544577206636e-07, "loss": 0.0, "num_tokens": 3873788.0, "reward": 0.24140623211860657, "reward_std": 0.38684260845184326, "rewards/format_reward/mean": 0.6953125, "rewards/format_reward/std": 0.2762732207775116, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 36, "step_time": 112.71978642407339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 204.046875, "completions/mean_terminated_length": 204.046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2291890811175108, "epoch": 0.02642857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 22.162996292114258, "learning_rate": 9.927646693054495e-07, "loss": 0.0, "num_tokens": 3949719.0, "reward": 0.43906253576278687, "reward_std": 0.4866037666797638, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.2777281701564789, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 37, "step_time": 62.183381506067235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 92.578125, "completions/mean_terminated_length": 92.578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.21844332106411457, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.25, "grad_norm": 21.4798641204834, "learning_rate": 9.919406389258606e-07, "loss": -0.0, "num_tokens": 4028188.0, "reward": 0.44453126192092896, "reward_std": 0.486411988735199, "rewards/format_reward/mean": 0.6953125, "rewards/format_reward/std": 0.2615155577659607, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 38, "step_time": 44.90790424309671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2506138999015093, "epoch": 0.027857142857142858, "frac_reward_zero_std": 0.25, "grad_norm": 27.89171028137207, "learning_rate": 9.910725598521012e-07, "loss": -0.0, "num_tokens": 4097708.0, "reward": 0.4664062261581421, "reward_std": 0.4868961274623871, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.19012710452079773, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 39, "step_time": 39.88727585604647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 136.984375, "completions/mean_terminated_length": 136.984375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20815920643508434, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.125, "grad_norm": 22.442346572875977, "learning_rate": 9.901605098125526e-07, "loss": -0.0, "num_tokens": 4190579.0, "reward": 0.38749998807907104, "reward_std": 0.4662412703037262, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.26726123690605164, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 40, "step_time": 62.169976764998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 59.859375, "completions/mean_terminated_length": 59.859375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2938700430095196, "epoch": 0.029285714285714286, "frac_reward_zero_std": 0.25, "grad_norm": 11.49561882019043, "learning_rate": 9.892045704727863e-07, "loss": -0.0, "num_tokens": 4283034.0, "reward": 0.16562500596046448, "reward_std": 0.2750000059604645, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.2182178944349289, "rewards/mcq_exact_match_reward/mean": 0.078125, "rewards/mcq_exact_match_reward/std": 0.27048972249031067, "step": 41, "step_time": 32.6582014990272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 75.765625, "completions/mean_terminated_length": 75.765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25041171722114086, "epoch": 0.03, "frac_reward_zero_std": 0.25, "grad_norm": 30.742996215820312, "learning_rate": 9.882048274282505e-07, "loss": 0.0, "num_tokens": 4361843.0, "reward": 0.57421875, "reward_std": 0.5030062198638916, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.20275264978408813, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 42, "step_time": 47.323365143092815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 47.390625, "completions/mean_terminated_length": 47.390625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18909681774675846, "epoch": 0.030714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 22.67137908935547, "learning_rate": 9.871613701966066e-07, "loss": 0.0, "num_tokens": 4457780.0, "reward": 0.7250000238418579, "reward_std": 0.485504150390625, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.233588308095932, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 43, "step_time": 47.4528916090494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 83.21875, "completions/mean_terminated_length": 52.0317497253418, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.17218941450119019, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 12.08895492553711, "learning_rate": 9.86074292209714e-07, "loss": 0.0, "num_tokens": 4527074.0, "reward": 0.628125011920929, "reward_std": 0.50661301612854, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 44, "step_time": 107.07343659299659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18100928142666817, "epoch": 0.03214285714285714, "frac_reward_zero_std": 0.25, "grad_norm": 13.066922187805176, "learning_rate": 9.849436908052636e-07, "loss": 0.0, "num_tokens": 4608754.0, "reward": 0.38203126192092896, "reward_std": 0.465761661529541, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.26246222853660583, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 45, "step_time": 30.311806608980987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 67.34375, "completions/mean_terminated_length": 67.34375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.19344223476946354, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 8.720047950744629, "learning_rate": 9.837696672180618e-07, "loss": 0.0, "num_tokens": 4691800.0, "reward": 0.328125, "reward_std": 0.4157489538192749, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 46, "step_time": 44.673023908922914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 20.140625, "completions/mean_terminated_length": 20.140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07546021463349462, "epoch": 0.03357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 9.505743980407715, "learning_rate": 9.825523265709665e-07, "loss": -0.0, "num_tokens": 4783617.0, "reward": 0.71875, "reward_std": 0.49629583954811096, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 47, "step_time": 28.177909465972334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 20.59375, "completions/mean_terminated_length": 20.59375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.10830738116055727, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 14.707826614379883, "learning_rate": 9.812917778654747e-07, "loss": 0.0, "num_tokens": 4861247.0, "reward": 0.43125003576278687, "reward_std": 0.4888843894004822, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 48, "step_time": 10.893696008017287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 66.125, "completions/mean_terminated_length": 34.66666793823242, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11854543350636959, "epoch": 0.035, "frac_reward_zero_std": 0.375, "grad_norm": 15.387039184570312, "learning_rate": 9.799881339719614e-07, "loss": 0.0, "num_tokens": 4968215.0, "reward": 0.5382812023162842, "reward_std": 0.504876434803009, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.24688033759593964, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 49, "step_time": 154.59990453493083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 77.125, "completions/mean_terminated_length": 77.125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.09524603839963675, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 10.865472793579102, "learning_rate": 9.786415116195732e-07, "loss": 0.0, "num_tokens": 5049023.0, "reward": 0.44218751788139343, "reward_std": 0.47993209958076477, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 50, "step_time": 96.41551529400749 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 5049023, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }