{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.14285714285714285, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 453.359375, "completions/mean_terminated_length": 428.0476379394531, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20064565539360046, "epoch": 0.0007142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 5.157586574554443, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 104207.0, "reward": 0.3671875, "reward_std": 0.47810959815979004, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.22658175230026245, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 1, "step_time": 163.2008375240257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 558.921875, "completions/mean_terminated_length": 535.2857666015625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.15827747248113155, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.5780975818634033, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "num_tokens": 224762.0, "reward": 0.24296873807907104, "reward_std": 0.4135470986366272, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.28423789143562317, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 2, "step_time": 134.8140414939844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 589.8125, "completions/mean_terminated_length": 566.6666870117188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1299030063673854, "epoch": 0.002142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 5.490052223205566, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "num_tokens": 346622.0, "reward": 0.32109373807907104, "reward_std": 0.4618633985519409, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.2387082874774933, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 3, "step_time": 124.3698272620677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 522.0625, "completions/mean_terminated_length": 522.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.16233543679118156, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 1.4298690557479858, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "num_tokens": 460962.0, "reward": 0.27656251192092896, "reward_std": 0.4340624213218689, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2847827076911926, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 4, "step_time": 90.78269547002856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 650.03125, "completions/mean_terminated_length": 581.2786865234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1047047358006239, "epoch": 0.0035714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 3.9983320236206055, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "num_tokens": 626636.0, "reward": 0.23515623807907104, "reward_std": 0.41525280475616455, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.24180518090724945, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 5, "step_time": 171.21265639393823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 379.109375, "completions/mean_terminated_length": 379.109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17905950360000134, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 33.00831604003906, "learning_rate": 2.7777777777777776e-07, "loss": -0.0, "num_tokens": 742675.0, "reward": 0.22421874105930328, "reward_std": 0.39880695939064026, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.28510910272598267, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 6, "step_time": 77.91981936100638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 479.34375, "completions/mean_terminated_length": 479.34375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11762952525168657, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 6.298130989074707, "learning_rate": 3.333333333333333e-07, "loss": -0.0, "num_tokens": 846929.0, "reward": 0.4898437261581421, "reward_std": 0.5113147497177124, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.255761981010437, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 7, "step_time": 89.83808165497612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 634.296875, "completions/mean_terminated_length": 634.296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.158901397138834, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 1.4249234199523926, "learning_rate": 3.888888888888889e-07, "loss": -0.0, "num_tokens": 984668.0, "reward": 0.3351562023162842, "reward_std": 0.465447336435318, "rewards/format_reward/mean": 0.3828125, "rewards/format_reward/std": 0.2634054720401764, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 8, "step_time": 113.26597948698327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 481.453125, "completions/mean_terminated_length": 481.453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1585789266973734, "epoch": 0.0064285714285714285, "frac_reward_zero_std": 0.375, "grad_norm": 3.944404125213623, "learning_rate": 4.444444444444444e-07, "loss": -0.0, "num_tokens": 1116649.0, "reward": 0.16718748211860657, "reward_std": 0.3370293378829956, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2221602201461792, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 9, "step_time": 132.11293392902007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 455.625, "completions/mean_terminated_length": 430.3492431640625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.15571903064846992, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 11.225871086120605, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 1237337.0, "reward": 0.29921871423721313, "reward_std": 0.45211073756217957, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2366211861371994, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 10, "step_time": 141.26959226594772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 530.84375, "completions/mean_terminated_length": 506.7619323730469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12162926886230707, "epoch": 0.007857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 26.4554443359375, "learning_rate": 5.555555555555555e-07, "loss": -0.0, "num_tokens": 1358607.0, "reward": 0.23671871423721313, "reward_std": 0.4144456684589386, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2366211861371994, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 11, "step_time": 136.65451408794615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 458.390625, "completions/mean_terminated_length": 433.15875244140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14663300476968288, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 11.722622871398926, "learning_rate": 6.111111111111112e-07, "loss": 0.0, "num_tokens": 1471192.0, "reward": 0.22499997913837433, "reward_std": 0.3999999761581421, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2182178944349289, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 12, "step_time": 133.89911664801184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 562.9375, "completions/mean_terminated_length": 539.3651123046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15633145160973072, "epoch": 0.009285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 2.441755771636963, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 1576364.0, "reward": 0.25703123211860657, "reward_std": 0.42340895533561707, "rewards/format_reward/mean": 0.3828125, "rewards/format_reward/std": 0.21347814798355103, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 13, "step_time": 121.01016625500051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 596.859375, "completions/mean_terminated_length": 573.825439453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11535925976932049, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 8.867958068847656, "learning_rate": 7.222222222222221e-07, "loss": 0.0, "num_tokens": 1739315.0, "reward": 0.1874999850988388, "reward_std": 0.375224769115448, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.3021090030670166, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 14, "step_time": 186.91490571206668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17848026007413864, "epoch": 0.010714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 7.981410503387451, "learning_rate": 7.777777777777778e-07, "loss": -0.0, "num_tokens": 1836883.0, "reward": 0.34843745827674866, "reward_std": 0.47941502928733826, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.32694777846336365, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 15, "step_time": 94.68117612809874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 598.296875, "completions/mean_terminated_length": 575.2857666015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12071683909744024, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 17.546281814575195, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "num_tokens": 1974854.0, "reward": 0.14687499403953552, "reward_std": 0.3447882831096649, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 16, "step_time": 148.51339858904248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 611.171875, "completions/mean_terminated_length": 564.8225708007812, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1321120047941804, "epoch": 0.012142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 5.823922157287598, "learning_rate": 8.888888888888888e-07, "loss": -0.0, "num_tokens": 2119753.0, "reward": 0.17578125, "reward_std": 0.3575702905654907, "rewards/format_reward/mean": 0.3515625, "rewards/format_reward/std": 0.29113471508026123, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 17, "step_time": 182.10150892706588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 559.859375, "completions/mean_terminated_length": 511.8548278808594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1420492259785533, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 9.969266891479492, "learning_rate": 9.444444444444444e-07, "loss": 0.0, "num_tokens": 2257376.0, "reward": 0.25624996423721313, "reward_std": 0.42393654584884644, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2357022762298584, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 18, "step_time": 162.68666934006615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 580.640625, "completions/mean_terminated_length": 580.640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.123278739862144, "epoch": 0.013571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 13.4293851852417, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2411657.0, "reward": 0.2242187261581421, "reward_std": 0.40247172117233276, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.23974503576755524, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 19, "step_time": 114.04774017888121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 439.953125, "completions/mean_terminated_length": 439.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10359836835414171, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 18.170589447021484, "learning_rate": 9.999776148326214e-07, "loss": -0.0, "num_tokens": 2542550.0, "reward": 0.20546874403953552, "reward_std": 0.38863998651504517, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2680720090866089, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 20, "step_time": 94.10481164290104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 583.359375, "completions/mean_terminated_length": 511.3278503417969, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12750612013041973, "epoch": 0.015, "frac_reward_zero_std": 0.25, "grad_norm": 10.771635055541992, "learning_rate": 9.999104613348689e-07, "loss": 0.0, "num_tokens": 2661437.0, "reward": 0.2593749761581421, "reward_std": 0.4222835302352905, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.233588308095932, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 21, "step_time": 178.62638382194564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 381.640625, "completions/mean_terminated_length": 355.19049072265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15890471823513508, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.125, "grad_norm": 11.206506729125977, "learning_rate": 9.997985455197113e-07, "loss": 0.0, "num_tokens": 2776894.0, "reward": 0.18593749403953552, "reward_std": 0.35270705819129944, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.1717960685491562, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 22, "step_time": 140.86376020603348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 310.015625, "completions/mean_terminated_length": 224.5409698486328, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18053795211017132, "epoch": 0.016428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 9.23369026184082, "learning_rate": 9.996418774081656e-07, "loss": 0.0, "num_tokens": 2898815.0, "reward": 0.3109374940395355, "reward_std": 0.4464558959007263, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.14689241349697113, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 23, "step_time": 148.10121405799873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 232.359375, "completions/mean_terminated_length": 232.359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16719974670559168, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 4.078112602233887, "learning_rate": 9.994404710283998e-07, "loss": 0.0, "num_tokens": 2993910.0, "reward": 0.28203123807907104, "reward_std": 0.4266301095485687, "rewards/format_reward/mean": 0.4765625, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 24, "step_time": 73.75073616893496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 111.78125, "completions/mean_terminated_length": 111.78125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25493066385388374, "epoch": 0.017857142857142856, "frac_reward_zero_std": 0.125, "grad_norm": 16.993078231811523, "learning_rate": 9.991943444144756e-07, "loss": -0.0, "num_tokens": 3085936.0, "reward": 0.3023437261581421, "reward_std": 0.4352912902832031, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 25, "step_time": 45.09112752194051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 132.671875, "completions/mean_terminated_length": 132.671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2280710767954588, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.375, "grad_norm": 15.601746559143066, "learning_rate": 9.989035196047348e-07, "loss": 0.0, "num_tokens": 3186115.0, "reward": 0.20468749105930328, "reward_std": 0.3647915720939636, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.1534975916147232, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 26, "step_time": 68.13181330589578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 64.9375, "completions/mean_terminated_length": 64.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.22438204288482666, "epoch": 0.019285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 11.98585033416748, "learning_rate": 9.98568022639826e-07, "loss": -0.0, "num_tokens": 3275551.0, "reward": 0.4406249523162842, "reward_std": 0.4918280243873596, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.08908708393573761, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 27, "step_time": 46.977470892015845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 50.109375, "completions/mean_terminated_length": 50.109375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2098379861563444, "epoch": 0.02, "frac_reward_zero_std": 0.375, "grad_norm": 18.382869720458984, "learning_rate": 9.981878835603716e-07, "loss": 0.0, "num_tokens": 3357406.0, "reward": 0.39531248807907104, "reward_std": 0.4776528775691986, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 28, "step_time": 35.3679761699168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10740844532847404, "epoch": 0.020714285714285713, "frac_reward_zero_std": 0.5, "grad_norm": 18.927444458007812, "learning_rate": 9.977631364042794e-07, "loss": -0.0, "num_tokens": 3456198.0, "reward": 0.5187499523162842, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.46875, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 29, "step_time": 4.8387698759906925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.197237528860569, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 18.760665893554688, "learning_rate": 9.972938192036944e-07, "loss": -0.0, "num_tokens": 3529918.0, "reward": 0.3781249523162842, "reward_std": 0.47324231266975403, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 30, "step_time": 19.270987000956666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 7.53125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14352155569940805, "epoch": 0.02214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 17.66056251525879, "learning_rate": 9.967799739815924e-07, "loss": -0.0, "num_tokens": 3604272.0, "reward": 0.27812498807907104, "reward_std": 0.4307500422000885, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 31, "step_time": 5.142326800851151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16279550455510616, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 19.25493621826172, "learning_rate": 9.96221646748019e-07, "loss": -0.0, "num_tokens": 3687568.0, "reward": 0.22187498211860657, "reward_std": 0.3802541494369507, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 32, "step_time": 3.4762189310858957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 7.59375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1700758682563901, "epoch": 0.023571428571428573, "frac_reward_zero_std": 0.5, "grad_norm": 23.92763328552246, "learning_rate": 9.956188874959686e-07, "loss": -0.0, "num_tokens": 3782558.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 33, "step_time": 7.063230810861569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10504745738580823, "epoch": 0.024285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 17.42229461669922, "learning_rate": 9.949717501969079e-07, "loss": -0.0, "num_tokens": 3872686.0, "reward": 0.26874998211860657, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 34, "step_time": 3.867844623979181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 6.359375, "completions/mean_terminated_length": 6.359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18720911256968975, "epoch": 0.025, "frac_reward_zero_std": 0.625, "grad_norm": 26.774076461791992, "learning_rate": 9.942802927959442e-07, "loss": 0.0, "num_tokens": 3981189.0, "reward": 0.29921871423721313, "reward_std": 0.43693482875823975, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 35, "step_time": 5.721210387942847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.13526835106313229, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 25.46927833557129, "learning_rate": 9.93544577206636e-07, "loss": -0.0, "num_tokens": 4056141.0, "reward": 0.3468749523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 36, "step_time": 3.36784387397347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17059525474905968, "epoch": 0.02642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 20.648895263671875, "learning_rate": 9.927646693054495e-07, "loss": 0.0, "num_tokens": 4163493.0, "reward": 0.2999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 37, "step_time": 9.884533652977552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.953125, "completions/mean_terminated_length": 5.953125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10067910794168711, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 24.11454963684082, "learning_rate": 9.919406389258606e-07, "loss": 0.0, "num_tokens": 4273250.0, "reward": 0.28359371423721313, "reward_std": 0.4274373948574066, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 38, "step_time": 6.0175170440925285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 6.953125, "completions/mean_terminated_length": 6.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14652357157319784, "epoch": 0.027857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 12.282570838928223, "learning_rate": 9.910725598521012e-07, "loss": -0.0, "num_tokens": 4366839.0, "reward": 0.4562499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 39, "step_time": 9.129241381015163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07260213000699878, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 11.272303581237793, "learning_rate": 9.901605098125526e-07, "loss": -0.0, "num_tokens": 4477503.0, "reward": 0.39374998211860657, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 40, "step_time": 4.785194783064071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 39.328125, "completions/mean_terminated_length": 39.328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12252951040863991, "epoch": 0.029285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 9.645527839660645, "learning_rate": 9.892045704727863e-07, "loss": -0.0, "num_tokens": 4603468.0, "reward": 0.3937499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 41, "step_time": 121.67112983297557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 7.203125, "completions/mean_terminated_length": 7.203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04064215812832117, "epoch": 0.03, "frac_reward_zero_std": 0.75, "grad_norm": 17.42051124572754, "learning_rate": 9.882048274282505e-07, "loss": -0.0, "num_tokens": 4709177.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 42, "step_time": 10.29577969602542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 6.03125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12579507380723953, "epoch": 0.030714285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 36.053192138671875, "learning_rate": 9.871613701966066e-07, "loss": 0.0, "num_tokens": 4787955.0, "reward": 0.40859371423721313, "reward_std": 0.4842400550842285, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 43, "step_time": 4.568186060001608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 6.4375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0865603880956769, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 14.81761360168457, "learning_rate": 9.86074292209714e-07, "loss": 0.0, "num_tokens": 4872295.0, "reward": 0.6539061665534973, "reward_std": 0.4988323152065277, "rewards/format_reward/mean": 0.4453125, "rewards/format_reward/std": 0.15728822350502014, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 44, "step_time": 4.5612655181321315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05028381128795445, "epoch": 0.03214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 18.105215072631836, "learning_rate": 9.849436908052636e-07, "loss": 0.0, "num_tokens": 4971903.0, "reward": 0.4718749523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 45, "step_time": 4.8814199649496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0737028606235981, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.625, "grad_norm": 36.69444274902344, "learning_rate": 9.837696672180618e-07, "loss": -0.0, "num_tokens": 5051695.0, "reward": 0.4874999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 46, "step_time": 3.359571039909497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0733404541388154, "epoch": 0.03357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 18.161724090576172, "learning_rate": 9.825523265709665e-07, "loss": 0.0, "num_tokens": 5126431.0, "reward": 0.5335937142372131, "reward_std": 0.504507839679718, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 47, "step_time": 17.83803274697857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.054197699995711446, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 19.445640563964844, "learning_rate": 9.812917778654747e-07, "loss": -0.0, "num_tokens": 5230503.0, "reward": 0.7531249523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 48, "step_time": 4.160447052039672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05320898490026593, "epoch": 0.035, "frac_reward_zero_std": 0.875, "grad_norm": 11.983798027038574, "learning_rate": 9.799881339719614e-07, "loss": -0.0, "num_tokens": 5297479.0, "reward": 0.3937499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 49, "step_time": 2.485525873955339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 6.09375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0547908297739923, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 15.623820304870605, "learning_rate": 9.786415116195732e-07, "loss": -0.0, "num_tokens": 5377581.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 50, "step_time": 4.019255019025877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07222574669867754, "epoch": 0.03642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 23.187658309936523, "learning_rate": 9.772520313857775e-07, "loss": -0.0, "num_tokens": 5490213.0, "reward": 0.4874999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 51, "step_time": 6.748388390929904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07622226374223828, "epoch": 0.037142857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 15.004591941833496, "learning_rate": 9.758198176855646e-07, "loss": 0.0, "num_tokens": 5583557.0, "reward": 0.4718749523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 52, "step_time": 4.643642266979441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.052028866310138255, "epoch": 0.03785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 18.302907943725586, "learning_rate": 9.74344998760308e-07, "loss": -0.0, "num_tokens": 5674525.0, "reward": 0.26874998211860657, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 53, "step_time": 3.6161275009508245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07992784632369876, "epoch": 0.03857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 50.737979888916016, "learning_rate": 9.72827706666282e-07, "loss": 0.0, "num_tokens": 5768773.0, "reward": 0.4093749523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 54, "step_time": 4.318731640116312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04811420664191246, "epoch": 0.039285714285714285, "frac_reward_zero_std": 0.875, "grad_norm": 8.901388168334961, "learning_rate": 9.712680772628363e-07, "loss": 0.0, "num_tokens": 5840629.0, "reward": 0.4406249523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 55, "step_time": 3.043417449865956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.034670245833694935, "epoch": 0.04, "frac_reward_zero_std": 0.75, "grad_norm": 38.62637710571289, "learning_rate": 9.696662502002318e-07, "loss": -0.0, "num_tokens": 5928837.0, "reward": 0.3468749523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 56, "step_time": 8.042339402018115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04627775074914098, "epoch": 0.04071428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 20.798757553100586, "learning_rate": 9.680223689071362e-07, "loss": 0.0, "num_tokens": 6012549.0, "reward": 0.41874998807907104, "reward_std": 0.4930870831012726, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 57, "step_time": 3.4416783688939176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.045997242676094174, "epoch": 0.041428571428571426, "frac_reward_zero_std": 0.875, "grad_norm": 14.777454376220703, "learning_rate": 9.663365805777814e-07, "loss": 0.0, "num_tokens": 6088061.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 58, "step_time": 3.701708526001312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.06566549651324749, "epoch": 0.04214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 30.113880157470703, "learning_rate": 9.646090361587827e-07, "loss": -0.0, "num_tokens": 6172757.0, "reward": 0.6437499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 59, "step_time": 3.5152428460423835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04948642780072987, "epoch": 0.04285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 19.78885841369629, "learning_rate": 9.628398903356239e-07, "loss": 0.0, "num_tokens": 6264821.0, "reward": 0.6124999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 60, "step_time": 4.421045998169575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03131909319199622, "epoch": 0.04357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 34.9882698059082, "learning_rate": 9.610293015188067e-07, "loss": -0.0, "num_tokens": 6347405.0, "reward": 0.7531249523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 61, "step_time": 6.333404837932903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04404397588223219, "epoch": 0.04428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 27.348413467407227, "learning_rate": 9.59177431829666e-07, "loss": -0.0, "num_tokens": 6419029.0, "reward": 0.5031249523162842, "reward_std": 0.5017330646514893, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 62, "step_time": 3.639777671021875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.028264216147363186, "epoch": 0.045, "frac_reward_zero_std": 0.875, "grad_norm": 25.774185180664062, "learning_rate": 9.572844470858537e-07, "loss": -0.0, "num_tokens": 6502877.0, "reward": 0.7531249523162842, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 63, "step_time": 5.851632744073868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04575204895809293, "epoch": 0.045714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.55350516786491e-07, "loss": 0.0, "num_tokens": 6591573.0, "reward": 0.29999998211860657, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 64, "step_time": 4.74795475701103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0018548955195001327, "epoch": 0.04642857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.533758140969912e-07, "loss": 0.0, "num_tokens": 6704141.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 65, "step_time": 4.814983140968252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.018711353768594563, "epoch": 0.047142857142857146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.513605158335562e-07, "loss": 0.0, "num_tokens": 6782221.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 66, "step_time": 3.5154523900710046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.057149365777149796, "epoch": 0.047857142857142855, "frac_reward_zero_std": 0.875, "grad_norm": 20.58985137939453, "learning_rate": 9.493048024473411e-07, "loss": 0.0, "num_tokens": 6851269.0, "reward": 0.6742187142372131, "reward_std": 0.489005446434021, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 67, "step_time": 2.9816590580740012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.030840615974739194, "epoch": 0.04857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 26.677947998046875, "learning_rate": 9.47208858008299e-07, "loss": -0.0, "num_tokens": 6936149.0, "reward": 0.6593749523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 68, "step_time": 5.698866136022843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 6.1875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.06430117995478213, "epoch": 0.04928571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 48.29899978637695, "learning_rate": 9.450728701886983e-07, "loss": -0.0, "num_tokens": 7002961.0, "reward": 0.6859374046325684, "reward_std": 0.49009785056114197, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.14689241349697113, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 69, "step_time": 3.3110440919408575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.06107360986061394, "epoch": 0.05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.428970302463184e-07, "loss": 0.0, "num_tokens": 7080225.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 70, "step_time": 3.588115891034249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.019286671769805253, "epoch": 0.05071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 10.453094482421875, "learning_rate": 9.406815330073244e-07, "loss": -0.0, "num_tokens": 7157009.0, "reward": 0.4093749523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 71, "step_time": 4.012491253030021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.005534585099667311, "epoch": 0.05142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.384265768488224e-07, "loss": 0.0, "num_tokens": 7251009.0, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 1.0, "rewards/mcq_exact_match_reward/std": 0.0, "step": 72, "step_time": 4.052724620094523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03679531207308173, "epoch": 0.052142857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 37.07649230957031, "learning_rate": 9.36132363681097e-07, "loss": 0.0, "num_tokens": 7329065.0, "reward": 0.34609371423721313, "reward_std": 0.461046427488327, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 73, "step_time": 3.5614252468803898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.06942176586017013, "epoch": 0.05285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 28.538206100463867, "learning_rate": 9.337990989295304e-07, "loss": -0.0, "num_tokens": 7427713.0, "reward": 0.26874998211860657, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 74, "step_time": 4.526347325881943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02778073405715986, "epoch": 0.05357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 17.399362564086914, "learning_rate": 9.314269915162114e-07, "loss": 0.0, "num_tokens": 7527089.0, "reward": 0.5812499523162842, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.53125, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 75, "step_time": 5.331380940915551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.06143995188176632, "epoch": 0.054285714285714284, "frac_reward_zero_std": 0.75, "grad_norm": 37.28912353515625, "learning_rate": 9.290162538412255e-07, "loss": 0.0, "num_tokens": 7613169.0, "reward": 0.5968749523162842, "reward_std": 0.5017330646514893, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 76, "step_time": 4.3536295120138675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0089755498120212, "epoch": 0.055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.265671017636382e-07, "loss": 0.0, "num_tokens": 7690257.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 77, "step_time": 3.8471228859853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0636584812309593, "epoch": 0.055714285714285716, "frac_reward_zero_std": 0.5, "grad_norm": 75.22062683105469, "learning_rate": 9.240797545821666e-07, "loss": -0.0, "num_tokens": 7786777.0, "reward": 0.5968749523162842, "reward_std": 0.5017330646514893, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 78, "step_time": 4.731224032060709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010438685640110634, "epoch": 0.056428571428571425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.215544350155422e-07, "loss": 0.0, "num_tokens": 7879257.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 79, "step_time": 5.060094357933849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.003100438669207506, "epoch": 0.05714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.189913691825699e-07, "loss": 0.0, "num_tokens": 7939841.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 80, "step_time": 2.383018973923754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0016284913563140435, "epoch": 0.05785714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.163907865818806e-07, "loss": 0.0, "num_tokens": 8000473.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 81, "step_time": 2.3158561410964467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01943864591885358, "epoch": 0.05857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 31.108356475830078, "learning_rate": 9.137529200713809e-07, "loss": -0.0, "num_tokens": 8074729.0, "reward": 0.3937499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 82, "step_time": 3.5820812580059282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.019582699635066092, "epoch": 0.05928571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.11078005847405e-07, "loss": 0.0, "num_tokens": 8149137.0, "reward": 0.29999998211860657, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 83, "step_time": 2.6342398920096457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04416700848378241, "epoch": 0.06, "frac_reward_zero_std": 0.625, "grad_norm": 41.02012634277344, "learning_rate": 9.083662834235629e-07, "loss": 0.0, "num_tokens": 8220905.0, "reward": 0.4406249523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 84, "step_time": 3.4052489919704385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02520724307396449, "epoch": 0.060714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 17.11612892150879, "learning_rate": 9.056179956092961e-07, "loss": 0.0, "num_tokens": 8312417.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 85, "step_time": 5.020384737115819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03604305279441178, "epoch": 0.06142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 81.0152359008789, "learning_rate": 9.028333884881356e-07, "loss": -0.0, "num_tokens": 8386809.0, "reward": 0.6124999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 86, "step_time": 3.479461514914874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04329964122734964, "epoch": 0.062142857142857146, "frac_reward_zero_std": 0.75, "grad_norm": 48.07200622558594, "learning_rate": 9.000127113956672e-07, "loss": 0.0, "num_tokens": 8471817.0, "reward": 0.3468749523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 87, "step_time": 4.475756738916971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.028254332719370723, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 135.7811737060547, "learning_rate": 8.971562168972064e-07, "loss": -0.0, "num_tokens": 8560945.0, "reward": 0.6437499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 88, "step_time": 6.017775994958356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04081520880572498, "epoch": 0.06357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 9.588271141052246, "learning_rate": 8.942641607651828e-07, "loss": 0.0, "num_tokens": 8668209.0, "reward": 0.6906249523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 89, "step_time": 3.933919732866343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.012149464862886816, "epoch": 0.06428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 12.276681900024414, "learning_rate": 8.91336801956239e-07, "loss": -0.0, "num_tokens": 8759393.0, "reward": 0.6593749523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 90, "step_time": 4.012565175944474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.00394415354821831, "epoch": 0.065, "frac_reward_zero_std": 0.875, "grad_norm": 42.945594787597656, "learning_rate": 8.883744025880427e-07, "loss": -0.0, "num_tokens": 8841201.0, "reward": 0.6593749523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 91, "step_time": 3.554952215985395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.020873073022812605, "epoch": 0.06571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.853772279158165e-07, "loss": 0.0, "num_tokens": 8926921.0, "reward": 0.4249999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.375, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 92, "step_time": 3.4018443040549755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.017624020569201093, "epoch": 0.06642857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.823455463085873e-07, "loss": 0.0, "num_tokens": 9020281.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 93, "step_time": 3.9275616111117415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.029963097535073757, "epoch": 0.06714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 21.280263900756836, "learning_rate": 8.792796292251559e-07, "loss": 0.0, "num_tokens": 9108273.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 94, "step_time": 4.094957825960591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01988843083381653, "epoch": 0.06785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.761797511897906e-07, "loss": 0.0, "num_tokens": 9201273.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 95, "step_time": 4.90473719505826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.002151613953174092, "epoch": 0.06857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.730461897676463e-07, "loss": 0.0, "num_tokens": 9289649.0, "reward": 1.0499999523162842, "reward_std": 0.0, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 1.0, "rewards/mcq_exact_match_reward/std": 0.0, "step": 96, "step_time": 5.864397282944992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.028051164787029848, "epoch": 0.06928571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.698792255399103e-07, "loss": 0.0, "num_tokens": 9385057.0, "reward": 0.6749999523162842, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 97, "step_time": 5.353533354995307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04004533472470939, "epoch": 0.07, "frac_reward_zero_std": 0.75, "grad_norm": 42.5, "learning_rate": 8.666791420786803e-07, "loss": -0.0, "num_tokens": 9463201.0, "reward": 0.8781249523162842, "reward_std": 0.38025417923927307, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.828125, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 98, "step_time": 3.3163292090175673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.033746465924195945, "epoch": 0.07071428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 36.30033874511719, "learning_rate": 8.634462259215718e-07, "loss": -0.0, "num_tokens": 9550801.0, "reward": 0.4718749523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 99, "step_time": 4.70873619185295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.020849275402724743, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 40.67396926879883, "learning_rate": 8.601807665460619e-07, "loss": -0.0, "num_tokens": 9638393.0, "reward": 0.7218749523162842, "reward_std": 0.47324231266975403, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.671875, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 100, "step_time": 4.529204568010755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.028578478610143065, "epoch": 0.07214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 13.613948822021484, "learning_rate": 8.568830563435694e-07, "loss": -0.0, "num_tokens": 9708137.0, "reward": 0.6437499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 101, "step_time": 3.1490642699063756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05988255422562361, "epoch": 0.07285714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 32.46657943725586, "learning_rate": 8.535533905932737e-07, "loss": -0.0, "num_tokens": 9788553.0, "reward": 0.6906249523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 102, "step_time": 3.1424633120768704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.017814256290876074, "epoch": 0.07357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.501920674356754e-07, "loss": 0.0, "num_tokens": 9868105.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 103, "step_time": 3.9018897020141594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.015304914733860642, "epoch": 0.07428571428571429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.467993878459003e-07, "loss": 0.0, "num_tokens": 9947649.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 104, "step_time": 3.0474197379662655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 6.703125, "completions/mean_terminated_length": 6.703125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07897318806499243, "epoch": 0.075, "frac_reward_zero_std": 0.875, "grad_norm": 58.019317626953125, "learning_rate": 8.433756556067505e-07, "loss": -0.0, "num_tokens": 10047606.0, "reward": 0.3312499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 105, "step_time": 7.077936801011674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.030407932470552623, "epoch": 0.07571428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 20.576940536499023, "learning_rate": 8.399211772815029e-07, "loss": 0.0, "num_tokens": 10164246.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 106, "step_time": 5.708010822942015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0643875045934692, "epoch": 0.07642857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 86.49934387207031, "learning_rate": 8.364362621864594e-07, "loss": -0.0, "num_tokens": 10297870.0, "reward": 0.4874999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 107, "step_time": 8.16701563395327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.024679650348844007, "epoch": 0.07714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 24.781404495239258, "learning_rate": 8.32921222363251e-07, "loss": -0.0, "num_tokens": 10376342.0, "reward": 0.7062499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.65625, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 108, "step_time": 3.7486861869692802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03000695165246725, "epoch": 0.07785714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.293763725508969e-07, "loss": 0.0, "num_tokens": 10474478.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 109, "step_time": 4.3634466278599575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0029143407664378174, "epoch": 0.07857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.258020301576223e-07, "loss": 0.0, "num_tokens": 10571134.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 110, "step_time": 3.7971292279544286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.019338089448865503, "epoch": 0.07928571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 72.22135925292969, "learning_rate": 8.221985152324384e-07, "loss": -0.0, "num_tokens": 10656846.0, "reward": 0.6281249523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 111, "step_time": 4.486735024023801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 6.03125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.00799583364278078, "epoch": 0.08, "frac_reward_zero_std": 0.875, "grad_norm": 14.136860847473145, "learning_rate": 8.185661504364844e-07, "loss": 0.0, "num_tokens": 10726288.0, "reward": 0.9406249523162842, "reward_std": 0.3145764172077179, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.890625, "rewards/mcq_exact_match_reward/std": 0.3145764470100403, "step": 112, "step_time": 2.845964161970187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.002038436970906332, "epoch": 0.08071428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.149052610141355e-07, "loss": 0.0, "num_tokens": 10812632.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 113, "step_time": 3.8674794240505435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.023922092164866626, "epoch": 0.08142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 23.515605926513672, "learning_rate": 8.112161747638821e-07, "loss": -0.0, "num_tokens": 10905936.0, "reward": 0.9718749523162842, "reward_std": 0.2704896926879883, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.921875, "rewards/mcq_exact_match_reward/std": 0.27048972249031067, "step": 114, "step_time": 3.664424007933121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0029691653471672907, "epoch": 0.08214285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.074992220089768e-07, "loss": 0.0, "num_tokens": 11023048.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 115, "step_time": 6.1637890610145405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.009863481915090233, "epoch": 0.08285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.037547355678576e-07, "loss": 0.0, "num_tokens": 11108232.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 116, "step_time": 3.388725136872381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01721668551908806, "epoch": 0.08357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 32.60246658325195, "learning_rate": 7.999830507243477e-07, "loss": -0.0, "num_tokens": 11191584.0, "reward": 0.6437499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.59375, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 117, "step_time": 3.8073597230250016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.031643704045563936, "epoch": 0.08428571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 60.023223876953125, "learning_rate": 7.961845051976332e-07, "loss": -0.0, "num_tokens": 11300448.0, "reward": 0.6124999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5625, "rewards/mcq_exact_match_reward/std": 0.5, "step": 118, "step_time": 4.292414934025146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.004885429982095957, "epoch": 0.085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.923594391120236e-07, "loss": 0.0, "num_tokens": 11384816.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 119, "step_time": 4.405862295941915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.022961166221648455, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 13.171720504760742, "learning_rate": 7.88508194966497e-07, "loss": -0.0, "num_tokens": 11491176.0, "reward": 0.6593749523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 120, "step_time": 4.337666148028802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.015833469369681552, "epoch": 0.08642857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 12.888790130615234, "learning_rate": 7.84631117604033e-07, "loss": -0.0, "num_tokens": 11571112.0, "reward": 0.8468749523162842, "reward_std": 0.4055052399635315, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.796875, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 121, "step_time": 2.9378147069946863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0003582504086807603, "epoch": 0.08714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.80728554180734e-07, "loss": 0.0, "num_tokens": 11676112.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 122, "step_time": 6.747522479039617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.022964858857449144, "epoch": 0.08785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 16.368377685546875, "learning_rate": 7.768008541347421e-07, "loss": -0.0, "num_tokens": 11779544.0, "reward": 0.7531249523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 123, "step_time": 4.132223132997751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05628858879208565, "epoch": 0.08857142857142856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.728483691549491e-07, "loss": 0.0, "num_tokens": 11868712.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 124, "step_time": 6.3635276320856065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.014705251320265234, "epoch": 0.08928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 25.80197525024414, "learning_rate": 7.688714531495059e-07, "loss": 0.0, "num_tokens": 11972512.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 125, "step_time": 5.704090942977928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.016387713549192995, "epoch": 0.09, "frac_reward_zero_std": 0.875, "grad_norm": 36.02985763549805, "learning_rate": 7.648704622141347e-07, "loss": -0.0, "num_tokens": 12060000.0, "reward": 0.8312499523162842, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.78125, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 126, "step_time": 4.272621555137448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.019242929047322832, "epoch": 0.09071428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 18.566303253173828, "learning_rate": 7.608457546002422e-07, "loss": 0.0, "num_tokens": 12136504.0, "reward": 0.8593749403953552, "reward_std": 0.40008679032325745, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.12198751419782639, "rewards/mcq_exact_match_reward/mean": 0.8125, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 127, "step_time": 3.5814500708365813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.006453944864915684, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 16.331865310668945, "learning_rate": 7.56797690682843e-07, "loss": -0.0, "num_tokens": 12217952.0, "reward": 0.9093749523162842, "reward_std": 0.35038241744041443, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.859375, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 128, "step_time": 5.136591883026995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.030486588017083704, "epoch": 0.09214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 18.421833038330078, "learning_rate": 7.527266329282905e-07, "loss": 0.0, "num_tokens": 12309872.0, "reward": 0.8156249523162842, "reward_std": 0.42695626616477966, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.765625, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 129, "step_time": 4.8528469749726355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02539128519129008, "epoch": 0.09285714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 13.853494644165039, "learning_rate": 7.486329458618215e-07, "loss": 0.0, "num_tokens": 12398056.0, "reward": 0.6906249523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 130, "step_time": 5.018374064005911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.002957444063213188, "epoch": 0.09357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.445169960349166e-07, "loss": 0.0, "num_tokens": 12488472.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 131, "step_time": 4.972345500893425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.022331225918605924, "epoch": 0.09428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 27.090051651000977, "learning_rate": 7.403791519924793e-07, "loss": -0.0, "num_tokens": 12577104.0, "reward": 0.9093749523162842, "reward_std": 0.35038241744041443, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.859375, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 132, "step_time": 4.115176132007036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.008792717853793874, "epoch": 0.095, "frac_reward_zero_std": 0.875, "grad_norm": 27.841257095336914, "learning_rate": 7.362197842398354e-07, "loss": 0.0, "num_tokens": 12651600.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 133, "step_time": 3.2441998279537074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.022069176658988, "epoch": 0.09571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.320392652095583e-07, "loss": 0.0, "num_tokens": 12740912.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 134, "step_time": 5.6722183779929765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.014933973550796509, "epoch": 0.09642857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 183.2643280029297, "learning_rate": 7.278379692281208e-07, "loss": -0.0, "num_tokens": 12821936.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 135, "step_time": 4.318428025871981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.036976943898480386, "epoch": 0.09714285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 44.74044418334961, "learning_rate": 7.236162724823778e-07, "loss": -0.0, "num_tokens": 12952720.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 136, "step_time": 7.915950812923256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.012101019237888977, "epoch": 0.09785714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.193745529858826e-07, "loss": 0.0, "num_tokens": 13040200.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 137, "step_time": 4.189123667951208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.019024420296773314, "epoch": 0.09857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 21.765090942382812, "learning_rate": 7.151131905450385e-07, "loss": -0.0, "num_tokens": 13144128.0, "reward": 0.7531249523162842, "reward_std": 0.46049273014068604, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 138, "step_time": 6.618631144170649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010866805270779878, "epoch": 0.09928571428571428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.10832566725092e-07, "loss": 0.0, "num_tokens": 13233000.0, "reward": 0.6749999523162842, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 139, "step_time": 4.987454570014961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0087249851130764, "epoch": 0.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.065330648159655e-07, "loss": 0.0, "num_tokens": 13290744.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 140, "step_time": 2.0643829210312106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02978032447572332, "epoch": 0.10071428571428571, "frac_reward_zero_std": 0.875, "grad_norm": 13.619489669799805, "learning_rate": 7.022150697979384e-07, "loss": -0.0, "num_tokens": 13358248.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 141, "step_time": 2.6800080239772797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.027033332939026877, "epoch": 0.10142857142857142, "frac_reward_zero_std": 0.875, "grad_norm": 96.12157440185547, "learning_rate": 6.978789683071759e-07, "loss": -0.0, "num_tokens": 13433672.0, "reward": 0.7843749523162842, "reward_std": 0.44515690207481384, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 142, "step_time": 2.9009379278868437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.021864378399186535, "epoch": 0.10214285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.935251486011086e-07, "loss": 0.0, "num_tokens": 13515472.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 143, "step_time": 4.081806018890347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.024838617178829736, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 28.892257690429688, "learning_rate": 6.891540005236674e-07, "loss": -0.0, "num_tokens": 13600024.0, "reward": 0.5968749523162842, "reward_std": 0.5017330646514893, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 144, "step_time": 4.945346346183214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02198075025808066, "epoch": 0.10357142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.847659154703785e-07, "loss": 0.0, "num_tokens": 13696928.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 145, "step_time": 3.499220358033199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.005274529721646104, "epoch": 0.10428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 10.472895622253418, "learning_rate": 6.803612863533149e-07, "loss": -0.0, "num_tokens": 13783760.0, "reward": 0.7843749523162842, "reward_std": 0.44515690207481384, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 146, "step_time": 4.068320885999128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.014849641811451875, "epoch": 0.105, "frac_reward_zero_std": 0.875, "grad_norm": 45.07769012451172, "learning_rate": 6.759405075659165e-07, "loss": -0.0, "num_tokens": 13859296.0, "reward": 0.7374999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 147, "step_time": 3.5780998170375824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.011419365764595568, "epoch": 0.10571428571428572, "frac_reward_zero_std": 0.875, "grad_norm": 13.905855178833008, "learning_rate": 6.715039749476763e-07, "loss": -0.0, "num_tokens": 13947984.0, "reward": 0.7374999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 148, "step_time": 5.263757238048129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.018238219687191304, "epoch": 0.10642857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 22.64560890197754, "learning_rate": 6.670520857486949e-07, "loss": 0.0, "num_tokens": 14033664.0, "reward": 0.8156249523162842, "reward_std": 0.42695626616477966, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.765625, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 149, "step_time": 4.154493458045181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010879674111492932, "epoch": 0.10714285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 34.28276062011719, "learning_rate": 6.625852385941118e-07, "loss": -0.0, "num_tokens": 14127608.0, "reward": 0.8937499523162842, "reward_std": 0.36596250534057617, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 150, "step_time": 11.891356678155717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.025685275555588305, "epoch": 0.10785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 36.59083557128906, "learning_rate": 6.58103833448412e-07, "loss": -0.0, "num_tokens": 14225632.0, "reward": 0.4562499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 151, "step_time": 5.057005219103303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03449344326509163, "epoch": 0.10857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.536082715796124e-07, "loss": 0.0, "num_tokens": 14318456.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 152, "step_time": 4.17643676197622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07026771118398756, "epoch": 0.10928571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 9.721390724182129, "learning_rate": 6.490989555233327e-07, "loss": -0.0, "num_tokens": 14425512.0, "reward": 0.5968749523162842, "reward_std": 0.5017330646514893, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.546875, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 153, "step_time": 5.342055176908616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.013852535252226517, "epoch": 0.11, "frac_reward_zero_std": 0.75, "grad_norm": 27.178972244262695, "learning_rate": 6.445762890467517e-07, "loss": -0.0, "num_tokens": 14518880.0, "reward": 0.7374999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 154, "step_time": 4.132746922026854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.044882668400532566, "epoch": 0.11071428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 26.709630966186523, "learning_rate": 6.400406771124535e-07, "loss": -0.0, "num_tokens": 14619344.0, "reward": 0.7374999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 155, "step_time": 6.531158309953753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.012610521174792666, "epoch": 0.11142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.354925258421675e-07, "loss": 0.0, "num_tokens": 14709472.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 156, "step_time": 6.000549168966245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04332200507633388, "epoch": 0.11214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 46.953636169433594, "learning_rate": 6.309322424804033e-07, "loss": -0.0, "num_tokens": 14790048.0, "reward": 0.6593749523162842, "reward_std": 0.49174734950065613, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 157, "step_time": 5.684053825039882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010729722736869007, "epoch": 0.11285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.263602353579866e-07, "loss": 0.0, "num_tokens": 14897088.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 158, "step_time": 5.917332597950008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02750105356972199, "epoch": 0.11357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 46.41020584106445, "learning_rate": 6.217769138554959e-07, "loss": -0.0, "num_tokens": 14974200.0, "reward": 0.8312499523162842, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.78125, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 159, "step_time": 4.165541312017012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.020890720130410045, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 32.159664154052734, "learning_rate": 6.171826883666074e-07, "loss": 0.0, "num_tokens": 15099272.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 160, "step_time": 8.19824515801156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.018137310704332776, "epoch": 0.115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.12577970261347e-07, "loss": 0.0, "num_tokens": 15189928.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 161, "step_time": 4.100585987849627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.00779947466799058, "epoch": 0.11571428571428571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.079631718492568e-07, "loss": 0.0, "num_tokens": 15272416.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 162, "step_time": 4.265750402060803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0057306156668346375, "epoch": 0.11642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.033387063424764e-07, "loss": 0.0, "num_tokens": 15370072.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 163, "step_time": 4.551348562003113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.023858943488448858, "epoch": 0.11714285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 21.742687225341797, "learning_rate": 5.987049878187436e-07, "loss": -0.0, "num_tokens": 15470064.0, "reward": 0.7374999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.6875, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 164, "step_time": 4.178184504038654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04080562340095639, "epoch": 0.11785714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 106.40847778320312, "learning_rate": 5.940624311843168e-07, "loss": -0.0, "num_tokens": 15553528.0, "reward": 0.9093749523162842, "reward_std": 0.35038241744041443, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.859375, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 165, "step_time": 3.9056500179576688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.03239465922524687, "epoch": 0.11857142857142858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.894114521368258e-07, "loss": 0.0, "num_tokens": 15651352.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 166, "step_time": 6.06811363005545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.021100470912642777, "epoch": 0.11928571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 27.465686798095703, "learning_rate": 5.847524671280483e-07, "loss": -0.0, "num_tokens": 15748824.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 167, "step_time": 4.068884873995557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010125938017154112, "epoch": 0.12, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.800858933266212e-07, "loss": 0.0, "num_tokens": 15841192.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 168, "step_time": 4.681547085056081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.007420915557304397, "epoch": 0.12071428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.75412148580687e-07, "loss": 0.0, "num_tokens": 15941912.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 169, "step_time": 4.018109558033757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.00387280583527172, "epoch": 0.12142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.707316513804792e-07, "loss": 0.0, "num_tokens": 16030976.0, "reward": 0.6749999523162842, "reward_std": 0.48795002698898315, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 170, "step_time": 3.581215404032264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.018240234567201696, "epoch": 0.12214285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 27.001724243164062, "learning_rate": 5.660448208208513e-07, "loss": 0.0, "num_tokens": 16131104.0, "reward": 0.7671874761581421, "reward_std": 0.4557584524154663, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 171, "step_time": 5.626854334026575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.008751161993131973, "epoch": 0.12285714285714286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.613520765637489e-07, "loss": 0.0, "num_tokens": 16199736.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 172, "step_time": 3.0112785029341467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0020948836317984387, "epoch": 0.12357142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.56653838800635e-07, "loss": 0.0, "num_tokens": 16305192.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 173, "step_time": 3.8440414160722867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.029801132390275598, "epoch": 0.12428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 38.89519119262695, "learning_rate": 5.519505282148643e-07, "loss": -0.0, "num_tokens": 16400128.0, "reward": 0.8937499523162842, "reward_std": 0.36596250534057617, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 174, "step_time": 4.306124185968656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01975566567853093, "epoch": 0.125, "frac_reward_zero_std": 0.875, "grad_norm": 14.759847640991211, "learning_rate": 5.472425659440156e-07, "loss": 0.0, "num_tokens": 16501992.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 175, "step_time": 6.891298751113936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.010644901332852896, "epoch": 0.12571428571428572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.425303735421828e-07, "loss": 0.0, "num_tokens": 16583760.0, "reward": 0.6749999523162842, "reward_std": 0.48794999718666077, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.625, "rewards/mcq_exact_match_reward/std": 0.48795005679130554, "step": 176, "step_time": 4.401262045954354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.013223175366874784, "epoch": 0.12642857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.378143729422284e-07, "loss": 0.0, "num_tokens": 16670544.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 177, "step_time": 3.597560814989265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.00034627606146386825, "epoch": 0.12714285714285714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.330949864180033e-07, "loss": 0.0, "num_tokens": 16742800.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 178, "step_time": 3.9288182540913112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02273468012572266, "epoch": 0.12785714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 10.492746353149414, "learning_rate": 5.28372636546537e-07, "loss": -0.0, "num_tokens": 16819288.0, "reward": 0.8937499523162842, "reward_std": 0.36596250534057617, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.84375, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 179, "step_time": 3.3139185319887474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.032032598566729575, "epoch": 0.12857142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 15.799858093261719, "learning_rate": 5.236477461701985e-07, "loss": 0.0, "num_tokens": 16902520.0, "reward": 0.6906249523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 180, "step_time": 4.982991532073356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.02958734449930489, "epoch": 0.12928571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 62.94922637939453, "learning_rate": 5.189207383588352e-07, "loss": -0.0, "num_tokens": 17025040.0, "reward": 0.3624999523162842, "reward_std": 0.46717655658721924, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 181, "step_time": 4.936472486006096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.005954080814262852, "epoch": 0.13, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.141920363718916e-07, "loss": 0.0, "num_tokens": 17112664.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 182, "step_time": 4.847438686934765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.020133527723373845, "epoch": 0.13071428571428573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.094620636205095e-07, "loss": 0.0, "num_tokens": 17195056.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 183, "step_time": 4.1526574228773825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01397450927470345, "epoch": 0.13142857142857142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 5.047312436296158e-07, "loss": 0.0, "num_tokens": 17281096.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 184, "step_time": 3.595313879137393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.026221045991405845, "epoch": 0.13214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 25.40279769897461, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 17367032.0, "reward": 0.5187499523162842, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.46875, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 185, "step_time": 3.9962045900174417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0004232673418300692, "epoch": 0.13285714285714287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.952687563703841e-07, "loss": 0.0, "num_tokens": 17495344.0, "reward": 0.9249999523162842, "reward_std": 0.3333333134651184, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.875, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 186, "step_time": 6.927359046996571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.009157578751910478, "epoch": 0.13357142857142856, "frac_reward_zero_std": 0.875, "grad_norm": 21.018695831298828, "learning_rate": 4.905379363794906e-07, "loss": 0.0, "num_tokens": 17599416.0, "reward": 0.5656249523162842, "reward_std": 0.5037064552307129, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.515625, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 187, "step_time": 7.725423171068542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.025402174986083992, "epoch": 0.13428571428571429, "frac_reward_zero_std": 0.875, "grad_norm": 26.784770965576172, "learning_rate": 4.858079636281084e-07, "loss": -0.0, "num_tokens": 17672048.0, "reward": 0.7218749523162842, "reward_std": 0.47324231266975403, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.671875, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 188, "step_time": 3.3486171750118956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0412696284474805, "epoch": 0.135, "frac_reward_zero_std": 0.875, "grad_norm": 27.7330322265625, "learning_rate": 4.810792616411649e-07, "loss": -0.0, "num_tokens": 17767608.0, "reward": 0.7843749523162842, "reward_std": 0.44515690207481384, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 189, "step_time": 5.364009258977603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.014710754854604602, "epoch": 0.1357142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 12.3643159866333, "learning_rate": 4.763522538298017e-07, "loss": -0.0, "num_tokens": 17877352.0, "reward": 0.7843749523162842, "reward_std": 0.44515690207481384, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 190, "step_time": 4.777944088040385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.045710146659985185, "epoch": 0.13642857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 109.233642578125, "learning_rate": 4.7162736345346296e-07, "loss": -0.0, "num_tokens": 17981952.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 191, "step_time": 3.6734825060120784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.002387989137787372, "epoch": 0.13714285714285715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.6690501358199655e-07, "loss": 0.0, "num_tokens": 18077648.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 192, "step_time": 4.10004890995333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05728395436744904, "epoch": 0.13785714285714284, "frac_reward_zero_std": 0.75, "grad_norm": 87.72314453125, "learning_rate": 4.621856270577718e-07, "loss": 0.0, "num_tokens": 18187616.0, "reward": 0.6281249523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.578125, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 193, "step_time": 5.712224373128265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.005229042188148014, "epoch": 0.13857142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.5746962645781723e-07, "loss": 0.0, "num_tokens": 18277576.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 194, "step_time": 4.505508521979209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.01992573600000469, "epoch": 0.1392857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 28.904830932617188, "learning_rate": 4.5275743405598437e-07, "loss": -0.0, "num_tokens": 18367728.0, "reward": 0.7804687023162842, "reward_std": 0.45185837149620056, "rewards/format_reward/mean": 0.4609375, "rewards/format_reward/std": 0.13524486124515533, "rewards/mcq_exact_match_reward/mean": 0.734375, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 195, "step_time": 4.144765653065406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.012862415402196348, "epoch": 0.14, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.480494717851358e-07, "loss": 0.0, "num_tokens": 18431632.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 196, "step_time": 2.3946178509504534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0024675551831023768, "epoch": 0.1407142857142857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.433461611993651e-07, "loss": 0.0, "num_tokens": 18520896.0, "reward": 0.7999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.75, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 197, "step_time": 3.6817237000796013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.014663272420875728, "epoch": 0.14142857142857143, "frac_reward_zero_std": 0.875, "grad_norm": 14.525986671447754, "learning_rate": 4.3864792343625115e-07, "loss": 0.0, "num_tokens": 18620728.0, "reward": 0.6906249523162842, "reward_std": 0.4836103320121765, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.640625, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 198, "step_time": 4.85546653799247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.048036441701697186, "epoch": 0.14214285714285715, "frac_reward_zero_std": 0.875, "grad_norm": 61.86164093017578, "learning_rate": 4.3395517917914894e-07, "loss": -0.0, "num_tokens": 18706736.0, "reward": 0.22187498211860657, "reward_std": 0.3802541494369507, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 199, "step_time": 3.4838617978966795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0018978773259732407, "epoch": 0.14285714285714285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 4.2926834861952077e-07, "loss": 0.0, "num_tokens": 18798064.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 200, "step_time": 4.239792971056886 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 18798064, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }