{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03571428571428571, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 453.359375, "completions/mean_terminated_length": 428.0476379394531, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.20064565539360046, "epoch": 0.0007142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 5.157586574554443, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 104207.0, "reward": 0.3671875, "reward_std": 0.47810959815979004, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.22658175230026245, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 1, "step_time": 163.2008375240257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 558.921875, "completions/mean_terminated_length": 535.2857666015625, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.15827747248113155, "epoch": 0.0014285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 1.5780975818634033, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "num_tokens": 224762.0, "reward": 0.24296873807907104, "reward_std": 0.4135470986366272, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.28423789143562317, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 2, "step_time": 134.8140414939844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 589.8125, "completions/mean_terminated_length": 566.6666870117188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1299030063673854, "epoch": 0.002142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 5.490052223205566, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "num_tokens": 346622.0, "reward": 0.32109373807907104, "reward_std": 0.4618633985519409, "rewards/format_reward/mean": 0.3984375, "rewards/format_reward/std": 0.2387082874774933, "rewards/mcq_exact_match_reward/mean": 0.28125, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 3, "step_time": 124.3698272620677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 522.0625, "completions/mean_terminated_length": 522.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.16233543679118156, "epoch": 0.002857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 1.4298690557479858, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "num_tokens": 460962.0, "reward": 0.27656251192092896, "reward_std": 0.4340624213218689, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2847827076911926, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 4, "step_time": 90.78269547002856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 650.03125, "completions/mean_terminated_length": 581.2786865234375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1047047358006239, "epoch": 0.0035714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 3.9983320236206055, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "num_tokens": 626636.0, "reward": 0.23515623807907104, "reward_std": 0.41525280475616455, "rewards/format_reward/mean": 0.3203125, "rewards/format_reward/std": 0.24180518090724945, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 5, "step_time": 171.21265639393823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 379.109375, "completions/mean_terminated_length": 379.109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17905950360000134, "epoch": 0.004285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 33.00831604003906, "learning_rate": 2.7777777777777776e-07, "loss": -0.0, "num_tokens": 742675.0, "reward": 0.22421874105930328, "reward_std": 0.39880695939064026, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.28510910272598267, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 6, "step_time": 77.91981936100638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1596.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 479.34375, "completions/mean_terminated_length": 479.34375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11762952525168657, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 6.298130989074707, "learning_rate": 3.333333333333333e-07, "loss": -0.0, "num_tokens": 846929.0, "reward": 0.4898437261581421, "reward_std": 0.5113147497177124, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.255761981010437, "rewards/mcq_exact_match_reward/mean": 0.453125, "rewards/mcq_exact_match_reward/std": 0.501733124256134, "step": 7, "step_time": 89.83808165497612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 634.296875, "completions/mean_terminated_length": 634.296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.158901397138834, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 1.4249234199523926, "learning_rate": 3.888888888888889e-07, "loss": -0.0, "num_tokens": 984668.0, "reward": 0.3351562023162842, "reward_std": 0.465447336435318, "rewards/format_reward/mean": 0.3828125, "rewards/format_reward/std": 0.2634054720401764, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 8, "step_time": 113.26597948698327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 481.453125, "completions/mean_terminated_length": 481.453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1585789266973734, "epoch": 0.0064285714285714285, "frac_reward_zero_std": 0.375, "grad_norm": 3.944404125213623, "learning_rate": 4.444444444444444e-07, "loss": -0.0, "num_tokens": 1116649.0, "reward": 0.16718748211860657, "reward_std": 0.3370293378829956, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.2221602201461792, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 9, "step_time": 132.11293392902007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 455.625, "completions/mean_terminated_length": 430.3492431640625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.15571903064846992, "epoch": 0.007142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 11.225871086120605, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 1237337.0, "reward": 0.29921871423721313, "reward_std": 0.45211073756217957, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2366211861371994, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 10, "step_time": 141.26959226594772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 530.84375, "completions/mean_terminated_length": 506.7619323730469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.12162926886230707, "epoch": 0.007857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 26.4554443359375, "learning_rate": 5.555555555555555e-07, "loss": -0.0, "num_tokens": 1358607.0, "reward": 0.23671871423721313, "reward_std": 0.4144456684589386, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2366211861371994, "rewards/mcq_exact_match_reward/mean": 0.203125, "rewards/mcq_exact_match_reward/std": 0.40550529956817627, "step": 11, "step_time": 136.65451408794615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 458.390625, "completions/mean_terminated_length": 433.15875244140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14663300476968288, "epoch": 0.008571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 11.722622871398926, "learning_rate": 6.111111111111112e-07, "loss": 0.0, "num_tokens": 1471192.0, "reward": 0.22499997913837433, "reward_std": 0.3999999761581421, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2182178944349289, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 12, "step_time": 133.89911664801184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1495.0, "completions/mean_length": 562.9375, "completions/mean_terminated_length": 539.3651123046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15633145160973072, "epoch": 0.009285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 2.441755771636963, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 1576364.0, "reward": 0.25703123211860657, "reward_std": 0.42340895533561707, "rewards/format_reward/mean": 0.3828125, "rewards/format_reward/std": 0.21347814798355103, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 13, "step_time": 121.01016625500051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 596.859375, "completions/mean_terminated_length": 573.825439453125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.11535925976932049, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 8.867958068847656, "learning_rate": 7.222222222222221e-07, "loss": 0.0, "num_tokens": 1739315.0, "reward": 0.1874999850988388, "reward_std": 0.375224769115448, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.3021090030670166, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 14, "step_time": 186.91490571206668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17848026007413864, "epoch": 0.010714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 7.981410503387451, "learning_rate": 7.777777777777778e-07, "loss": -0.0, "num_tokens": 1836883.0, "reward": 0.34843745827674866, "reward_std": 0.47941502928733826, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.32694777846336365, "rewards/mcq_exact_match_reward/mean": 0.3125, "rewards/mcq_exact_match_reward/std": 0.467176616191864, "step": 15, "step_time": 94.68117612809874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 598.296875, "completions/mean_terminated_length": 575.2857666015625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12071683909744024, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 17.546281814575195, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "num_tokens": 1974854.0, "reward": 0.14687499403953552, "reward_std": 0.3447882831096649, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.25, "rewards/mcq_exact_match_reward/mean": 0.125, "rewards/mcq_exact_match_reward/std": 0.3333333432674408, "step": 16, "step_time": 148.51339858904248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 611.171875, "completions/mean_terminated_length": 564.8225708007812, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1321120047941804, "epoch": 0.012142857142857143, "frac_reward_zero_std": 0.125, "grad_norm": 5.823922157287598, "learning_rate": 8.888888888888888e-07, "loss": -0.0, "num_tokens": 2119753.0, "reward": 0.17578125, "reward_std": 0.3575702905654907, "rewards/format_reward/mean": 0.3515625, "rewards/format_reward/std": 0.29113471508026123, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 17, "step_time": 182.10150892706588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 559.859375, "completions/mean_terminated_length": 511.8548278808594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1420492259785533, "epoch": 0.012857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 9.969266891479492, "learning_rate": 9.444444444444444e-07, "loss": 0.0, "num_tokens": 2257376.0, "reward": 0.25624996423721313, "reward_std": 0.42393654584884644, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.2357022762298584, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 18, "step_time": 162.68666934006615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 580.640625, "completions/mean_terminated_length": 580.640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.123278739862144, "epoch": 0.013571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 13.4293851852417, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 2411657.0, "reward": 0.2242187261581421, "reward_std": 0.40247172117233276, "rewards/format_reward/mean": 0.3671875, "rewards/format_reward/std": 0.23974503576755524, "rewards/mcq_exact_match_reward/mean": 0.1875, "rewards/mcq_exact_match_reward/std": 0.39339789748191833, "step": 19, "step_time": 114.04774017888121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 439.953125, "completions/mean_terminated_length": 439.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10359836835414171, "epoch": 0.014285714285714285, "frac_reward_zero_std": 0.125, "grad_norm": 18.170589447021484, "learning_rate": 9.999776148326214e-07, "loss": -0.0, "num_tokens": 2542550.0, "reward": 0.20546874403953552, "reward_std": 0.38863998651504517, "rewards/format_reward/mean": 0.3359375, "rewards/format_reward/std": 0.2680720090866089, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 20, "step_time": 94.10481164290104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 583.359375, "completions/mean_terminated_length": 511.3278503417969, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12750612013041973, "epoch": 0.015, "frac_reward_zero_std": 0.25, "grad_norm": 10.771635055541992, "learning_rate": 9.999104613348689e-07, "loss": 0.0, "num_tokens": 2661437.0, "reward": 0.2593749761581421, "reward_std": 0.4222835302352905, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.233588308095932, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 21, "step_time": 178.62638382194564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 381.640625, "completions/mean_terminated_length": 355.19049072265625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.15890471823513508, "epoch": 0.015714285714285715, "frac_reward_zero_std": 0.125, "grad_norm": 11.206506729125977, "learning_rate": 9.997985455197113e-07, "loss": 0.0, "num_tokens": 2776894.0, "reward": 0.18593749403953552, "reward_std": 0.35270705819129944, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.1717960685491562, "rewards/mcq_exact_match_reward/mean": 0.140625, "rewards/mcq_exact_match_reward/std": 0.3503824472427368, "step": 22, "step_time": 140.86376020603348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 310.015625, "completions/mean_terminated_length": 224.5409698486328, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18053795211017132, "epoch": 0.016428571428571428, "frac_reward_zero_std": 0.25, "grad_norm": 9.23369026184082, "learning_rate": 9.996418774081656e-07, "loss": 0.0, "num_tokens": 2898815.0, "reward": 0.3109374940395355, "reward_std": 0.4464558959007263, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.14689241349697113, "rewards/mcq_exact_match_reward/mean": 0.265625, "rewards/mcq_exact_match_reward/std": 0.44515693187713623, "step": 23, "step_time": 148.10121405799873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 232.359375, "completions/mean_terminated_length": 232.359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16719974670559168, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.25, "grad_norm": 4.078112602233887, "learning_rate": 9.994404710283998e-07, "loss": 0.0, "num_tokens": 2993910.0, "reward": 0.28203123807907104, "reward_std": 0.4266301095485687, "rewards/format_reward/mean": 0.4765625, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 24, "step_time": 73.75073616893496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 111.78125, "completions/mean_terminated_length": 111.78125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.25493066385388374, "epoch": 0.017857142857142856, "frac_reward_zero_std": 0.125, "grad_norm": 16.993078231811523, "learning_rate": 9.991943444144756e-07, "loss": -0.0, "num_tokens": 3085936.0, "reward": 0.3023437261581421, "reward_std": 0.4352912902832031, "rewards/format_reward/mean": 0.5234375, "rewards/format_reward/std": 0.13886408507823944, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 25, "step_time": 45.09112752194051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 132.671875, "completions/mean_terminated_length": 132.671875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2280710767954588, "epoch": 0.018571428571428572, "frac_reward_zero_std": 0.375, "grad_norm": 15.601746559143066, "learning_rate": 9.989035196047348e-07, "loss": 0.0, "num_tokens": 3186115.0, "reward": 0.20468749105930328, "reward_std": 0.3647915720939636, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.1534975916147232, "rewards/mcq_exact_match_reward/mean": 0.15625, "rewards/mcq_exact_match_reward/std": 0.36596253514289856, "step": 26, "step_time": 68.13181330589578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 64.9375, "completions/mean_terminated_length": 64.9375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.22438204288482666, "epoch": 0.019285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 11.98585033416748, "learning_rate": 9.98568022639826e-07, "loss": -0.0, "num_tokens": 3275551.0, "reward": 0.4406249523162842, "reward_std": 0.4918280243873596, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.08908708393573761, "rewards/mcq_exact_match_reward/mean": 0.390625, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 27, "step_time": 46.977470892015845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 50.109375, "completions/mean_terminated_length": 50.109375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.2098379861563444, "epoch": 0.02, "frac_reward_zero_std": 0.375, "grad_norm": 18.382869720458984, "learning_rate": 9.981878835603716e-07, "loss": 0.0, "num_tokens": 3357406.0, "reward": 0.39531248807907104, "reward_std": 0.4776528775691986, "rewards/format_reward/mean": 0.515625, "rewards/format_reward/std": 0.08768405020236969, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 28, "step_time": 35.3679761699168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10740844532847404, "epoch": 0.020714285714285713, "frac_reward_zero_std": 0.5, "grad_norm": 18.927444458007812, "learning_rate": 9.977631364042794e-07, "loss": -0.0, "num_tokens": 3456198.0, "reward": 0.5187499523162842, "reward_std": 0.502967357635498, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.46875, "rewards/mcq_exact_match_reward/std": 0.5029674172401428, "step": 29, "step_time": 4.8387698759906925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 15.125, "completions/mean_terminated_length": 15.125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.197237528860569, "epoch": 0.02142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 18.760665893554688, "learning_rate": 9.972938192036944e-07, "loss": -0.0, "num_tokens": 3529918.0, "reward": 0.3781249523162842, "reward_std": 0.47324231266975403, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.328125, "rewards/mcq_exact_match_reward/std": 0.4732423722743988, "step": 30, "step_time": 19.270987000956666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 7.53125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14352155569940805, "epoch": 0.02214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 17.66056251525879, "learning_rate": 9.967799739815924e-07, "loss": -0.0, "num_tokens": 3604272.0, "reward": 0.27812498807907104, "reward_std": 0.4307500422000885, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.1666666716337204, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 31, "step_time": 5.142326800851151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.16279550455510616, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 19.25493621826172, "learning_rate": 9.96221646748019e-07, "loss": -0.0, "num_tokens": 3687568.0, "reward": 0.22187498211860657, "reward_std": 0.3802541494369507, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.171875, "rewards/mcq_exact_match_reward/std": 0.38025420904159546, "step": 32, "step_time": 3.4762189310858957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 7.59375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.1700758682563901, "epoch": 0.023571428571428573, "frac_reward_zero_std": 0.5, "grad_norm": 23.92763328552246, "learning_rate": 9.956188874959686e-07, "loss": -0.0, "num_tokens": 3782558.0, "reward": 0.5499999523162842, "reward_std": 0.5039525628089905, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.5, "rewards/mcq_exact_match_reward/std": 0.5039526224136353, "step": 33, "step_time": 7.063230810861569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.10504745738580823, "epoch": 0.024285714285714285, "frac_reward_zero_std": 0.75, "grad_norm": 17.42229461669922, "learning_rate": 9.949717501969079e-07, "loss": -0.0, "num_tokens": 3872686.0, "reward": 0.26874998211860657, "reward_std": 0.4166666269302368, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.21875, "rewards/mcq_exact_match_reward/std": 0.4166666865348816, "step": 34, "step_time": 3.867844623979181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 6.359375, "completions/mean_terminated_length": 6.359375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.18720911256968975, "epoch": 0.025, "frac_reward_zero_std": 0.625, "grad_norm": 26.774076461791992, "learning_rate": 9.942802927959442e-07, "loss": 0.0, "num_tokens": 3981189.0, "reward": 0.29921871423721313, "reward_std": 0.43693482875823975, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 35, "step_time": 5.721210387942847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.13526835106313229, "epoch": 0.025714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 25.46927833557129, "learning_rate": 9.93544577206636e-07, "loss": -0.0, "num_tokens": 4056141.0, "reward": 0.3468749523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.296875, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 36, "step_time": 3.36784387397347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.17059525474905968, "epoch": 0.02642857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 20.648895263671875, "learning_rate": 9.927646693054495e-07, "loss": 0.0, "num_tokens": 4163493.0, "reward": 0.2999999523162842, "reward_std": 0.4364357590675354, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.25, "rewards/mcq_exact_match_reward/std": 0.4364357888698578, "step": 37, "step_time": 9.884533652977552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 5.953125, "completions/mean_terminated_length": 5.953125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10067910794168711, "epoch": 0.027142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 24.11454963684082, "learning_rate": 9.919406389258606e-07, "loss": 0.0, "num_tokens": 4273250.0, "reward": 0.28359371423721313, "reward_std": 0.4274373948574066, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.234375, "rewards/mcq_exact_match_reward/std": 0.42695629596710205, "step": 38, "step_time": 6.0175170440925285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 6.953125, "completions/mean_terminated_length": 6.953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.14652357157319784, "epoch": 0.027857142857142858, "frac_reward_zero_std": 0.875, "grad_norm": 12.282570838928223, "learning_rate": 9.910725598521012e-07, "loss": -0.0, "num_tokens": 4366839.0, "reward": 0.4562499523162842, "reward_std": 0.49501481652259827, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.40625, "rewards/mcq_exact_match_reward/std": 0.49501484632492065, "step": 39, "step_time": 9.129241381015163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.07260213000699878, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.875, "grad_norm": 11.272303581237793, "learning_rate": 9.901605098125526e-07, "loss": -0.0, "num_tokens": 4477503.0, "reward": 0.39374998211860657, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 40, "step_time": 4.785194783064071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 39.328125, "completions/mean_terminated_length": 39.328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12252951040863991, "epoch": 0.029285714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 9.645527839660645, "learning_rate": 9.892045704727863e-07, "loss": -0.0, "num_tokens": 4603468.0, "reward": 0.3937499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 41, "step_time": 121.67112983297557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 7.203125, "completions/mean_terminated_length": 7.203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.04064215812832117, "epoch": 0.03, "frac_reward_zero_std": 0.75, "grad_norm": 17.42051124572754, "learning_rate": 9.882048274282505e-07, "loss": -0.0, "num_tokens": 4709177.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 42, "step_time": 10.29577969602542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 6.03125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.12579507380723953, "epoch": 0.030714285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 36.053192138671875, "learning_rate": 9.871613701966066e-07, "loss": 0.0, "num_tokens": 4787955.0, "reward": 0.40859371423721313, "reward_std": 0.4842400550842285, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.359375, "rewards/mcq_exact_match_reward/std": 0.4836103618144989, "step": 43, "step_time": 4.568186060001608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 6.4375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0865603880956769, "epoch": 0.03142857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 14.81761360168457, "learning_rate": 9.86074292209714e-07, "loss": 0.0, "num_tokens": 4872295.0, "reward": 0.6539061665534973, "reward_std": 0.4988323152065277, "rewards/format_reward/mean": 0.4453125, "rewards/format_reward/std": 0.15728822350502014, "rewards/mcq_exact_match_reward/mean": 0.609375, "rewards/mcq_exact_match_reward/std": 0.4917473793029785, "step": 44, "step_time": 4.5612655181321315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05028381128795445, "epoch": 0.03214285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 18.105215072631836, "learning_rate": 9.849436908052636e-07, "loss": 0.0, "num_tokens": 4971903.0, "reward": 0.4718749523162842, "reward_std": 0.4977628290653229, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.421875, "rewards/mcq_exact_match_reward/std": 0.49776285886764526, "step": 45, "step_time": 4.8814199649496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0737028606235981, "epoch": 0.032857142857142856, "frac_reward_zero_std": 0.625, "grad_norm": 36.69444274902344, "learning_rate": 9.837696672180618e-07, "loss": -0.0, "num_tokens": 5051695.0, "reward": 0.4874999523162842, "reward_std": 0.4999999701976776, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.4375, "rewards/mcq_exact_match_reward/std": 0.5, "step": 46, "step_time": 3.359571039909497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0733404541388154, "epoch": 0.03357142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 18.161724090576172, "learning_rate": 9.825523265709665e-07, "loss": 0.0, "num_tokens": 5126431.0, "reward": 0.5335937142372131, "reward_std": 0.504507839679718, "rewards/format_reward/mean": 0.4921875, "rewards/format_reward/std": 0.0625, "rewards/mcq_exact_match_reward/mean": 0.484375, "rewards/mcq_exact_match_reward/std": 0.5037065148353577, "step": 47, "step_time": 17.83803274697857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.054197699995711446, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.75, "grad_norm": 19.445640563964844, "learning_rate": 9.812917778654747e-07, "loss": -0.0, "num_tokens": 5230503.0, "reward": 0.7531249523162842, "reward_std": 0.46049270033836365, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.703125, "rewards/mcq_exact_match_reward/std": 0.4604927599430084, "step": 48, "step_time": 4.160447052039672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 6.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.05320898490026593, "epoch": 0.035, "frac_reward_zero_std": 0.875, "grad_norm": 11.983798027038574, "learning_rate": 9.799881339719614e-07, "loss": -0.0, "num_tokens": 5297479.0, "reward": 0.3937499523162842, "reward_std": 0.4787135422229767, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.34375, "rewards/mcq_exact_match_reward/std": 0.4787135720252991, "step": 49, "step_time": 2.485525873955339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 6.09375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.0547908297739923, "epoch": 0.03571428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 15.623820304870605, "learning_rate": 9.786415116195732e-07, "loss": -0.0, "num_tokens": 5377581.0, "reward": 0.7687499523162842, "reward_std": 0.4531634449958801, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.0, "rewards/mcq_exact_match_reward/mean": 0.71875, "rewards/mcq_exact_match_reward/std": 0.4531635046005249, "step": 50, "step_time": 4.019255019025877 } ], "logging_steps": 1, "max_steps": 350, "num_input_tokens_seen": 5377581, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }