{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.375, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.03125, "frac_reward_zero_std": 0.0, "grad_norm": 7.3125, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 12528.0, "reward": 0.39676433801651, "reward_std": 0.11280547827482224, "rewards/grpo_reward_func/mean": 0.39676433801651, "rewards/grpo_reward_func/std": 0.13478560745716095, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0625, "frac_reward_zero_std": 0.0, "grad_norm": 12.8125, "learning_rate": 9.966666666666667e-07, "loss": 0.0, "num_tokens": 24884.0, "reward": 0.4752987027168274, "reward_std": 0.13702644407749176, "rewards/grpo_reward_func/mean": 0.4752987027168274, "rewards/grpo_reward_func/std": 0.17374587059020996, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.09375, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "learning_rate": 9.933333333333333e-07, "loss": -0.0, "num_tokens": 37352.0, "reward": 0.44525083899497986, "reward_std": 0.10103905200958252, "rewards/grpo_reward_func/mean": 0.44525083899497986, "rewards/grpo_reward_func/std": 0.0979275107383728, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 14.125, "learning_rate": 9.9e-07, "loss": 0.0, "num_tokens": 49744.0, "reward": 0.399270236492157, "reward_std": 0.10935800522565842, "rewards/grpo_reward_func/mean": 0.399270236492157, "rewards/grpo_reward_func/std": 0.10536573082208633, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.15625, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "learning_rate": 9.866666666666666e-07, "loss": 0.0, "num_tokens": 62224.0, "reward": 0.3989260196685791, "reward_std": 0.11544467508792877, "rewards/grpo_reward_func/mean": 0.3989260196685791, "rewards/grpo_reward_func/std": 0.11394146084785461, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1875, "frac_reward_zero_std": 0.0, "grad_norm": 11.125, "learning_rate": 9.833333333333332e-07, "loss": -0.0, "num_tokens": 74736.0, "reward": 0.42444688081741333, "reward_std": 0.14600424468517303, "rewards/grpo_reward_func/mean": 0.42444688081741333, "rewards/grpo_reward_func/std": 0.17498743534088135, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "learning_rate": 9.8e-07, "loss": 0.0, "num_tokens": 87100.0, "reward": 0.4266095757484436, "reward_std": 0.0954706147313118, "rewards/grpo_reward_func/mean": 0.4266095757484436, "rewards/grpo_reward_func/std": 0.09790605306625366, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "learning_rate": 9.766666666666667e-07, "loss": -0.0, "num_tokens": 99496.0, "reward": 0.4947161376476288, "reward_std": 0.07030671834945679, "rewards/grpo_reward_func/mean": 0.4947161376476288, "rewards/grpo_reward_func/std": 0.07488483190536499, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.28125, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "learning_rate": 9.733333333333333e-07, "loss": -0.0, "num_tokens": 111844.0, "reward": 0.4835072159767151, "reward_std": 0.1621960997581482, "rewards/grpo_reward_func/mean": 0.4835072159767151, "rewards/grpo_reward_func/std": 0.17284278571605682, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3125, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "learning_rate": 9.7e-07, "loss": -0.0, "num_tokens": 124240.0, "reward": 0.4783210754394531, "reward_std": 0.09915027022361755, "rewards/grpo_reward_func/mean": 0.4783210754394531, "rewards/grpo_reward_func/std": 0.11161749064922333, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.34375, "frac_reward_zero_std": 0.0, "grad_norm": 6.6875, "learning_rate": 9.666666666666666e-07, "loss": -0.0, "num_tokens": 136652.0, "reward": 0.40330448746681213, "reward_std": 0.10881966352462769, "rewards/grpo_reward_func/mean": 0.40330448746681213, "rewards/grpo_reward_func/std": 0.1156788170337677, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 10.75, "learning_rate": 9.633333333333334e-07, "loss": -0.0, "num_tokens": 149048.0, "reward": 0.41300415992736816, "reward_std": 0.13600921630859375, "rewards/grpo_reward_func/mean": 0.41300415992736816, "rewards/grpo_reward_func/std": 0.1646273136138916, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.40625, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "learning_rate": 9.6e-07, "loss": 0.0, "num_tokens": 161276.0, "reward": 0.4857324957847595, "reward_std": 0.09516896307468414, "rewards/grpo_reward_func/mean": 0.4857324957847595, "rewards/grpo_reward_func/std": 0.09173914790153503, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4375, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "learning_rate": 9.566666666666667e-07, "loss": 0.0, "num_tokens": 173780.0, "reward": 0.4015089273452759, "reward_std": 0.06604111194610596, "rewards/grpo_reward_func/mean": 0.4015089273452759, "rewards/grpo_reward_func/std": 0.07018419355154037, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.46875, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "learning_rate": 9.533333333333333e-07, "loss": -0.0, "num_tokens": 186192.0, "reward": 0.31999891996383667, "reward_std": 0.0805739015340805, "rewards/grpo_reward_func/mean": 0.31999891996383667, "rewards/grpo_reward_func/std": 0.08632533997297287, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "learning_rate": 9.499999999999999e-07, "loss": -0.0, "num_tokens": 198684.0, "reward": 0.39560186862945557, "reward_std": 0.09632067382335663, "rewards/grpo_reward_func/mean": 0.39560186862945557, "rewards/grpo_reward_func/std": 0.09369846433401108, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.53125, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "learning_rate": 9.466666666666666e-07, "loss": 0.0, "num_tokens": 211096.0, "reward": 0.48571068048477173, "reward_std": 0.15206970274448395, "rewards/grpo_reward_func/mean": 0.48571068048477173, "rewards/grpo_reward_func/std": 0.1438637524843216, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5625, "frac_reward_zero_std": 0.0, "grad_norm": 7.09375, "learning_rate": 9.433333333333333e-07, "loss": -0.0, "num_tokens": 223552.0, "reward": 0.45060235261917114, "reward_std": 0.05437461659312248, "rewards/grpo_reward_func/mean": 0.45060235261917114, "rewards/grpo_reward_func/std": 0.140779510140419, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.59375, "frac_reward_zero_std": 0.0, "grad_norm": 5.78125, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "num_tokens": 236036.0, "reward": 0.4261874556541443, "reward_std": 0.09510611742734909, "rewards/grpo_reward_func/mean": 0.4261874556541443, "rewards/grpo_reward_func/std": 0.10084228217601776, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.625, "frac_reward_zero_std": 0.0, "grad_norm": 5.375, "learning_rate": 9.366666666666666e-07, "loss": -0.0, "num_tokens": 248448.0, "reward": 0.29703885316848755, "reward_std": 0.046393271535634995, "rewards/grpo_reward_func/mean": 0.29703885316848755, "rewards/grpo_reward_func/std": 0.04335997626185417, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.65625, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "learning_rate": 9.333333333333333e-07, "loss": 0.0, "num_tokens": 260776.0, "reward": 0.45774269104003906, "reward_std": 0.16561079025268555, "rewards/grpo_reward_func/mean": 0.45774269104003906, "rewards/grpo_reward_func/std": 0.15406657755374908, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "learning_rate": 9.3e-07, "loss": 0.0, "num_tokens": 273248.0, "reward": 0.4235140085220337, "reward_std": 0.06906857341527939, "rewards/grpo_reward_func/mean": 0.4235140085220337, "rewards/grpo_reward_func/std": 0.07242283225059509, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.71875, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "learning_rate": 9.266666666666665e-07, "loss": 0.0, "num_tokens": 285724.0, "reward": 0.36918026208877563, "reward_std": 0.06028338894248009, "rewards/grpo_reward_func/mean": 0.36918026208877563, "rewards/grpo_reward_func/std": 0.0693485215306282, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "learning_rate": 9.233333333333333e-07, "loss": 0.0, "num_tokens": 298132.0, "reward": 0.3204312324523926, "reward_std": 0.07052356004714966, "rewards/grpo_reward_func/mean": 0.3204312324523926, "rewards/grpo_reward_func/std": 0.09546414762735367, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.78125, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "learning_rate": 9.2e-07, "loss": -0.0, "num_tokens": 310584.0, "reward": 0.38078033924102783, "reward_std": 0.13373351097106934, "rewards/grpo_reward_func/mean": 0.38078033924102783, "rewards/grpo_reward_func/std": 0.13402824103832245, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8125, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "learning_rate": 9.166666666666665e-07, "loss": 0.0, "num_tokens": 323076.0, "reward": 0.3454480767250061, "reward_std": 0.10349850356578827, "rewards/grpo_reward_func/mean": 0.3454480767250061, "rewards/grpo_reward_func/std": 0.12671217322349548, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.84375, "frac_reward_zero_std": 0.0, "grad_norm": 7.96875, "learning_rate": 9.133333333333333e-07, "loss": 0.0, "num_tokens": 335520.0, "reward": 0.3619287312030792, "reward_std": 0.12553678452968597, "rewards/grpo_reward_func/mean": 0.3619287312030792, "rewards/grpo_reward_func/std": 0.1537715494632721, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.875, "frac_reward_zero_std": 0.0, "grad_norm": 14.9375, "learning_rate": 9.1e-07, "loss": 0.0, "num_tokens": 347940.0, "reward": 0.3436325788497925, "reward_std": 0.09887667000293732, "rewards/grpo_reward_func/mean": 0.3436325788497925, "rewards/grpo_reward_func/std": 0.12251166999340057, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.90625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "learning_rate": 9.066666666666665e-07, "loss": -0.0, "num_tokens": 360388.0, "reward": 0.4369204044342041, "reward_std": 0.19640696048736572, "rewards/grpo_reward_func/mean": 0.4369204044342041, "rewards/grpo_reward_func/std": 0.1927463412284851, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9375, "frac_reward_zero_std": 0.0, "grad_norm": 7.8125, "learning_rate": 9.033333333333333e-07, "loss": 0.0, "num_tokens": 372832.0, "reward": 0.4874047338962555, "reward_std": 0.053364820778369904, "rewards/grpo_reward_func/mean": 0.4874047338962555, "rewards/grpo_reward_func/std": 0.08248723298311234, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.96875, "frac_reward_zero_std": 0.0, "grad_norm": 6.78125, "learning_rate": 9e-07, "loss": -0.0, "num_tokens": 385256.0, "reward": 0.4391651451587677, "reward_std": 0.07597412914037704, "rewards/grpo_reward_func/mean": 0.4391651451587677, "rewards/grpo_reward_func/std": 0.13502921164035797, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 8.5, "learning_rate": 8.966666666666666e-07, "loss": 0.0, "num_tokens": 397784.0, "reward": 0.552140474319458, "reward_std": 0.1218448132276535, "rewards/grpo_reward_func/mean": 0.552140474319458, "rewards/grpo_reward_func/std": 0.11282333731651306, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.03125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "learning_rate": 8.933333333333333e-07, "loss": -0.0, "num_tokens": 410304.0, "reward": 0.4041430950164795, "reward_std": 0.1936928927898407, "rewards/grpo_reward_func/mean": 0.4041430950164795, "rewards/grpo_reward_func/std": 0.18484662473201752, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.0625, "frac_reward_zero_std": 0.0, "grad_norm": 7.90625, "learning_rate": 8.9e-07, "loss": -0.0, "num_tokens": 422796.0, "reward": 0.41248780488967896, "reward_std": 0.15024888515472412, "rewards/grpo_reward_func/mean": 0.41248780488967896, "rewards/grpo_reward_func/std": 0.16827252507209778, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.09375, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "learning_rate": 8.866666666666667e-07, "loss": 0.0, "num_tokens": 435260.0, "reward": 0.4898865818977356, "reward_std": 0.11311106383800507, "rewards/grpo_reward_func/mean": 0.4898865818977356, "rewards/grpo_reward_func/std": 0.11546135693788528, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.125, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "learning_rate": 8.833333333333333e-07, "loss": -0.0, "num_tokens": 447656.0, "reward": 0.402587354183197, "reward_std": 0.07555107772350311, "rewards/grpo_reward_func/mean": 0.402587354183197, "rewards/grpo_reward_func/std": 0.07951883971691132, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.15625, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 460100.0, "reward": 0.4937467575073242, "reward_std": 0.11035488545894623, "rewards/grpo_reward_func/mean": 0.4937467575073242, "rewards/grpo_reward_func/std": 0.11266050487756729, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.1875, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 8.766666666666667e-07, "loss": -0.0, "num_tokens": 472580.0, "reward": 0.42728495597839355, "reward_std": 0.05418732762336731, "rewards/grpo_reward_func/mean": 0.42728495597839355, "rewards/grpo_reward_func/std": 0.05117730051279068, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.21875, "frac_reward_zero_std": 0.0, "grad_norm": 5.9375, "learning_rate": 8.733333333333333e-07, "loss": -0.0, "num_tokens": 485016.0, "reward": 0.3464398980140686, "reward_std": 0.05486953258514404, "rewards/grpo_reward_func/mean": 0.3464398980140686, "rewards/grpo_reward_func/std": 0.10943454504013062, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.25, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "learning_rate": 8.699999999999999e-07, "loss": -0.0, "num_tokens": 497416.0, "reward": 0.43631184101104736, "reward_std": 0.09718433767557144, "rewards/grpo_reward_func/mean": 0.43631184101104736, "rewards/grpo_reward_func/std": 0.17311933636665344, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.28125, "frac_reward_zero_std": 0.0, "grad_norm": 6.78125, "learning_rate": 8.666666666666667e-07, "loss": -0.0, "num_tokens": 509832.0, "reward": 0.5329959392547607, "reward_std": 0.11580680310726166, "rewards/grpo_reward_func/mean": 0.5329959392547607, "rewards/grpo_reward_func/std": 0.11687568575143814, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.3125, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "learning_rate": 8.633333333333333e-07, "loss": -0.0, "num_tokens": 522252.0, "reward": 0.44177818298339844, "reward_std": 0.13238248229026794, "rewards/grpo_reward_func/mean": 0.44177818298339844, "rewards/grpo_reward_func/std": 0.12943537533283234, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.34375, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "learning_rate": 8.599999999999999e-07, "loss": -0.0, "num_tokens": 534660.0, "reward": 0.5416427850723267, "reward_std": 0.09374570846557617, "rewards/grpo_reward_func/mean": 0.5416427850723267, "rewards/grpo_reward_func/std": 0.11684079468250275, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "learning_rate": 8.566666666666667e-07, "loss": 0.0, "num_tokens": 547064.0, "reward": 0.3880234658718109, "reward_std": 0.06982941925525665, "rewards/grpo_reward_func/mean": 0.3880234658718109, "rewards/grpo_reward_func/std": 0.09098156541585922, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.40625, "frac_reward_zero_std": 0.0, "grad_norm": 7.34375, "learning_rate": 8.533333333333334e-07, "loss": -0.0, "num_tokens": 559488.0, "reward": 0.33481428027153015, "reward_std": 0.06352214515209198, "rewards/grpo_reward_func/mean": 0.33481428027153015, "rewards/grpo_reward_func/std": 0.08472999185323715, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.4375, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "learning_rate": 8.499999999999999e-07, "loss": 0.0, "num_tokens": 571944.0, "reward": 0.387703001499176, "reward_std": 0.07385663688182831, "rewards/grpo_reward_func/mean": 0.387703001499176, "rewards/grpo_reward_func/std": 0.11046246439218521, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.125, "learning_rate": 8.466666666666667e-07, "loss": 0.0, "num_tokens": 584324.0, "reward": 0.5441805124282837, "reward_std": 0.11389695107936859, "rewards/grpo_reward_func/mean": 0.5441805124282837, "rewards/grpo_reward_func/std": 0.13207265734672546, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.5, "frac_reward_zero_std": 0.0, "grad_norm": 7.8125, "learning_rate": 8.433333333333333e-07, "loss": 0.0, "num_tokens": 596692.0, "reward": 0.488021582365036, "reward_std": 0.13947440683841705, "rewards/grpo_reward_func/mean": 0.488021582365036, "rewards/grpo_reward_func/std": 0.15811356902122498, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.53125, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "learning_rate": 8.399999999999999e-07, "loss": -0.0, "num_tokens": 609168.0, "reward": 0.3698539733886719, "reward_std": 0.04929333180189133, "rewards/grpo_reward_func/mean": 0.3698539733886719, "rewards/grpo_reward_func/std": 0.05231497436761856, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.5625, "frac_reward_zero_std": 0.0, "grad_norm": 6.59375, "learning_rate": 8.366666666666667e-07, "loss": -0.0, "num_tokens": 621624.0, "reward": 0.46477562189102173, "reward_std": 0.07750491052865982, "rewards/grpo_reward_func/mean": 0.46477562189102173, "rewards/grpo_reward_func/std": 0.15642288327217102, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.59375, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "num_tokens": 633868.0, "reward": 0.43864211440086365, "reward_std": 0.13110151886940002, "rewards/grpo_reward_func/mean": 0.43864211440086365, "rewards/grpo_reward_func/std": 0.14933471381664276, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.625, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "learning_rate": 8.299999999999999e-07, "loss": 0.0, "num_tokens": 646324.0, "reward": 0.3448641300201416, "reward_std": 0.06778337061405182, "rewards/grpo_reward_func/mean": 0.3448641300201416, "rewards/grpo_reward_func/std": 0.06967282295227051, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.65625, "frac_reward_zero_std": 0.0, "grad_norm": 13.9375, "learning_rate": 8.266666666666667e-07, "loss": 0.0, "num_tokens": 658764.0, "reward": 0.4265494644641876, "reward_std": 0.11092057079076767, "rewards/grpo_reward_func/mean": 0.4265494644641876, "rewards/grpo_reward_func/std": 0.11681105941534042, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.6875, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "learning_rate": 8.233333333333333e-07, "loss": 0.0, "num_tokens": 671084.0, "reward": 0.3909933567047119, "reward_std": 0.062042489647865295, "rewards/grpo_reward_func/mean": 0.3909933567047119, "rewards/grpo_reward_func/std": 0.12040998041629791, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.71875, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "learning_rate": 8.199999999999999e-07, "loss": -0.0, "num_tokens": 683556.0, "reward": 0.3827119469642639, "reward_std": 0.0810474157333374, "rewards/grpo_reward_func/mean": 0.3827119469642639, "rewards/grpo_reward_func/std": 0.10648734867572784, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.75, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "learning_rate": 8.166666666666666e-07, "loss": 0.0, "num_tokens": 696044.0, "reward": 0.43536075949668884, "reward_std": 0.13194429874420166, "rewards/grpo_reward_func/mean": 0.43536075949668884, "rewards/grpo_reward_func/std": 0.14542116224765778, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.78125, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "learning_rate": 8.133333333333333e-07, "loss": -0.0, "num_tokens": 708468.0, "reward": 0.4286166727542877, "reward_std": 0.07387880980968475, "rewards/grpo_reward_func/mean": 0.4286166727542877, "rewards/grpo_reward_func/std": 0.10452007502317429, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.8125, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "learning_rate": 8.1e-07, "loss": -0.0, "num_tokens": 720940.0, "reward": 0.38893401622772217, "reward_std": 0.0943751409649849, "rewards/grpo_reward_func/mean": 0.38893401622772217, "rewards/grpo_reward_func/std": 0.12028432637453079, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.84375, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "learning_rate": 8.066666666666666e-07, "loss": -0.0, "num_tokens": 733360.0, "reward": 0.4644596576690674, "reward_std": 0.16205663979053497, "rewards/grpo_reward_func/mean": 0.4644596576690674, "rewards/grpo_reward_func/std": 0.15505553781986237, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.875, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "learning_rate": 8.033333333333333e-07, "loss": 0.0, "num_tokens": 745704.0, "reward": 0.46369504928588867, "reward_std": 0.0912257730960846, "rewards/grpo_reward_func/mean": 0.46369504928588867, "rewards/grpo_reward_func/std": 0.09050611406564713, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.90625, "frac_reward_zero_std": 0.0, "grad_norm": 7.84375, "learning_rate": 8e-07, "loss": -0.0, "num_tokens": 758080.0, "reward": 0.4551791548728943, "reward_std": 0.12297463417053223, "rewards/grpo_reward_func/mean": 0.4551791548728943, "rewards/grpo_reward_func/std": 0.14138628542423248, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.9375, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "learning_rate": 7.966666666666666e-07, "loss": 0.0, "num_tokens": 770712.0, "reward": 0.39730104804039, "reward_std": 0.06629657000303268, "rewards/grpo_reward_func/mean": 0.39730104804039, "rewards/grpo_reward_func/std": 0.08781840652227402, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 1.96875, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "learning_rate": 7.933333333333333e-07, "loss": 0.0, "num_tokens": 783120.0, "reward": 0.40575429797172546, "reward_std": 0.09323962777853012, "rewards/grpo_reward_func/mean": 0.40575429797172546, "rewards/grpo_reward_func/std": 0.1281837671995163, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.0, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "learning_rate": 7.9e-07, "loss": 0.0, "num_tokens": 795568.0, "reward": 0.4539070129394531, "reward_std": 0.1893976330757141, "rewards/grpo_reward_func/mean": 0.4539070129394531, "rewards/grpo_reward_func/std": 0.17878401279449463, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.03125, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "learning_rate": 7.866666666666666e-07, "loss": 0.0, "num_tokens": 808072.0, "reward": 0.42031583189964294, "reward_std": 0.06885866075754166, "rewards/grpo_reward_func/mean": 0.42031583189964294, "rewards/grpo_reward_func/std": 0.06720545887947083, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.0625, "frac_reward_zero_std": 0.0, "grad_norm": 6.9375, "learning_rate": 7.833333333333333e-07, "loss": -0.0, "num_tokens": 820592.0, "reward": 0.446481317281723, "reward_std": 0.06617365032434464, "rewards/grpo_reward_func/mean": 0.446481317281723, "rewards/grpo_reward_func/std": 0.11224810034036636, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.09375, "frac_reward_zero_std": 0.0, "grad_norm": 10.0625, "learning_rate": 7.799999999999999e-07, "loss": -0.0, "num_tokens": 833008.0, "reward": 0.29850703477859497, "reward_std": 0.07878842949867249, "rewards/grpo_reward_func/mean": 0.29850703477859497, "rewards/grpo_reward_func/std": 0.09381019324064255, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.125, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "learning_rate": 7.766666666666666e-07, "loss": 0.0, "num_tokens": 845476.0, "reward": 0.3700866401195526, "reward_std": 0.11176452040672302, "rewards/grpo_reward_func/mean": 0.3700866401195526, "rewards/grpo_reward_func/std": 0.1271413266658783, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.15625, "frac_reward_zero_std": 0.0, "grad_norm": 12.75, "learning_rate": 7.733333333333333e-07, "loss": -0.0, "num_tokens": 857896.0, "reward": 0.4782499074935913, "reward_std": 0.10448910295963287, "rewards/grpo_reward_func/mean": 0.4782499074935913, "rewards/grpo_reward_func/std": 0.125322625041008, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.1875, "frac_reward_zero_std": 0.0, "grad_norm": 13.5, "learning_rate": 7.699999999999999e-07, "loss": -0.0, "num_tokens": 870308.0, "reward": 0.44694995880126953, "reward_std": 0.11892125755548477, "rewards/grpo_reward_func/mean": 0.44694995880126953, "rewards/grpo_reward_func/std": 0.15172399580478668, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "learning_rate": 7.666666666666667e-07, "loss": -0.0, "num_tokens": 882696.0, "reward": 0.48773661255836487, "reward_std": 0.18720099329948425, "rewards/grpo_reward_func/mean": 0.48773661255836487, "rewards/grpo_reward_func/std": 0.19652612507343292, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.25, "frac_reward_zero_std": 0.0, "grad_norm": 5.21875, "learning_rate": 7.633333333333333e-07, "loss": 0.0, "num_tokens": 895208.0, "reward": 0.360309362411499, "reward_std": 0.05594930052757263, "rewards/grpo_reward_func/mean": 0.360309362411499, "rewards/grpo_reward_func/std": 0.08431853353977203, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.28125, "frac_reward_zero_std": 0.0, "grad_norm": 15.3125, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "num_tokens": 907548.0, "reward": 0.4548572897911072, "reward_std": 0.1430705040693283, "rewards/grpo_reward_func/mean": 0.4548572897911072, "rewards/grpo_reward_func/std": 0.144826740026474, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.3125, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "learning_rate": 7.566666666666667e-07, "loss": -0.0, "num_tokens": 919976.0, "reward": 0.43647801876068115, "reward_std": 0.10883159935474396, "rewards/grpo_reward_func/mean": 0.43647801876068115, "rewards/grpo_reward_func/std": 0.13386793434619904, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.34375, "frac_reward_zero_std": 0.0, "grad_norm": 6.96875, "learning_rate": 7.533333333333332e-07, "loss": 0.0, "num_tokens": 932436.0, "reward": 0.3631000518798828, "reward_std": 0.055175162851810455, "rewards/grpo_reward_func/mean": 0.3631000518798828, "rewards/grpo_reward_func/std": 0.061299730092287064, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.625, "learning_rate": 7.5e-07, "loss": -0.0, "num_tokens": 944744.0, "reward": 0.3734683394432068, "reward_std": 0.07731673121452332, "rewards/grpo_reward_func/mean": 0.3734683394432068, "rewards/grpo_reward_func/std": 0.1018432006239891, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.40625, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "learning_rate": 7.466666666666667e-07, "loss": -0.0, "num_tokens": 957140.0, "reward": 0.3586929738521576, "reward_std": 0.08576677739620209, "rewards/grpo_reward_func/mean": 0.3586929738521576, "rewards/grpo_reward_func/std": 0.09627655893564224, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.4375, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "learning_rate": 7.433333333333332e-07, "loss": -0.0, "num_tokens": 969588.0, "reward": 0.3304125964641571, "reward_std": 0.09432289004325867, "rewards/grpo_reward_func/mean": 0.3304125964641571, "rewards/grpo_reward_func/std": 0.12439437210559845, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.625, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 982032.0, "reward": 0.4600115418434143, "reward_std": 0.11891645193099976, "rewards/grpo_reward_func/mean": 0.4600115418434143, "rewards/grpo_reward_func/std": 0.11769349873065948, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "learning_rate": 7.366666666666667e-07, "loss": 0.0, "num_tokens": 994440.0, "reward": 0.4921344816684723, "reward_std": 0.18801572918891907, "rewards/grpo_reward_func/mean": 0.4921344816684723, "rewards/grpo_reward_func/std": 0.17593181133270264, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.53125, "frac_reward_zero_std": 0.0, "grad_norm": 7.03125, "learning_rate": 7.333333333333332e-07, "loss": -0.0, "num_tokens": 1006908.0, "reward": 0.44369810819625854, "reward_std": 0.11731548607349396, "rewards/grpo_reward_func/mean": 0.44369810819625854, "rewards/grpo_reward_func/std": 0.13351494073867798, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.5625, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "learning_rate": 7.3e-07, "loss": -0.0, "num_tokens": 1019360.0, "reward": 0.4988051652908325, "reward_std": 0.08421847224235535, "rewards/grpo_reward_func/mean": 0.4988051652908325, "rewards/grpo_reward_func/std": 0.12857672572135925, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.59375, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "learning_rate": 7.266666666666667e-07, "loss": 0.0, "num_tokens": 1031756.0, "reward": 0.4094837009906769, "reward_std": 0.10778755694627762, "rewards/grpo_reward_func/mean": 0.4094837009906769, "rewards/grpo_reward_func/std": 0.11033328622579575, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.625, "frac_reward_zero_std": 0.0, "grad_norm": 11.375, "learning_rate": 7.233333333333333e-07, "loss": -0.0, "num_tokens": 1044160.0, "reward": 0.3499518632888794, "reward_std": 0.07542143762111664, "rewards/grpo_reward_func/mean": 0.3499518632888794, "rewards/grpo_reward_func/std": 0.08578986674547195, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.65625, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "learning_rate": 7.2e-07, "loss": -0.0, "num_tokens": 1056580.0, "reward": 0.4997272491455078, "reward_std": 0.1262975037097931, "rewards/grpo_reward_func/mean": 0.4997272491455078, "rewards/grpo_reward_func/std": 0.1279306709766388, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.6875, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "learning_rate": 7.166666666666667e-07, "loss": 0.0, "num_tokens": 1069020.0, "reward": 0.46792131662368774, "reward_std": 0.13234254717826843, "rewards/grpo_reward_func/mean": 0.46792131662368774, "rewards/grpo_reward_func/std": 0.1700320839881897, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.71875, "frac_reward_zero_std": 0.0, "grad_norm": 7.375, "learning_rate": 7.133333333333333e-07, "loss": -0.0, "num_tokens": 1081496.0, "reward": 0.4166927635669708, "reward_std": 0.07564548403024673, "rewards/grpo_reward_func/mean": 0.4166927635669708, "rewards/grpo_reward_func/std": 0.19586633145809174, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.75, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "learning_rate": 7.1e-07, "loss": 0.0, "num_tokens": 1093816.0, "reward": 0.42825716733932495, "reward_std": 0.1293352246284485, "rewards/grpo_reward_func/mean": 0.42825716733932495, "rewards/grpo_reward_func/std": 0.1340746283531189, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.78125, "frac_reward_zero_std": 0.0, "grad_norm": 6.96875, "learning_rate": 7.066666666666666e-07, "loss": 0.0, "num_tokens": 1106336.0, "reward": 0.40863943099975586, "reward_std": 0.061242297291755676, "rewards/grpo_reward_func/mean": 0.40863943099975586, "rewards/grpo_reward_func/std": 0.11059094965457916, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.8125, "frac_reward_zero_std": 0.0, "grad_norm": 13.875, "learning_rate": 7.033333333333333e-07, "loss": 0.0, "num_tokens": 1118756.0, "reward": 0.44183290004730225, "reward_std": 0.1359260380268097, "rewards/grpo_reward_func/mean": 0.44183290004730225, "rewards/grpo_reward_func/std": 0.15313053131103516, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.84375, "frac_reward_zero_std": 0.0, "grad_norm": 6.9375, "learning_rate": 7e-07, "loss": -0.0, "num_tokens": 1131108.0, "reward": 0.4604765474796295, "reward_std": 0.09057141840457916, "rewards/grpo_reward_func/mean": 0.4604765474796295, "rewards/grpo_reward_func/std": 0.17239472270011902, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.875, "frac_reward_zero_std": 0.0, "grad_norm": 5.84375, "learning_rate": 6.966666666666666e-07, "loss": -0.0, "num_tokens": 1143520.0, "reward": 0.37243229150772095, "reward_std": 0.07444402575492859, "rewards/grpo_reward_func/mean": 0.37243229150772095, "rewards/grpo_reward_func/std": 0.1061118021607399, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.90625, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "learning_rate": 6.933333333333333e-07, "loss": -0.0, "num_tokens": 1156008.0, "reward": 0.5441325902938843, "reward_std": 0.11369525641202927, "rewards/grpo_reward_func/mean": 0.5441325902938843, "rewards/grpo_reward_func/std": 0.11172118782997131, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.9375, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "learning_rate": 6.9e-07, "loss": -0.0, "num_tokens": 1168452.0, "reward": 0.4581533670425415, "reward_std": 0.11172451823949814, "rewards/grpo_reward_func/mean": 0.4581533670425415, "rewards/grpo_reward_func/std": 0.1257813274860382, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 2.96875, "frac_reward_zero_std": 0.0, "grad_norm": 11.6875, "learning_rate": 6.866666666666666e-07, "loss": -0.0, "num_tokens": 1180928.0, "reward": 0.4434836208820343, "reward_std": 0.14923422038555145, "rewards/grpo_reward_func/mean": 0.4434836208820343, "rewards/grpo_reward_func/std": 0.1542947143316269, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0, "frac_reward_zero_std": 0.0, "grad_norm": 13.5625, "learning_rate": 6.833333333333333e-07, "loss": -0.0, "num_tokens": 1193352.0, "reward": 0.3983464241027832, "reward_std": 0.08742759376764297, "rewards/grpo_reward_func/mean": 0.3983464241027832, "rewards/grpo_reward_func/std": 0.12986424565315247, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.03125, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "learning_rate": 6.800000000000001e-07, "loss": -0.0, "num_tokens": 1205796.0, "reward": 0.5015304088592529, "reward_std": 0.08956287801265717, "rewards/grpo_reward_func/mean": 0.5015304088592529, "rewards/grpo_reward_func/std": 0.08333175629377365, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.0625, "frac_reward_zero_std": 0.0, "grad_norm": 16.875, "learning_rate": 6.766666666666666e-07, "loss": 0.0, "num_tokens": 1218244.0, "reward": 0.47066164016723633, "reward_std": 0.19255688786506653, "rewards/grpo_reward_func/mean": 0.47066164016723633, "rewards/grpo_reward_func/std": 0.1828991174697876, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.09375, "frac_reward_zero_std": 0.0, "grad_norm": 14.5625, "learning_rate": 6.733333333333333e-07, "loss": 0.0, "num_tokens": 1230632.0, "reward": 0.49644234776496887, "reward_std": 0.10233695805072784, "rewards/grpo_reward_func/mean": 0.49644234776496887, "rewards/grpo_reward_func/std": 0.09938962757587433, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.125, "frac_reward_zero_std": 0.0, "grad_norm": 17.0, "learning_rate": 6.7e-07, "loss": -0.0, "num_tokens": 1243024.0, "reward": 0.48214682936668396, "reward_std": 0.1728937327861786, "rewards/grpo_reward_func/mean": 0.48214682936668396, "rewards/grpo_reward_func/std": 0.16634704172611237, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "num_tokens": 1255484.0, "reward": 0.4351156949996948, "reward_std": 0.12486094236373901, "rewards/grpo_reward_func/mean": 0.4351156949996948, "rewards/grpo_reward_func/std": 0.1314164698123932, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.1875, "frac_reward_zero_std": 0.0, "grad_norm": 13.4375, "learning_rate": 6.633333333333333e-07, "loss": 0.0, "num_tokens": 1267856.0, "reward": 0.38795578479766846, "reward_std": 0.1968497335910797, "rewards/grpo_reward_func/mean": 0.38795578479766846, "rewards/grpo_reward_func/std": 0.18232691287994385, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.21875, "frac_reward_zero_std": 0.0, "grad_norm": 15.0625, "learning_rate": 6.6e-07, "loss": 0.0, "num_tokens": 1280280.0, "reward": 0.3891702890396118, "reward_std": 0.09787960350513458, "rewards/grpo_reward_func/mean": 0.3891702890396118, "rewards/grpo_reward_func/std": 0.09284209460020065, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.06298828125, "learning_rate": 6.566666666666666e-07, "loss": 0.0, "num_tokens": 1292752.0, "reward": 0.39056217670440674, "reward_std": 0.04999999329447746, "rewards/grpo_reward_func/mean": 0.39056217670440674, "rewards/grpo_reward_func/std": 0.04636901617050171, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.28125, "frac_reward_zero_std": 0.0, "grad_norm": 10.75, "learning_rate": 6.533333333333333e-07, "loss": -0.0, "num_tokens": 1305200.0, "reward": 0.4605242609977722, "reward_std": 0.13093939423561096, "rewards/grpo_reward_func/mean": 0.4605242609977722, "rewards/grpo_reward_func/std": 0.15952207148075104, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.3125, "frac_reward_zero_std": 0.0, "grad_norm": 15.6875, "learning_rate": 6.5e-07, "loss": -0.0, "num_tokens": 1317660.0, "reward": 0.3946014940738678, "reward_std": 0.09192033857107162, "rewards/grpo_reward_func/mean": 0.3946014940738678, "rewards/grpo_reward_func/std": 0.10782631486654282, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.34375, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "learning_rate": 6.466666666666666e-07, "loss": 0.0, "num_tokens": 1330068.0, "reward": 0.4714941084384918, "reward_std": 0.09265273809432983, "rewards/grpo_reward_func/mean": 0.4714941084384918, "rewards/grpo_reward_func/std": 0.12330163270235062, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.375, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "learning_rate": 6.433333333333332e-07, "loss": 0.0, "num_tokens": 1342620.0, "reward": 0.49374109506607056, "reward_std": 0.0895591527223587, "rewards/grpo_reward_func/mean": 0.49374109506607056, "rewards/grpo_reward_func/std": 0.1332620531320572, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.40625, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "learning_rate": 6.4e-07, "loss": 0.0, "num_tokens": 1355016.0, "reward": 0.3305853009223938, "reward_std": 0.04621565341949463, "rewards/grpo_reward_func/mean": 0.3305853009223938, "rewards/grpo_reward_func/std": 0.04419610649347305, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.4375, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "learning_rate": 6.366666666666667e-07, "loss": -0.0, "num_tokens": 1367452.0, "reward": 0.5173900723457336, "reward_std": 0.14908233284950256, "rewards/grpo_reward_func/mean": 0.5173900723457336, "rewards/grpo_reward_func/std": 0.15880633890628815, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.46875, "frac_reward_zero_std": 0.0, "grad_norm": 11.9375, "learning_rate": 6.333333333333332e-07, "loss": -0.0, "num_tokens": 1379760.0, "reward": 0.3797125816345215, "reward_std": 0.10961093008518219, "rewards/grpo_reward_func/mean": 0.3797125816345215, "rewards/grpo_reward_func/std": 0.12369874864816666, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.5, "frac_reward_zero_std": 0.0, "grad_norm": 12.875, "learning_rate": 6.3e-07, "loss": 0.0, "num_tokens": 1392296.0, "reward": 0.3952435255050659, "reward_std": 0.07089774310588837, "rewards/grpo_reward_func/mean": 0.3952435255050659, "rewards/grpo_reward_func/std": 0.09734237939119339, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.53125, "frac_reward_zero_std": 0.0, "grad_norm": 6.25, "learning_rate": 6.266666666666667e-07, "loss": 0.0, "num_tokens": 1404748.0, "reward": 0.4383198916912079, "reward_std": 0.08845233917236328, "rewards/grpo_reward_func/mean": 0.4383198916912079, "rewards/grpo_reward_func/std": 0.08347002416849136, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.5625, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "learning_rate": 6.233333333333332e-07, "loss": -0.0, "num_tokens": 1417172.0, "reward": 0.3984643220901489, "reward_std": 0.08412055671215057, "rewards/grpo_reward_func/mean": 0.3984643220901489, "rewards/grpo_reward_func/std": 0.08139137923717499, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.59375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0, "learning_rate": 6.2e-07, "loss": -0.0, "num_tokens": 1429572.0, "reward": 0.3756071925163269, "reward_std": 0.1621457189321518, "rewards/grpo_reward_func/mean": 0.3756071925163269, "rewards/grpo_reward_func/std": 0.16212420165538788, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.625, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "learning_rate": 6.166666666666667e-07, "loss": -0.0, "num_tokens": 1441984.0, "reward": 0.3367416262626648, "reward_std": 0.10579686611890793, "rewards/grpo_reward_func/mean": 0.3367416262626648, "rewards/grpo_reward_func/std": 0.12276742607355118, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.65625, "frac_reward_zero_std": 0.0, "grad_norm": 6.09375, "learning_rate": 6.133333333333332e-07, "loss": -0.0, "num_tokens": 1454520.0, "reward": 0.33171868324279785, "reward_std": 0.05540106073021889, "rewards/grpo_reward_func/mean": 0.33171868324279785, "rewards/grpo_reward_func/std": 0.05543047562241554, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.6875, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "learning_rate": 6.1e-07, "loss": -0.0, "num_tokens": 1466968.0, "reward": 0.46069973707199097, "reward_std": 0.08953073620796204, "rewards/grpo_reward_func/mean": 0.46069973707199097, "rewards/grpo_reward_func/std": 0.10067260265350342, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.71875, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "learning_rate": 6.066666666666666e-07, "loss": 0.0, "num_tokens": 1479328.0, "reward": 0.49788278341293335, "reward_std": 0.12688566744327545, "rewards/grpo_reward_func/mean": 0.49788278341293335, "rewards/grpo_reward_func/std": 0.12214919179677963, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.75, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 6.033333333333333e-07, "loss": 0.0, "num_tokens": 1491788.0, "reward": 0.35892003774642944, "reward_std": 0.0625436007976532, "rewards/grpo_reward_func/mean": 0.35892003774642944, "rewards/grpo_reward_func/std": 0.09081238508224487, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.78125, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 1504220.0, "reward": 0.38591668009757996, "reward_std": 0.15822480618953705, "rewards/grpo_reward_func/mean": 0.38591668009757996, "rewards/grpo_reward_func/std": 0.16854539513587952, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.8125, "frac_reward_zero_std": 0.0, "grad_norm": 12.25, "learning_rate": 5.966666666666666e-07, "loss": -0.0, "num_tokens": 1516652.0, "reward": 0.43537092208862305, "reward_std": 0.14132292568683624, "rewards/grpo_reward_func/mean": 0.43537092208862305, "rewards/grpo_reward_func/std": 0.15050342679023743, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.84375, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "learning_rate": 5.933333333333334e-07, "loss": -0.0, "num_tokens": 1529072.0, "reward": 0.4112863540649414, "reward_std": 0.08730175346136093, "rewards/grpo_reward_func/mean": 0.4112863540649414, "rewards/grpo_reward_func/std": 0.09073270857334137, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.875, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 5.9e-07, "loss": -0.0, "num_tokens": 1541488.0, "reward": 0.3833653926849365, "reward_std": 0.09057098627090454, "rewards/grpo_reward_func/mean": 0.3833653926849365, "rewards/grpo_reward_func/std": 0.08530126512050629, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.90625, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 5.866666666666666e-07, "loss": 0.0, "num_tokens": 1553812.0, "reward": 0.5172641277313232, "reward_std": 0.08300620317459106, "rewards/grpo_reward_func/mean": 0.5172641277313232, "rewards/grpo_reward_func/std": 0.18922077119350433, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.9375, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "learning_rate": 5.833333333333334e-07, "loss": 0.0, "num_tokens": 1566244.0, "reward": 0.45866021513938904, "reward_std": 0.13558343052864075, "rewards/grpo_reward_func/mean": 0.45866021513938904, "rewards/grpo_reward_func/std": 0.12821511924266815, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 3.96875, "frac_reward_zero_std": 0.0, "grad_norm": 5.46875, "learning_rate": 5.8e-07, "loss": -0.0, "num_tokens": 1578680.0, "reward": 0.4404694437980652, "reward_std": 0.058066606521606445, "rewards/grpo_reward_func/mean": 0.4404694437980652, "rewards/grpo_reward_func/std": 0.057657789438962936, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.0, "frac_reward_zero_std": 0.0, "grad_norm": 14.9375, "learning_rate": 5.766666666666666e-07, "loss": -0.0, "num_tokens": 1591136.0, "reward": 0.3580424189567566, "reward_std": 0.07987552881240845, "rewards/grpo_reward_func/mean": 0.3580424189567566, "rewards/grpo_reward_func/std": 0.0977816954255104, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.03125, "frac_reward_zero_std": 0.0, "grad_norm": 10.875, "learning_rate": 5.733333333333334e-07, "loss": 0.0, "num_tokens": 1603604.0, "reward": 0.3891274929046631, "reward_std": 0.15381482243537903, "rewards/grpo_reward_func/mean": 0.3891274929046631, "rewards/grpo_reward_func/std": 0.17152857780456543, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.0625, "frac_reward_zero_std": 0.0, "grad_norm": 10.25, "learning_rate": 5.699999999999999e-07, "loss": 0.0, "num_tokens": 1616044.0, "reward": 0.27857083082199097, "reward_std": 0.09501777589321136, "rewards/grpo_reward_func/mean": 0.27857083082199097, "rewards/grpo_reward_func/std": 0.1052025854587555, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.09375, "frac_reward_zero_std": 0.0, "grad_norm": 11.1875, "learning_rate": 5.666666666666666e-07, "loss": -0.0, "num_tokens": 1628436.0, "reward": 0.35340362787246704, "reward_std": 0.16999280452728271, "rewards/grpo_reward_func/mean": 0.35340362787246704, "rewards/grpo_reward_func/std": 0.16278210282325745, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.125, "frac_reward_zero_std": 0.0, "grad_norm": 12.3125, "learning_rate": 5.633333333333334e-07, "loss": -0.0, "num_tokens": 1640824.0, "reward": 0.4199197590351105, "reward_std": 0.08985067158937454, "rewards/grpo_reward_func/mean": 0.4199197590351105, "rewards/grpo_reward_func/std": 0.09818078577518463, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.15625, "frac_reward_zero_std": 0.0, "grad_norm": 7.0, "learning_rate": 5.6e-07, "loss": -0.0, "num_tokens": 1653220.0, "reward": 0.44602805376052856, "reward_std": 0.10932175815105438, "rewards/grpo_reward_func/mean": 0.44602805376052856, "rewards/grpo_reward_func/std": 0.11537235230207443, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.1875, "frac_reward_zero_std": 0.0, "grad_norm": 12.5625, "learning_rate": 5.566666666666666e-07, "loss": -0.0, "num_tokens": 1665684.0, "reward": 0.4218568205833435, "reward_std": 0.09915173053741455, "rewards/grpo_reward_func/mean": 0.4218568205833435, "rewards/grpo_reward_func/std": 0.1479072868824005, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "learning_rate": 5.533333333333334e-07, "loss": -0.0, "num_tokens": 1678120.0, "reward": 0.3699283301830292, "reward_std": 0.05628474801778793, "rewards/grpo_reward_func/mean": 0.3699283301830292, "rewards/grpo_reward_func/std": 0.055360160768032074, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.25, "frac_reward_zero_std": 0.0, "grad_norm": 12.625, "learning_rate": 5.5e-07, "loss": 0.0, "num_tokens": 1690616.0, "reward": 0.43144893646240234, "reward_std": 0.097145214676857, "rewards/grpo_reward_func/mean": 0.43144893646240234, "rewards/grpo_reward_func/std": 0.09757841378450394, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.28125, "frac_reward_zero_std": 0.0, "grad_norm": 7.875, "learning_rate": 5.466666666666666e-07, "loss": -0.0, "num_tokens": 1703048.0, "reward": 0.37039631605148315, "reward_std": 0.06340405344963074, "rewards/grpo_reward_func/mean": 0.37039631605148315, "rewards/grpo_reward_func/std": 0.10630898922681808, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.3125, "frac_reward_zero_std": 0.0, "grad_norm": 5.5, "learning_rate": 5.433333333333334e-07, "loss": -0.0, "num_tokens": 1715404.0, "reward": 0.44485020637512207, "reward_std": 0.061223354190588, "rewards/grpo_reward_func/mean": 0.44485020637512207, "rewards/grpo_reward_func/std": 0.0653579831123352, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.34375, "frac_reward_zero_std": 0.0, "grad_norm": 18.75, "learning_rate": 5.4e-07, "loss": -0.0, "num_tokens": 1727876.0, "reward": 0.4389991760253906, "reward_std": 0.12622228264808655, "rewards/grpo_reward_func/mean": 0.4389991760253906, "rewards/grpo_reward_func/std": 0.1206517443060875, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.375, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "learning_rate": 5.366666666666666e-07, "loss": 0.0, "num_tokens": 1740252.0, "reward": 0.3506331741809845, "reward_std": 0.1391739398241043, "rewards/grpo_reward_func/mean": 0.3506331741809845, "rewards/grpo_reward_func/std": 0.14306746423244476, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.40625, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "learning_rate": 5.333333333333333e-07, "loss": -0.0, "num_tokens": 1752632.0, "reward": 0.5316411256790161, "reward_std": 0.10773089528083801, "rewards/grpo_reward_func/mean": 0.5316411256790161, "rewards/grpo_reward_func/std": 0.16645555198192596, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.4375, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "learning_rate": 5.3e-07, "loss": -0.0, "num_tokens": 1765040.0, "reward": 0.3930637836456299, "reward_std": 0.07452228665351868, "rewards/grpo_reward_func/mean": 0.3930637836456299, "rewards/grpo_reward_func/std": 0.07487671822309494, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.875, "completions/mean_terminated_length": 11.875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 4.46875, "frac_reward_zero_std": 0.0, "grad_norm": 15.25, "learning_rate": 5.266666666666666e-07, "loss": -0.0142, "num_tokens": 1777423.0, "reward": 0.3444192409515381, "reward_std": 0.1598653644323349, "rewards/grpo_reward_func/mean": 0.3444192409515381, "rewards/grpo_reward_func/std": 0.18078266084194183, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.5, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "learning_rate": 5.233333333333333e-07, "loss": 0.0, "num_tokens": 1789683.0, "reward": 0.5174664258956909, "reward_std": 0.07813962548971176, "rewards/grpo_reward_func/mean": 0.5174664258956909, "rewards/grpo_reward_func/std": 0.10316640138626099, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.53125, "frac_reward_zero_std": 0.0, "grad_norm": 7.625, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 1802119.0, "reward": 0.3699246048927307, "reward_std": 0.08162573724985123, "rewards/grpo_reward_func/mean": 0.3699246048927307, "rewards/grpo_reward_func/std": 0.09686075896024704, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.5625, "frac_reward_zero_std": 0.0, "grad_norm": 10.0, "learning_rate": 5.166666666666667e-07, "loss": 0.0, "num_tokens": 1814483.0, "reward": 0.4604162275791168, "reward_std": 0.19567811489105225, "rewards/grpo_reward_func/mean": 0.4604162275791168, "rewards/grpo_reward_func/std": 0.19948698580265045, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.59375, "frac_reward_zero_std": 0.0, "grad_norm": 7.90625, "learning_rate": 5.133333333333333e-07, "loss": -0.0, "num_tokens": 1827055.0, "reward": 0.41122761368751526, "reward_std": 0.08153079450130463, "rewards/grpo_reward_func/mean": 0.41122761368751526, "rewards/grpo_reward_func/std": 0.08045266568660736, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.625, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125, "learning_rate": 5.1e-07, "loss": 0.0, "num_tokens": 1839535.0, "reward": 0.3292653560638428, "reward_std": 0.04870126396417618, "rewards/grpo_reward_func/mean": 0.3292653560638428, "rewards/grpo_reward_func/std": 0.07768747955560684, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.65625, "frac_reward_zero_std": 0.0, "grad_norm": 17.5, "learning_rate": 5.066666666666667e-07, "loss": -0.0, "num_tokens": 1852003.0, "reward": 0.4356845021247864, "reward_std": 0.11020061373710632, "rewards/grpo_reward_func/mean": 0.4356845021247864, "rewards/grpo_reward_func/std": 0.12760911881923676, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.6875, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "learning_rate": 5.033333333333333e-07, "loss": 0.0, "num_tokens": 1864487.0, "reward": 0.47176241874694824, "reward_std": 0.1466352343559265, "rewards/grpo_reward_func/mean": 0.47176241874694824, "rewards/grpo_reward_func/std": 0.15562468767166138, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.71875, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 1876939.0, "reward": 0.49293607473373413, "reward_std": 0.15847747027873993, "rewards/grpo_reward_func/mean": 0.49293607473373413, "rewards/grpo_reward_func/std": 0.16349899768829346, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.75, "frac_reward_zero_std": 0.0, "grad_norm": 9.1875, "learning_rate": 4.966666666666666e-07, "loss": -0.0, "num_tokens": 1889499.0, "reward": 0.4915664792060852, "reward_std": 0.19223570823669434, "rewards/grpo_reward_func/mean": 0.4915664792060852, "rewards/grpo_reward_func/std": 0.1780252456665039, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.78125, "frac_reward_zero_std": 0.0, "grad_norm": 5.1875, "learning_rate": 4.933333333333333e-07, "loss": -0.0, "num_tokens": 1901911.0, "reward": 0.39836806058883667, "reward_std": 0.08220314979553223, "rewards/grpo_reward_func/mean": 0.39836806058883667, "rewards/grpo_reward_func/std": 0.09293971210718155, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.8125, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "learning_rate": 4.9e-07, "loss": 0.0, "num_tokens": 1914267.0, "reward": 0.5052293539047241, "reward_std": 0.05901884660124779, "rewards/grpo_reward_func/mean": 0.5052293539047241, "rewards/grpo_reward_func/std": 0.07250750809907913, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.84375, "frac_reward_zero_std": 0.0, "grad_norm": 15.8125, "learning_rate": 4.866666666666666e-07, "loss": -0.0, "num_tokens": 1926679.0, "reward": 0.2826748192310333, "reward_std": 0.0776633769273758, "rewards/grpo_reward_func/mean": 0.2826748192310333, "rewards/grpo_reward_func/std": 0.07334372401237488, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.875, "frac_reward_zero_std": 0.0, "grad_norm": 13.125, "learning_rate": 4.833333333333333e-07, "loss": -0.0, "num_tokens": 1939135.0, "reward": 0.38298332691192627, "reward_std": 0.15204550325870514, "rewards/grpo_reward_func/mean": 0.38298332691192627, "rewards/grpo_reward_func/std": 0.17793436348438263, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.90625, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 1951555.0, "reward": 0.45585888624191284, "reward_std": 0.08215408027172089, "rewards/grpo_reward_func/mean": 0.45585888624191284, "rewards/grpo_reward_func/std": 0.08240208774805069, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.9375, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "learning_rate": 4.7666666666666667e-07, "loss": 0.0, "num_tokens": 1964007.0, "reward": 0.4660055935382843, "reward_std": 0.17032964527606964, "rewards/grpo_reward_func/mean": 0.4660055935382843, "rewards/grpo_reward_func/std": 0.1751418560743332, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 4.96875, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "learning_rate": 4.733333333333333e-07, "loss": 0.0, "num_tokens": 1976483.0, "reward": 0.5173270106315613, "reward_std": 0.17288881540298462, "rewards/grpo_reward_func/mean": 0.5173270106315613, "rewards/grpo_reward_func/std": 0.1660362035036087, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.0, "frac_reward_zero_std": 0.0, "grad_norm": 12.125, "learning_rate": 4.6999999999999995e-07, "loss": 0.0, "num_tokens": 1988919.0, "reward": 0.4136430323123932, "reward_std": 0.22560492157936096, "rewards/grpo_reward_func/mean": 0.4136430323123932, "rewards/grpo_reward_func/std": 0.21545714139938354, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.03125, "frac_reward_zero_std": 0.0, "grad_norm": 13.1875, "learning_rate": 4.6666666666666666e-07, "loss": -0.0, "num_tokens": 2001379.0, "reward": 0.43014535307884216, "reward_std": 0.08317069709300995, "rewards/grpo_reward_func/mean": 0.43014535307884216, "rewards/grpo_reward_func/std": 0.07750457525253296, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.0625, "frac_reward_zero_std": 0.0, "grad_norm": 5.90625, "learning_rate": 4.633333333333333e-07, "loss": -0.0, "num_tokens": 2013971.0, "reward": 0.34895196557044983, "reward_std": 0.0512375608086586, "rewards/grpo_reward_func/mean": 0.34895196557044983, "rewards/grpo_reward_func/std": 0.04775034263730049, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.09375, "frac_reward_zero_std": 0.0, "grad_norm": 6.34375, "learning_rate": 4.6e-07, "loss": 0.0, "num_tokens": 2026375.0, "reward": 0.3551255464553833, "reward_std": 0.12043958902359009, "rewards/grpo_reward_func/mean": 0.3551255464553833, "rewards/grpo_reward_func/std": 0.13196633756160736, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.125, "frac_reward_zero_std": 0.0, "grad_norm": 14.1875, "learning_rate": 4.5666666666666665e-07, "loss": 0.0, "num_tokens": 2038823.0, "reward": 0.41090184450149536, "reward_std": 0.11341163516044617, "rewards/grpo_reward_func/mean": 0.41090184450149536, "rewards/grpo_reward_func/std": 0.11507044732570648, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.15625, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "learning_rate": 4.5333333333333326e-07, "loss": -0.0, "num_tokens": 2051219.0, "reward": 0.3054584860801697, "reward_std": 0.08504727482795715, "rewards/grpo_reward_func/mean": 0.3054584860801697, "rewards/grpo_reward_func/std": 0.09004215151071548, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.1875, "frac_reward_zero_std": 0.0, "grad_norm": 13.9375, "learning_rate": 4.5e-07, "loss": -0.0, "num_tokens": 2063651.0, "reward": 0.47015416622161865, "reward_std": 0.15467038750648499, "rewards/grpo_reward_func/mean": 0.47015416622161865, "rewards/grpo_reward_func/std": 0.153534397482872, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.21875, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "learning_rate": 4.4666666666666664e-07, "loss": -0.0, "num_tokens": 2075955.0, "reward": 0.4894865155220032, "reward_std": 0.07816055417060852, "rewards/grpo_reward_func/mean": 0.4894865155220032, "rewards/grpo_reward_func/std": 0.07534909248352051, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.25, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "learning_rate": 4.4333333333333336e-07, "loss": -0.0, "num_tokens": 2088435.0, "reward": 0.4603702425956726, "reward_std": 0.15144219994544983, "rewards/grpo_reward_func/mean": 0.4603702425956726, "rewards/grpo_reward_func/std": 0.16273239254951477, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.28125, "frac_reward_zero_std": 0.0, "grad_norm": 8.75, "learning_rate": 4.3999999999999997e-07, "loss": -0.0, "num_tokens": 2100919.0, "reward": 0.3637647032737732, "reward_std": 0.06757047772407532, "rewards/grpo_reward_func/mean": 0.3637647032737732, "rewards/grpo_reward_func/std": 0.08233585953712463, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.3125, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "learning_rate": 4.3666666666666663e-07, "loss": 0.0, "num_tokens": 2113343.0, "reward": 0.3543202579021454, "reward_std": 0.08441969752311707, "rewards/grpo_reward_func/mean": 0.3543202579021454, "rewards/grpo_reward_func/std": 0.08902662247419357, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.34375, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "num_tokens": 2125791.0, "reward": 0.47929999232292175, "reward_std": 0.17670738697052002, "rewards/grpo_reward_func/mean": 0.47929999232292175, "rewards/grpo_reward_func/std": 0.17999567091464996, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.375, "frac_reward_zero_std": 0.0, "grad_norm": 8.5625, "learning_rate": 4.2999999999999996e-07, "loss": -0.0, "num_tokens": 2138251.0, "reward": 0.3452494740486145, "reward_std": 0.08022183179855347, "rewards/grpo_reward_func/mean": 0.3452494740486145, "rewards/grpo_reward_func/std": 0.08067353814840317, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.40625, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "learning_rate": 4.266666666666667e-07, "loss": -0.0, "num_tokens": 2150659.0, "reward": 0.42406925559043884, "reward_std": 0.2445584237575531, "rewards/grpo_reward_func/mean": 0.42406925559043884, "rewards/grpo_reward_func/std": 0.22746475040912628, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.4375, "frac_reward_zero_std": 0.0, "grad_norm": 6.34375, "learning_rate": 4.2333333333333334e-07, "loss": -0.0, "num_tokens": 2163163.0, "reward": 0.4360213279724121, "reward_std": 0.07188587635755539, "rewards/grpo_reward_func/mean": 0.4360213279724121, "rewards/grpo_reward_func/std": 0.07112448662519455, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.46875, "frac_reward_zero_std": 0.0, "grad_norm": 7.65625, "learning_rate": 4.1999999999999995e-07, "loss": -0.0, "num_tokens": 2175679.0, "reward": 0.40680232644081116, "reward_std": 0.054570674896240234, "rewards/grpo_reward_func/mean": 0.40680232644081116, "rewards/grpo_reward_func/std": 0.05052686110138893, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.5, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "learning_rate": 4.1666666666666667e-07, "loss": -0.0, "num_tokens": 2188199.0, "reward": 0.4191306233406067, "reward_std": 0.11386445164680481, "rewards/grpo_reward_func/mean": 0.4191306233406067, "rewards/grpo_reward_func/std": 0.1961081475019455, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.53125, "frac_reward_zero_std": 0.0, "grad_norm": 13.4375, "learning_rate": 4.1333333333333333e-07, "loss": -0.0, "num_tokens": 2200571.0, "reward": 0.49487611651420593, "reward_std": 0.18403539061546326, "rewards/grpo_reward_func/mean": 0.49487611651420593, "rewards/grpo_reward_func/std": 0.17239995300769806, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.5625, "frac_reward_zero_std": 0.0, "grad_norm": 13.0625, "learning_rate": 4.0999999999999994e-07, "loss": 0.0, "num_tokens": 2212955.0, "reward": 0.5409983396530151, "reward_std": 0.12222976982593536, "rewards/grpo_reward_func/mean": 0.5409983396530151, "rewards/grpo_reward_func/std": 0.11841105669736862, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.59375, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "learning_rate": 4.0666666666666666e-07, "loss": 0.0, "num_tokens": 2225275.0, "reward": 0.3465573191642761, "reward_std": 0.05415717512369156, "rewards/grpo_reward_func/mean": 0.3465573191642761, "rewards/grpo_reward_func/std": 0.08967100828886032, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.625, "frac_reward_zero_std": 0.0, "grad_norm": 6.625, "learning_rate": 4.033333333333333e-07, "loss": -0.0, "num_tokens": 2237591.0, "reward": 0.40769240260124207, "reward_std": 0.061508819460868835, "rewards/grpo_reward_func/mean": 0.40769240260124207, "rewards/grpo_reward_func/std": 0.11384513974189758, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.65625, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2249983.0, "reward": 0.4172666072845459, "reward_std": 0.04795217514038086, "rewards/grpo_reward_func/mean": 0.4172666072845459, "rewards/grpo_reward_func/std": 0.06908071041107178, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.6875, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "learning_rate": 3.9666666666666665e-07, "loss": 0.0, "num_tokens": 2262391.0, "reward": 0.4887160658836365, "reward_std": 0.0936364233493805, "rewards/grpo_reward_func/mean": 0.4887160658836365, "rewards/grpo_reward_func/std": 0.12800146639347076, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.71875, "frac_reward_zero_std": 0.0, "grad_norm": 8.8125, "learning_rate": 3.933333333333333e-07, "loss": 0.0, "num_tokens": 2274847.0, "reward": 0.5363283157348633, "reward_std": 0.09925331920385361, "rewards/grpo_reward_func/mean": 0.5363283157348633, "rewards/grpo_reward_func/std": 0.09658796340227127, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.75, "frac_reward_zero_std": 0.0, "grad_norm": 10.625, "learning_rate": 3.8999999999999997e-07, "loss": 0.0, "num_tokens": 2287319.0, "reward": 0.41678112745285034, "reward_std": 0.13148340582847595, "rewards/grpo_reward_func/mean": 0.41678112745285034, "rewards/grpo_reward_func/std": 0.1416279673576355, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.78125, "frac_reward_zero_std": 0.0, "grad_norm": 9.5, "learning_rate": 3.8666666666666664e-07, "loss": -0.0, "num_tokens": 2299739.0, "reward": 0.49028465151786804, "reward_std": 0.09930803626775742, "rewards/grpo_reward_func/mean": 0.49028465151786804, "rewards/grpo_reward_func/std": 0.1043338030576706, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.8125, "frac_reward_zero_std": 0.0, "grad_norm": 7.1875, "learning_rate": 3.8333333333333335e-07, "loss": 0.0, "num_tokens": 2312179.0, "reward": 0.42906028032302856, "reward_std": 0.09733951836824417, "rewards/grpo_reward_func/mean": 0.42906028032302856, "rewards/grpo_reward_func/std": 0.101521797478199, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.84375, "frac_reward_zero_std": 0.0, "grad_norm": 6.59375, "learning_rate": 3.7999999999999996e-07, "loss": -0.0, "num_tokens": 2324643.0, "reward": 0.5322451591491699, "reward_std": 0.05065479129552841, "rewards/grpo_reward_func/mean": 0.5322451591491699, "rewards/grpo_reward_func/std": 0.10973110795021057, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.875, "frac_reward_zero_std": 0.0, "grad_norm": 15.25, "learning_rate": 3.766666666666666e-07, "loss": 0.0, "num_tokens": 2337095.0, "reward": 0.357377290725708, "reward_std": 0.11668767035007477, "rewards/grpo_reward_func/mean": 0.357377290725708, "rewards/grpo_reward_func/std": 0.11811976879835129, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.90625, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "learning_rate": 3.7333333333333334e-07, "loss": 0.0, "num_tokens": 2349527.0, "reward": 0.4484630823135376, "reward_std": 0.13092045485973358, "rewards/grpo_reward_func/mean": 0.4484630823135376, "rewards/grpo_reward_func/std": 0.18394383788108826, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.9375, "frac_reward_zero_std": 0.0, "grad_norm": 16.625, "learning_rate": 3.7e-07, "loss": 0.0, "num_tokens": 2361935.0, "reward": 0.5536394119262695, "reward_std": 0.1294117271900177, "rewards/grpo_reward_func/mean": 0.5536394119262695, "rewards/grpo_reward_func/std": 0.1366083174943924, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 5.96875, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "learning_rate": 3.666666666666666e-07, "loss": -0.0, "num_tokens": 2374335.0, "reward": 0.43622025847435, "reward_std": 0.036504555493593216, "rewards/grpo_reward_func/mean": 0.43622025847435, "rewards/grpo_reward_func/std": 0.04184015840291977, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.0, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "learning_rate": 3.6333333333333333e-07, "loss": 0.0, "num_tokens": 2386703.0, "reward": 0.32082119584083557, "reward_std": 0.08303728699684143, "rewards/grpo_reward_func/mean": 0.32082119584083557, "rewards/grpo_reward_func/std": 0.09865312278270721, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.03125, "frac_reward_zero_std": 0.0, "grad_norm": 12.0, "learning_rate": 3.6e-07, "loss": -0.0, "num_tokens": 2399127.0, "reward": 0.40079742670059204, "reward_std": 0.12725131213665009, "rewards/grpo_reward_func/mean": 0.40079742670059204, "rewards/grpo_reward_func/std": 0.16112373769283295, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.0625, "frac_reward_zero_std": 0.0, "grad_norm": 8.6875, "learning_rate": 3.5666666666666666e-07, "loss": -0.0, "num_tokens": 2411571.0, "reward": 0.43647855520248413, "reward_std": 0.10620959103107452, "rewards/grpo_reward_func/mean": 0.43647855520248413, "rewards/grpo_reward_func/std": 0.09909818321466446, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.09375, "frac_reward_zero_std": 0.0, "grad_norm": 10.6875, "learning_rate": 3.533333333333333e-07, "loss": 0.0, "num_tokens": 2423979.0, "reward": 0.3994408845901489, "reward_std": 0.17607587575912476, "rewards/grpo_reward_func/mean": 0.3994408845901489, "rewards/grpo_reward_func/std": 0.17166729271411896, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.125, "frac_reward_zero_std": 0.0, "grad_norm": 7.78125, "learning_rate": 3.5e-07, "loss": -0.0, "num_tokens": 2436363.0, "reward": 0.4736449420452118, "reward_std": 0.09779857844114304, "rewards/grpo_reward_func/mean": 0.4736449420452118, "rewards/grpo_reward_func/std": 0.11548104882240295, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0, "learning_rate": 3.4666666666666665e-07, "loss": 0.0, "num_tokens": 2448911.0, "reward": 0.38275349140167236, "reward_std": 0.07293462753295898, "rewards/grpo_reward_func/mean": 0.38275349140167236, "rewards/grpo_reward_func/std": 0.0916486382484436, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.1875, "frac_reward_zero_std": 0.0, "grad_norm": 14.125, "learning_rate": 3.433333333333333e-07, "loss": 0.0, "num_tokens": 2461263.0, "reward": 0.5372081995010376, "reward_std": 0.20845532417297363, "rewards/grpo_reward_func/mean": 0.5372081995010376, "rewards/grpo_reward_func/std": 0.21515534818172455, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.21875, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "num_tokens": 2473707.0, "reward": 0.415330171585083, "reward_std": 0.15996377170085907, "rewards/grpo_reward_func/mean": 0.415330171585083, "rewards/grpo_reward_func/std": 0.18506671488285065, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.25, "frac_reward_zero_std": 0.0, "grad_norm": 8.0625, "learning_rate": 3.3666666666666664e-07, "loss": -0.0, "num_tokens": 2486079.0, "reward": 0.41273248195648193, "reward_std": 0.050071652978658676, "rewards/grpo_reward_func/mean": 0.41273248195648193, "rewards/grpo_reward_func/std": 0.11006694287061691, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.28125, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "learning_rate": 3.333333333333333e-07, "loss": -0.0, "num_tokens": 2498575.0, "reward": 0.40237534046173096, "reward_std": 0.16011598706245422, "rewards/grpo_reward_func/mean": 0.40237534046173096, "rewards/grpo_reward_func/std": 0.1658114790916443, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.3125, "frac_reward_zero_std": 0.0, "grad_norm": 14.125, "learning_rate": 3.3e-07, "loss": 0.0, "num_tokens": 2511047.0, "reward": 0.40720921754837036, "reward_std": 0.10842312127351761, "rewards/grpo_reward_func/mean": 0.40720921754837036, "rewards/grpo_reward_func/std": 0.15348494052886963, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.34375, "frac_reward_zero_std": 0.0, "grad_norm": 6.75, "learning_rate": 3.2666666666666663e-07, "loss": -0.0, "num_tokens": 2523499.0, "reward": 0.46542418003082275, "reward_std": 0.1260077953338623, "rewards/grpo_reward_func/mean": 0.46542418003082275, "rewards/grpo_reward_func/std": 0.1437770575284958, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.375, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "learning_rate": 3.233333333333333e-07, "loss": -0.0, "num_tokens": 2535931.0, "reward": 0.4416119456291199, "reward_std": 0.10100536048412323, "rewards/grpo_reward_func/mean": 0.4416119456291199, "rewards/grpo_reward_func/std": 0.11735321581363678, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.40625, "frac_reward_zero_std": 0.0, "grad_norm": 6.1875, "learning_rate": 3.2e-07, "loss": -0.0, "num_tokens": 2548287.0, "reward": 0.40553370118141174, "reward_std": 0.13550561666488647, "rewards/grpo_reward_func/mean": 0.40553370118141174, "rewards/grpo_reward_func/std": 0.13743624091148376, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.4375, "frac_reward_zero_std": 0.0, "grad_norm": 9.75, "learning_rate": 3.166666666666666e-07, "loss": -0.0, "num_tokens": 2560711.0, "reward": 0.35497111082077026, "reward_std": 0.11463560163974762, "rewards/grpo_reward_func/mean": 0.35497111082077026, "rewards/grpo_reward_func/std": 0.12006353586912155, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.46875, "frac_reward_zero_std": 0.0, "grad_norm": 11.5625, "learning_rate": 3.1333333333333333e-07, "loss": 0.0, "num_tokens": 2573143.0, "reward": 0.4096822142601013, "reward_std": 0.05833249166607857, "rewards/grpo_reward_func/mean": 0.4096822142601013, "rewards/grpo_reward_func/std": 0.08212708681821823, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.3125, "learning_rate": 3.1e-07, "loss": -0.0, "num_tokens": 2585583.0, "reward": 0.4554346799850464, "reward_std": 0.12953370809555054, "rewards/grpo_reward_func/mean": 0.4554346799850464, "rewards/grpo_reward_func/std": 0.1593649685382843, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.53125, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "learning_rate": 3.066666666666666e-07, "loss": -0.0, "num_tokens": 2598031.0, "reward": 0.5756185054779053, "reward_std": 0.0809057205915451, "rewards/grpo_reward_func/mean": 0.5756185054779053, "rewards/grpo_reward_func/std": 0.10910212993621826, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.5625, "frac_reward_zero_std": 0.0, "grad_norm": 12.9375, "learning_rate": 3.033333333333333e-07, "loss": 0.0, "num_tokens": 2610507.0, "reward": 0.39368146657943726, "reward_std": 0.1372520923614502, "rewards/grpo_reward_func/mean": 0.39368146657943726, "rewards/grpo_reward_func/std": 0.13048243522644043, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.59375, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 2623039.0, "reward": 0.3540037274360657, "reward_std": 0.08001622557640076, "rewards/grpo_reward_func/mean": 0.3540037274360657, "rewards/grpo_reward_func/std": 0.08400996774435043, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.625, "frac_reward_zero_std": 0.0, "grad_norm": 7.8125, "learning_rate": 2.966666666666667e-07, "loss": 0.0, "num_tokens": 2635395.0, "reward": 0.44302040338516235, "reward_std": 0.08312968909740448, "rewards/grpo_reward_func/mean": 0.44302040338516235, "rewards/grpo_reward_func/std": 0.0891660526394844, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.65625, "frac_reward_zero_std": 0.0, "grad_norm": 7.78125, "learning_rate": 2.933333333333333e-07, "loss": -0.0, "num_tokens": 2647843.0, "reward": 0.3831726610660553, "reward_std": 0.05200519412755966, "rewards/grpo_reward_func/mean": 0.3831726610660553, "rewards/grpo_reward_func/std": 0.09167957305908203, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.6875, "frac_reward_zero_std": 0.0, "grad_norm": 4.4375, "learning_rate": 2.9e-07, "loss": 0.0, "num_tokens": 2660251.0, "reward": 0.3554950952529907, "reward_std": 0.05713435262441635, "rewards/grpo_reward_func/mean": 0.3554950952529907, "rewards/grpo_reward_func/std": 0.0688985213637352, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.71875, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "learning_rate": 2.866666666666667e-07, "loss": -0.0, "num_tokens": 2672727.0, "reward": 0.326229453086853, "reward_std": 0.08010618388652802, "rewards/grpo_reward_func/mean": 0.326229453086853, "rewards/grpo_reward_func/std": 0.08994052559137344, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.75, "frac_reward_zero_std": 0.0, "grad_norm": 12.125, "learning_rate": 2.833333333333333e-07, "loss": -0.0, "num_tokens": 2685119.0, "reward": 0.4351205825805664, "reward_std": 0.08398930728435516, "rewards/grpo_reward_func/mean": 0.4351205825805664, "rewards/grpo_reward_func/std": 0.08400265872478485, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.78125, "frac_reward_zero_std": 0.0, "grad_norm": 7.09375, "learning_rate": 2.8e-07, "loss": 0.0, "num_tokens": 2697507.0, "reward": 0.4564037621021271, "reward_std": 0.08567321300506592, "rewards/grpo_reward_func/mean": 0.4564037621021271, "rewards/grpo_reward_func/std": 0.08155813813209534, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.8125, "frac_reward_zero_std": 0.0, "grad_norm": 13.8125, "learning_rate": 2.766666666666667e-07, "loss": -0.0, "num_tokens": 2709927.0, "reward": 0.4280545115470886, "reward_std": 0.13084210455417633, "rewards/grpo_reward_func/mean": 0.4280545115470886, "rewards/grpo_reward_func/std": 0.13759230077266693, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.84375, "frac_reward_zero_std": 0.0, "grad_norm": 6.25, "learning_rate": 2.733333333333333e-07, "loss": 0.0, "num_tokens": 2722331.0, "reward": 0.44869235157966614, "reward_std": 0.0902545154094696, "rewards/grpo_reward_func/mean": 0.44869235157966614, "rewards/grpo_reward_func/std": 0.1127212718129158, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.875, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "learning_rate": 2.7e-07, "loss": 0.0, "num_tokens": 2734779.0, "reward": 0.4759725332260132, "reward_std": 0.12860512733459473, "rewards/grpo_reward_func/mean": 0.4759725332260132, "rewards/grpo_reward_func/std": 0.1384066343307495, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.90625, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "learning_rate": 2.6666666666666667e-07, "loss": -0.0, "num_tokens": 2747235.0, "reward": 0.5738496780395508, "reward_std": 0.11320274323225021, "rewards/grpo_reward_func/mean": 0.5738496780395508, "rewards/grpo_reward_func/std": 0.10654186457395554, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.9375, "frac_reward_zero_std": 0.0, "grad_norm": 8.9375, "learning_rate": 2.633333333333333e-07, "loss": -0.0, "num_tokens": 2759643.0, "reward": 0.33652713894844055, "reward_std": 0.10561183094978333, "rewards/grpo_reward_func/mean": 0.33652713894844055, "rewards/grpo_reward_func/std": 0.11127988249063492, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 6.96875, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "learning_rate": 2.6e-07, "loss": 0.0, "num_tokens": 2772083.0, "reward": 0.45456087589263916, "reward_std": 0.21474137902259827, "rewards/grpo_reward_func/mean": 0.45456087589263916, "rewards/grpo_reward_func/std": 0.20742803812026978, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.0, "frac_reward_zero_std": 0.0, "grad_norm": 12.25, "learning_rate": 2.5666666666666666e-07, "loss": -0.0, "num_tokens": 2784487.0, "reward": 0.36959922313690186, "reward_std": 0.12393350899219513, "rewards/grpo_reward_func/mean": 0.36959922313690186, "rewards/grpo_reward_func/std": 0.18545781075954437, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.03125, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "learning_rate": 2.533333333333333e-07, "loss": 0.0, "num_tokens": 2796895.0, "reward": 0.5148861408233643, "reward_std": 0.10401658713817596, "rewards/grpo_reward_func/mean": 0.5148861408233643, "rewards/grpo_reward_func/std": 0.10146593302488327, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.0625, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 2809283.0, "reward": 0.3833024799823761, "reward_std": 0.07489189505577087, "rewards/grpo_reward_func/mean": 0.3833024799823761, "rewards/grpo_reward_func/std": 0.07100249826908112, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.09375, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "learning_rate": 2.4666666666666665e-07, "loss": 0.0, "num_tokens": 2821811.0, "reward": 0.37905335426330566, "reward_std": 0.09207235276699066, "rewards/grpo_reward_func/mean": 0.37905335426330566, "rewards/grpo_reward_func/std": 0.10075780749320984, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.125, "frac_reward_zero_std": 0.0, "grad_norm": 0.0693359375, "learning_rate": 2.433333333333333e-07, "loss": 0.0, "num_tokens": 2834259.0, "reward": 0.5241298079490662, "reward_std": 0.050000011920928955, "rewards/grpo_reward_func/mean": 0.5241298079490662, "rewards/grpo_reward_func/std": 0.11461541801691055, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "learning_rate": 2.4e-07, "loss": -0.0, "num_tokens": 2846667.0, "reward": 0.38863605260849, "reward_std": 0.09145700931549072, "rewards/grpo_reward_func/mean": 0.38863605260849, "rewards/grpo_reward_func/std": 0.0854310691356659, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.1875, "frac_reward_zero_std": 0.0, "grad_norm": 13.6875, "learning_rate": 2.3666666666666664e-07, "loss": 0.0, "num_tokens": 2859183.0, "reward": 0.48604702949523926, "reward_std": 0.12953568994998932, "rewards/grpo_reward_func/mean": 0.48604702949523926, "rewards/grpo_reward_func/std": 0.12877187132835388, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.21875, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "num_tokens": 2871631.0, "reward": 0.49290764331817627, "reward_std": 0.1408785730600357, "rewards/grpo_reward_func/mean": 0.49290764331817627, "rewards/grpo_reward_func/std": 0.16115672886371613, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.25, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "learning_rate": 2.3e-07, "loss": 0.0, "num_tokens": 2884087.0, "reward": 0.389384388923645, "reward_std": 0.08452893793582916, "rewards/grpo_reward_func/mean": 0.389384388923645, "rewards/grpo_reward_func/std": 0.09952805191278458, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.28125, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "learning_rate": 2.2666666666666663e-07, "loss": 0.0, "num_tokens": 2896567.0, "reward": 0.42921292781829834, "reward_std": 0.12179729342460632, "rewards/grpo_reward_func/mean": 0.42921292781829834, "rewards/grpo_reward_func/std": 0.14654681086540222, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.3125, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 2.2333333333333332e-07, "loss": 0.0, "num_tokens": 2909027.0, "reward": 0.3906250596046448, "reward_std": 0.07476774603128433, "rewards/grpo_reward_func/mean": 0.3906250596046448, "rewards/grpo_reward_func/std": 0.07509444653987885, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.34375, "frac_reward_zero_std": 0.0, "grad_norm": 15.6875, "learning_rate": 2.1999999999999998e-07, "loss": -0.0, "num_tokens": 2921395.0, "reward": 0.590385913848877, "reward_std": 0.07703244686126709, "rewards/grpo_reward_func/mean": 0.590385913848877, "rewards/grpo_reward_func/std": 0.10371364653110504, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.375, "frac_reward_zero_std": 0.0, "grad_norm": 13.375, "learning_rate": 2.1666666666666667e-07, "loss": 0.0, "num_tokens": 2933635.0, "reward": 0.45836111903190613, "reward_std": 0.1561897248029709, "rewards/grpo_reward_func/mean": 0.45836111903190613, "rewards/grpo_reward_func/std": 0.1763034164905548, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.40625, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "learning_rate": 2.1333333333333334e-07, "loss": 0.0, "num_tokens": 2946019.0, "reward": 0.3915758430957794, "reward_std": 0.09102918207645416, "rewards/grpo_reward_func/mean": 0.3915758430957794, "rewards/grpo_reward_func/std": 0.09686020016670227, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.4375, "frac_reward_zero_std": 0.0, "grad_norm": 7.4375, "learning_rate": 2.0999999999999997e-07, "loss": 0.0, "num_tokens": 2958339.0, "reward": 0.5137478709220886, "reward_std": 0.06453146040439606, "rewards/grpo_reward_func/mean": 0.5137478709220886, "rewards/grpo_reward_func/std": 0.08145393431186676, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.46875, "frac_reward_zero_std": 0.0, "grad_norm": 7.6875, "learning_rate": 2.0666666666666666e-07, "loss": 0.0, "num_tokens": 2970795.0, "reward": 0.3901534080505371, "reward_std": 0.08677110075950623, "rewards/grpo_reward_func/mean": 0.3901534080505371, "rewards/grpo_reward_func/std": 0.0884600430727005, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.25, "learning_rate": 2.0333333333333333e-07, "loss": -0.0, "num_tokens": 2983199.0, "reward": 0.3966296911239624, "reward_std": 0.11454164981842041, "rewards/grpo_reward_func/mean": 0.3966296911239624, "rewards/grpo_reward_func/std": 0.11218782514333725, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.53125, "frac_reward_zero_std": 0.0, "grad_norm": 11.8125, "learning_rate": 2e-07, "loss": -0.0, "num_tokens": 2995671.0, "reward": 0.4128722548484802, "reward_std": 0.1050279289484024, "rewards/grpo_reward_func/mean": 0.4128722548484802, "rewards/grpo_reward_func/std": 0.15005381405353546, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.5625, "frac_reward_zero_std": 0.0, "grad_norm": 7.40625, "learning_rate": 1.9666666666666665e-07, "loss": -0.0, "num_tokens": 3008163.0, "reward": 0.41674578189849854, "reward_std": 0.130544051527977, "rewards/grpo_reward_func/mean": 0.41674578189849854, "rewards/grpo_reward_func/std": 0.1530657559633255, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.59375, "frac_reward_zero_std": 0.0, "grad_norm": 15.125, "learning_rate": 1.9333333333333332e-07, "loss": 0.0, "num_tokens": 3020547.0, "reward": 0.44154661893844604, "reward_std": 0.11442729830741882, "rewards/grpo_reward_func/mean": 0.44154661893844604, "rewards/grpo_reward_func/std": 0.15436801314353943, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.625, "frac_reward_zero_std": 0.0, "grad_norm": 11.875, "learning_rate": 1.8999999999999998e-07, "loss": -0.0, "num_tokens": 3032907.0, "reward": 0.41183507442474365, "reward_std": 0.11221058666706085, "rewards/grpo_reward_func/mean": 0.41183507442474365, "rewards/grpo_reward_func/std": 0.10689571499824524, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.65625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 1.8666666666666667e-07, "loss": -0.0, "num_tokens": 3045283.0, "reward": 0.3969360589981079, "reward_std": 0.13579751551151276, "rewards/grpo_reward_func/mean": 0.3969360589981079, "rewards/grpo_reward_func/std": 0.14742760360240936, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.6875, "frac_reward_zero_std": 0.0, "grad_norm": 14.5625, "learning_rate": 1.833333333333333e-07, "loss": 0.0, "num_tokens": 3057703.0, "reward": 0.3443870544433594, "reward_std": 0.20534491539001465, "rewards/grpo_reward_func/mean": 0.3443870544433594, "rewards/grpo_reward_func/std": 0.19916358590126038, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.71875, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "learning_rate": 1.8e-07, "loss": 0.0, "num_tokens": 3070135.0, "reward": 0.3964824378490448, "reward_std": 0.13892269134521484, "rewards/grpo_reward_func/mean": 0.3964824378490448, "rewards/grpo_reward_func/std": 0.15906588733196259, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.75, "frac_reward_zero_std": 0.0, "grad_norm": 9.5625, "learning_rate": 1.7666666666666666e-07, "loss": -0.0, "num_tokens": 3082619.0, "reward": 0.4210782051086426, "reward_std": 0.12285022437572479, "rewards/grpo_reward_func/mean": 0.4210782051086426, "rewards/grpo_reward_func/std": 0.1384182870388031, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.78125, "frac_reward_zero_std": 0.0, "grad_norm": 9.9375, "learning_rate": 1.7333333333333332e-07, "loss": -0.0, "num_tokens": 3095171.0, "reward": 0.49818533658981323, "reward_std": 0.10502855479717255, "rewards/grpo_reward_func/mean": 0.49818533658981323, "rewards/grpo_reward_func/std": 0.13689859211444855, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.8125, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "learning_rate": 1.7000000000000001e-07, "loss": -0.0, "num_tokens": 3107575.0, "reward": 0.39785051345825195, "reward_std": 0.0737057775259018, "rewards/grpo_reward_func/mean": 0.39785051345825195, "rewards/grpo_reward_func/std": 0.08374593406915665, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.84375, "frac_reward_zero_std": 0.0, "grad_norm": 7.1875, "learning_rate": 1.6666666666666665e-07, "loss": -0.0, "num_tokens": 3119987.0, "reward": 0.3956165909767151, "reward_std": 0.08730382472276688, "rewards/grpo_reward_func/mean": 0.3956165909767151, "rewards/grpo_reward_func/std": 0.12590822577476501, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.875, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "learning_rate": 1.6333333333333331e-07, "loss": -0.0, "num_tokens": 3132419.0, "reward": 0.40047013759613037, "reward_std": 0.09308422356843948, "rewards/grpo_reward_func/mean": 0.40047013759613037, "rewards/grpo_reward_func/std": 0.10088325291872025, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.90625, "frac_reward_zero_std": 0.0, "grad_norm": 9.125, "learning_rate": 1.6e-07, "loss": -0.0, "num_tokens": 3144851.0, "reward": 0.33004331588745117, "reward_std": 0.04140020161867142, "rewards/grpo_reward_func/mean": 0.33004331588745117, "rewards/grpo_reward_func/std": 0.04383409395813942, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.9375, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "learning_rate": 1.5666666666666667e-07, "loss": -0.0, "num_tokens": 3157367.0, "reward": 0.495669424533844, "reward_std": 0.095655158162117, "rewards/grpo_reward_func/mean": 0.495669424533844, "rewards/grpo_reward_func/std": 0.10840737819671631, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 7.96875, "frac_reward_zero_std": 0.0, "grad_norm": 10.3125, "learning_rate": 1.533333333333333e-07, "loss": 0.0, "num_tokens": 3169795.0, "reward": 0.398048460483551, "reward_std": 0.08092916011810303, "rewards/grpo_reward_func/mean": 0.398048460483551, "rewards/grpo_reward_func/std": 0.08040700852870941, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.0, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "learning_rate": 1.5e-07, "loss": 0.0, "num_tokens": 3182271.0, "reward": 0.4666450321674347, "reward_std": 0.08053655922412872, "rewards/grpo_reward_func/mean": 0.4666450321674347, "rewards/grpo_reward_func/std": 0.11888416111469269, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.03125, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "learning_rate": 1.4666666666666666e-07, "loss": 0.0, "num_tokens": 3194699.0, "reward": 0.4015364646911621, "reward_std": 0.16598042845726013, "rewards/grpo_reward_func/mean": 0.4015364646911621, "rewards/grpo_reward_func/std": 0.16788989305496216, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.0625, "frac_reward_zero_std": 0.0, "grad_norm": 10.375, "learning_rate": 1.4333333333333335e-07, "loss": -0.0, "num_tokens": 3207091.0, "reward": 0.48480066657066345, "reward_std": 0.15683354437351227, "rewards/grpo_reward_func/mean": 0.48480066657066345, "rewards/grpo_reward_func/std": 0.14960500597953796, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.09375, "frac_reward_zero_std": 0.0, "grad_norm": 9.8125, "learning_rate": 1.4e-07, "loss": 0.0, "num_tokens": 3219447.0, "reward": 0.49088042974472046, "reward_std": 0.16376182436943054, "rewards/grpo_reward_func/mean": 0.49088042974472046, "rewards/grpo_reward_func/std": 0.17037776112556458, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.125, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "learning_rate": 1.3666666666666665e-07, "loss": 0.0, "num_tokens": 3231843.0, "reward": 0.4621606469154358, "reward_std": 0.16308224201202393, "rewards/grpo_reward_func/mean": 0.4621606469154358, "rewards/grpo_reward_func/std": 0.18942511081695557, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.15625, "frac_reward_zero_std": 0.0, "grad_norm": 9.375, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "num_tokens": 3244199.0, "reward": 0.521634578704834, "reward_std": 0.08799108862876892, "rewards/grpo_reward_func/mean": 0.521634578704834, "rewards/grpo_reward_func/std": 0.08898300677537918, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.1875, "frac_reward_zero_std": 0.0, "grad_norm": 12.375, "learning_rate": 1.3e-07, "loss": -0.0, "num_tokens": 3256663.0, "reward": 0.5014014840126038, "reward_std": 0.10305628925561905, "rewards/grpo_reward_func/mean": 0.5014014840126038, "rewards/grpo_reward_func/std": 0.11243268102407455, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.21875, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "learning_rate": 1.2666666666666666e-07, "loss": 0.0, "num_tokens": 3269115.0, "reward": 0.49657315015792847, "reward_std": 0.14654701948165894, "rewards/grpo_reward_func/mean": 0.49657315015792847, "rewards/grpo_reward_func/std": 0.14595918357372284, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.25, "frac_reward_zero_std": 0.0, "grad_norm": 11.75, "learning_rate": 1.2333333333333333e-07, "loss": 0.0, "num_tokens": 3281535.0, "reward": 0.4041872024536133, "reward_std": 0.1379416286945343, "rewards/grpo_reward_func/mean": 0.4041872024536133, "rewards/grpo_reward_func/std": 0.1561095267534256, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.28125, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "learning_rate": 1.2e-07, "loss": -0.0, "num_tokens": 3293927.0, "reward": 0.5414110422134399, "reward_std": 0.1973114013671875, "rewards/grpo_reward_func/mean": 0.5414110422134399, "rewards/grpo_reward_func/std": 0.18588195741176605, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.3125, "frac_reward_zero_std": 0.0, "grad_norm": 11.5, "learning_rate": 1.1666666666666667e-07, "loss": -0.0, "num_tokens": 3306379.0, "reward": 0.3934594988822937, "reward_std": 0.025219213217496872, "rewards/grpo_reward_func/mean": 0.3934594988822937, "rewards/grpo_reward_func/std": 0.027134951204061508, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.34375, "frac_reward_zero_std": 0.0, "grad_norm": 14.4375, "learning_rate": 1.1333333333333332e-07, "loss": -0.0, "num_tokens": 3318815.0, "reward": 0.40915048122406006, "reward_std": 0.09651514887809753, "rewards/grpo_reward_func/mean": 0.40915048122406006, "rewards/grpo_reward_func/std": 0.11164474487304688, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.375, "frac_reward_zero_std": 0.0, "grad_norm": 13.3125, "learning_rate": 1.0999999999999999e-07, "loss": 0.0, "num_tokens": 3331207.0, "reward": 0.3795730471611023, "reward_std": 0.08440607786178589, "rewards/grpo_reward_func/mean": 0.3795730471611023, "rewards/grpo_reward_func/std": 0.08232571184635162, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.40625, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "learning_rate": 1.0666666666666667e-07, "loss": -0.0, "num_tokens": 3343791.0, "reward": 0.45081427693367004, "reward_std": 0.13623002171516418, "rewards/grpo_reward_func/mean": 0.45081427693367004, "rewards/grpo_reward_func/std": 0.14548543095588684, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.4375, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 1.0333333333333333e-07, "loss": -0.0, "num_tokens": 3356207.0, "reward": 0.46069252490997314, "reward_std": 0.07286226749420166, "rewards/grpo_reward_func/mean": 0.46069252490997314, "rewards/grpo_reward_func/std": 0.08740860968828201, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.46875, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 1e-07, "loss": 0.0, "num_tokens": 3368691.0, "reward": 0.37001582980155945, "reward_std": 0.08882021903991699, "rewards/grpo_reward_func/mean": 0.37001582980155945, "rewards/grpo_reward_func/std": 0.08371038734912872, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.5, "frac_reward_zero_std": 0.0, "grad_norm": 11.625, "learning_rate": 9.666666666666666e-08, "loss": -0.0, "num_tokens": 3380935.0, "reward": 0.46963435411453247, "reward_std": 0.12529392540454865, "rewards/grpo_reward_func/mean": 0.46963435411453247, "rewards/grpo_reward_func/std": 0.13837039470672607, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.53125, "frac_reward_zero_std": 0.0, "grad_norm": 13.3125, "learning_rate": 9.333333333333334e-08, "loss": -0.0, "num_tokens": 3393443.0, "reward": 0.5679957866668701, "reward_std": 0.08565768599510193, "rewards/grpo_reward_func/mean": 0.5679957866668701, "rewards/grpo_reward_func/std": 0.08279130607843399, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.5625, "frac_reward_zero_std": 0.0, "grad_norm": 9.25, "learning_rate": 9e-08, "loss": 0.0, "num_tokens": 3405771.0, "reward": 0.3129928410053253, "reward_std": 0.07984557747840881, "rewards/grpo_reward_func/mean": 0.3129928410053253, "rewards/grpo_reward_func/std": 0.08136677742004395, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.59375, "frac_reward_zero_std": 0.0, "grad_norm": 14.0625, "learning_rate": 8.666666666666666e-08, "loss": 0.0, "num_tokens": 3418243.0, "reward": 0.3354572653770447, "reward_std": 0.09963542222976685, "rewards/grpo_reward_func/mean": 0.3354572653770447, "rewards/grpo_reward_func/std": 0.09654007852077484, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.625, "frac_reward_zero_std": 0.0, "grad_norm": 15.25, "learning_rate": 8.333333333333333e-08, "loss": 0.0, "num_tokens": 3430691.0, "reward": 0.41226309537887573, "reward_std": 0.1296028345823288, "rewards/grpo_reward_func/mean": 0.41226309537887573, "rewards/grpo_reward_func/std": 0.12655113637447357, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.65625, "frac_reward_zero_std": 0.0, "grad_norm": 10.125, "learning_rate": 8e-08, "loss": 0.0, "num_tokens": 3443151.0, "reward": 0.4148029088973999, "reward_std": 0.1445026993751526, "rewards/grpo_reward_func/mean": 0.4148029088973999, "rewards/grpo_reward_func/std": 0.1527920663356781, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.6875, "frac_reward_zero_std": 0.0, "grad_norm": 12.1875, "learning_rate": 7.666666666666665e-08, "loss": 0.0, "num_tokens": 3455579.0, "reward": 0.3091464638710022, "reward_std": 0.09873013943433762, "rewards/grpo_reward_func/mean": 0.3091464638710022, "rewards/grpo_reward_func/std": 0.12618468701839447, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.71875, "frac_reward_zero_std": 0.0, "grad_norm": 8.375, "learning_rate": 7.333333333333333e-08, "loss": 0.0, "num_tokens": 3468011.0, "reward": 0.412067174911499, "reward_std": 0.11878905445337296, "rewards/grpo_reward_func/mean": 0.412067174911499, "rewards/grpo_reward_func/std": 0.12399723380804062, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.75, "frac_reward_zero_std": 0.0, "grad_norm": 12.6875, "learning_rate": 7e-08, "loss": -0.0, "num_tokens": 3480459.0, "reward": 0.3863711953163147, "reward_std": 0.1872004270553589, "rewards/grpo_reward_func/mean": 0.3863711953163147, "rewards/grpo_reward_func/std": 0.18860581517219543, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.78125, "frac_reward_zero_std": 0.0, "grad_norm": 16.125, "learning_rate": 6.666666666666667e-08, "loss": -0.0, "num_tokens": 3492923.0, "reward": 0.40867847204208374, "reward_std": 0.14625820517539978, "rewards/grpo_reward_func/mean": 0.40867847204208374, "rewards/grpo_reward_func/std": 0.14255009591579437, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.8125, "frac_reward_zero_std": 0.0, "grad_norm": 8.875, "learning_rate": 6.333333333333333e-08, "loss": 0.0, "num_tokens": 3505387.0, "reward": 0.45097100734710693, "reward_std": 0.21717840433120728, "rewards/grpo_reward_func/mean": 0.45097100734710693, "rewards/grpo_reward_func/std": 0.20403653383255005, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.84375, "frac_reward_zero_std": 0.0, "grad_norm": 10.1875, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 3517903.0, "reward": 0.49071210622787476, "reward_std": 0.13102422654628754, "rewards/grpo_reward_func/mean": 0.49071210622787476, "rewards/grpo_reward_func/std": 0.14358305931091309, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.875, "frac_reward_zero_std": 0.0, "grad_norm": 8.0, "learning_rate": 5.666666666666666e-08, "loss": -0.0, "num_tokens": 3530331.0, "reward": 0.471984326839447, "reward_std": 0.11608313769102097, "rewards/grpo_reward_func/mean": 0.471984326839447, "rewards/grpo_reward_func/std": 0.12841607630252838, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.90625, "frac_reward_zero_std": 0.0, "grad_norm": 8.1875, "learning_rate": 5.3333333333333334e-08, "loss": -0.0, "num_tokens": 3542763.0, "reward": 0.39699018001556396, "reward_std": 0.11195935308933258, "rewards/grpo_reward_func/mean": 0.39699018001556396, "rewards/grpo_reward_func/std": 0.16198311746120453, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.9375, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "learning_rate": 5e-08, "loss": 0.0, "num_tokens": 3555171.0, "reward": 0.40294522047042847, "reward_std": 0.11233559250831604, "rewards/grpo_reward_func/mean": 0.40294522047042847, "rewards/grpo_reward_func/std": 0.12867507338523865, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 8.96875, "frac_reward_zero_std": 0.0, "grad_norm": 8.125, "learning_rate": 4.666666666666667e-08, "loss": 0.0, "num_tokens": 3567639.0, "reward": 0.45153820514678955, "reward_std": 0.10483110696077347, "rewards/grpo_reward_func/mean": 0.45153820514678955, "rewards/grpo_reward_func/std": 0.11334265768527985, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.0, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "learning_rate": 4.333333333333333e-08, "loss": 0.0, "num_tokens": 3580055.0, "reward": 0.39824116230010986, "reward_std": 0.0965305045247078, "rewards/grpo_reward_func/mean": 0.39824116230010986, "rewards/grpo_reward_func/std": 0.10601532459259033, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.03125, "frac_reward_zero_std": 0.0, "grad_norm": 10.8125, "learning_rate": 4e-08, "loss": -0.0, "num_tokens": 3592511.0, "reward": 0.3396638035774231, "reward_std": 0.0737166702747345, "rewards/grpo_reward_func/mean": 0.3396638035774231, "rewards/grpo_reward_func/std": 0.07909521460533142, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.0625, "frac_reward_zero_std": 0.0, "grad_norm": 12.0625, "learning_rate": 3.6666666666666664e-08, "loss": -0.0, "num_tokens": 3604847.0, "reward": 0.4459681212902069, "reward_std": 0.13664312660694122, "rewards/grpo_reward_func/mean": 0.4459681212902069, "rewards/grpo_reward_func/std": 0.1500515192747116, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.09375, "frac_reward_zero_std": 0.0, "grad_norm": 16.625, "learning_rate": 3.3333333333333334e-08, "loss": -0.0, "num_tokens": 3617299.0, "reward": 0.35913753509521484, "reward_std": 0.10111263394355774, "rewards/grpo_reward_func/mean": 0.35913753509521484, "rewards/grpo_reward_func/std": 0.10508442670106888, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.125, "frac_reward_zero_std": 0.0, "grad_norm": 10.5625, "learning_rate": 3e-08, "loss": -0.0, "num_tokens": 3629567.0, "reward": 0.4349736273288727, "reward_std": 0.12172282487154007, "rewards/grpo_reward_func/mean": 0.4349736273288727, "rewards/grpo_reward_func/std": 0.11470159143209457, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.15625, "frac_reward_zero_std": 0.0, "grad_norm": 11.0625, "learning_rate": 2.6666666666666667e-08, "loss": 0.0, "num_tokens": 3642071.0, "reward": 0.396597683429718, "reward_std": 0.12911826372146606, "rewards/grpo_reward_func/mean": 0.396597683429718, "rewards/grpo_reward_func/std": 0.12233106046915054, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.1875, "frac_reward_zero_std": 0.0, "grad_norm": 17.5, "learning_rate": 2.3333333333333334e-08, "loss": 0.0, "num_tokens": 3654479.0, "reward": 0.5098578929901123, "reward_std": 0.1227826401591301, "rewards/grpo_reward_func/mean": 0.5098578929901123, "rewards/grpo_reward_func/std": 0.11480940878391266, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.21875, "frac_reward_zero_std": 0.0, "grad_norm": 7.5625, "learning_rate": 2e-08, "loss": 0.0, "num_tokens": 3666891.0, "reward": 0.40734565258026123, "reward_std": 0.11240965127944946, "rewards/grpo_reward_func/mean": 0.40734565258026123, "rewards/grpo_reward_func/std": 0.13429103791713715, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.25, "frac_reward_zero_std": 0.0, "grad_norm": 3.21875, "learning_rate": 1.6666666666666667e-08, "loss": -0.0, "num_tokens": 3679471.0, "reward": 0.37585046887397766, "reward_std": 0.048339828848838806, "rewards/grpo_reward_func/mean": 0.37585046887397766, "rewards/grpo_reward_func/std": 0.059352707117795944, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.28125, "frac_reward_zero_std": 0.0, "grad_norm": 11.4375, "learning_rate": 1.3333333333333334e-08, "loss": 0.0, "num_tokens": 3691927.0, "reward": 0.3830341100692749, "reward_std": 0.09623756259679794, "rewards/grpo_reward_func/mean": 0.3830341100692749, "rewards/grpo_reward_func/std": 0.0935094878077507, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.3125, "frac_reward_zero_std": 0.0, "grad_norm": 10.9375, "learning_rate": 1e-08, "loss": 0.0, "num_tokens": 3704431.0, "reward": 0.5307860374450684, "reward_std": 0.15707515180110931, "rewards/grpo_reward_func/mean": 0.5307860374450684, "rewards/grpo_reward_func/std": 0.15192177891731262, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.34375, "frac_reward_zero_std": 0.0, "grad_norm": 12.4375, "learning_rate": 6.666666666666667e-09, "loss": 0.0, "num_tokens": 3716835.0, "reward": 0.5075388550758362, "reward_std": 0.13507473468780518, "rewards/grpo_reward_func/mean": 0.5075388550758362, "rewards/grpo_reward_func/std": 0.19023331999778748, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 9.375, "frac_reward_zero_std": 0.0, "grad_norm": 13.0, "learning_rate": 3.3333333333333334e-09, "loss": -0.0, "num_tokens": 3729299.0, "reward": 0.4454175531864166, "reward_std": 0.07014341652393341, "rewards/grpo_reward_func/mean": 0.4454175531864166, "rewards/grpo_reward_func/std": 0.1258506029844284, "step": 300 } ], "logging_steps": 1, "max_steps": 300, "num_input_tokens_seen": 3729299, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }