brainrl-grpo-single-m/checkpoint-300/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 9.375,
  "eval_steps": 500,
  "global_step": 300,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.3125,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 12528.0,
      "reward": 0.39676433801651,
      "reward_std": 0.11280547827482224,
      "rewards/grpo_reward_func/mean": 0.39676433801651,
      "rewards/grpo_reward_func/std": 0.13478560745716095,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.8125,
      "learning_rate": 9.966666666666667e-07,
      "loss": 0.0,
      "num_tokens": 24884.0,
      "reward": 0.4752987027168274,
      "reward_std": 0.13702644407749176,
      "rewards/grpo_reward_func/mean": 0.4752987027168274,
      "rewards/grpo_reward_func/std": 0.17374587059020996,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.625,
      "learning_rate": 9.933333333333333e-07,
      "loss": -0.0,
      "num_tokens": 37352.0,
      "reward": 0.44525083899497986,
      "reward_std": 0.10103905200958252,
      "rewards/grpo_reward_func/mean": 0.44525083899497986,
      "rewards/grpo_reward_func/std": 0.0979275107383728,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.125,
      "learning_rate": 9.9e-07,
      "loss": 0.0,
      "num_tokens": 49744.0,
      "reward": 0.399270236492157,
      "reward_std": 0.10935800522565842,
      "rewards/grpo_reward_func/mean": 0.399270236492157,
      "rewards/grpo_reward_func/std": 0.10536573082208633,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.75,
      "learning_rate": 9.866666666666666e-07,
      "loss": 0.0,
      "num_tokens": 62224.0,
      "reward": 0.3989260196685791,
      "reward_std": 0.11544467508792877,
      "rewards/grpo_reward_func/mean": 0.3989260196685791,
      "rewards/grpo_reward_func/std": 0.11394146084785461,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.125,
      "learning_rate": 9.833333333333332e-07,
      "loss": -0.0,
      "num_tokens": 74736.0,
      "reward": 0.42444688081741333,
      "reward_std": 0.14600424468517303,
      "rewards/grpo_reward_func/mean": 0.42444688081741333,
      "rewards/grpo_reward_func/std": 0.17498743534088135,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.125,
      "learning_rate": 9.8e-07,
      "loss": 0.0,
      "num_tokens": 87100.0,
      "reward": 0.4266095757484436,
      "reward_std": 0.0954706147313118,
      "rewards/grpo_reward_func/mean": 0.4266095757484436,
      "rewards/grpo_reward_func/std": 0.09790605306625366,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5,
      "learning_rate": 9.766666666666667e-07,
      "loss": -0.0,
      "num_tokens": 99496.0,
      "reward": 0.4947161376476288,
      "reward_std": 0.07030671834945679,
      "rewards/grpo_reward_func/mean": 0.4947161376476288,
      "rewards/grpo_reward_func/std": 0.07488483190536499,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.8125,
      "learning_rate": 9.733333333333333e-07,
      "loss": -0.0,
      "num_tokens": 111844.0,
      "reward": 0.4835072159767151,
      "reward_std": 0.1621960997581482,
      "rewards/grpo_reward_func/mean": 0.4835072159767151,
      "rewards/grpo_reward_func/std": 0.17284278571605682,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0,
      "learning_rate": 9.7e-07,
      "loss": -0.0,
      "num_tokens": 124240.0,
      "reward": 0.4783210754394531,
      "reward_std": 0.09915027022361755,
      "rewards/grpo_reward_func/mean": 0.4783210754394531,
      "rewards/grpo_reward_func/std": 0.11161749064922333,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.6875,
      "learning_rate": 9.666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 136652.0,
      "reward": 0.40330448746681213,
      "reward_std": 0.10881966352462769,
      "rewards/grpo_reward_func/mean": 0.40330448746681213,
      "rewards/grpo_reward_func/std": 0.1156788170337677,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.75,
      "learning_rate": 9.633333333333334e-07,
      "loss": -0.0,
      "num_tokens": 149048.0,
      "reward": 0.41300415992736816,
      "reward_std": 0.13600921630859375,
      "rewards/grpo_reward_func/mean": 0.41300415992736816,
      "rewards/grpo_reward_func/std": 0.1646273136138916,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.625,
      "learning_rate": 9.6e-07,
      "loss": 0.0,
      "num_tokens": 161276.0,
      "reward": 0.4857324957847595,
      "reward_std": 0.09516896307468414,
      "rewards/grpo_reward_func/mean": 0.4857324957847595,
      "rewards/grpo_reward_func/std": 0.09173914790153503,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.8125,
      "learning_rate": 9.566666666666667e-07,
      "loss": 0.0,
      "num_tokens": 173780.0,
      "reward": 0.4015089273452759,
      "reward_std": 0.06604111194610596,
      "rewards/grpo_reward_func/mean": 0.4015089273452759,
      "rewards/grpo_reward_func/std": 0.07018419355154037,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.6875,
      "learning_rate": 9.533333333333333e-07,
      "loss": -0.0,
      "num_tokens": 186192.0,
      "reward": 0.31999891996383667,
      "reward_std": 0.0805739015340805,
      "rewards/grpo_reward_func/mean": 0.31999891996383667,
      "rewards/grpo_reward_func/std": 0.08632533997297287,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.1875,
      "learning_rate": 9.499999999999999e-07,
      "loss": -0.0,
      "num_tokens": 198684.0,
      "reward": 0.39560186862945557,
      "reward_std": 0.09632067382335663,
      "rewards/grpo_reward_func/mean": 0.39560186862945557,
      "rewards/grpo_reward_func/std": 0.09369846433401108,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5,
      "learning_rate": 9.466666666666666e-07,
      "loss": 0.0,
      "num_tokens": 211096.0,
      "reward": 0.48571068048477173,
      "reward_std": 0.15206970274448395,
      "rewards/grpo_reward_func/mean": 0.48571068048477173,
      "rewards/grpo_reward_func/std": 0.1438637524843216,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.09375,
      "learning_rate": 9.433333333333333e-07,
      "loss": -0.0,
      "num_tokens": 223552.0,
      "reward": 0.45060235261917114,
      "reward_std": 0.05437461659312248,
      "rewards/grpo_reward_func/mean": 0.45060235261917114,
      "rewards/grpo_reward_func/std": 0.140779510140419,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.78125,
      "learning_rate": 9.399999999999999e-07,
      "loss": 0.0,
      "num_tokens": 236036.0,
      "reward": 0.4261874556541443,
      "reward_std": 0.09510611742734909,
      "rewards/grpo_reward_func/mean": 0.4261874556541443,
      "rewards/grpo_reward_func/std": 0.10084228217601776,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.375,
      "learning_rate": 9.366666666666666e-07,
      "loss": -0.0,
      "num_tokens": 248448.0,
      "reward": 0.29703885316848755,
      "reward_std": 0.046393271535634995,
      "rewards/grpo_reward_func/mean": 0.29703885316848755,
      "rewards/grpo_reward_func/std": 0.04335997626185417,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0625,
      "learning_rate": 9.333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 260776.0,
      "reward": 0.45774269104003906,
      "reward_std": 0.16561079025268555,
      "rewards/grpo_reward_func/mean": 0.45774269104003906,
      "rewards/grpo_reward_func/std": 0.15406657755374908,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.125,
      "learning_rate": 9.3e-07,
      "loss": 0.0,
      "num_tokens": 273248.0,
      "reward": 0.4235140085220337,
      "reward_std": 0.06906857341527939,
      "rewards/grpo_reward_func/mean": 0.4235140085220337,
      "rewards/grpo_reward_func/std": 0.07242283225059509,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.25,
      "learning_rate": 9.266666666666665e-07,
      "loss": 0.0,
      "num_tokens": 285724.0,
      "reward": 0.36918026208877563,
      "reward_std": 0.06028338894248009,
      "rewards/grpo_reward_func/mean": 0.36918026208877563,
      "rewards/grpo_reward_func/std": 0.0693485215306282,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0625,
      "learning_rate": 9.233333333333333e-07,
      "loss": 0.0,
      "num_tokens": 298132.0,
      "reward": 0.3204312324523926,
      "reward_std": 0.07052356004714966,
      "rewards/grpo_reward_func/mean": 0.3204312324523926,
      "rewards/grpo_reward_func/std": 0.09546414762735367,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.0625,
      "learning_rate": 9.2e-07,
      "loss": -0.0,
      "num_tokens": 310584.0,
      "reward": 0.38078033924102783,
      "reward_std": 0.13373351097106934,
      "rewards/grpo_reward_func/mean": 0.38078033924102783,
      "rewards/grpo_reward_func/std": 0.13402824103832245,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.6875,
      "learning_rate": 9.166666666666665e-07,
      "loss": 0.0,
      "num_tokens": 323076.0,
      "reward": 0.3454480767250061,
      "reward_std": 0.10349850356578827,
      "rewards/grpo_reward_func/mean": 0.3454480767250061,
      "rewards/grpo_reward_func/std": 0.12671217322349548,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.96875,
      "learning_rate": 9.133333333333333e-07,
      "loss": 0.0,
      "num_tokens": 335520.0,
      "reward": 0.3619287312030792,
      "reward_std": 0.12553678452968597,
      "rewards/grpo_reward_func/mean": 0.3619287312030792,
      "rewards/grpo_reward_func/std": 0.1537715494632721,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.9375,
      "learning_rate": 9.1e-07,
      "loss": 0.0,
      "num_tokens": 347940.0,
      "reward": 0.3436325788497925,
      "reward_std": 0.09887667000293732,
      "rewards/grpo_reward_func/mean": 0.3436325788497925,
      "rewards/grpo_reward_func/std": 0.12251166999340057,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0,
      "learning_rate": 9.066666666666665e-07,
      "loss": -0.0,
      "num_tokens": 360388.0,
      "reward": 0.4369204044342041,
      "reward_std": 0.19640696048736572,
      "rewards/grpo_reward_func/mean": 0.4369204044342041,
      "rewards/grpo_reward_func/std": 0.1927463412284851,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.8125,
      "learning_rate": 9.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 372832.0,
      "reward": 0.4874047338962555,
      "reward_std": 0.053364820778369904,
      "rewards/grpo_reward_func/mean": 0.4874047338962555,
      "rewards/grpo_reward_func/std": 0.08248723298311234,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.78125,
      "learning_rate": 9e-07,
      "loss": -0.0,
      "num_tokens": 385256.0,
      "reward": 0.4391651451587677,
      "reward_std": 0.07597412914037704,
      "rewards/grpo_reward_func/mean": 0.4391651451587677,
      "rewards/grpo_reward_func/std": 0.13502921164035797,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5,
      "learning_rate": 8.966666666666666e-07,
      "loss": 0.0,
      "num_tokens": 397784.0,
      "reward": 0.552140474319458,
      "reward_std": 0.1218448132276535,
      "rewards/grpo_reward_func/mean": 0.552140474319458,
      "rewards/grpo_reward_func/std": 0.11282333731651306,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5625,
      "learning_rate": 8.933333333333333e-07,
      "loss": -0.0,
      "num_tokens": 410304.0,
      "reward": 0.4041430950164795,
      "reward_std": 0.1936928927898407,
      "rewards/grpo_reward_func/mean": 0.4041430950164795,
      "rewards/grpo_reward_func/std": 0.18484662473201752,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.90625,
      "learning_rate": 8.9e-07,
      "loss": -0.0,
      "num_tokens": 422796.0,
      "reward": 0.41248780488967896,
      "reward_std": 0.15024888515472412,
      "rewards/grpo_reward_func/mean": 0.41248780488967896,
      "rewards/grpo_reward_func/std": 0.16827252507209778,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.625,
      "learning_rate": 8.866666666666667e-07,
      "loss": 0.0,
      "num_tokens": 435260.0,
      "reward": 0.4898865818977356,
      "reward_std": 0.11311106383800507,
      "rewards/grpo_reward_func/mean": 0.4898865818977356,
      "rewards/grpo_reward_func/std": 0.11546135693788528,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.25,
      "learning_rate": 8.833333333333333e-07,
      "loss": -0.0,
      "num_tokens": 447656.0,
      "reward": 0.402587354183197,
      "reward_std": 0.07555107772350311,
      "rewards/grpo_reward_func/mean": 0.402587354183197,
      "rewards/grpo_reward_func/std": 0.07951883971691132,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.25,
      "learning_rate": 8.799999999999999e-07,
      "loss": 0.0,
      "num_tokens": 460100.0,
      "reward": 0.4937467575073242,
      "reward_std": 0.11035488545894623,
      "rewards/grpo_reward_func/mean": 0.4937467575073242,
      "rewards/grpo_reward_func/std": 0.11266050487756729,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 8.766666666666667e-07,
      "loss": -0.0,
      "num_tokens": 472580.0,
      "reward": 0.42728495597839355,
      "reward_std": 0.05418732762336731,
      "rewards/grpo_reward_func/mean": 0.42728495597839355,
      "rewards/grpo_reward_func/std": 0.05117730051279068,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.9375,
      "learning_rate": 8.733333333333333e-07,
      "loss": -0.0,
      "num_tokens": 485016.0,
      "reward": 0.3464398980140686,
      "reward_std": 0.05486953258514404,
      "rewards/grpo_reward_func/mean": 0.3464398980140686,
      "rewards/grpo_reward_func/std": 0.10943454504013062,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.40625,
      "learning_rate": 8.699999999999999e-07,
      "loss": -0.0,
      "num_tokens": 497416.0,
      "reward": 0.43631184101104736,
      "reward_std": 0.09718433767557144,
      "rewards/grpo_reward_func/mean": 0.43631184101104736,
      "rewards/grpo_reward_func/std": 0.17311933636665344,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.78125,
      "learning_rate": 8.666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 509832.0,
      "reward": 0.5329959392547607,
      "reward_std": 0.11580680310726166,
      "rewards/grpo_reward_func/mean": 0.5329959392547607,
      "rewards/grpo_reward_func/std": 0.11687568575143814,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.75,
      "learning_rate": 8.633333333333333e-07,
      "loss": -0.0,
      "num_tokens": 522252.0,
      "reward": 0.44177818298339844,
      "reward_std": 0.13238248229026794,
      "rewards/grpo_reward_func/mean": 0.44177818298339844,
      "rewards/grpo_reward_func/std": 0.12943537533283234,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.375,
      "learning_rate": 8.599999999999999e-07,
      "loss": -0.0,
      "num_tokens": 534660.0,
      "reward": 0.5416427850723267,
      "reward_std": 0.09374570846557617,
      "rewards/grpo_reward_func/mean": 0.5416427850723267,
      "rewards/grpo_reward_func/std": 0.11684079468250275,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.125,
      "learning_rate": 8.566666666666667e-07,
      "loss": 0.0,
      "num_tokens": 547064.0,
      "reward": 0.3880234658718109,
      "reward_std": 0.06982941925525665,
      "rewards/grpo_reward_func/mean": 0.3880234658718109,
      "rewards/grpo_reward_func/std": 0.09098156541585922,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.34375,
      "learning_rate": 8.533333333333334e-07,
      "loss": -0.0,
      "num_tokens": 559488.0,
      "reward": 0.33481428027153015,
      "reward_std": 0.06352214515209198,
      "rewards/grpo_reward_func/mean": 0.33481428027153015,
      "rewards/grpo_reward_func/std": 0.08472999185323715,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.0625,
      "learning_rate": 8.499999999999999e-07,
      "loss": 0.0,
      "num_tokens": 571944.0,
      "reward": 0.387703001499176,
      "reward_std": 0.07385663688182831,
      "rewards/grpo_reward_func/mean": 0.387703001499176,
      "rewards/grpo_reward_func/std": 0.11046246439218521,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.125,
      "learning_rate": 8.466666666666667e-07,
      "loss": 0.0,
      "num_tokens": 584324.0,
      "reward": 0.5441805124282837,
      "reward_std": 0.11389695107936859,
      "rewards/grpo_reward_func/mean": 0.5441805124282837,
      "rewards/grpo_reward_func/std": 0.13207265734672546,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.8125,
      "learning_rate": 8.433333333333333e-07,
      "loss": 0.0,
      "num_tokens": 596692.0,
      "reward": 0.488021582365036,
      "reward_std": 0.13947440683841705,
      "rewards/grpo_reward_func/mean": 0.488021582365036,
      "rewards/grpo_reward_func/std": 0.15811356902122498,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.421875,
      "learning_rate": 8.399999999999999e-07,
      "loss": -0.0,
      "num_tokens": 609168.0,
      "reward": 0.3698539733886719,
      "reward_std": 0.04929333180189133,
      "rewards/grpo_reward_func/mean": 0.3698539733886719,
      "rewards/grpo_reward_func/std": 0.05231497436761856,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.59375,
      "learning_rate": 8.366666666666667e-07,
      "loss": -0.0,
      "num_tokens": 621624.0,
      "reward": 0.46477562189102173,
      "reward_std": 0.07750491052865982,
      "rewards/grpo_reward_func/mean": 0.46477562189102173,
      "rewards/grpo_reward_func/std": 0.15642288327217102,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.0,
      "learning_rate": 8.333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 633868.0,
      "reward": 0.43864211440086365,
      "reward_std": 0.13110151886940002,
      "rewards/grpo_reward_func/mean": 0.43864211440086365,
      "rewards/grpo_reward_func/std": 0.14933471381664276,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.625,
      "learning_rate": 8.299999999999999e-07,
      "loss": 0.0,
      "num_tokens": 646324.0,
      "reward": 0.3448641300201416,
      "reward_std": 0.06778337061405182,
      "rewards/grpo_reward_func/mean": 0.3448641300201416,
      "rewards/grpo_reward_func/std": 0.06967282295227051,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.9375,
      "learning_rate": 8.266666666666667e-07,
      "loss": 0.0,
      "num_tokens": 658764.0,
      "reward": 0.4265494644641876,
      "reward_std": 0.11092057079076767,
      "rewards/grpo_reward_func/mean": 0.4265494644641876,
      "rewards/grpo_reward_func/std": 0.11681105941534042,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.3125,
      "learning_rate": 8.233333333333333e-07,
      "loss": 0.0,
      "num_tokens": 671084.0,
      "reward": 0.3909933567047119,
      "reward_std": 0.062042489647865295,
      "rewards/grpo_reward_func/mean": 0.3909933567047119,
      "rewards/grpo_reward_func/std": 0.12040998041629791,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.6875,
      "learning_rate": 8.199999999999999e-07,
      "loss": -0.0,
      "num_tokens": 683556.0,
      "reward": 0.3827119469642639,
      "reward_std": 0.0810474157333374,
      "rewards/grpo_reward_func/mean": 0.3827119469642639,
      "rewards/grpo_reward_func/std": 0.10648734867572784,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.625,
      "learning_rate": 8.166666666666666e-07,
      "loss": 0.0,
      "num_tokens": 696044.0,
      "reward": 0.43536075949668884,
      "reward_std": 0.13194429874420166,
      "rewards/grpo_reward_func/mean": 0.43536075949668884,
      "rewards/grpo_reward_func/std": 0.14542116224765778,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5625,
      "learning_rate": 8.133333333333333e-07,
      "loss": -0.0,
      "num_tokens": 708468.0,
      "reward": 0.4286166727542877,
      "reward_std": 0.07387880980968475,
      "rewards/grpo_reward_func/mean": 0.4286166727542877,
      "rewards/grpo_reward_func/std": 0.10452007502317429,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5625,
      "learning_rate": 8.1e-07,
      "loss": -0.0,
      "num_tokens": 720940.0,
      "reward": 0.38893401622772217,
      "reward_std": 0.0943751409649849,
      "rewards/grpo_reward_func/mean": 0.38893401622772217,
      "rewards/grpo_reward_func/std": 0.12028432637453079,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.1875,
      "learning_rate": 8.066666666666666e-07,
      "loss": -0.0,
      "num_tokens": 733360.0,
      "reward": 0.4644596576690674,
      "reward_std": 0.16205663979053497,
      "rewards/grpo_reward_func/mean": 0.4644596576690674,
      "rewards/grpo_reward_func/std": 0.15505553781986237,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.1875,
      "learning_rate": 8.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 745704.0,
      "reward": 0.46369504928588867,
      "reward_std": 0.0912257730960846,
      "rewards/grpo_reward_func/mean": 0.46369504928588867,
      "rewards/grpo_reward_func/std": 0.09050611406564713,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.84375,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 758080.0,
      "reward": 0.4551791548728943,
      "reward_std": 0.12297463417053223,
      "rewards/grpo_reward_func/mean": 0.4551791548728943,
      "rewards/grpo_reward_func/std": 0.14138628542423248,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0,
      "learning_rate": 7.966666666666666e-07,
      "loss": 0.0,
      "num_tokens": 770712.0,
      "reward": 0.39730104804039,
      "reward_std": 0.06629657000303268,
      "rewards/grpo_reward_func/mean": 0.39730104804039,
      "rewards/grpo_reward_func/std": 0.08781840652227402,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 1.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.625,
      "learning_rate": 7.933333333333333e-07,
      "loss": 0.0,
      "num_tokens": 783120.0,
      "reward": 0.40575429797172546,
      "reward_std": 0.09323962777853012,
      "rewards/grpo_reward_func/mean": 0.40575429797172546,
      "rewards/grpo_reward_func/std": 0.1281837671995163,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.0,
      "learning_rate": 7.9e-07,
      "loss": 0.0,
      "num_tokens": 795568.0,
      "reward": 0.4539070129394531,
      "reward_std": 0.1893976330757141,
      "rewards/grpo_reward_func/mean": 0.4539070129394531,
      "rewards/grpo_reward_func/std": 0.17878401279449463,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.625,
      "learning_rate": 7.866666666666666e-07,
      "loss": 0.0,
      "num_tokens": 808072.0,
      "reward": 0.42031583189964294,
      "reward_std": 0.06885866075754166,
      "rewards/grpo_reward_func/mean": 0.42031583189964294,
      "rewards/grpo_reward_func/std": 0.06720545887947083,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.9375,
      "learning_rate": 7.833333333333333e-07,
      "loss": -0.0,
      "num_tokens": 820592.0,
      "reward": 0.446481317281723,
      "reward_std": 0.06617365032434464,
      "rewards/grpo_reward_func/mean": 0.446481317281723,
      "rewards/grpo_reward_func/std": 0.11224810034036636,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.0625,
      "learning_rate": 7.799999999999999e-07,
      "loss": -0.0,
      "num_tokens": 833008.0,
      "reward": 0.29850703477859497,
      "reward_std": 0.07878842949867249,
      "rewards/grpo_reward_func/mean": 0.29850703477859497,
      "rewards/grpo_reward_func/std": 0.09381019324064255,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.8125,
      "learning_rate": 7.766666666666666e-07,
      "loss": 0.0,
      "num_tokens": 845476.0,
      "reward": 0.3700866401195526,
      "reward_std": 0.11176452040672302,
      "rewards/grpo_reward_func/mean": 0.3700866401195526,
      "rewards/grpo_reward_func/std": 0.1271413266658783,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.75,
      "learning_rate": 7.733333333333333e-07,
      "loss": -0.0,
      "num_tokens": 857896.0,
      "reward": 0.4782499074935913,
      "reward_std": 0.10448910295963287,
      "rewards/grpo_reward_func/mean": 0.4782499074935913,
      "rewards/grpo_reward_func/std": 0.125322625041008,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.5,
      "learning_rate": 7.699999999999999e-07,
      "loss": -0.0,
      "num_tokens": 870308.0,
      "reward": 0.44694995880126953,
      "reward_std": 0.11892125755548477,
      "rewards/grpo_reward_func/mean": 0.44694995880126953,
      "rewards/grpo_reward_func/std": 0.15172399580478668,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25,
      "learning_rate": 7.666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 882696.0,
      "reward": 0.48773661255836487,
      "reward_std": 0.18720099329948425,
      "rewards/grpo_reward_func/mean": 0.48773661255836487,
      "rewards/grpo_reward_func/std": 0.19652612507343292,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.21875,
      "learning_rate": 7.633333333333333e-07,
      "loss": 0.0,
      "num_tokens": 895208.0,
      "reward": 0.360309362411499,
      "reward_std": 0.05594930052757263,
      "rewards/grpo_reward_func/mean": 0.360309362411499,
      "rewards/grpo_reward_func/std": 0.08431853353977203,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.3125,
      "learning_rate": 7.599999999999999e-07,
      "loss": 0.0,
      "num_tokens": 907548.0,
      "reward": 0.4548572897911072,
      "reward_std": 0.1430705040693283,
      "rewards/grpo_reward_func/mean": 0.4548572897911072,
      "rewards/grpo_reward_func/std": 0.144826740026474,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.40625,
      "learning_rate": 7.566666666666667e-07,
      "loss": -0.0,
      "num_tokens": 919976.0,
      "reward": 0.43647801876068115,
      "reward_std": 0.10883159935474396,
      "rewards/grpo_reward_func/mean": 0.43647801876068115,
      "rewards/grpo_reward_func/std": 0.13386793434619904,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.96875,
      "learning_rate": 7.533333333333332e-07,
      "loss": 0.0,
      "num_tokens": 932436.0,
      "reward": 0.3631000518798828,
      "reward_std": 0.055175162851810455,
      "rewards/grpo_reward_func/mean": 0.3631000518798828,
      "rewards/grpo_reward_func/std": 0.061299730092287064,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.625,
      "learning_rate": 7.5e-07,
      "loss": -0.0,
      "num_tokens": 944744.0,
      "reward": 0.3734683394432068,
      "reward_std": 0.07731673121452332,
      "rewards/grpo_reward_func/mean": 0.3734683394432068,
      "rewards/grpo_reward_func/std": 0.1018432006239891,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.75,
      "learning_rate": 7.466666666666667e-07,
      "loss": -0.0,
      "num_tokens": 957140.0,
      "reward": 0.3586929738521576,
      "reward_std": 0.08576677739620209,
      "rewards/grpo_reward_func/mean": 0.3586929738521576,
      "rewards/grpo_reward_func/std": 0.09627655893564224,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.1875,
      "learning_rate": 7.433333333333332e-07,
      "loss": -0.0,
      "num_tokens": 969588.0,
      "reward": 0.3304125964641571,
      "reward_std": 0.09432289004325867,
      "rewards/grpo_reward_func/mean": 0.3304125964641571,
      "rewards/grpo_reward_func/std": 0.12439437210559845,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.625,
      "learning_rate": 7.4e-07,
      "loss": -0.0,
      "num_tokens": 982032.0,
      "reward": 0.4600115418434143,
      "reward_std": 0.11891645193099976,
      "rewards/grpo_reward_func/mean": 0.4600115418434143,
      "rewards/grpo_reward_func/std": 0.11769349873065948,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5625,
      "learning_rate": 7.366666666666667e-07,
      "loss": 0.0,
      "num_tokens": 994440.0,
      "reward": 0.4921344816684723,
      "reward_std": 0.18801572918891907,
      "rewards/grpo_reward_func/mean": 0.4921344816684723,
      "rewards/grpo_reward_func/std": 0.17593181133270264,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.03125,
      "learning_rate": 7.333333333333332e-07,
      "loss": -0.0,
      "num_tokens": 1006908.0,
      "reward": 0.44369810819625854,
      "reward_std": 0.11731548607349396,
      "rewards/grpo_reward_func/mean": 0.44369810819625854,
      "rewards/grpo_reward_func/std": 0.13351494073867798,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.40625,
      "learning_rate": 7.3e-07,
      "loss": -0.0,
      "num_tokens": 1019360.0,
      "reward": 0.4988051652908325,
      "reward_std": 0.08421847224235535,
      "rewards/grpo_reward_func/mean": 0.4988051652908325,
      "rewards/grpo_reward_func/std": 0.12857672572135925,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.1875,
      "learning_rate": 7.266666666666667e-07,
      "loss": 0.0,
      "num_tokens": 1031756.0,
      "reward": 0.4094837009906769,
      "reward_std": 0.10778755694627762,
      "rewards/grpo_reward_func/mean": 0.4094837009906769,
      "rewards/grpo_reward_func/std": 0.11033328622579575,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.375,
      "learning_rate": 7.233333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1044160.0,
      "reward": 0.3499518632888794,
      "reward_std": 0.07542143762111664,
      "rewards/grpo_reward_func/mean": 0.3499518632888794,
      "rewards/grpo_reward_func/std": 0.08578986674547195,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.875,
      "learning_rate": 7.2e-07,
      "loss": -0.0,
      "num_tokens": 1056580.0,
      "reward": 0.4997272491455078,
      "reward_std": 0.1262975037097931,
      "rewards/grpo_reward_func/mean": 0.4997272491455078,
      "rewards/grpo_reward_func/std": 0.1279306709766388,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.0625,
      "learning_rate": 7.166666666666667e-07,
      "loss": 0.0,
      "num_tokens": 1069020.0,
      "reward": 0.46792131662368774,
      "reward_std": 0.13234254717826843,
      "rewards/grpo_reward_func/mean": 0.46792131662368774,
      "rewards/grpo_reward_func/std": 0.1700320839881897,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.375,
      "learning_rate": 7.133333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1081496.0,
      "reward": 0.4166927635669708,
      "reward_std": 0.07564548403024673,
      "rewards/grpo_reward_func/mean": 0.4166927635669708,
      "rewards/grpo_reward_func/std": 0.19586633145809174,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.5,
      "learning_rate": 7.1e-07,
      "loss": 0.0,
      "num_tokens": 1093816.0,
      "reward": 0.42825716733932495,
      "reward_std": 0.1293352246284485,
      "rewards/grpo_reward_func/mean": 0.42825716733932495,
      "rewards/grpo_reward_func/std": 0.1340746283531189,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.96875,
      "learning_rate": 7.066666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1106336.0,
      "reward": 0.40863943099975586,
      "reward_std": 0.061242297291755676,
      "rewards/grpo_reward_func/mean": 0.40863943099975586,
      "rewards/grpo_reward_func/std": 0.11059094965457916,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.875,
      "learning_rate": 7.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1118756.0,
      "reward": 0.44183290004730225,
      "reward_std": 0.1359260380268097,
      "rewards/grpo_reward_func/mean": 0.44183290004730225,
      "rewards/grpo_reward_func/std": 0.15313053131103516,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.9375,
      "learning_rate": 7e-07,
      "loss": -0.0,
      "num_tokens": 1131108.0,
      "reward": 0.4604765474796295,
      "reward_std": 0.09057141840457916,
      "rewards/grpo_reward_func/mean": 0.4604765474796295,
      "rewards/grpo_reward_func/std": 0.17239472270011902,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.84375,
      "learning_rate": 6.966666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1143520.0,
      "reward": 0.37243229150772095,
      "reward_std": 0.07444402575492859,
      "rewards/grpo_reward_func/mean": 0.37243229150772095,
      "rewards/grpo_reward_func/std": 0.1061118021607399,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25,
      "learning_rate": 6.933333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1156008.0,
      "reward": 0.5441325902938843,
      "reward_std": 0.11369525641202927,
      "rewards/grpo_reward_func/mean": 0.5441325902938843,
      "rewards/grpo_reward_func/std": 0.11172118782997131,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.3125,
      "learning_rate": 6.9e-07,
      "loss": -0.0,
      "num_tokens": 1168452.0,
      "reward": 0.4581533670425415,
      "reward_std": 0.11172451823949814,
      "rewards/grpo_reward_func/mean": 0.4581533670425415,
      "rewards/grpo_reward_func/std": 0.1257813274860382,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 2.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.6875,
      "learning_rate": 6.866666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1180928.0,
      "reward": 0.4434836208820343,
      "reward_std": 0.14923422038555145,
      "rewards/grpo_reward_func/mean": 0.4434836208820343,
      "rewards/grpo_reward_func/std": 0.1542947143316269,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.5625,
      "learning_rate": 6.833333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1193352.0,
      "reward": 0.3983464241027832,
      "reward_std": 0.08742759376764297,
      "rewards/grpo_reward_func/mean": 0.3983464241027832,
      "rewards/grpo_reward_func/std": 0.12986424565315247,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.875,
      "learning_rate": 6.800000000000001e-07,
      "loss": -0.0,
      "num_tokens": 1205796.0,
      "reward": 0.5015304088592529,
      "reward_std": 0.08956287801265717,
      "rewards/grpo_reward_func/mean": 0.5015304088592529,
      "rewards/grpo_reward_func/std": 0.08333175629377365,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.875,
      "learning_rate": 6.766666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1218244.0,
      "reward": 0.47066164016723633,
      "reward_std": 0.19255688786506653,
      "rewards/grpo_reward_func/mean": 0.47066164016723633,
      "rewards/grpo_reward_func/std": 0.1828991174697876,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.5625,
      "learning_rate": 6.733333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1230632.0,
      "reward": 0.49644234776496887,
      "reward_std": 0.10233695805072784,
      "rewards/grpo_reward_func/mean": 0.49644234776496887,
      "rewards/grpo_reward_func/std": 0.09938962757587433,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.0,
      "learning_rate": 6.7e-07,
      "loss": -0.0,
      "num_tokens": 1243024.0,
      "reward": 0.48214682936668396,
      "reward_std": 0.1728937327861786,
      "rewards/grpo_reward_func/mean": 0.48214682936668396,
      "rewards/grpo_reward_func/std": 0.16634704172611237,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.1875,
      "learning_rate": 6.666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1255484.0,
      "reward": 0.4351156949996948,
      "reward_std": 0.12486094236373901,
      "rewards/grpo_reward_func/mean": 0.4351156949996948,
      "rewards/grpo_reward_func/std": 0.1314164698123932,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.4375,
      "learning_rate": 6.633333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1267856.0,
      "reward": 0.38795578479766846,
      "reward_std": 0.1968497335910797,
      "rewards/grpo_reward_func/mean": 0.38795578479766846,
      "rewards/grpo_reward_func/std": 0.18232691287994385,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.0625,
      "learning_rate": 6.6e-07,
      "loss": 0.0,
      "num_tokens": 1280280.0,
      "reward": 0.3891702890396118,
      "reward_std": 0.09787960350513458,
      "rewards/grpo_reward_func/mean": 0.3891702890396118,
      "rewards/grpo_reward_func/std": 0.09284209460020065,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.06298828125,
      "learning_rate": 6.566666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1292752.0,
      "reward": 0.39056217670440674,
      "reward_std": 0.04999999329447746,
      "rewards/grpo_reward_func/mean": 0.39056217670440674,
      "rewards/grpo_reward_func/std": 0.04636901617050171,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.75,
      "learning_rate": 6.533333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1305200.0,
      "reward": 0.4605242609977722,
      "reward_std": 0.13093939423561096,
      "rewards/grpo_reward_func/mean": 0.4605242609977722,
      "rewards/grpo_reward_func/std": 0.15952207148075104,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.6875,
      "learning_rate": 6.5e-07,
      "loss": -0.0,
      "num_tokens": 1317660.0,
      "reward": 0.3946014940738678,
      "reward_std": 0.09192033857107162,
      "rewards/grpo_reward_func/mean": 0.3946014940738678,
      "rewards/grpo_reward_func/std": 0.10782631486654282,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.6875,
      "learning_rate": 6.466666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1330068.0,
      "reward": 0.4714941084384918,
      "reward_std": 0.09265273809432983,
      "rewards/grpo_reward_func/mean": 0.4714941084384918,
      "rewards/grpo_reward_func/std": 0.12330163270235062,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.5625,
      "learning_rate": 6.433333333333332e-07,
      "loss": 0.0,
      "num_tokens": 1342620.0,
      "reward": 0.49374109506607056,
      "reward_std": 0.0895591527223587,
      "rewards/grpo_reward_func/mean": 0.49374109506607056,
      "rewards/grpo_reward_func/std": 0.1332620531320572,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.625,
      "learning_rate": 6.4e-07,
      "loss": 0.0,
      "num_tokens": 1355016.0,
      "reward": 0.3305853009223938,
      "reward_std": 0.04621565341949463,
      "rewards/grpo_reward_func/mean": 0.3305853009223938,
      "rewards/grpo_reward_func/std": 0.04419610649347305,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.6875,
      "learning_rate": 6.366666666666667e-07,
      "loss": -0.0,
      "num_tokens": 1367452.0,
      "reward": 0.5173900723457336,
      "reward_std": 0.14908233284950256,
      "rewards/grpo_reward_func/mean": 0.5173900723457336,
      "rewards/grpo_reward_func/std": 0.15880633890628815,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.9375,
      "learning_rate": 6.333333333333332e-07,
      "loss": -0.0,
      "num_tokens": 1379760.0,
      "reward": 0.3797125816345215,
      "reward_std": 0.10961093008518219,
      "rewards/grpo_reward_func/mean": 0.3797125816345215,
      "rewards/grpo_reward_func/std": 0.12369874864816666,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.875,
      "learning_rate": 6.3e-07,
      "loss": 0.0,
      "num_tokens": 1392296.0,
      "reward": 0.3952435255050659,
      "reward_std": 0.07089774310588837,
      "rewards/grpo_reward_func/mean": 0.3952435255050659,
      "rewards/grpo_reward_func/std": 0.09734237939119339,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.25,
      "learning_rate": 6.266666666666667e-07,
      "loss": 0.0,
      "num_tokens": 1404748.0,
      "reward": 0.4383198916912079,
      "reward_std": 0.08845233917236328,
      "rewards/grpo_reward_func/mean": 0.4383198916912079,
      "rewards/grpo_reward_func/std": 0.08347002416849136,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.125,
      "learning_rate": 6.233333333333332e-07,
      "loss": -0.0,
      "num_tokens": 1417172.0,
      "reward": 0.3984643220901489,
      "reward_std": 0.08412055671215057,
      "rewards/grpo_reward_func/mean": 0.3984643220901489,
      "rewards/grpo_reward_func/std": 0.08139137923717499,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.0,
      "learning_rate": 6.2e-07,
      "loss": -0.0,
      "num_tokens": 1429572.0,
      "reward": 0.3756071925163269,
      "reward_std": 0.1621457189321518,
      "rewards/grpo_reward_func/mean": 0.3756071925163269,
      "rewards/grpo_reward_func/std": 0.16212420165538788,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0625,
      "learning_rate": 6.166666666666667e-07,
      "loss": -0.0,
      "num_tokens": 1441984.0,
      "reward": 0.3367416262626648,
      "reward_std": 0.10579686611890793,
      "rewards/grpo_reward_func/mean": 0.3367416262626648,
      "rewards/grpo_reward_func/std": 0.12276742607355118,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.09375,
      "learning_rate": 6.133333333333332e-07,
      "loss": -0.0,
      "num_tokens": 1454520.0,
      "reward": 0.33171868324279785,
      "reward_std": 0.05540106073021889,
      "rewards/grpo_reward_func/mean": 0.33171868324279785,
      "rewards/grpo_reward_func/std": 0.05543047562241554,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.4375,
      "learning_rate": 6.1e-07,
      "loss": -0.0,
      "num_tokens": 1466968.0,
      "reward": 0.46069973707199097,
      "reward_std": 0.08953073620796204,
      "rewards/grpo_reward_func/mean": 0.46069973707199097,
      "rewards/grpo_reward_func/std": 0.10067260265350342,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5,
      "learning_rate": 6.066666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1479328.0,
      "reward": 0.49788278341293335,
      "reward_std": 0.12688566744327545,
      "rewards/grpo_reward_func/mean": 0.49788278341293335,
      "rewards/grpo_reward_func/std": 0.12214919179677963,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 6.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1491788.0,
      "reward": 0.35892003774642944,
      "reward_std": 0.0625436007976532,
      "rewards/grpo_reward_func/mean": 0.35892003774642944,
      "rewards/grpo_reward_func/std": 0.09081238508224487,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 6e-07,
      "loss": 0.0,
      "num_tokens": 1504220.0,
      "reward": 0.38591668009757996,
      "reward_std": 0.15822480618953705,
      "rewards/grpo_reward_func/mean": 0.38591668009757996,
      "rewards/grpo_reward_func/std": 0.16854539513587952,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.25,
      "learning_rate": 5.966666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1516652.0,
      "reward": 0.43537092208862305,
      "reward_std": 0.14132292568683624,
      "rewards/grpo_reward_func/mean": 0.43537092208862305,
      "rewards/grpo_reward_func/std": 0.15050342679023743,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.125,
      "learning_rate": 5.933333333333334e-07,
      "loss": -0.0,
      "num_tokens": 1529072.0,
      "reward": 0.4112863540649414,
      "reward_std": 0.08730175346136093,
      "rewards/grpo_reward_func/mean": 0.4112863540649414,
      "rewards/grpo_reward_func/std": 0.09073270857334137,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 5.9e-07,
      "loss": -0.0,
      "num_tokens": 1541488.0,
      "reward": 0.3833653926849365,
      "reward_std": 0.09057098627090454,
      "rewards/grpo_reward_func/mean": 0.3833653926849365,
      "rewards/grpo_reward_func/std": 0.08530126512050629,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 5.866666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1553812.0,
      "reward": 0.5172641277313232,
      "reward_std": 0.08300620317459106,
      "rewards/grpo_reward_func/mean": 0.5172641277313232,
      "rewards/grpo_reward_func/std": 0.18922077119350433,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.9375,
      "learning_rate": 5.833333333333334e-07,
      "loss": 0.0,
      "num_tokens": 1566244.0,
      "reward": 0.45866021513938904,
      "reward_std": 0.13558343052864075,
      "rewards/grpo_reward_func/mean": 0.45866021513938904,
      "rewards/grpo_reward_func/std": 0.12821511924266815,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 3.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.46875,
      "learning_rate": 5.8e-07,
      "loss": -0.0,
      "num_tokens": 1578680.0,
      "reward": 0.4404694437980652,
      "reward_std": 0.058066606521606445,
      "rewards/grpo_reward_func/mean": 0.4404694437980652,
      "rewards/grpo_reward_func/std": 0.057657789438962936,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.9375,
      "learning_rate": 5.766666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1591136.0,
      "reward": 0.3580424189567566,
      "reward_std": 0.07987552881240845,
      "rewards/grpo_reward_func/mean": 0.3580424189567566,
      "rewards/grpo_reward_func/std": 0.0977816954255104,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.875,
      "learning_rate": 5.733333333333334e-07,
      "loss": 0.0,
      "num_tokens": 1603604.0,
      "reward": 0.3891274929046631,
      "reward_std": 0.15381482243537903,
      "rewards/grpo_reward_func/mean": 0.3891274929046631,
      "rewards/grpo_reward_func/std": 0.17152857780456543,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.25,
      "learning_rate": 5.699999999999999e-07,
      "loss": 0.0,
      "num_tokens": 1616044.0,
      "reward": 0.27857083082199097,
      "reward_std": 0.09501777589321136,
      "rewards/grpo_reward_func/mean": 0.27857083082199097,
      "rewards/grpo_reward_func/std": 0.1052025854587555,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.1875,
      "learning_rate": 5.666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1628436.0,
      "reward": 0.35340362787246704,
      "reward_std": 0.16999280452728271,
      "rewards/grpo_reward_func/mean": 0.35340362787246704,
      "rewards/grpo_reward_func/std": 0.16278210282325745,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.3125,
      "learning_rate": 5.633333333333334e-07,
      "loss": -0.0,
      "num_tokens": 1640824.0,
      "reward": 0.4199197590351105,
      "reward_std": 0.08985067158937454,
      "rewards/grpo_reward_func/mean": 0.4199197590351105,
      "rewards/grpo_reward_func/std": 0.09818078577518463,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.0,
      "learning_rate": 5.6e-07,
      "loss": -0.0,
      "num_tokens": 1653220.0,
      "reward": 0.44602805376052856,
      "reward_std": 0.10932175815105438,
      "rewards/grpo_reward_func/mean": 0.44602805376052856,
      "rewards/grpo_reward_func/std": 0.11537235230207443,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.5625,
      "learning_rate": 5.566666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1665684.0,
      "reward": 0.4218568205833435,
      "reward_std": 0.09915173053741455,
      "rewards/grpo_reward_func/mean": 0.4218568205833435,
      "rewards/grpo_reward_func/std": 0.1479072868824005,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.375,
      "learning_rate": 5.533333333333334e-07,
      "loss": -0.0,
      "num_tokens": 1678120.0,
      "reward": 0.3699283301830292,
      "reward_std": 0.05628474801778793,
      "rewards/grpo_reward_func/mean": 0.3699283301830292,
      "rewards/grpo_reward_func/std": 0.055360160768032074,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.625,
      "learning_rate": 5.5e-07,
      "loss": 0.0,
      "num_tokens": 1690616.0,
      "reward": 0.43144893646240234,
      "reward_std": 0.097145214676857,
      "rewards/grpo_reward_func/mean": 0.43144893646240234,
      "rewards/grpo_reward_func/std": 0.09757841378450394,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.875,
      "learning_rate": 5.466666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1703048.0,
      "reward": 0.37039631605148315,
      "reward_std": 0.06340405344963074,
      "rewards/grpo_reward_func/mean": 0.37039631605148315,
      "rewards/grpo_reward_func/std": 0.10630898922681808,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.5,
      "learning_rate": 5.433333333333334e-07,
      "loss": -0.0,
      "num_tokens": 1715404.0,
      "reward": 0.44485020637512207,
      "reward_std": 0.061223354190588,
      "rewards/grpo_reward_func/mean": 0.44485020637512207,
      "rewards/grpo_reward_func/std": 0.0653579831123352,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 18.75,
      "learning_rate": 5.4e-07,
      "loss": -0.0,
      "num_tokens": 1727876.0,
      "reward": 0.4389991760253906,
      "reward_std": 0.12622228264808655,
      "rewards/grpo_reward_func/mean": 0.4389991760253906,
      "rewards/grpo_reward_func/std": 0.1206517443060875,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.3125,
      "learning_rate": 5.366666666666666e-07,
      "loss": 0.0,
      "num_tokens": 1740252.0,
      "reward": 0.3506331741809845,
      "reward_std": 0.1391739398241043,
      "rewards/grpo_reward_func/mean": 0.3506331741809845,
      "rewards/grpo_reward_func/std": 0.14306746423244476,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.8125,
      "learning_rate": 5.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1752632.0,
      "reward": 0.5316411256790161,
      "reward_std": 0.10773089528083801,
      "rewards/grpo_reward_func/mean": 0.5316411256790161,
      "rewards/grpo_reward_func/std": 0.16645555198192596,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.40625,
      "learning_rate": 5.3e-07,
      "loss": -0.0,
      "num_tokens": 1765040.0,
      "reward": 0.3930637836456299,
      "reward_std": 0.07452228665351868,
      "rewards/grpo_reward_func/mean": 0.3930637836456299,
      "rewards/grpo_reward_func/std": 0.07487671822309494,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 11.875,
      "completions/mean_terminated_length": 11.875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 4.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.25,
      "learning_rate": 5.266666666666666e-07,
      "loss": -0.0142,
      "num_tokens": 1777423.0,
      "reward": 0.3444192409515381,
      "reward_std": 0.1598653644323349,
      "rewards/grpo_reward_func/mean": 0.3444192409515381,
      "rewards/grpo_reward_func/std": 0.18078266084194183,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0,
      "learning_rate": 5.233333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1789683.0,
      "reward": 0.5174664258956909,
      "reward_std": 0.07813962548971176,
      "rewards/grpo_reward_func/mean": 0.5174664258956909,
      "rewards/grpo_reward_func/std": 0.10316640138626099,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.625,
      "learning_rate": 5.2e-07,
      "loss": 0.0,
      "num_tokens": 1802119.0,
      "reward": 0.3699246048927307,
      "reward_std": 0.08162573724985123,
      "rewards/grpo_reward_func/mean": 0.3699246048927307,
      "rewards/grpo_reward_func/std": 0.09686075896024704,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.0,
      "learning_rate": 5.166666666666667e-07,
      "loss": 0.0,
      "num_tokens": 1814483.0,
      "reward": 0.4604162275791168,
      "reward_std": 0.19567811489105225,
      "rewards/grpo_reward_func/mean": 0.4604162275791168,
      "rewards/grpo_reward_func/std": 0.19948698580265045,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.90625,
      "learning_rate": 5.133333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1827055.0,
      "reward": 0.41122761368751526,
      "reward_std": 0.08153079450130463,
      "rewards/grpo_reward_func/mean": 0.41122761368751526,
      "rewards/grpo_reward_func/std": 0.08045266568660736,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.8125,
      "learning_rate": 5.1e-07,
      "loss": 0.0,
      "num_tokens": 1839535.0,
      "reward": 0.3292653560638428,
      "reward_std": 0.04870126396417618,
      "rewards/grpo_reward_func/mean": 0.3292653560638428,
      "rewards/grpo_reward_func/std": 0.07768747955560684,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.5,
      "learning_rate": 5.066666666666667e-07,
      "loss": -0.0,
      "num_tokens": 1852003.0,
      "reward": 0.4356845021247864,
      "reward_std": 0.11020061373710632,
      "rewards/grpo_reward_func/mean": 0.4356845021247864,
      "rewards/grpo_reward_func/std": 0.12760911881923676,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.1875,
      "learning_rate": 5.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1864487.0,
      "reward": 0.47176241874694824,
      "reward_std": 0.1466352343559265,
      "rewards/grpo_reward_func/mean": 0.47176241874694824,
      "rewards/grpo_reward_func/std": 0.15562468767166138,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 1876939.0,
      "reward": 0.49293607473373413,
      "reward_std": 0.15847747027873993,
      "rewards/grpo_reward_func/mean": 0.49293607473373413,
      "rewards/grpo_reward_func/std": 0.16349899768829346,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.1875,
      "learning_rate": 4.966666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1889499.0,
      "reward": 0.4915664792060852,
      "reward_std": 0.19223570823669434,
      "rewards/grpo_reward_func/mean": 0.4915664792060852,
      "rewards/grpo_reward_func/std": 0.1780252456665039,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.1875,
      "learning_rate": 4.933333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1901911.0,
      "reward": 0.39836806058883667,
      "reward_std": 0.08220314979553223,
      "rewards/grpo_reward_func/mean": 0.39836806058883667,
      "rewards/grpo_reward_func/std": 0.09293971210718155,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.0625,
      "learning_rate": 4.9e-07,
      "loss": 0.0,
      "num_tokens": 1914267.0,
      "reward": 0.5052293539047241,
      "reward_std": 0.05901884660124779,
      "rewards/grpo_reward_func/mean": 0.5052293539047241,
      "rewards/grpo_reward_func/std": 0.07250750809907913,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.8125,
      "learning_rate": 4.866666666666666e-07,
      "loss": -0.0,
      "num_tokens": 1926679.0,
      "reward": 0.2826748192310333,
      "reward_std": 0.0776633769273758,
      "rewards/grpo_reward_func/mean": 0.2826748192310333,
      "rewards/grpo_reward_func/std": 0.07334372401237488,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.125,
      "learning_rate": 4.833333333333333e-07,
      "loss": -0.0,
      "num_tokens": 1939135.0,
      "reward": 0.38298332691192627,
      "reward_std": 0.15204550325870514,
      "rewards/grpo_reward_func/mean": 0.38298332691192627,
      "rewards/grpo_reward_func/std": 0.17793436348438263,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.625,
      "learning_rate": 4.8e-07,
      "loss": 0.0,
      "num_tokens": 1951555.0,
      "reward": 0.45585888624191284,
      "reward_std": 0.08215408027172089,
      "rewards/grpo_reward_func/mean": 0.45585888624191284,
      "rewards/grpo_reward_func/std": 0.08240208774805069,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.8125,
      "learning_rate": 4.7666666666666667e-07,
      "loss": 0.0,
      "num_tokens": 1964007.0,
      "reward": 0.4660055935382843,
      "reward_std": 0.17032964527606964,
      "rewards/grpo_reward_func/mean": 0.4660055935382843,
      "rewards/grpo_reward_func/std": 0.1751418560743332,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 4.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5,
      "learning_rate": 4.733333333333333e-07,
      "loss": 0.0,
      "num_tokens": 1976483.0,
      "reward": 0.5173270106315613,
      "reward_std": 0.17288881540298462,
      "rewards/grpo_reward_func/mean": 0.5173270106315613,
      "rewards/grpo_reward_func/std": 0.1660362035036087,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.125,
      "learning_rate": 4.6999999999999995e-07,
      "loss": 0.0,
      "num_tokens": 1988919.0,
      "reward": 0.4136430323123932,
      "reward_std": 0.22560492157936096,
      "rewards/grpo_reward_func/mean": 0.4136430323123932,
      "rewards/grpo_reward_func/std": 0.21545714139938354,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.1875,
      "learning_rate": 4.6666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2001379.0,
      "reward": 0.43014535307884216,
      "reward_std": 0.08317069709300995,
      "rewards/grpo_reward_func/mean": 0.43014535307884216,
      "rewards/grpo_reward_func/std": 0.07750457525253296,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.90625,
      "learning_rate": 4.633333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2013971.0,
      "reward": 0.34895196557044983,
      "reward_std": 0.0512375608086586,
      "rewards/grpo_reward_func/mean": 0.34895196557044983,
      "rewards/grpo_reward_func/std": 0.04775034263730049,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.34375,
      "learning_rate": 4.6e-07,
      "loss": 0.0,
      "num_tokens": 2026375.0,
      "reward": 0.3551255464553833,
      "reward_std": 0.12043958902359009,
      "rewards/grpo_reward_func/mean": 0.3551255464553833,
      "rewards/grpo_reward_func/std": 0.13196633756160736,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.1875,
      "learning_rate": 4.5666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 2038823.0,
      "reward": 0.41090184450149536,
      "reward_std": 0.11341163516044617,
      "rewards/grpo_reward_func/mean": 0.41090184450149536,
      "rewards/grpo_reward_func/std": 0.11507044732570648,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5625,
      "learning_rate": 4.5333333333333326e-07,
      "loss": -0.0,
      "num_tokens": 2051219.0,
      "reward": 0.3054584860801697,
      "reward_std": 0.08504727482795715,
      "rewards/grpo_reward_func/mean": 0.3054584860801697,
      "rewards/grpo_reward_func/std": 0.09004215151071548,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.9375,
      "learning_rate": 4.5e-07,
      "loss": -0.0,
      "num_tokens": 2063651.0,
      "reward": 0.47015416622161865,
      "reward_std": 0.15467038750648499,
      "rewards/grpo_reward_func/mean": 0.47015416622161865,
      "rewards/grpo_reward_func/std": 0.153534397482872,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.25,
      "learning_rate": 4.4666666666666664e-07,
      "loss": -0.0,
      "num_tokens": 2075955.0,
      "reward": 0.4894865155220032,
      "reward_std": 0.07816055417060852,
      "rewards/grpo_reward_func/mean": 0.4894865155220032,
      "rewards/grpo_reward_func/std": 0.07534909248352051,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.8125,
      "learning_rate": 4.4333333333333336e-07,
      "loss": -0.0,
      "num_tokens": 2088435.0,
      "reward": 0.4603702425956726,
      "reward_std": 0.15144219994544983,
      "rewards/grpo_reward_func/mean": 0.4603702425956726,
      "rewards/grpo_reward_func/std": 0.16273239254951477,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.75,
      "learning_rate": 4.3999999999999997e-07,
      "loss": -0.0,
      "num_tokens": 2100919.0,
      "reward": 0.3637647032737732,
      "reward_std": 0.06757047772407532,
      "rewards/grpo_reward_func/mean": 0.3637647032737732,
      "rewards/grpo_reward_func/std": 0.08233585953712463,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.8125,
      "learning_rate": 4.3666666666666663e-07,
      "loss": 0.0,
      "num_tokens": 2113343.0,
      "reward": 0.3543202579021454,
      "reward_std": 0.08441969752311707,
      "rewards/grpo_reward_func/mean": 0.3543202579021454,
      "rewards/grpo_reward_func/std": 0.08902662247419357,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.4375,
      "learning_rate": 4.3333333333333335e-07,
      "loss": 0.0,
      "num_tokens": 2125791.0,
      "reward": 0.47929999232292175,
      "reward_std": 0.17670738697052002,
      "rewards/grpo_reward_func/mean": 0.47929999232292175,
      "rewards/grpo_reward_func/std": 0.17999567091464996,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.5625,
      "learning_rate": 4.2999999999999996e-07,
      "loss": -0.0,
      "num_tokens": 2138251.0,
      "reward": 0.3452494740486145,
      "reward_std": 0.08022183179855347,
      "rewards/grpo_reward_func/mean": 0.3452494740486145,
      "rewards/grpo_reward_func/std": 0.08067353814840317,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.3125,
      "learning_rate": 4.266666666666667e-07,
      "loss": -0.0,
      "num_tokens": 2150659.0,
      "reward": 0.42406925559043884,
      "reward_std": 0.2445584237575531,
      "rewards/grpo_reward_func/mean": 0.42406925559043884,
      "rewards/grpo_reward_func/std": 0.22746475040912628,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.34375,
      "learning_rate": 4.2333333333333334e-07,
      "loss": -0.0,
      "num_tokens": 2163163.0,
      "reward": 0.4360213279724121,
      "reward_std": 0.07188587635755539,
      "rewards/grpo_reward_func/mean": 0.4360213279724121,
      "rewards/grpo_reward_func/std": 0.07112448662519455,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.65625,
      "learning_rate": 4.1999999999999995e-07,
      "loss": -0.0,
      "num_tokens": 2175679.0,
      "reward": 0.40680232644081116,
      "reward_std": 0.054570674896240234,
      "rewards/grpo_reward_func/mean": 0.40680232644081116,
      "rewards/grpo_reward_func/std": 0.05052686110138893,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.1875,
      "learning_rate": 4.1666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 2188199.0,
      "reward": 0.4191306233406067,
      "reward_std": 0.11386445164680481,
      "rewards/grpo_reward_func/mean": 0.4191306233406067,
      "rewards/grpo_reward_func/std": 0.1961081475019455,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.4375,
      "learning_rate": 4.1333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2200571.0,
      "reward": 0.49487611651420593,
      "reward_std": 0.18403539061546326,
      "rewards/grpo_reward_func/mean": 0.49487611651420593,
      "rewards/grpo_reward_func/std": 0.17239995300769806,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.0625,
      "learning_rate": 4.0999999999999994e-07,
      "loss": 0.0,
      "num_tokens": 2212955.0,
      "reward": 0.5409983396530151,
      "reward_std": 0.12222976982593536,
      "rewards/grpo_reward_func/mean": 0.5409983396530151,
      "rewards/grpo_reward_func/std": 0.11841105669736862,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1484375,
      "learning_rate": 4.0666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 2225275.0,
      "reward": 0.3465573191642761,
      "reward_std": 0.05415717512369156,
      "rewards/grpo_reward_func/mean": 0.3465573191642761,
      "rewards/grpo_reward_func/std": 0.08967100828886032,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.625,
      "learning_rate": 4.033333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2237591.0,
      "reward": 0.40769240260124207,
      "reward_std": 0.061508819460868835,
      "rewards/grpo_reward_func/mean": 0.40769240260124207,
      "rewards/grpo_reward_func/std": 0.11384513974189758,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.75,
      "learning_rate": 4e-07,
      "loss": 0.0,
      "num_tokens": 2249983.0,
      "reward": 0.4172666072845459,
      "reward_std": 0.04795217514038086,
      "rewards/grpo_reward_func/mean": 0.4172666072845459,
      "rewards/grpo_reward_func/std": 0.06908071041107178,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.0,
      "learning_rate": 3.9666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 2262391.0,
      "reward": 0.4887160658836365,
      "reward_std": 0.0936364233493805,
      "rewards/grpo_reward_func/mean": 0.4887160658836365,
      "rewards/grpo_reward_func/std": 0.12800146639347076,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.8125,
      "learning_rate": 3.933333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2274847.0,
      "reward": 0.5363283157348633,
      "reward_std": 0.09925331920385361,
      "rewards/grpo_reward_func/mean": 0.5363283157348633,
      "rewards/grpo_reward_func/std": 0.09658796340227127,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.625,
      "learning_rate": 3.8999999999999997e-07,
      "loss": 0.0,
      "num_tokens": 2287319.0,
      "reward": 0.41678112745285034,
      "reward_std": 0.13148340582847595,
      "rewards/grpo_reward_func/mean": 0.41678112745285034,
      "rewards/grpo_reward_func/std": 0.1416279673576355,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5,
      "learning_rate": 3.8666666666666664e-07,
      "loss": -0.0,
      "num_tokens": 2299739.0,
      "reward": 0.49028465151786804,
      "reward_std": 0.09930803626775742,
      "rewards/grpo_reward_func/mean": 0.49028465151786804,
      "rewards/grpo_reward_func/std": 0.1043338030576706,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.1875,
      "learning_rate": 3.8333333333333335e-07,
      "loss": 0.0,
      "num_tokens": 2312179.0,
      "reward": 0.42906028032302856,
      "reward_std": 0.09733951836824417,
      "rewards/grpo_reward_func/mean": 0.42906028032302856,
      "rewards/grpo_reward_func/std": 0.101521797478199,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.59375,
      "learning_rate": 3.7999999999999996e-07,
      "loss": -0.0,
      "num_tokens": 2324643.0,
      "reward": 0.5322451591491699,
      "reward_std": 0.05065479129552841,
      "rewards/grpo_reward_func/mean": 0.5322451591491699,
      "rewards/grpo_reward_func/std": 0.10973110795021057,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.25,
      "learning_rate": 3.766666666666666e-07,
      "loss": 0.0,
      "num_tokens": 2337095.0,
      "reward": 0.357377290725708,
      "reward_std": 0.11668767035007477,
      "rewards/grpo_reward_func/mean": 0.357377290725708,
      "rewards/grpo_reward_func/std": 0.11811976879835129,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.4375,
      "learning_rate": 3.7333333333333334e-07,
      "loss": 0.0,
      "num_tokens": 2349527.0,
      "reward": 0.4484630823135376,
      "reward_std": 0.13092045485973358,
      "rewards/grpo_reward_func/mean": 0.4484630823135376,
      "rewards/grpo_reward_func/std": 0.18394383788108826,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.625,
      "learning_rate": 3.7e-07,
      "loss": 0.0,
      "num_tokens": 2361935.0,
      "reward": 0.5536394119262695,
      "reward_std": 0.1294117271900177,
      "rewards/grpo_reward_func/mean": 0.5536394119262695,
      "rewards/grpo_reward_func/std": 0.1366083174943924,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 5.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.6875,
      "learning_rate": 3.666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2374335.0,
      "reward": 0.43622025847435,
      "reward_std": 0.036504555493593216,
      "rewards/grpo_reward_func/mean": 0.43622025847435,
      "rewards/grpo_reward_func/std": 0.04184015840291977,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.375,
      "learning_rate": 3.6333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2386703.0,
      "reward": 0.32082119584083557,
      "reward_std": 0.08303728699684143,
      "rewards/grpo_reward_func/mean": 0.32082119584083557,
      "rewards/grpo_reward_func/std": 0.09865312278270721,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.0,
      "learning_rate": 3.6e-07,
      "loss": -0.0,
      "num_tokens": 2399127.0,
      "reward": 0.40079742670059204,
      "reward_std": 0.12725131213665009,
      "rewards/grpo_reward_func/mean": 0.40079742670059204,
      "rewards/grpo_reward_func/std": 0.16112373769283295,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.6875,
      "learning_rate": 3.5666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2411571.0,
      "reward": 0.43647855520248413,
      "reward_std": 0.10620959103107452,
      "rewards/grpo_reward_func/mean": 0.43647855520248413,
      "rewards/grpo_reward_func/std": 0.09909818321466446,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.6875,
      "learning_rate": 3.533333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2423979.0,
      "reward": 0.3994408845901489,
      "reward_std": 0.17607587575912476,
      "rewards/grpo_reward_func/mean": 0.3994408845901489,
      "rewards/grpo_reward_func/std": 0.17166729271411896,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.78125,
      "learning_rate": 3.5e-07,
      "loss": -0.0,
      "num_tokens": 2436363.0,
      "reward": 0.4736449420452118,
      "reward_std": 0.09779857844114304,
      "rewards/grpo_reward_func/mean": 0.4736449420452118,
      "rewards/grpo_reward_func/std": 0.11548104882240295,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0,
      "learning_rate": 3.4666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 2448911.0,
      "reward": 0.38275349140167236,
      "reward_std": 0.07293462753295898,
      "rewards/grpo_reward_func/mean": 0.38275349140167236,
      "rewards/grpo_reward_func/std": 0.0916486382484436,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.125,
      "learning_rate": 3.433333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2461263.0,
      "reward": 0.5372081995010376,
      "reward_std": 0.20845532417297363,
      "rewards/grpo_reward_func/mean": 0.5372081995010376,
      "rewards/grpo_reward_func/std": 0.21515534818172455,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.125,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 0.0,
      "num_tokens": 2473707.0,
      "reward": 0.415330171585083,
      "reward_std": 0.15996377170085907,
      "rewards/grpo_reward_func/mean": 0.415330171585083,
      "rewards/grpo_reward_func/std": 0.18506671488285065,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0625,
      "learning_rate": 3.3666666666666664e-07,
      "loss": -0.0,
      "num_tokens": 2486079.0,
      "reward": 0.41273248195648193,
      "reward_std": 0.050071652978658676,
      "rewards/grpo_reward_func/mean": 0.41273248195648193,
      "rewards/grpo_reward_func/std": 0.11006694287061691,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.0625,
      "learning_rate": 3.333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2498575.0,
      "reward": 0.40237534046173096,
      "reward_std": 0.16011598706245422,
      "rewards/grpo_reward_func/mean": 0.40237534046173096,
      "rewards/grpo_reward_func/std": 0.1658114790916443,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.125,
      "learning_rate": 3.3e-07,
      "loss": 0.0,
      "num_tokens": 2511047.0,
      "reward": 0.40720921754837036,
      "reward_std": 0.10842312127351761,
      "rewards/grpo_reward_func/mean": 0.40720921754837036,
      "rewards/grpo_reward_func/std": 0.15348494052886963,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.75,
      "learning_rate": 3.2666666666666663e-07,
      "loss": -0.0,
      "num_tokens": 2523499.0,
      "reward": 0.46542418003082275,
      "reward_std": 0.1260077953338623,
      "rewards/grpo_reward_func/mean": 0.46542418003082275,
      "rewards/grpo_reward_func/std": 0.1437770575284958,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.4375,
      "learning_rate": 3.233333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2535931.0,
      "reward": 0.4416119456291199,
      "reward_std": 0.10100536048412323,
      "rewards/grpo_reward_func/mean": 0.4416119456291199,
      "rewards/grpo_reward_func/std": 0.11735321581363678,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.1875,
      "learning_rate": 3.2e-07,
      "loss": -0.0,
      "num_tokens": 2548287.0,
      "reward": 0.40553370118141174,
      "reward_std": 0.13550561666488647,
      "rewards/grpo_reward_func/mean": 0.40553370118141174,
      "rewards/grpo_reward_func/std": 0.13743624091148376,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.75,
      "learning_rate": 3.166666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2560711.0,
      "reward": 0.35497111082077026,
      "reward_std": 0.11463560163974762,
      "rewards/grpo_reward_func/mean": 0.35497111082077026,
      "rewards/grpo_reward_func/std": 0.12006353586912155,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.5625,
      "learning_rate": 3.1333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2573143.0,
      "reward": 0.4096822142601013,
      "reward_std": 0.05833249166607857,
      "rewards/grpo_reward_func/mean": 0.4096822142601013,
      "rewards/grpo_reward_func/std": 0.08212708681821823,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.3125,
      "learning_rate": 3.1e-07,
      "loss": -0.0,
      "num_tokens": 2585583.0,
      "reward": 0.4554346799850464,
      "reward_std": 0.12953370809555054,
      "rewards/grpo_reward_func/mean": 0.4554346799850464,
      "rewards/grpo_reward_func/std": 0.1593649685382843,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.375,
      "learning_rate": 3.066666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2598031.0,
      "reward": 0.5756185054779053,
      "reward_std": 0.0809057205915451,
      "rewards/grpo_reward_func/mean": 0.5756185054779053,
      "rewards/grpo_reward_func/std": 0.10910212993621826,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.9375,
      "learning_rate": 3.033333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2610507.0,
      "reward": 0.39368146657943726,
      "reward_std": 0.1372520923614502,
      "rewards/grpo_reward_func/mean": 0.39368146657943726,
      "rewards/grpo_reward_func/std": 0.13048243522644043,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.1875,
      "learning_rate": 3e-07,
      "loss": 0.0,
      "num_tokens": 2623039.0,
      "reward": 0.3540037274360657,
      "reward_std": 0.08001622557640076,
      "rewards/grpo_reward_func/mean": 0.3540037274360657,
      "rewards/grpo_reward_func/std": 0.08400996774435043,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.8125,
      "learning_rate": 2.966666666666667e-07,
      "loss": 0.0,
      "num_tokens": 2635395.0,
      "reward": 0.44302040338516235,
      "reward_std": 0.08312968909740448,
      "rewards/grpo_reward_func/mean": 0.44302040338516235,
      "rewards/grpo_reward_func/std": 0.0891660526394844,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.78125,
      "learning_rate": 2.933333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2647843.0,
      "reward": 0.3831726610660553,
      "reward_std": 0.05200519412755966,
      "rewards/grpo_reward_func/mean": 0.3831726610660553,
      "rewards/grpo_reward_func/std": 0.09167957305908203,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.4375,
      "learning_rate": 2.9e-07,
      "loss": 0.0,
      "num_tokens": 2660251.0,
      "reward": 0.3554950952529907,
      "reward_std": 0.05713435262441635,
      "rewards/grpo_reward_func/mean": 0.3554950952529907,
      "rewards/grpo_reward_func/std": 0.0688985213637352,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.375,
      "learning_rate": 2.866666666666667e-07,
      "loss": -0.0,
      "num_tokens": 2672727.0,
      "reward": 0.326229453086853,
      "reward_std": 0.08010618388652802,
      "rewards/grpo_reward_func/mean": 0.326229453086853,
      "rewards/grpo_reward_func/std": 0.08994052559137344,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.125,
      "learning_rate": 2.833333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2685119.0,
      "reward": 0.4351205825805664,
      "reward_std": 0.08398930728435516,
      "rewards/grpo_reward_func/mean": 0.4351205825805664,
      "rewards/grpo_reward_func/std": 0.08400265872478485,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.09375,
      "learning_rate": 2.8e-07,
      "loss": 0.0,
      "num_tokens": 2697507.0,
      "reward": 0.4564037621021271,
      "reward_std": 0.08567321300506592,
      "rewards/grpo_reward_func/mean": 0.4564037621021271,
      "rewards/grpo_reward_func/std": 0.08155813813209534,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.8125,
      "learning_rate": 2.766666666666667e-07,
      "loss": -0.0,
      "num_tokens": 2709927.0,
      "reward": 0.4280545115470886,
      "reward_std": 0.13084210455417633,
      "rewards/grpo_reward_func/mean": 0.4280545115470886,
      "rewards/grpo_reward_func/std": 0.13759230077266693,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.25,
      "learning_rate": 2.733333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2722331.0,
      "reward": 0.44869235157966614,
      "reward_std": 0.0902545154094696,
      "rewards/grpo_reward_func/mean": 0.44869235157966614,
      "rewards/grpo_reward_func/std": 0.1127212718129158,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.375,
      "learning_rate": 2.7e-07,
      "loss": 0.0,
      "num_tokens": 2734779.0,
      "reward": 0.4759725332260132,
      "reward_std": 0.12860512733459473,
      "rewards/grpo_reward_func/mean": 0.4759725332260132,
      "rewards/grpo_reward_func/std": 0.1384066343307495,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.75,
      "learning_rate": 2.6666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 2747235.0,
      "reward": 0.5738496780395508,
      "reward_std": 0.11320274323225021,
      "rewards/grpo_reward_func/mean": 0.5738496780395508,
      "rewards/grpo_reward_func/std": 0.10654186457395554,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.9375,
      "learning_rate": 2.633333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2759643.0,
      "reward": 0.33652713894844055,
      "reward_std": 0.10561183094978333,
      "rewards/grpo_reward_func/mean": 0.33652713894844055,
      "rewards/grpo_reward_func/std": 0.11127988249063492,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 6.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25,
      "learning_rate": 2.6e-07,
      "loss": 0.0,
      "num_tokens": 2772083.0,
      "reward": 0.45456087589263916,
      "reward_std": 0.21474137902259827,
      "rewards/grpo_reward_func/mean": 0.45456087589263916,
      "rewards/grpo_reward_func/std": 0.20742803812026978,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.25,
      "learning_rate": 2.5666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 2784487.0,
      "reward": 0.36959922313690186,
      "reward_std": 0.12393350899219513,
      "rewards/grpo_reward_func/mean": 0.36959922313690186,
      "rewards/grpo_reward_func/std": 0.18545781075954437,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.375,
      "learning_rate": 2.533333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2796895.0,
      "reward": 0.5148861408233643,
      "reward_std": 0.10401658713817596,
      "rewards/grpo_reward_func/mean": 0.5148861408233643,
      "rewards/grpo_reward_func/std": 0.10146593302488327,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.6875,
      "learning_rate": 2.5e-07,
      "loss": 0.0,
      "num_tokens": 2809283.0,
      "reward": 0.3833024799823761,
      "reward_std": 0.07489189505577087,
      "rewards/grpo_reward_func/mean": 0.3833024799823761,
      "rewards/grpo_reward_func/std": 0.07100249826908112,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.5625,
      "learning_rate": 2.4666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 2821811.0,
      "reward": 0.37905335426330566,
      "reward_std": 0.09207235276699066,
      "rewards/grpo_reward_func/mean": 0.37905335426330566,
      "rewards/grpo_reward_func/std": 0.10075780749320984,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.0693359375,
      "learning_rate": 2.433333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2834259.0,
      "reward": 0.5241298079490662,
      "reward_std": 0.050000011920928955,
      "rewards/grpo_reward_func/mean": 0.5241298079490662,
      "rewards/grpo_reward_func/std": 0.11461541801691055,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.4375,
      "learning_rate": 2.4e-07,
      "loss": -0.0,
      "num_tokens": 2846667.0,
      "reward": 0.38863605260849,
      "reward_std": 0.09145700931549072,
      "rewards/grpo_reward_func/mean": 0.38863605260849,
      "rewards/grpo_reward_func/std": 0.0854310691356659,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.6875,
      "learning_rate": 2.3666666666666664e-07,
      "loss": 0.0,
      "num_tokens": 2859183.0,
      "reward": 0.48604702949523926,
      "reward_std": 0.12953568994998932,
      "rewards/grpo_reward_func/mean": 0.48604702949523926,
      "rewards/grpo_reward_func/std": 0.12877187132835388,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 2.3333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 2871631.0,
      "reward": 0.49290764331817627,
      "reward_std": 0.1408785730600357,
      "rewards/grpo_reward_func/mean": 0.49290764331817627,
      "rewards/grpo_reward_func/std": 0.16115672886371613,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.4375,
      "learning_rate": 2.3e-07,
      "loss": 0.0,
      "num_tokens": 2884087.0,
      "reward": 0.389384388923645,
      "reward_std": 0.08452893793582916,
      "rewards/grpo_reward_func/mean": 0.389384388923645,
      "rewards/grpo_reward_func/std": 0.09952805191278458,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.8125,
      "learning_rate": 2.2666666666666663e-07,
      "loss": 0.0,
      "num_tokens": 2896567.0,
      "reward": 0.42921292781829834,
      "reward_std": 0.12179729342460632,
      "rewards/grpo_reward_func/mean": 0.42921292781829834,
      "rewards/grpo_reward_func/std": 0.14654681086540222,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 2.2333333333333332e-07,
      "loss": 0.0,
      "num_tokens": 2909027.0,
      "reward": 0.3906250596046448,
      "reward_std": 0.07476774603128433,
      "rewards/grpo_reward_func/mean": 0.3906250596046448,
      "rewards/grpo_reward_func/std": 0.07509444653987885,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.6875,
      "learning_rate": 2.1999999999999998e-07,
      "loss": -0.0,
      "num_tokens": 2921395.0,
      "reward": 0.590385913848877,
      "reward_std": 0.07703244686126709,
      "rewards/grpo_reward_func/mean": 0.590385913848877,
      "rewards/grpo_reward_func/std": 0.10371364653110504,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.375,
      "learning_rate": 2.1666666666666667e-07,
      "loss": 0.0,
      "num_tokens": 2933635.0,
      "reward": 0.45836111903190613,
      "reward_std": 0.1561897248029709,
      "rewards/grpo_reward_func/mean": 0.45836111903190613,
      "rewards/grpo_reward_func/std": 0.1763034164905548,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.75,
      "learning_rate": 2.1333333333333334e-07,
      "loss": 0.0,
      "num_tokens": 2946019.0,
      "reward": 0.3915758430957794,
      "reward_std": 0.09102918207645416,
      "rewards/grpo_reward_func/mean": 0.3915758430957794,
      "rewards/grpo_reward_func/std": 0.09686020016670227,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.4375,
      "learning_rate": 2.0999999999999997e-07,
      "loss": 0.0,
      "num_tokens": 2958339.0,
      "reward": 0.5137478709220886,
      "reward_std": 0.06453146040439606,
      "rewards/grpo_reward_func/mean": 0.5137478709220886,
      "rewards/grpo_reward_func/std": 0.08145393431186676,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.6875,
      "learning_rate": 2.0666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 2970795.0,
      "reward": 0.3901534080505371,
      "reward_std": 0.08677110075950623,
      "rewards/grpo_reward_func/mean": 0.3901534080505371,
      "rewards/grpo_reward_func/std": 0.0884600430727005,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.25,
      "learning_rate": 2.0333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 2983199.0,
      "reward": 0.3966296911239624,
      "reward_std": 0.11454164981842041,
      "rewards/grpo_reward_func/mean": 0.3966296911239624,
      "rewards/grpo_reward_func/std": 0.11218782514333725,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.8125,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 2995671.0,
      "reward": 0.4128722548484802,
      "reward_std": 0.1050279289484024,
      "rewards/grpo_reward_func/mean": 0.4128722548484802,
      "rewards/grpo_reward_func/std": 0.15005381405353546,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.40625,
      "learning_rate": 1.9666666666666665e-07,
      "loss": -0.0,
      "num_tokens": 3008163.0,
      "reward": 0.41674578189849854,
      "reward_std": 0.130544051527977,
      "rewards/grpo_reward_func/mean": 0.41674578189849854,
      "rewards/grpo_reward_func/std": 0.1530657559633255,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.125,
      "learning_rate": 1.9333333333333332e-07,
      "loss": 0.0,
      "num_tokens": 3020547.0,
      "reward": 0.44154661893844604,
      "reward_std": 0.11442729830741882,
      "rewards/grpo_reward_func/mean": 0.44154661893844604,
      "rewards/grpo_reward_func/std": 0.15436801314353943,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.875,
      "learning_rate": 1.8999999999999998e-07,
      "loss": -0.0,
      "num_tokens": 3032907.0,
      "reward": 0.41183507442474365,
      "reward_std": 0.11221058666706085,
      "rewards/grpo_reward_func/mean": 0.41183507442474365,
      "rewards/grpo_reward_func/std": 0.10689571499824524,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 1.8666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 3045283.0,
      "reward": 0.3969360589981079,
      "reward_std": 0.13579751551151276,
      "rewards/grpo_reward_func/mean": 0.3969360589981079,
      "rewards/grpo_reward_func/std": 0.14742760360240936,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.5625,
      "learning_rate": 1.833333333333333e-07,
      "loss": 0.0,
      "num_tokens": 3057703.0,
      "reward": 0.3443870544433594,
      "reward_std": 0.20534491539001465,
      "rewards/grpo_reward_func/mean": 0.3443870544433594,
      "rewards/grpo_reward_func/std": 0.19916358590126038,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.5625,
      "learning_rate": 1.8e-07,
      "loss": 0.0,
      "num_tokens": 3070135.0,
      "reward": 0.3964824378490448,
      "reward_std": 0.13892269134521484,
      "rewards/grpo_reward_func/mean": 0.3964824378490448,
      "rewards/grpo_reward_func/std": 0.15906588733196259,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.5625,
      "learning_rate": 1.7666666666666666e-07,
      "loss": -0.0,
      "num_tokens": 3082619.0,
      "reward": 0.4210782051086426,
      "reward_std": 0.12285022437572479,
      "rewards/grpo_reward_func/mean": 0.4210782051086426,
      "rewards/grpo_reward_func/std": 0.1384182870388031,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.9375,
      "learning_rate": 1.7333333333333332e-07,
      "loss": -0.0,
      "num_tokens": 3095171.0,
      "reward": 0.49818533658981323,
      "reward_std": 0.10502855479717255,
      "rewards/grpo_reward_func/mean": 0.49818533658981323,
      "rewards/grpo_reward_func/std": 0.13689859211444855,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.6875,
      "learning_rate": 1.7000000000000001e-07,
      "loss": -0.0,
      "num_tokens": 3107575.0,
      "reward": 0.39785051345825195,
      "reward_std": 0.0737057775259018,
      "rewards/grpo_reward_func/mean": 0.39785051345825195,
      "rewards/grpo_reward_func/std": 0.08374593406915665,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.1875,
      "learning_rate": 1.6666666666666665e-07,
      "loss": -0.0,
      "num_tokens": 3119987.0,
      "reward": 0.3956165909767151,
      "reward_std": 0.08730382472276688,
      "rewards/grpo_reward_func/mean": 0.3956165909767151,
      "rewards/grpo_reward_func/std": 0.12590822577476501,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.125,
      "learning_rate": 1.6333333333333331e-07,
      "loss": -0.0,
      "num_tokens": 3132419.0,
      "reward": 0.40047013759613037,
      "reward_std": 0.09308422356843948,
      "rewards/grpo_reward_func/mean": 0.40047013759613037,
      "rewards/grpo_reward_func/std": 0.10088325291872025,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.125,
      "learning_rate": 1.6e-07,
      "loss": -0.0,
      "num_tokens": 3144851.0,
      "reward": 0.33004331588745117,
      "reward_std": 0.04140020161867142,
      "rewards/grpo_reward_func/mean": 0.33004331588745117,
      "rewards/grpo_reward_func/std": 0.04383409395813942,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.4375,
      "learning_rate": 1.5666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 3157367.0,
      "reward": 0.495669424533844,
      "reward_std": 0.095655158162117,
      "rewards/grpo_reward_func/mean": 0.495669424533844,
      "rewards/grpo_reward_func/std": 0.10840737819671631,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 7.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.3125,
      "learning_rate": 1.533333333333333e-07,
      "loss": 0.0,
      "num_tokens": 3169795.0,
      "reward": 0.398048460483551,
      "reward_std": 0.08092916011810303,
      "rewards/grpo_reward_func/mean": 0.398048460483551,
      "rewards/grpo_reward_func/std": 0.08040700852870941,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.0625,
      "learning_rate": 1.5e-07,
      "loss": 0.0,
      "num_tokens": 3182271.0,
      "reward": 0.4666450321674347,
      "reward_std": 0.08053655922412872,
      "rewards/grpo_reward_func/mean": 0.4666450321674347,
      "rewards/grpo_reward_func/std": 0.11888416111469269,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.4375,
      "learning_rate": 1.4666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 3194699.0,
      "reward": 0.4015364646911621,
      "reward_std": 0.16598042845726013,
      "rewards/grpo_reward_func/mean": 0.4015364646911621,
      "rewards/grpo_reward_func/std": 0.16788989305496216,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.375,
      "learning_rate": 1.4333333333333335e-07,
      "loss": -0.0,
      "num_tokens": 3207091.0,
      "reward": 0.48480066657066345,
      "reward_std": 0.15683354437351227,
      "rewards/grpo_reward_func/mean": 0.48480066657066345,
      "rewards/grpo_reward_func/std": 0.14960500597953796,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.8125,
      "learning_rate": 1.4e-07,
      "loss": 0.0,
      "num_tokens": 3219447.0,
      "reward": 0.49088042974472046,
      "reward_std": 0.16376182436943054,
      "rewards/grpo_reward_func/mean": 0.49088042974472046,
      "rewards/grpo_reward_func/std": 0.17037776112556458,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.6875,
      "learning_rate": 1.3666666666666665e-07,
      "loss": 0.0,
      "num_tokens": 3231843.0,
      "reward": 0.4621606469154358,
      "reward_std": 0.16308224201202393,
      "rewards/grpo_reward_func/mean": 0.4621606469154358,
      "rewards/grpo_reward_func/std": 0.18942511081695557,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.375,
      "learning_rate": 1.3333333333333334e-07,
      "loss": 0.0,
      "num_tokens": 3244199.0,
      "reward": 0.521634578704834,
      "reward_std": 0.08799108862876892,
      "rewards/grpo_reward_func/mean": 0.521634578704834,
      "rewards/grpo_reward_func/std": 0.08898300677537918,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.375,
      "learning_rate": 1.3e-07,
      "loss": -0.0,
      "num_tokens": 3256663.0,
      "reward": 0.5014014840126038,
      "reward_std": 0.10305628925561905,
      "rewards/grpo_reward_func/mean": 0.5014014840126038,
      "rewards/grpo_reward_func/std": 0.11243268102407455,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25,
      "learning_rate": 1.2666666666666666e-07,
      "loss": 0.0,
      "num_tokens": 3269115.0,
      "reward": 0.49657315015792847,
      "reward_std": 0.14654701948165894,
      "rewards/grpo_reward_func/mean": 0.49657315015792847,
      "rewards/grpo_reward_func/std": 0.14595918357372284,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.75,
      "learning_rate": 1.2333333333333333e-07,
      "loss": 0.0,
      "num_tokens": 3281535.0,
      "reward": 0.4041872024536133,
      "reward_std": 0.1379416286945343,
      "rewards/grpo_reward_func/mean": 0.4041872024536133,
      "rewards/grpo_reward_func/std": 0.1561095267534256,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.5,
      "learning_rate": 1.2e-07,
      "loss": -0.0,
      "num_tokens": 3293927.0,
      "reward": 0.5414110422134399,
      "reward_std": 0.1973114013671875,
      "rewards/grpo_reward_func/mean": 0.5414110422134399,
      "rewards/grpo_reward_func/std": 0.18588195741176605,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.5,
      "learning_rate": 1.1666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 3306379.0,
      "reward": 0.3934594988822937,
      "reward_std": 0.025219213217496872,
      "rewards/grpo_reward_func/mean": 0.3934594988822937,
      "rewards/grpo_reward_func/std": 0.027134951204061508,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.4375,
      "learning_rate": 1.1333333333333332e-07,
      "loss": -0.0,
      "num_tokens": 3318815.0,
      "reward": 0.40915048122406006,
      "reward_std": 0.09651514887809753,
      "rewards/grpo_reward_func/mean": 0.40915048122406006,
      "rewards/grpo_reward_func/std": 0.11164474487304688,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.3125,
      "learning_rate": 1.0999999999999999e-07,
      "loss": 0.0,
      "num_tokens": 3331207.0,
      "reward": 0.3795730471611023,
      "reward_std": 0.08440607786178589,
      "rewards/grpo_reward_func/mean": 0.3795730471611023,
      "rewards/grpo_reward_func/std": 0.08232571184635162,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.40625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.125,
      "learning_rate": 1.0666666666666667e-07,
      "loss": -0.0,
      "num_tokens": 3343791.0,
      "reward": 0.45081427693367004,
      "reward_std": 0.13623002171516418,
      "rewards/grpo_reward_func/mean": 0.45081427693367004,
      "rewards/grpo_reward_func/std": 0.14548543095588684,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.4375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 1.0333333333333333e-07,
      "loss": -0.0,
      "num_tokens": 3356207.0,
      "reward": 0.46069252490997314,
      "reward_std": 0.07286226749420166,
      "rewards/grpo_reward_func/mean": 0.46069252490997314,
      "rewards/grpo_reward_func/std": 0.08740860968828201,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.46875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 1e-07,
      "loss": 0.0,
      "num_tokens": 3368691.0,
      "reward": 0.37001582980155945,
      "reward_std": 0.08882021903991699,
      "rewards/grpo_reward_func/mean": 0.37001582980155945,
      "rewards/grpo_reward_func/std": 0.08371038734912872,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.5,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.625,
      "learning_rate": 9.666666666666666e-08,
      "loss": -0.0,
      "num_tokens": 3380935.0,
      "reward": 0.46963435411453247,
      "reward_std": 0.12529392540454865,
      "rewards/grpo_reward_func/mean": 0.46963435411453247,
      "rewards/grpo_reward_func/std": 0.13837039470672607,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.53125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.3125,
      "learning_rate": 9.333333333333334e-08,
      "loss": -0.0,
      "num_tokens": 3393443.0,
      "reward": 0.5679957866668701,
      "reward_std": 0.08565768599510193,
      "rewards/grpo_reward_func/mean": 0.5679957866668701,
      "rewards/grpo_reward_func/std": 0.08279130607843399,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.5625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.25,
      "learning_rate": 9e-08,
      "loss": 0.0,
      "num_tokens": 3405771.0,
      "reward": 0.3129928410053253,
      "reward_std": 0.07984557747840881,
      "rewards/grpo_reward_func/mean": 0.3129928410053253,
      "rewards/grpo_reward_func/std": 0.08136677742004395,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.59375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.0625,
      "learning_rate": 8.666666666666666e-08,
      "loss": 0.0,
      "num_tokens": 3418243.0,
      "reward": 0.3354572653770447,
      "reward_std": 0.09963542222976685,
      "rewards/grpo_reward_func/mean": 0.3354572653770447,
      "rewards/grpo_reward_func/std": 0.09654007852077484,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.25,
      "learning_rate": 8.333333333333333e-08,
      "loss": 0.0,
      "num_tokens": 3430691.0,
      "reward": 0.41226309537887573,
      "reward_std": 0.1296028345823288,
      "rewards/grpo_reward_func/mean": 0.41226309537887573,
      "rewards/grpo_reward_func/std": 0.12655113637447357,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.65625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.125,
      "learning_rate": 8e-08,
      "loss": 0.0,
      "num_tokens": 3443151.0,
      "reward": 0.4148029088973999,
      "reward_std": 0.1445026993751526,
      "rewards/grpo_reward_func/mean": 0.4148029088973999,
      "rewards/grpo_reward_func/std": 0.1527920663356781,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.6875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.1875,
      "learning_rate": 7.666666666666665e-08,
      "loss": 0.0,
      "num_tokens": 3455579.0,
      "reward": 0.3091464638710022,
      "reward_std": 0.09873013943433762,
      "rewards/grpo_reward_func/mean": 0.3091464638710022,
      "rewards/grpo_reward_func/std": 0.12618468701839447,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.71875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.375,
      "learning_rate": 7.333333333333333e-08,
      "loss": 0.0,
      "num_tokens": 3468011.0,
      "reward": 0.412067174911499,
      "reward_std": 0.11878905445337296,
      "rewards/grpo_reward_func/mean": 0.412067174911499,
      "rewards/grpo_reward_func/std": 0.12399723380804062,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.75,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.6875,
      "learning_rate": 7e-08,
      "loss": -0.0,
      "num_tokens": 3480459.0,
      "reward": 0.3863711953163147,
      "reward_std": 0.1872004270553589,
      "rewards/grpo_reward_func/mean": 0.3863711953163147,
      "rewards/grpo_reward_func/std": 0.18860581517219543,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.78125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.125,
      "learning_rate": 6.666666666666667e-08,
      "loss": -0.0,
      "num_tokens": 3492923.0,
      "reward": 0.40867847204208374,
      "reward_std": 0.14625820517539978,
      "rewards/grpo_reward_func/mean": 0.40867847204208374,
      "rewards/grpo_reward_func/std": 0.14255009591579437,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.8125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.875,
      "learning_rate": 6.333333333333333e-08,
      "loss": 0.0,
      "num_tokens": 3505387.0,
      "reward": 0.45097100734710693,
      "reward_std": 0.21717840433120728,
      "rewards/grpo_reward_func/mean": 0.45097100734710693,
      "rewards/grpo_reward_func/std": 0.20403653383255005,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.84375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.1875,
      "learning_rate": 6e-08,
      "loss": -0.0,
      "num_tokens": 3517903.0,
      "reward": 0.49071210622787476,
      "reward_std": 0.13102422654628754,
      "rewards/grpo_reward_func/mean": 0.49071210622787476,
      "rewards/grpo_reward_func/std": 0.14358305931091309,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.0,
      "learning_rate": 5.666666666666666e-08,
      "loss": -0.0,
      "num_tokens": 3530331.0,
      "reward": 0.471984326839447,
      "reward_std": 0.11608313769102097,
      "rewards/grpo_reward_func/mean": 0.471984326839447,
      "rewards/grpo_reward_func/std": 0.12841607630252838,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.90625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.1875,
      "learning_rate": 5.3333333333333334e-08,
      "loss": -0.0,
      "num_tokens": 3542763.0,
      "reward": 0.39699018001556396,
      "reward_std": 0.11195935308933258,
      "rewards/grpo_reward_func/mean": 0.39699018001556396,
      "rewards/grpo_reward_func/std": 0.16198311746120453,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.9375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.9375,
      "learning_rate": 5e-08,
      "loss": 0.0,
      "num_tokens": 3555171.0,
      "reward": 0.40294522047042847,
      "reward_std": 0.11233559250831604,
      "rewards/grpo_reward_func/mean": 0.40294522047042847,
      "rewards/grpo_reward_func/std": 0.12867507338523865,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 8.96875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.125,
      "learning_rate": 4.666666666666667e-08,
      "loss": 0.0,
      "num_tokens": 3567639.0,
      "reward": 0.45153820514678955,
      "reward_std": 0.10483110696077347,
      "rewards/grpo_reward_func/mean": 0.45153820514678955,
      "rewards/grpo_reward_func/std": 0.11334265768527985,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.0,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.9375,
      "learning_rate": 4.333333333333333e-08,
      "loss": 0.0,
      "num_tokens": 3580055.0,
      "reward": 0.39824116230010986,
      "reward_std": 0.0965305045247078,
      "rewards/grpo_reward_func/mean": 0.39824116230010986,
      "rewards/grpo_reward_func/std": 0.10601532459259033,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.03125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.8125,
      "learning_rate": 4e-08,
      "loss": -0.0,
      "num_tokens": 3592511.0,
      "reward": 0.3396638035774231,
      "reward_std": 0.0737166702747345,
      "rewards/grpo_reward_func/mean": 0.3396638035774231,
      "rewards/grpo_reward_func/std": 0.07909521460533142,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.0625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.0625,
      "learning_rate": 3.6666666666666664e-08,
      "loss": -0.0,
      "num_tokens": 3604847.0,
      "reward": 0.4459681212902069,
      "reward_std": 0.13664312660694122,
      "rewards/grpo_reward_func/mean": 0.4459681212902069,
      "rewards/grpo_reward_func/std": 0.1500515192747116,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.09375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.625,
      "learning_rate": 3.3333333333333334e-08,
      "loss": -0.0,
      "num_tokens": 3617299.0,
      "reward": 0.35913753509521484,
      "reward_std": 0.10111263394355774,
      "rewards/grpo_reward_func/mean": 0.35913753509521484,
      "rewards/grpo_reward_func/std": 0.10508442670106888,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.5625,
      "learning_rate": 3e-08,
      "loss": -0.0,
      "num_tokens": 3629567.0,
      "reward": 0.4349736273288727,
      "reward_std": 0.12172282487154007,
      "rewards/grpo_reward_func/mean": 0.4349736273288727,
      "rewards/grpo_reward_func/std": 0.11470159143209457,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.15625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.0625,
      "learning_rate": 2.6666666666666667e-08,
      "loss": 0.0,
      "num_tokens": 3642071.0,
      "reward": 0.396597683429718,
      "reward_std": 0.12911826372146606,
      "rewards/grpo_reward_func/mean": 0.396597683429718,
      "rewards/grpo_reward_func/std": 0.12233106046915054,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.1875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.5,
      "learning_rate": 2.3333333333333334e-08,
      "loss": 0.0,
      "num_tokens": 3654479.0,
      "reward": 0.5098578929901123,
      "reward_std": 0.1227826401591301,
      "rewards/grpo_reward_func/mean": 0.5098578929901123,
      "rewards/grpo_reward_func/std": 0.11480940878391266,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.21875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.5625,
      "learning_rate": 2e-08,
      "loss": 0.0,
      "num_tokens": 3666891.0,
      "reward": 0.40734565258026123,
      "reward_std": 0.11240965127944946,
      "rewards/grpo_reward_func/mean": 0.40734565258026123,
      "rewards/grpo_reward_func/std": 0.13429103791713715,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.25,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.21875,
      "learning_rate": 1.6666666666666667e-08,
      "loss": -0.0,
      "num_tokens": 3679471.0,
      "reward": 0.37585046887397766,
      "reward_std": 0.048339828848838806,
      "rewards/grpo_reward_func/mean": 0.37585046887397766,
      "rewards/grpo_reward_func/std": 0.059352707117795944,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.28125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.4375,
      "learning_rate": 1.3333333333333334e-08,
      "loss": 0.0,
      "num_tokens": 3691927.0,
      "reward": 0.3830341100692749,
      "reward_std": 0.09623756259679794,
      "rewards/grpo_reward_func/mean": 0.3830341100692749,
      "rewards/grpo_reward_func/std": 0.0935094878077507,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.3125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.9375,
      "learning_rate": 1e-08,
      "loss": 0.0,
      "num_tokens": 3704431.0,
      "reward": 0.5307860374450684,
      "reward_std": 0.15707515180110931,
      "rewards/grpo_reward_func/mean": 0.5307860374450684,
      "rewards/grpo_reward_func/std": 0.15192177891731262,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.34375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 12.4375,
      "learning_rate": 6.666666666666667e-09,
      "loss": 0.0,
      "num_tokens": 3716835.0,
      "reward": 0.5075388550758362,
      "reward_std": 0.13507473468780518,
      "rewards/grpo_reward_func/mean": 0.5075388550758362,
      "rewards/grpo_reward_func/std": 0.19023331999778748,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 9.375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 13.0,
      "learning_rate": 3.3333333333333334e-09,
      "loss": -0.0,
      "num_tokens": 3729299.0,
      "reward": 0.4454175531864166,
      "reward_std": 0.07014341652393341,
      "rewards/grpo_reward_func/mean": 0.4454175531864166,
      "rewards/grpo_reward_func/std": 0.1258506029844284,
      "step": 300
    }
  ],
  "logging_steps": 1,
  "max_steps": 300,
  "num_input_tokens_seen": 3729299,
  "num_train_epochs": 10,
  "save_steps": 300,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}