TT_L0.2_H0.2_grpo/checkpoint-100/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.11428571428571428,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1702.03125,
      "completions/mean_terminated_length": 993.6190795898438,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 0.001142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2544386684894562,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 118418.0,
      "reward": 0.17899775505065918,
      "reward_std": 0.7650213241577148,
      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1738.90625,
      "completions/mean_terminated_length": 949.0,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.002285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2436082512140274,
      "learning_rate": 5e-08,
      "loss": -0.0,
      "num_tokens": 239748.0,
      "reward": 0.3848632574081421,
      "reward_std": 0.9111153483390808,
      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1545.0,
      "completions/mean_length": 1989.015625,
      "completions/mean_terminated_length": 1104.25,
      "completions/min_length": 706.0,
      "completions/min_terminated_length": 706.0,
      "epoch": 0.0034285714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2544717788696289,
      "learning_rate": 1e-07,
      "loss": -0.0,
      "num_tokens": 377517.0,
      "reward": -0.3279358148574829,
      "reward_std": 0.33216947317123413,
      "rewards/cosine_scaled_reward/mean": -0.20303040742874146,
      "rewards/cosine_scaled_reward/std": 0.179075226187706,
      "rewards/format_reward/mean": 0.078125,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1566.421875,
      "completions/mean_terminated_length": 1084.84375,
      "completions/min_length": 502.0,
      "completions/min_terminated_length": 502.0,
      "epoch": 0.004571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28807103633880615,
      "learning_rate": 1.5e-07,
      "loss": -0.0,
      "num_tokens": 487576.0,
      "reward": 0.2716121971607208,
      "reward_std": 0.6643469333648682,
      "rewards/cosine_scaled_reward/mean": -0.12981891632080078,
      "rewards/cosine_scaled_reward/std": 0.3019586503505707,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1807.0,
      "completions/mean_length": 1936.84375,
      "completions/mean_terminated_length": 1031.71435546875,
      "completions/min_length": 463.0,
      "completions/min_terminated_length": 463.0,
      "epoch": 0.005714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26783761382102966,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 622350.0,
      "reward": -0.3612896800041199,
      "reward_std": 0.41048353910446167,
      "rewards/cosine_scaled_reward/mean": -0.23533234000205994,
      "rewards/cosine_scaled_reward/std": 0.20467400550842285,
      "rewards/format_reward/mean": 0.109375,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1301.0,
      "completions/mean_length": 1889.453125,
      "completions/mean_terminated_length": 779.625,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 0.006857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.262518972158432,
      "learning_rate": 2.5e-07,
      "loss": 0.0,
      "num_tokens": 754923.0,
      "reward": -0.29250282049179077,
      "reward_std": 0.5422531962394714,
      "rewards/cosine_scaled_reward/mean": -0.22437641024589539,
      "rewards/cosine_scaled_reward/std": 0.22509199380874634,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1568.0,
      "completions/mean_length": 1921.921875,
      "completions/mean_terminated_length": 1314.45458984375,
      "completions/min_length": 927.0,
      "completions/min_terminated_length": 927.0,
      "epoch": 0.008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22601397335529327,
      "learning_rate": 3e-07,
      "loss": 0.0,
      "num_tokens": 888334.0,
      "reward": 0.025340259075164795,
      "reward_std": 0.7285393476486206,
      "rewards/cosine_scaled_reward/mean": -0.1279548704624176,
      "rewards/cosine_scaled_reward/std": 0.40222346782684326,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 1736.859375,
      "completions/mean_terminated_length": 999.9473876953125,
      "completions/min_length": 305.0,
      "completions/min_terminated_length": 305.0,
      "epoch": 0.009142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24552854895591736,
      "learning_rate": 3.5e-07,
      "loss": 0.0,
      "num_tokens": 1009909.0,
      "reward": 0.21729671955108643,
      "reward_std": 0.6989120244979858,
      "rewards/cosine_scaled_reward/mean": -0.055414143949747086,
      "rewards/cosine_scaled_reward/std": 0.47493892908096313,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.4732423722743988,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1963.0,
      "completions/mean_length": 1967.53125,
      "completions/mean_terminated_length": 1475.77783203125,
      "completions/min_length": 856.0,
      "completions/min_terminated_length": 856.0,
      "epoch": 0.010285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2430322915315628,
      "learning_rate": 4e-07,
      "loss": 0.0,
      "num_tokens": 1147287.0,
      "reward": -0.21451422572135925,
      "reward_std": 0.587526798248291,
      "rewards/cosine_scaled_reward/mean": -0.19319462776184082,
      "rewards/cosine_scaled_reward/std": 0.29357606172561646,
      "rewards/format_reward/mean": 0.171875,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 1708.546875,
      "completions/mean_terminated_length": 961.75,
      "completions/min_length": 388.0,
      "completions/min_terminated_length": 388.0,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2543582320213318,
      "learning_rate": 4.5e-07,
      "loss": 0.0,
      "num_tokens": 1267466.0,
      "reward": 0.02539752423763275,
      "reward_std": 0.545810341835022,
      "rewards/cosine_scaled_reward/mean": -0.14355123043060303,
      "rewards/cosine_scaled_reward/std": 0.36147356033325195,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.90625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1579.0,
      "completions/mean_length": 1967.734375,
      "completions/mean_terminated_length": 1191.8333740234375,
      "completions/min_length": 843.0,
      "completions/min_terminated_length": 843.0,
      "epoch": 0.012571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24583907425403595,
      "learning_rate": 5e-07,
      "loss": -0.0,
      "num_tokens": 1405073.0,
      "reward": -0.46971434354782104,
      "reward_std": 0.36104393005371094,
      "rewards/cosine_scaled_reward/mean": -0.28173214197158813,
      "rewards/cosine_scaled_reward/std": 0.17775526642799377,
      "rewards/format_reward/mean": 0.09375,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1707.5625,
      "completions/mean_terminated_length": 1176.47998046875,
      "completions/min_length": 330.0,
      "completions/min_terminated_length": 330.0,
      "epoch": 0.013714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3135142922401428,
      "learning_rate": 5.5e-07,
      "loss": -0.0,
      "num_tokens": 1525301.0,
      "reward": 0.0018395520746707916,
      "reward_std": 0.7012988328933716,
      "rewards/cosine_scaled_reward/mean": -0.21783021092414856,
      "rewards/cosine_scaled_reward/std": 0.324150949716568,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1745.0,
      "completions/mean_length": 1841.96875,
      "completions/mean_terminated_length": 1168.933349609375,
      "completions/min_length": 442.0,
      "completions/min_terminated_length": 442.0,
      "epoch": 0.014857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2532394826412201,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 1654227.0,
      "reward": -0.10322706401348114,
      "reward_std": 0.6915165185928345,
      "rewards/cosine_scaled_reward/mean": -0.17661353945732117,
      "rewards/cosine_scaled_reward/std": 0.329875111579895,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 1816.390625,
      "completions/mean_terminated_length": 1306.8499755859375,
      "completions/min_length": 520.0,
      "completions/min_terminated_length": 520.0,
      "epoch": 0.016,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28405147790908813,
      "learning_rate": 6.5e-07,
      "loss": 0.0,
      "num_tokens": 1781084.0,
      "reward": 0.10602855682373047,
      "reward_std": 0.630502462387085,
      "rewards/cosine_scaled_reward/mean": -0.11104822158813477,
      "rewards/cosine_scaled_reward/std": 0.3846627473831177,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.4732423722743988,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1574.0,
      "completions/mean_length": 1702.109375,
      "completions/mean_terminated_length": 818.1666870117188,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.017142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28779250383377075,
      "learning_rate": 7e-07,
      "loss": 0.0,
      "num_tokens": 1900939.0,
      "reward": 0.32734519243240356,
      "reward_std": 0.3870265483856201,
      "rewards/cosine_scaled_reward/mean": 0.007422588765621185,
      "rewards/cosine_scaled_reward/std": 0.45787373185157776,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 2048.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 2048.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.018285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2337152510881424,
      "learning_rate": 7.5e-07,
      "loss": -0.0,
      "num_tokens": 2042451.0,
      "reward": -0.5429925918579102,
      "reward_std": 0.3153150975704193,
      "rewards/cosine_scaled_reward/mean": -0.2714962661266327,
      "rewards/cosine_scaled_reward/std": 0.1678173691034317,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1564.921875,
      "completions/mean_terminated_length": 858.8846435546875,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.019428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33599403500556946,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 2153126.0,
      "reward": 0.17696775496006012,
      "reward_std": 0.6489306688308716,
      "rewards/cosine_scaled_reward/mean": -0.11464111506938934,
      "rewards/cosine_scaled_reward/std": 0.3551919758319855,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.49501484632492065,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1298.0,
      "completions/mean_length": 1795.390625,
      "completions/mean_terminated_length": 893.21435546875,
      "completions/min_length": 619.0,
      "completions/min_terminated_length": 619.0,
      "epoch": 0.02057142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22697053849697113,
      "learning_rate": 8.499999999999999e-07,
      "loss": -0.0,
      "num_tokens": 2278407.0,
      "reward": -0.10711958259344101,
      "reward_std": 0.5238703489303589,
      "rewards/cosine_scaled_reward/mean": -0.1785597801208496,
      "rewards/cosine_scaled_reward/std": 0.2545098662376404,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 1921.484375,
      "completions/mean_terminated_length": 1238.300048828125,
      "completions/min_length": 623.0,
      "completions/min_terminated_length": 623.0,
      "epoch": 0.021714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23972108960151672,
      "learning_rate": 9e-07,
      "loss": 0.0,
      "num_tokens": 2412638.0,
      "reward": 0.029344379901885986,
      "reward_std": 0.6719281077384949,
      "rewards/cosine_scaled_reward/mean": -0.086890310049057,
      "rewards/cosine_scaled_reward/std": 0.40220555663108826,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.734375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 1728.5625,
      "completions/mean_terminated_length": 845.4117431640625,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23309311270713806,
      "learning_rate": 9.499999999999999e-07,
      "loss": 0.0,
      "num_tokens": 2534618.0,
      "reward": 0.0131673663854599,
      "reward_std": 0.4436222314834595,
      "rewards/cosine_scaled_reward/mean": -0.13404130935668945,
      "rewards/cosine_scaled_reward/std": 0.32819250226020813,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 1777.953125,
      "completions/mean_terminated_length": 1087.8333740234375,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29990270733833313,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 2659215.0,
      "reward": -0.1764472872018814,
      "reward_std": 0.5121938586235046,
      "rewards/cosine_scaled_reward/mean": -0.2444736361503601,
      "rewards/cosine_scaled_reward/std": 0.289971262216568,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1361.28125,
      "completions/mean_terminated_length": 921.0769653320312,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 0.025142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29922786355018616,
      "learning_rate": 9.99931462820376e-07,
      "loss": -0.0,
      "num_tokens": 2755353.0,
      "reward": 0.6089149713516235,
      "reward_std": 0.5986809730529785,
      "rewards/cosine_scaled_reward/mean": -0.05491749942302704,
      "rewards/cosine_scaled_reward/std": 0.39076483249664307,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1565.046875,
      "completions/mean_terminated_length": 903.2222290039062,
      "completions/min_length": 405.0,
      "completions/min_terminated_length": 405.0,
      "epoch": 0.026285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27512773871421814,
      "learning_rate": 9.997258721585931e-07,
      "loss": -0.0,
      "num_tokens": 2866308.0,
      "reward": 0.21871733665466309,
      "reward_std": 0.5976030826568604,
      "rewards/cosine_scaled_reward/mean": -0.10157884657382965,
      "rewards/cosine_scaled_reward/std": 0.3856185972690582,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 1801.671875,
      "completions/mean_terminated_length": 1259.75,
      "completions/min_length": 573.0,
      "completions/min_terminated_length": 573.0,
      "epoch": 0.027428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22642865777015686,
      "learning_rate": 9.993832906395582e-07,
      "loss": -0.0,
      "num_tokens": 2992543.0,
      "reward": 0.04899948835372925,
      "reward_std": 0.8525694608688354,
      "rewards/cosine_scaled_reward/mean": -0.17081275582313538,
      "rewards/cosine_scaled_reward/std": 0.3993513882160187,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1715.765625,
      "completions/mean_terminated_length": 1035.4761962890625,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25316134095191956,
      "learning_rate": 9.989038226169207e-07,
      "loss": -0.0,
      "num_tokens": 3112648.0,
      "reward": 0.10585837811231613,
      "reward_std": 0.7828943729400635,
      "rewards/cosine_scaled_reward/mean": -0.11894579976797104,
      "rewards/cosine_scaled_reward/std": 0.4141720235347748,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1917.703125,
      "completions/mean_terminated_length": 1452.357177734375,
      "completions/min_length": 840.0,
      "completions/min_terminated_length": 840.0,
      "epoch": 0.029714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2521306574344635,
      "learning_rate": 9.982876141412855e-07,
      "loss": -0.0,
      "num_tokens": 3246013.0,
      "reward": 0.17620250582695007,
      "reward_std": 0.6548349857330322,
      "rewards/cosine_scaled_reward/mean": -0.08377375453710556,
      "rewards/cosine_scaled_reward/std": 0.3527655303478241,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 1851.015625,
      "completions/mean_terminated_length": 1147.5,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 0.030857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2730060815811157,
      "learning_rate": 9.975348529157229e-07,
      "loss": -0.0,
      "num_tokens": 3374766.0,
      "reward": -0.18854813277721405,
      "reward_std": 0.49348777532577515,
      "rewards/cosine_scaled_reward/mean": -0.21146157383918762,
      "rewards/cosine_scaled_reward/std": 0.2601618766784668,
      "rewards/format_reward/mean": 0.234375,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1798.328125,
      "completions/mean_terminated_length": 1049.3125,
      "completions/min_length": 403.0,
      "completions/min_terminated_length": 403.0,
      "epoch": 0.032,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2566036880016327,
      "learning_rate": 9.96645768238595e-07,
      "loss": 0.0,
      "num_tokens": 3500195.0,
      "reward": 0.06705980002880096,
      "reward_std": 0.7090284824371338,
      "rewards/cosine_scaled_reward/mean": -0.10709509253501892,
      "rewards/cosine_scaled_reward/std": 0.4101051986217499,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1930.203125,
      "completions/mean_terminated_length": 1210.3333740234375,
      "completions/min_length": 582.0,
      "completions/min_terminated_length": 582.0,
      "epoch": 0.03314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25197461247444153,
      "learning_rate": 9.956206309337066e-07,
      "loss": 0.0,
      "num_tokens": 3634200.0,
      "reward": -0.2462695688009262,
      "reward_std": 0.5237302780151367,
      "rewards/cosine_scaled_reward/mean": -0.2012597918510437,
      "rewards/cosine_scaled_reward/std": 0.23252712190151215,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 1847.65625,
      "completions/mean_terminated_length": 1061.6923828125,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30431485176086426,
      "learning_rate": 9.944597532678119e-07,
      "loss": 0.0,
      "num_tokens": 3762986.0,
      "reward": -0.05392302945256233,
      "reward_std": 0.7249555587768555,
      "rewards/cosine_scaled_reward/mean": -0.15196150541305542,
      "rewards/cosine_scaled_reward/std": 0.34566983580589294,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 1838.671875,
      "completions/mean_terminated_length": 931.5833740234375,
      "completions/min_length": 399.0,
      "completions/min_terminated_length": 399.0,
      "epoch": 0.03542857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2484513372182846,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0,
      "num_tokens": 3891157.0,
      "reward": -0.11271396279335022,
      "reward_std": 0.6705260872840881,
      "rewards/cosine_scaled_reward/mean": -0.1813569962978363,
      "rewards/cosine_scaled_reward/std": 0.4071698486804962,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1715.0,
      "completions/mean_length": 1910.109375,
      "completions/mean_terminated_length": 1417.6429443359375,
      "completions/min_length": 906.0,
      "completions/min_terminated_length": 906.0,
      "epoch": 0.036571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25329527258872986,
      "learning_rate": 9.917322325514487e-07,
      "loss": -0.0,
      "num_tokens": 4023756.0,
      "reward": -0.08931556344032288,
      "reward_std": 0.6381070613861084,
      "rewards/cosine_scaled_reward/mean": -0.16965776681900024,
      "rewards/cosine_scaled_reward/std": 0.37385129928588867,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.953125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 2023.71875,
      "completions/mean_terminated_length": 1530.0,
      "completions/min_length": 1107.0,
      "completions/min_terminated_length": 1107.0,
      "epoch": 0.037714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22758109867572784,
      "learning_rate": 9.901664203302124e-07,
      "loss": 0.0,
      "num_tokens": 4164490.0,
      "reward": -0.4589868187904358,
      "reward_std": 0.5177067518234253,
      "rewards/cosine_scaled_reward/mean": -0.2919934093952179,
      "rewards/cosine_scaled_reward/std": 0.2252870500087738,
      "rewards/format_reward/mean": 0.125,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1454.78125,
      "completions/mean_terminated_length": 963.2571411132812,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.038857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3234354257583618,
      "learning_rate": 9.88466529153356e-07,
      "loss": 0.0,
      "num_tokens": 4267148.0,
      "reward": 0.656031608581543,
      "reward_std": 0.7529654502868652,
      "rewards/cosine_scaled_reward/mean": 0.05457830801606178,
      "rewards/cosine_scaled_reward/std": 0.49684229493141174,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1724.0,
      "completions/mean_length": 1819.078125,
      "completions/mean_terminated_length": 716.0909423828125,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2821458876132965,
      "learning_rate": 9.866330768241983e-07,
      "loss": -0.0,
      "num_tokens": 4395065.0,
      "reward": -0.09630556404590607,
      "reward_std": 0.7089139223098755,
      "rewards/cosine_scaled_reward/mean": -0.15752778947353363,
      "rewards/cosine_scaled_reward/std": 0.3647947609424591,
      "rewards/format_reward/mean": 0.21875,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1811.0,
      "completions/mean_length": 1954.34375,
      "completions/mean_terminated_length": 1382.0,
      "completions/min_length": 949.0,
      "completions/min_terminated_length": 949.0,
      "epoch": 0.04114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24163897335529327,
      "learning_rate": 9.846666218300807e-07,
      "loss": -0.0,
      "num_tokens": 4531255.0,
      "reward": -0.34593287110328674,
      "reward_std": 0.44493502378463745,
      "rewards/cosine_scaled_reward/mean": -0.24327893555164337,
      "rewards/cosine_scaled_reward/std": 0.24784433841705322,
      "rewards/format_reward/mean": 0.140625,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1723.0,
      "completions/mean_length": 1868.921875,
      "completions/mean_terminated_length": 1092.916748046875,
      "completions/min_length": 620.0,
      "completions/min_terminated_length": 620.0,
      "epoch": 0.04228571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24795544147491455,
      "learning_rate": 9.825677631722435e-07,
      "loss": -0.0,
      "num_tokens": 4661890.0,
      "reward": -0.23053905367851257,
      "reward_std": 0.34036368131637573,
      "rewards/cosine_scaled_reward/mean": -0.2246445268392563,
      "rewards/cosine_scaled_reward/std": 0.15942412614822388,
      "rewards/format_reward/mean": 0.21875,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1397.0,
      "completions/mean_length": 1889.53125,
      "completions/mean_terminated_length": 1033.800048828125,
      "completions/min_length": 810.0,
      "completions/min_terminated_length": 810.0,
      "epoch": 0.04342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24283826351165771,
      "learning_rate": 9.80337140183366e-07,
      "loss": 0.0,
      "num_tokens": 4794532.0,
      "reward": -0.10043507814407349,
      "reward_std": 0.47925832867622375,
      "rewards/cosine_scaled_reward/mean": -0.13615503907203674,
      "rewards/cosine_scaled_reward/std": 0.3336707651615143,
      "rewards/format_reward/mean": 0.171875,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1515.0,
      "completions/mean_length": 1644.828125,
      "completions/mean_terminated_length": 689.9473876953125,
      "completions/min_length": 279.0,
      "completions/min_terminated_length": 279.0,
      "epoch": 0.044571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28362998366355896,
      "learning_rate": 9.779754323328192e-07,
      "loss": 0.0,
      "num_tokens": 4910585.0,
      "reward": 0.12284853309392929,
      "reward_std": 0.4183085858821869,
      "rewards/cosine_scaled_reward/mean": -0.11045074462890625,
      "rewards/cosine_scaled_reward/std": 0.30217844247817993,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1632.0,
      "completions/mean_length": 1618.28125,
      "completions/mean_terminated_length": 902.0833740234375,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.045714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.262617826461792,
      "learning_rate": 9.754833590196926e-07,
      "loss": 0.0,
      "num_tokens": 5024227.0,
      "reward": 0.2076582908630371,
      "reward_std": 0.42125773429870605,
      "rewards/cosine_scaled_reward/mean": -0.12273336946964264,
      "rewards/cosine_scaled_reward/std": 0.4404613971710205,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1914.0,
      "completions/mean_length": 1717.734375,
      "completions/mean_terminated_length": 1235.0384521484375,
      "completions/min_length": 664.0,
      "completions/min_terminated_length": 664.0,
      "epoch": 0.046857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23294499516487122,
      "learning_rate": 9.728616793536587e-07,
      "loss": -0.0,
      "num_tokens": 5145314.0,
      "reward": 0.011502981185913086,
      "reward_std": 0.6816084980964661,
      "rewards/cosine_scaled_reward/mean": -0.22081100940704346,
      "rewards/cosine_scaled_reward/std": 0.37589573860168457,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1672.0,
      "completions/mean_length": 1703.921875,
      "completions/mean_terminated_length": 579.933349609375,
      "completions/min_length": 183.0,
      "completions/min_terminated_length": 183.0,
      "epoch": 0.048,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34672290086746216,
      "learning_rate": 9.701111919237408e-07,
      "loss": -0.0,
      "num_tokens": 5264725.0,
      "reward": -0.2616002857685089,
      "reward_std": 0.37952175736427307,
      "rewards/cosine_scaled_reward/mean": -0.26361262798309326,
      "rewards/cosine_scaled_reward/std": 0.17531204223632812,
      "rewards/format_reward/mean": 0.265625,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1370.0,
      "completions/mean_length": 1681.84375,
      "completions/mean_terminated_length": 814.631591796875,
      "completions/min_length": 308.0,
      "completions/min_terminated_length": 308.0,
      "epoch": 0.04914285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.263967901468277,
      "learning_rate": 9.672327345550543e-07,
      "loss": -0.0,
      "num_tokens": 5383979.0,
      "reward": 0.13376155495643616,
      "reward_std": 0.46012288331985474,
      "rewards/cosine_scaled_reward/mean": -0.08155670762062073,
      "rewards/cosine_scaled_reward/std": 0.3612325191497803,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1830.0,
      "completions/mean_length": 1624.625,
      "completions/mean_terminated_length": 869.9130859375,
      "completions/min_length": 385.0,
      "completions/min_terminated_length": 385.0,
      "epoch": 0.05028571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28927963972091675,
      "learning_rate": 9.64227184053598e-07,
      "loss": -0.0,
      "num_tokens": 5498651.0,
      "reward": 0.20869271457195282,
      "reward_std": 0.5558150410652161,
      "rewards/cosine_scaled_reward/mean": -0.0987786278128624,
      "rewards/cosine_scaled_reward/std": 0.42912590503692627,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.49501484632492065,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.921875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1851.0,
      "completions/mean_length": 2006.96875,
      "completions/mean_terminated_length": 1522.800048828125,
      "completions/min_length": 955.0,
      "completions/min_terminated_length": 955.0,
      "epoch": 0.05142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24254000186920166,
      "learning_rate": 9.610954559391704e-07,
      "loss": 0.0,
      "num_tokens": 5638753.0,
      "reward": -0.2540697157382965,
      "reward_std": 0.4600578844547272,
      "rewards/cosine_scaled_reward/mean": -0.20515984296798706,
      "rewards/cosine_scaled_reward/std": 0.3251590430736542,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1563.0,
      "completions/mean_length": 1765.984375,
      "completions/mean_terminated_length": 919.9375,
      "completions/min_length": 571.0,
      "completions/min_terminated_length": 571.0,
      "epoch": 0.052571428571428575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2645930349826813,
      "learning_rate": 9.578385041664925e-07,
      "loss": 0.0,
      "num_tokens": 5762944.0,
      "reward": -0.213707834482193,
      "reward_std": 0.38778313994407654,
      "rewards/cosine_scaled_reward/mean": -0.2318539321422577,
      "rewards/cosine_scaled_reward/std": 0.21436986327171326,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 1583.40625,
      "completions/mean_terminated_length": 986.0714721679688,
      "completions/min_length": 436.0,
      "completions/min_terminated_length": 436.0,
      "epoch": 0.053714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.311797559261322,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0,
      "num_tokens": 5874682.0,
      "reward": 0.27925533056259155,
      "reward_std": 0.6467443704605103,
      "rewards/cosine_scaled_reward/mean": -0.07912233471870422,
      "rewards/cosine_scaled_reward/std": 0.4737093150615692,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1527.0,
      "completions/mean_length": 1690.0625,
      "completions/mean_terminated_length": 1006.727294921875,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 0.054857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26644304394721985,
      "learning_rate": 9.509529358847654e-07,
      "loss": -0.0,
      "num_tokens": 5993390.0,
      "reward": 0.13692031800746918,
      "reward_std": 0.5655145049095154,
      "rewards/cosine_scaled_reward/mean": -0.12685233354568481,
      "rewards/cosine_scaled_reward/std": 0.32320985198020935,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1387.140625,
      "completions/mean_terminated_length": 804.0294189453125,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3078882396221161,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0,
      "num_tokens": 6092231.0,
      "reward": 0.35559189319610596,
      "reward_std": 0.5927403569221497,
      "rewards/cosine_scaled_reward/mean": -0.09564155340194702,
      "rewards/cosine_scaled_reward/std": 0.4046906530857086,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1830.0,
      "completions/mean_length": 1674.890625,
      "completions/mean_terminated_length": 962.5909423828125,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23925544321537018,
      "learning_rate": 9.43578868212728e-07,
      "loss": -0.0,
      "num_tokens": 6210240.0,
      "reward": 0.18573230504989624,
      "reward_std": 0.5264967083930969,
      "rewards/cosine_scaled_reward/mean": -0.09463384002447128,
      "rewards/cosine_scaled_reward/std": 0.4100942015647888,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1347.40625,
      "completions/mean_terminated_length": 836.1621704101562,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 0.05828571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.325811505317688,
      "learning_rate": 9.397114317029974e-07,
      "loss": 0.0,
      "num_tokens": 6306682.0,
      "reward": 0.1735648661851883,
      "reward_std": 0.5335988998413086,
      "rewards/cosine_scaled_reward/mean": -0.21009255945682526,
      "rewards/cosine_scaled_reward/std": 0.2623959481716156,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.49501484632492065,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1390.0,
      "completions/mean_length": 1727.765625,
      "completions/mean_terminated_length": 767.0625,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 246.0,
      "epoch": 0.05942857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27392977476119995,
      "learning_rate": 9.357252853159505e-07,
      "loss": 0.0,
      "num_tokens": 6428611.0,
      "reward": -0.16267812252044678,
      "reward_std": 0.5682471990585327,
      "rewards/cosine_scaled_reward/mean": -0.2219640612602234,
      "rewards/cosine_scaled_reward/std": 0.36739134788513184,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1907.0,
      "completions/mean_length": 1609.171875,
      "completions/mean_terminated_length": 924.5999755859375,
      "completions/min_length": 513.0,
      "completions/min_terminated_length": 513.0,
      "epoch": 0.060571428571428575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28155064582824707,
      "learning_rate": 9.316216432703916e-07,
      "loss": -0.0,
      "num_tokens": 6542430.0,
      "reward": 0.0752667784690857,
      "reward_std": 0.7118167281150818,
      "rewards/cosine_scaled_reward/mean": -0.18892911076545715,
      "rewards/cosine_scaled_reward/std": 0.3222156763076782,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1588.234375,
      "completions/mean_terminated_length": 1067.166748046875,
      "completions/min_length": 519.0,
      "completions/min_terminated_length": 519.0,
      "epoch": 0.061714285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2555343806743622,
      "learning_rate": 9.274017555754407e-07,
      "loss": 0.0,
      "num_tokens": 6655221.0,
      "reward": 0.6341299414634705,
      "reward_std": 1.0656921863555908,
      "rewards/cosine_scaled_reward/mean": 0.05143994837999344,
      "rewards/cosine_scaled_reward/std": 0.5348308086395264,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1420.0,
      "completions/mean_length": 1549.5625,
      "completions/mean_terminated_length": 821.0769653320312,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.06285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30243629217147827,
      "learning_rate": 9.230669076497687e-07,
      "loss": -0.0,
      "num_tokens": 6764681.0,
      "reward": 0.13021975755691528,
      "reward_std": 0.3984764516353607,
      "rewards/cosine_scaled_reward/mean": -0.13801513612270355,
      "rewards/cosine_scaled_reward/std": 0.41228073835372925,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.49501484632492065,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1633.25,
      "completions/mean_terminated_length": 1132.689697265625,
      "completions/min_length": 543.0,
      "completions/min_terminated_length": 543.0,
      "epoch": 0.064,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23835402727127075,
      "learning_rate": 9.186184199300463e-07,
      "loss": -0.0,
      "num_tokens": 6880169.0,
      "reward": 0.27981996536254883,
      "reward_std": 0.5018116235733032,
      "rewards/cosine_scaled_reward/mean": -0.10227750986814499,
      "rewards/cosine_scaled_reward/std": 0.481824666261673,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1807.0,
      "completions/mean_length": 1699.875,
      "completions/mean_terminated_length": 1156.7999267578125,
      "completions/min_length": 642.0,
      "completions/min_terminated_length": 642.0,
      "epoch": 0.06514285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22349494695663452,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.0,
      "num_tokens": 7000529.0,
      "reward": -0.026505012065172195,
      "reward_std": 0.5785415172576904,
      "rewards/cosine_scaled_reward/mean": -0.20856501162052155,
      "rewards/cosine_scaled_reward/std": 0.2749907374382019,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1457.875,
      "completions/mean_terminated_length": 1054.105224609375,
      "completions/min_length": 447.0,
      "completions/min_terminated_length": 447.0,
      "epoch": 0.06628571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.261942595243454,
      "learning_rate": 9.093859795212817e-07,
      "loss": 0.0,
      "num_tokens": 7103929.0,
      "reward": 0.5745843648910522,
      "reward_std": 0.8671218156814575,
      "rewards/cosine_scaled_reward/mean": -0.03302033245563507,
      "rewards/cosine_scaled_reward/std": 0.45529407262802124,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1590.0625,
      "completions/mean_terminated_length": 1159.8787841796875,
      "completions/min_length": 591.0,
      "completions/min_terminated_length": 591.0,
      "epoch": 0.06742857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24828943610191345,
      "learning_rate": 9.046048391230247e-07,
      "loss": -0.0,
      "num_tokens": 7216157.0,
      "reward": 0.3377103805541992,
      "reward_std": 0.5543617010116577,
      "rewards/cosine_scaled_reward/mean": -0.1045822948217392,
      "rewards/cosine_scaled_reward/std": 0.39040952920913696,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 1622.84375,
      "completions/mean_terminated_length": 1076.21435546875,
      "completions/min_length": 555.0,
      "completions/min_terminated_length": 555.0,
      "epoch": 0.06857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2752656936645508,
      "learning_rate": 8.997156826556369e-07,
      "loss": -0.0,
      "num_tokens": 7330907.0,
      "reward": 0.11114693433046341,
      "reward_std": 0.6926254034042358,
      "rewards/cosine_scaled_reward/mean": -0.1788015365600586,
      "rewards/cosine_scaled_reward/std": 0.39409172534942627,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1708.859375,
      "completions/mean_terminated_length": 1014.4285888671875,
      "completions/min_length": 411.0,
      "completions/min_terminated_length": 411.0,
      "epoch": 0.06971428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22669929265975952,
      "learning_rate": 8.9471999940354e-07,
      "loss": -0.0,
      "num_tokens": 7451794.0,
      "reward": 0.2345120906829834,
      "reward_std": 0.6293160319328308,
      "rewards/cosine_scaled_reward/mean": -0.1093064472079277,
      "rewards/cosine_scaled_reward/std": 0.29189831018447876,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1281.53125,
      "completions/mean_terminated_length": 1004.2978515625,
      "completions/min_length": 391.0,
      "completions/min_terminated_length": 391.0,
      "epoch": 0.07085714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25438693165779114,
      "learning_rate": 8.896193111002475e-07,
      "loss": 0.0,
      "num_tokens": 7544044.0,
      "reward": 0.9180847406387329,
      "reward_std": 0.6390912532806396,
      "rewards/cosine_scaled_reward/mean": 0.06841734796762466,
      "rewards/cosine_scaled_reward/std": 0.48315128684043884,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1678.0,
      "completions/mean_length": 1310.46875,
      "completions/mean_terminated_length": 896.731689453125,
      "completions/min_length": 295.0,
      "completions/min_terminated_length": 295.0,
      "epoch": 0.072,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28795576095581055,
      "learning_rate": 8.844151714648274e-07,
      "loss": -0.0,
      "num_tokens": 7638170.0,
      "reward": 0.6488770246505737,
      "reward_std": 0.7876260876655579,
      "rewards/cosine_scaled_reward/mean": -0.019311510026454926,
      "rewards/cosine_scaled_reward/std": 0.4736698865890503,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 1307.625,
      "completions/mean_terminated_length": 1039.8297119140625,
      "completions/min_length": 376.0,
      "completions/min_terminated_length": 376.0,
      "epoch": 0.07314285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25637197494506836,
      "learning_rate": 8.791091657286267e-07,
      "loss": -0.0,
      "num_tokens": 7732810.0,
      "reward": 0.8280279636383057,
      "reward_std": 0.6804471015930176,
      "rewards/cosine_scaled_reward/mean": 0.015576483681797981,
      "rewards/cosine_scaled_reward/std": 0.44819310307502747,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.359375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1846.0,
      "completions/mean_length": 1322.125,
      "completions/mean_terminated_length": 914.9268188476562,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.07428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2944399118423462,
      "learning_rate": 8.737029101523929e-07,
      "loss": -0.0,
      "num_tokens": 7828130.0,
      "reward": 0.15610456466674805,
      "reward_std": 0.4606686234474182,
      "rewards/cosine_scaled_reward/mean": -0.24226020276546478,
      "rewards/cosine_scaled_reward/std": 0.33131492137908936,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1803.0,
      "completions/mean_length": 1020.21875,
      "completions/mean_terminated_length": 806.9057006835938,
      "completions/min_length": 208.0,
      "completions/min_terminated_length": 208.0,
      "epoch": 0.07542857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32644009590148926,
      "learning_rate": 8.681980515339463e-07,
      "loss": 0.0,
      "num_tokens": 7903656.0,
      "reward": 0.7972471714019775,
      "reward_std": 0.7674820423126221,
      "rewards/cosine_scaled_reward/mean": -0.031063925474882126,
      "rewards/cosine_scaled_reward/std": 0.5106223225593567,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1910.0,
      "completions/mean_length": 1750.859375,
      "completions/mean_terminated_length": 1142.4285888671875,
      "completions/min_length": 585.0,
      "completions/min_terminated_length": 585.0,
      "epoch": 0.07657142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2270829975605011,
      "learning_rate": 8.625962667065487e-07,
      "loss": 0.0,
      "num_tokens": 8026447.0,
      "reward": -0.1400720775127411,
      "reward_std": 0.3325888514518738,
      "rewards/cosine_scaled_reward/mean": -0.24972353875637054,
      "rewards/cosine_scaled_reward/std": 0.16404789686203003,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1424.0,
      "completions/mean_length": 769.546875,
      "completions/mean_terminated_length": 637.2930908203125,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.07771428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.37025144696235657,
      "learning_rate": 8.568992620281243e-07,
      "loss": -0.0,
      "num_tokens": 8084954.0,
      "reward": 0.9792699813842773,
      "reward_std": 0.804767370223999,
      "rewards/cosine_scaled_reward/mean": 0.03651002421975136,
      "rewards/cosine_scaled_reward/std": 0.46041443943977356,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1701.0,
      "completions/mean_length": 1086.234375,
      "completions/mean_terminated_length": 886.6226806640625,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.07885714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3763800263404846,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.0,
      "num_tokens": 8164817.0,
      "reward": 0.35803771018981934,
      "reward_std": 0.5702667236328125,
      "rewards/cosine_scaled_reward/mean": -0.24285613000392914,
      "rewards/cosine_scaled_reward/std": 0.3019825220108032,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1463.375,
      "completions/mean_terminated_length": 1112.5999755859375,
      "completions/min_length": 503.0,
      "completions/min_terminated_length": 503.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24232418835163116,
      "learning_rate": 8.452265630457282e-07,
      "loss": -0.0,
      "num_tokens": 8269929.0,
      "reward": 0.3703588843345642,
      "reward_std": 0.7288752794265747,
      "rewards/cosine_scaled_reward/mean": -0.1351330280303955,
      "rewards/cosine_scaled_reward/std": 0.3751916289329529,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1409.859375,
      "completions/mean_terminated_length": 973.2368774414062,
      "completions/min_length": 404.0,
      "completions/min_terminated_length": 404.0,
      "epoch": 0.08114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.300010621547699,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.0,
      "num_tokens": 8370880.0,
      "reward": 0.5196826457977295,
      "reward_std": 0.7097917795181274,
      "rewards/cosine_scaled_reward/mean": -0.044846177101135254,
      "rewards/cosine_scaled_reward/std": 0.508389949798584,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1801.0,
      "completions/mean_length": 1228.046875,
      "completions/mean_terminated_length": 931.4680786132812,
      "completions/min_length": 331.0,
      "completions/min_terminated_length": 331.0,
      "epoch": 0.08228571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30454304814338684,
      "learning_rate": 8.331941759724268e-07,
      "loss": -0.0,
      "num_tokens": 8459827.0,
      "reward": 0.41365131735801697,
      "reward_std": 0.5005639791488647,
      "rewards/cosine_scaled_reward/mean": -0.1759868562221527,
      "rewards/cosine_scaled_reward/std": 0.19868774712085724,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1719.0,
      "completions/mean_length": 1513.28125,
      "completions/mean_terminated_length": 1192.4500732421875,
      "completions/min_length": 557.0,
      "completions/min_terminated_length": 557.0,
      "epoch": 0.08342857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27848970890045166,
      "learning_rate": 8.270476638965461e-07,
      "loss": -0.0,
      "num_tokens": 8567405.0,
      "reward": 0.09570223093032837,
      "reward_std": 0.5445049405097961,
      "rewards/cosine_scaled_reward/mean": -0.2802739143371582,
      "rewards/cosine_scaled_reward/std": 0.25603488087654114,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 1240.125,
      "completions/mean_terminated_length": 924.0,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 0.08457142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2789021134376526,
      "learning_rate": 8.208167604184217e-07,
      "loss": 0.0,
      "num_tokens": 8656701.0,
      "reward": 0.7823752760887146,
      "reward_std": 0.6479132175445557,
      "rewards/cosine_scaled_reward/mean": 0.031812600791454315,
      "rewards/cosine_scaled_reward/std": 0.5397623181343079,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1455.953125,
      "completions/mean_terminated_length": 1186.8409423828125,
      "completions/min_length": 695.0,
      "completions/min_terminated_length": 695.0,
      "epoch": 0.08571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22443196177482605,
      "learning_rate": 8.145033635316128e-07,
      "loss": 0.0,
      "num_tokens": 8760842.0,
      "reward": 0.8040015697479248,
      "reward_std": 0.5675323009490967,
      "rewards/cosine_scaled_reward/mean": 0.027000809088349342,
      "rewards/cosine_scaled_reward/std": 0.5096040964126587,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1720.0,
      "completions/mean_length": 1177.859375,
      "completions/mean_terminated_length": 863.1276245117188,
      "completions/min_length": 372.0,
      "completions/min_terminated_length": 372.0,
      "epoch": 0.08685714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32647648453712463,
      "learning_rate": 8.081093963579707e-07,
      "loss": 0.0,
      "num_tokens": 8846625.0,
      "reward": 0.310506671667099,
      "reward_std": 0.5110941529273987,
      "rewards/cosine_scaled_reward/mean": -0.2119341641664505,
      "rewards/cosine_scaled_reward/std": 0.24737994372844696,
      "rewards/format_reward/mean": 0.734375,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1787.0,
      "completions/mean_length": 1263.4375,
      "completions/mean_terminated_length": 1043.760009765625,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 0.088,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2545543611049652,
      "learning_rate": 8.01636806561836e-07,
      "loss": -0.0,
      "num_tokens": 8939061.0,
      "reward": 0.5484907031059265,
      "reward_std": 0.48998576402664185,
      "rewards/cosine_scaled_reward/mean": -0.13200464844703674,
      "rewards/cosine_scaled_reward/std": 0.3430649936199188,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 1460.78125,
      "completions/mean_terminated_length": 1059.0,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 0.08914285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2583931088447571,
      "learning_rate": 7.950875657567621e-07,
      "loss": 0.0,
      "num_tokens": 9043271.0,
      "reward": 0.6075442433357239,
      "reward_std": 0.6895643472671509,
      "rewards/cosine_scaled_reward/mean": -0.0009153857827186584,
      "rewards/cosine_scaled_reward/std": 0.48922818899154663,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1054.875,
      "completions/mean_terminated_length": 892.3635864257812,
      "completions/min_length": 261.0,
      "completions/min_terminated_length": 261.0,
      "epoch": 0.09028571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29089078307151794,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0,
      "num_tokens": 9120879.0,
      "reward": 0.6885831356048584,
      "reward_std": 0.508629322052002,
      "rewards/cosine_scaled_reward/mean": -0.09320840239524841,
      "rewards/cosine_scaled_reward/std": 0.38835227489471436,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1399.046875,
      "completions/mean_terminated_length": 1145.1087646484375,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 0.09142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27458345890045166,
      "learning_rate": 7.817671337095244e-07,
      "loss": 0.0,
      "num_tokens": 9220810.0,
      "reward": 0.5549384355545044,
      "reward_std": 0.7092134952545166,
      "rewards/cosine_scaled_reward/mean": -0.09753081202507019,
      "rewards/cosine_scaled_reward/std": 0.4125780463218689,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1833.0,
      "completions/mean_length": 1084.984375,
      "completions/mean_terminated_length": 906.6481323242188,
      "completions/min_length": 274.0,
      "completions/min_terminated_length": 274.0,
      "epoch": 0.09257142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.37247684597969055,
      "learning_rate": 7.75e-07,
      "loss": -0.0,
      "num_tokens": 9301521.0,
      "reward": 0.5357480049133301,
      "reward_std": 0.5661624670028687,
      "rewards/cosine_scaled_reward/mean": -0.18525099754333496,
      "rewards/cosine_scaled_reward/std": 0.3385297954082489,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 1260.921875,
      "completions/mean_terminated_length": 998.5625,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.09371428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27329322695732117,
      "learning_rate": 7.681643291108517e-07,
      "loss": -0.0,
      "num_tokens": 9392548.0,
      "reward": 0.9478914737701416,
      "reward_std": 0.4313860237598419,
      "rewards/cosine_scaled_reward/mean": 0.09894578158855438,
      "rewards/cosine_scaled_reward/std": 0.5477120876312256,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1309.671875,
      "completions/mean_terminated_length": 922.9285888671875,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.09485714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3202998638153076,
      "learning_rate": 7.612622032536507e-07,
      "loss": -0.0,
      "num_tokens": 9487455.0,
      "reward": 0.5201998949050903,
      "reward_std": 0.6858996152877808,
      "rewards/cosine_scaled_reward/mean": -0.09927503764629364,
      "rewards/cosine_scaled_reward/std": 0.37909674644470215,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1685.0,
      "completions/mean_length": 1185.703125,
      "completions/mean_terminated_length": 965.9019775390625,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29646041989326477,
      "learning_rate": 7.54295724882796e-07,
      "loss": -0.0,
      "num_tokens": 9574036.0,
      "reward": 0.6779025793075562,
      "reward_std": 0.557724118232727,
      "rewards/cosine_scaled_reward/mean": -0.09073619544506073,
      "rewards/cosine_scaled_reward/std": 0.3855368196964264,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1297.828125,
      "completions/mean_terminated_length": 1158.907470703125,
      "completions/min_length": 601.0,
      "completions/min_terminated_length": 601.0,
      "epoch": 0.09714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21307455003261566,
      "learning_rate": 7.472670160550848e-07,
      "loss": 0.0,
      "num_tokens": 9667417.0,
      "reward": 0.5093189477920532,
      "reward_std": 0.6006681323051453,
      "rewards/cosine_scaled_reward/mean": -0.1672155261039734,
      "rewards/cosine_scaled_reward/std": 0.34896284341812134,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1859.0,
      "completions/mean_length": 1348.90625,
      "completions/mean_terminated_length": 1096.04248046875,
      "completions/min_length": 501.0,
      "completions/min_terminated_length": 501.0,
      "epoch": 0.09828571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2883393168449402,
      "learning_rate": 7.401782177833147e-07,
      "loss": -0.0,
      "num_tokens": 9764603.0,
      "reward": 0.8025823831558228,
      "reward_std": 0.547119677066803,
      "rewards/cosine_scaled_reward/mean": 0.01847870647907257,
      "rewards/cosine_scaled_reward/std": 0.4346420168876648,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1782.0,
      "completions/mean_length": 1086.96875,
      "completions/mean_terminated_length": 909.0,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.09942857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31888866424560547,
      "learning_rate": 7.330314893841101e-07,
      "loss": -0.0,
      "num_tokens": 9844289.0,
      "reward": 0.5533354878425598,
      "reward_std": 0.5319498777389526,
      "rewards/cosine_scaled_reward/mean": -0.1530197560787201,
      "rewards/cosine_scaled_reward/std": 0.2434682846069336,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 954.921875,
      "completions/mean_terminated_length": 919.6612548828125,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 114.0,
      "epoch": 0.10057142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3025936484336853,
      "learning_rate": 7.258290078201731e-07,
      "loss": -0.0,
      "num_tokens": 9915916.0,
      "reward": 1.2692296504974365,
      "reward_std": 0.5115163326263428,
      "rewards/cosine_scaled_reward/mean": 0.13461479544639587,
      "rewards/cosine_scaled_reward/std": 0.506001353263855,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1926.0,
      "completions/mean_length": 1351.8125,
      "completions/mean_terminated_length": 1174.35302734375,
      "completions/min_length": 650.0,
      "completions/min_terminated_length": 650.0,
      "epoch": 0.10171428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23423585295677185,
      "learning_rate": 7.185729670371604e-07,
      "loss": -0.0,
      "num_tokens": 10013432.0,
      "reward": 0.724889874458313,
      "reward_std": 0.7425336837768555,
      "rewards/cosine_scaled_reward/mean": -0.0828675627708435,
      "rewards/cosine_scaled_reward/std": 0.3893774449825287,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 1153.28125,
      "completions/mean_terminated_length": 1025.46435546875,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 0.10285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3860023021697998,
      "learning_rate": 7.11265577295385e-07,
      "loss": -0.0,
      "num_tokens": 10097242.0,
      "reward": 0.5000253915786743,
      "reward_std": 0.5103108286857605,
      "rewards/cosine_scaled_reward/mean": -0.18748730421066284,
      "rewards/cosine_scaled_reward/std": 0.2787182629108429,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1455.484375,
      "completions/mean_terminated_length": 1166.1163330078125,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.104,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2551063895225525,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.0,
      "num_tokens": 10200961.0,
      "reward": 0.4053259789943695,
      "reward_std": 0.663999617099762,
      "rewards/cosine_scaled_reward/mean": -0.18796202540397644,
      "rewards/cosine_scaled_reward/std": 0.35777655243873596,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1176.953125,
      "completions/mean_terminated_length": 1015.6481323242188,
      "completions/min_length": 451.0,
      "completions/min_terminated_length": 451.0,
      "epoch": 0.10514285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27449366450309753,
      "learning_rate": 6.965056695057204e-07,
      "loss": -0.0,
      "num_tokens": 10286278.0,
      "reward": 0.5743436217308044,
      "reward_std": 0.6229422092437744,
      "rewards/cosine_scaled_reward/mean": -0.15032817423343658,
      "rewards/cosine_scaled_reward/std": 0.2899566888809204,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 1434.875,
      "completions/mean_terminated_length": 1156.181884765625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.10628571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2839376926422119,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.0,
      "num_tokens": 10389454.0,
      "reward": 0.30658647418022156,
      "reward_std": 0.5343226194381714,
      "rewards/cosine_scaled_reward/mean": -0.22951926290988922,
      "rewards/cosine_scaled_reward/std": 0.2324177473783493,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1684.0,
      "completions/mean_length": 1242.390625,
      "completions/mean_terminated_length": 927.1522216796875,
      "completions/min_length": 508.0,
      "completions/min_terminated_length": 508.0,
      "epoch": 0.10742857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2985072433948517,
      "learning_rate": 6.815672671252315e-07,
      "loss": 0.0,
      "num_tokens": 10478735.0,
      "reward": 0.6593698263168335,
      "reward_std": 0.5845412015914917,
      "rewards/cosine_scaled_reward/mean": -0.02969011664390564,
      "rewards/cosine_scaled_reward/std": 0.47056320309638977,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 1203.265625,
      "completions/mean_terminated_length": 1082.58935546875,
      "completions/min_length": 573.0,
      "completions/min_terminated_length": 573.0,
      "epoch": 0.10857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2689598798751831,
      "learning_rate": 6.740368101176495e-07,
      "loss": 0.0,
      "num_tokens": 10566272.0,
      "reward": 0.4301251173019409,
      "reward_std": 0.4795047640800476,
      "rewards/cosine_scaled_reward/mean": -0.22243742644786835,
      "rewards/cosine_scaled_reward/std": 0.2575407326221466,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1827.0,
      "completions/mean_length": 1205.5625,
      "completions/mean_terminated_length": 990.8235473632812,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 0.10971428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30502915382385254,
      "learning_rate": 6.664685702961344e-07,
      "loss": -0.0,
      "num_tokens": 10654564.0,
      "reward": 0.896080493927002,
      "reward_std": 0.6987663507461548,
      "rewards/cosine_scaled_reward/mean": 0.02616523765027523,
      "rewards/cosine_scaled_reward/std": 0.460237056016922,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1777.0,
      "completions/mean_length": 1170.390625,
      "completions/mean_terminated_length": 988.2453002929688,
      "completions/min_length": 430.0,
      "completions/min_terminated_length": 430.0,
      "epoch": 0.11085714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3103901743888855,
      "learning_rate": 6.588648530198504e-07,
      "loss": -0.0,
      "num_tokens": 10739733.0,
      "reward": 0.6633297204971313,
      "reward_std": 0.609075665473938,
      "rewards/cosine_scaled_reward/mean": -0.12927262485027313,
      "rewards/cosine_scaled_reward/std": 0.4114542305469513,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1136.5625,
      "completions/mean_terminated_length": 947.396240234375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.112,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2510873079299927,
      "learning_rate": 6.512279744547392e-07,
      "loss": 0.0,
      "num_tokens": 10823537.0,
      "reward": 0.6613268256187439,
      "reward_std": 0.4785424768924713,
      "rewards/cosine_scaled_reward/mean": -0.09902409464120865,
      "rewards/cosine_scaled_reward/std": 0.4345317482948303,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 1171.8125,
      "completions/mean_terminated_length": 1081.17236328125,
      "completions/min_length": 225.0,
      "completions/min_terminated_length": 225.0,
      "epoch": 0.11314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.281054824590683,
      "learning_rate": 6.435602608679916e-07,
      "loss": -0.0,
      "num_tokens": 10909701.0,
      "reward": 1.0416245460510254,
      "reward_std": 0.6949809789657593,
      "rewards/cosine_scaled_reward/mean": 0.0520622618496418,
      "rewards/cosine_scaled_reward/std": 0.508481502532959,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 1120.8125,
      "completions/mean_terminated_length": 1024.8966064453125,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 0.11428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2910788655281067,
      "learning_rate": 6.358640479194451e-07,
      "loss": 0.0,
      "num_tokens": 10991145.0,
      "reward": 1.2036188840866089,
      "reward_std": 0.8533884286880493,
      "rewards/cosine_scaled_reward/mean": 0.14087192714214325,
      "rewards/cosine_scaled_reward/std": 0.5375887751579285,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 100
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 10991145,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}