goldengoose-gumbel_tau1.00-…/checkpoint-100/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.25,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1464.0,
      "completions/mean_length": 477.234375,
      "completions/mean_terminated_length": 426.56451416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.820798873901367,
      "learning_rate": 0.0,
      "loss": 0.0,
      "num_tokens": 128463.0,
      "reward": 0.30078125,
      "reward_std": 0.2949070334434509,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.3294980227947235,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1324.0,
      "completions/max_terminated_length": 1324.0,
      "completions/mean_length": 549.921875,
      "completions/mean_terminated_length": 549.921875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.992785453796387,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 224890.0,
      "reward": 0.2890625,
      "reward_std": 0.39714252948760986,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.301698237657547,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1812.0,
      "completions/mean_length": 556.71875,
      "completions/mean_terminated_length": 533.0476684570312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0075,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.250213146209717,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 361720.0,
      "reward": 0.1874999850988388,
      "reward_std": 0.3287465274333954,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.2597312331199646,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2010.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 388.546875,
      "completions/mean_terminated_length": 388.546875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.673406600952148,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 469803.0,
      "reward": 0.41718748211860657,
      "reward_std": 0.42704811692237854,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1047.0,
      "completions/max_terminated_length": 1047.0,
      "completions/mean_length": 299.203125,
      "completions/mean_terminated_length": 299.203125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.0125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.532109260559082,
      "learning_rate": 8e-07,
      "loss": 0.0,
      "num_tokens": 581328.0,
      "reward": 0.32656246423721313,
      "reward_std": 0.25986582040786743,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.3177144229412079,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1405.0,
      "completions/max_terminated_length": 1405.0,
      "completions/mean_length": 549.515625,
      "completions/mean_terminated_length": 549.515625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.551382064819336,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 695497.0,
      "reward": 0.25078123807907104,
      "reward_std": 0.3316580057144165,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.28770697116851807,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1751.0,
      "completions/mean_length": 496.078125,
      "completions/mean_terminated_length": 446.01611328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.0175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.735747337341309,
      "learning_rate": 9.99726628670463e-07,
      "loss": 0.0,
      "num_tokens": 819054.0,
      "reward": 0.41015625,
      "reward_std": 0.42701074481010437,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.24688033759593964,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1369.0,
      "completions/max_terminated_length": 1369.0,
      "completions/mean_length": 461.125,
      "completions/mean_terminated_length": 461.125,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.02,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.3624932765960693,
      "learning_rate": 9.989068136093872e-07,
      "loss": 0.0,
      "num_tokens": 918694.0,
      "reward": 0.42656248807907104,
      "reward_std": 0.3252020478248596,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2741328477859497,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1707.0,
      "completions/mean_length": 575.140625,
      "completions/mean_terminated_length": 502.70489501953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.942022800445557,
      "learning_rate": 9.975414512725056e-07,
      "loss": -0.0,
      "num_tokens": 1059111.0,
      "reward": 0.32343748211860657,
      "reward_std": 0.38642236590385437,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.18298126757144928,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1615.0,
      "completions/mean_length": 552.0625,
      "completions/mean_terminated_length": 528.3175048828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.24199390411377,
      "learning_rate": 9.956320346634875e-07,
      "loss": -0.0,
      "num_tokens": 1199859.0,
      "reward": 0.29140621423721313,
      "reward_std": 0.28808674216270447,
      "rewards/format_reward/mean": 0.4140625,
      "rewards/format_reward/std": 0.209963858127594,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 323.359375,
      "completions/mean_terminated_length": 295.984130859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0275,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 11.983031272888184,
      "learning_rate": 9.931806517013612e-07,
      "loss": -0.0,
      "num_tokens": 1308242.0,
      "reward": 0.24374999105930328,
      "reward_std": 0.26579102873802185,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.19669894874095917,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1475.0,
      "completions/max_terminated_length": 1475.0,
      "completions/mean_length": 494.640625,
      "completions/mean_terminated_length": 494.640625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 13.534098625183105,
      "learning_rate": 9.901899829374047e-07,
      "loss": -0.0,
      "num_tokens": 1427139.0,
      "reward": 0.453125,
      "reward_std": 0.37656593322753906,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.21593283116817474,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1420.0,
      "completions/max_terminated_length": 1420.0,
      "completions/mean_length": 591.890625,
      "completions/mean_terminated_length": 591.890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0325,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 3.6356875896453857,
      "learning_rate": 9.866632986240029e-07,
      "loss": 0.0,
      "num_tokens": 1555604.0,
      "reward": 0.24765624105930328,
      "reward_std": 0.3406350612640381,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.2538151443004608,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1100.0,
      "completions/max_terminated_length": 1100.0,
      "completions/mean_length": 327.9375,
      "completions/mean_terminated_length": 327.9375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 19.802650451660156,
      "learning_rate": 9.826044551386742e-07,
      "loss": 0.0,
      "num_tokens": 1663176.0,
      "reward": 0.30781248211860657,
      "reward_std": 0.3470980226993561,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.20351573824882507,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1263.0,
      "completions/mean_length": 416.265625,
      "completions/mean_terminated_length": 390.3651123046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.966890335083008,
      "learning_rate": 9.780178907671788e-07,
      "loss": -0.0,
      "num_tokens": 1757385.0,
      "reward": 0.3671875,
      "reward_std": 0.37327155470848083,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.3299681544303894,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1576.0,
      "completions/mean_length": 447.671875,
      "completions/mean_terminated_length": 422.2698669433594,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 10.119935989379883,
      "learning_rate": 9.729086208503173e-07,
      "loss": -0.0,
      "num_tokens": 1892308.0,
      "reward": 0.5914062261581421,
      "reward_std": 0.31337296962738037,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1743.0,
      "completions/max_terminated_length": 1743.0,
      "completions/mean_length": 421.59375,
      "completions/mean_terminated_length": 421.59375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0425,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 10.146256446838379,
      "learning_rate": 9.672822322997304e-07,
      "loss": 0.0,
      "num_tokens": 2011562.0,
      "reward": 0.29140621423721313,
      "reward_std": 0.3035487234592438,
      "rewards/format_reward/mean": 0.4140625,
      "rewards/format_reward/std": 0.209963858127594,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1499.0,
      "completions/max_terminated_length": 1499.0,
      "completions/mean_length": 368.15625,
      "completions/mean_terminated_length": 368.15625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 28.832923889160156,
      "learning_rate": 9.611448774886923e-07,
      "loss": 0.0,
      "num_tokens": 2107684.0,
      "reward": 0.31328123807907104,
      "reward_std": 0.32075032591819763,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.22589658200740814,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1508.0,
      "completions/max_terminated_length": 1508.0,
      "completions/mean_length": 311.4375,
      "completions/mean_terminated_length": 311.4375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0475,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.548686981201172,
      "learning_rate": 9.545032675245813e-07,
      "loss": -0.0,
      "num_tokens": 2220144.0,
      "reward": 0.5015624761581421,
      "reward_std": 0.41764065623283386,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 172.484375,
      "completions/mean_terminated_length": 142.71429443359375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 16.76396942138672,
      "learning_rate": 9.473646649103817e-07,
      "loss": 0.0,
      "num_tokens": 2318791.0,
      "reward": 0.5539062023162842,
      "reward_std": 0.17947588860988617,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.16194961965084076,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1462.0,
      "completions/max_terminated_length": 1462.0,
      "completions/mean_length": 254.03125,
      "completions/mean_terminated_length": 254.03125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0525,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 12.80144214630127,
      "learning_rate": 9.397368756032444e-07,
      "loss": -0.0,
      "num_tokens": 2429865.0,
      "reward": 0.25312498211860657,
      "reward_std": 0.33632034063339233,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.1259881556034088,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 954.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 104.65625,
      "completions/mean_terminated_length": 104.65625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.055,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.90204906463623,
      "learning_rate": 9.316282404787869e-07,
      "loss": -0.0,
      "num_tokens": 2523307.0,
      "reward": 0.4093749523162842,
      "reward_std": 0.17358146607875824,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1322.0,
      "completions/max_terminated_length": 1322.0,
      "completions/mean_length": 377.9375,
      "completions/mean_terminated_length": 377.9375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0575,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 11.154988288879395,
      "learning_rate": 9.230476262104676e-07,
      "loss": -0.0,
      "num_tokens": 2626679.0,
      "reward": 0.22343748807907104,
      "reward_std": 0.28182199597358704,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1534975916147232,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 11.924612045288086,
      "learning_rate": 9.1400441557401e-07,
      "loss": 0.0,
      "num_tokens": 2733791.0,
      "reward": 0.4718749523162842,
      "reward_std": 0.24831001460552216,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 979.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 17.104806900024414,
      "learning_rate": 9.045084971874737e-07,
      "loss": -0.0,
      "num_tokens": 2827215.0,
      "reward": 0.51953125,
      "reward_std": 0.22175219655036926,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.1406387835741043,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1075.0,
      "completions/max_terminated_length": 1075.0,
      "completions/mean_length": 161.796875,
      "completions/mean_terminated_length": 161.796875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.065,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 24.93585968017578,
      "learning_rate": 8.945702546981968e-07,
      "loss": 0.0,
      "num_tokens": 2902970.0,
      "reward": 0.40859371423721313,
      "reward_std": 0.22210699319839478,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 928.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 235.765625,
      "completions/mean_terminated_length": 235.765625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0675,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 16.02617645263672,
      "learning_rate": 8.842005554284295e-07,
      "loss": -0.0,
      "num_tokens": 3005379.0,
      "reward": 0.29999998211860657,
      "reward_std": 0.2879316806793213,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.08908708393573761,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 935.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 141.0,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 29.071575164794922,
      "learning_rate": 8.734107384920769e-07,
      "loss": -0.0,
      "num_tokens": 3102595.0,
      "reward": 0.4562499523162842,
      "reward_std": 0.3846532702445984,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.08908708393573761,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 961.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 109.046875,
      "completions/mean_terminated_length": 109.046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0725,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.398436546325684,
      "learning_rate": 8.622126023955445e-07,
      "loss": -0.0,
      "num_tokens": 3190334.0,
      "reward": 0.6898437142372131,
      "reward_std": 0.19096830487251282,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2034.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 313.390625,
      "completions/mean_terminated_length": 313.390625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.075,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 23.27433967590332,
      "learning_rate": 8.506183921362442e-07,
      "loss": 0.0,
      "num_tokens": 3301191.0,
      "reward": 0.6148437261581421,
      "reward_std": 0.3767889142036438,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1409.0,
      "completions/max_terminated_length": 1409.0,
      "completions/mean_length": 302.03125,
      "completions/mean_terminated_length": 302.03125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0775,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 9.451353073120117,
      "learning_rate": 8.386407858128706e-07,
      "loss": -0.0,
      "num_tokens": 3431513.0,
      "reward": 0.3273437023162842,
      "reward_std": 0.24055621027946472,
      "rewards/format_reward/mean": 0.4609375,
      "rewards/format_reward/std": 0.16194961965084076,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1716.0,
      "completions/max_terminated_length": 1716.0,
      "completions/mean_length": 140.75,
      "completions/mean_terminated_length": 140.75,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 19.545513153076172,
      "learning_rate": 8.262928807620843e-07,
      "loss": -0.0,
      "num_tokens": 3516601.0,
      "reward": 0.42890626192092896,
      "reward_std": 0.0956839770078659,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.2236899733543396,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 26.1875,
      "completions/mean_terminated_length": 26.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0825,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.879921913146973,
      "learning_rate": 8.135881792367685e-07,
      "loss": 0.0,
      "num_tokens": 3610013.0,
      "reward": 0.7226561903953552,
      "reward_std": 0.19158241152763367,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1395.0,
      "completions/max_terminated_length": 1395.0,
      "completions/mean_length": 130.375,
      "completions/mean_terminated_length": 130.375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.085,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 22.375093460083008,
      "learning_rate": 8.005405736415125e-07,
      "loss": 0.0,
      "num_tokens": 3706909.0,
      "reward": 0.4156249761581421,
      "reward_std": 0.34389790892601013,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.18898223340511322,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1169.0,
      "completions/mean_length": 228.734375,
      "completions/mean_terminated_length": 170.0483856201172,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0875,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 17.84543228149414,
      "learning_rate": 7.871643313414718e-07,
      "loss": -0.0,
      "num_tokens": 3815884.0,
      "reward": 0.7546874284744263,
      "reward_std": 0.28800931572914124,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1534975916147232,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 27.609375,
      "completions/mean_terminated_length": 27.609375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 31.98973274230957,
      "learning_rate": 7.734740790612136e-07,
      "loss": -0.0,
      "num_tokens": 3941163.0,
      "reward": 0.4898437261581421,
      "reward_std": 0.2993735373020172,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1835.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 61.625,
      "completions/mean_terminated_length": 61.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0925,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.227835655212402,
      "learning_rate": 7.594847868906076e-07,
      "loss": 0.0,
      "num_tokens": 4027779.0,
      "reward": 0.528124988079071,
      "reward_std": 0.1930253505706787,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1583.0,
      "completions/max_terminated_length": 1583.0,
      "completions/mean_length": 239.546875,
      "completions/mean_terminated_length": 239.546875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.095,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 20.220304489135742,
      "learning_rate": 7.452117519152541e-07,
      "loss": 0.0,
      "num_tokens": 4120606.0,
      "reward": 0.39531248807907104,
      "reward_std": 0.3611350357532501,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.2847827076911926,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 72.34375,
      "completions/mean_terminated_length": 72.34375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0975,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 21.585006713867188,
      "learning_rate": 7.306705814893439e-07,
      "loss": -0.0,
      "num_tokens": 4210956.0,
      "reward": 0.37812501192092896,
      "reward_std": 0.23370197415351868,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.233588308095932,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1230.0,
      "completions/max_terminated_length": 1230.0,
      "completions/mean_length": 107.265625,
      "completions/mean_terminated_length": 107.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 19.986061096191406,
      "learning_rate": 7.158771761692464e-07,
      "loss": 0.0,
      "num_tokens": 4302349.0,
      "reward": 0.28593748807907104,
      "reward_std": 0.20377102494239807,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 784.0,
      "completions/max_terminated_length": 784.0,
      "completions/mean_length": 170.578125,
      "completions/mean_terminated_length": 170.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1025,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 23.84389305114746,
      "learning_rate": 7.008477123264847e-07,
      "loss": 0.0,
      "num_tokens": 4402810.0,
      "reward": 0.6890624761581421,
      "reward_std": 0.3438800573348999,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.24346621334552765,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1099.0,
      "completions/mean_length": 197.78125,
      "completions/mean_terminated_length": 106.78688049316406,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.105,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 30.521644592285156,
      "learning_rate": 6.855986244591103e-07,
      "loss": -0.0,
      "num_tokens": 4512004.0,
      "reward": 0.5382812023162842,
      "reward_std": 0.25127214193344116,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.31644338369369507,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1161.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 104.31746673583984,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1075,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 28.4834041595459,
      "learning_rate": 6.701465872208216e-07,
      "loss": 0.0,
      "num_tokens": 4623384.0,
      "reward": 0.2992187440395355,
      "reward_std": 0.23480820655822754,
      "rewards/format_reward/mean": 0.8046875,
      "rewards/format_reward/std": 0.29028159379959106,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 715.0,
      "completions/max_terminated_length": 715.0,
      "completions/mean_length": 76.203125,
      "completions/mean_terminated_length": 76.203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.11,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.868675231933594,
      "learning_rate": 6.545084971874736e-07,
      "loss": 0.0,
      "num_tokens": 4706005.0,
      "reward": 0.32734376192092896,
      "reward_std": 0.24423527717590332,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.1751912236213684,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 668.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 61.44261932373047,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 15.19470500946045,
      "learning_rate": 6.387014543809223e-07,
      "loss": 0.0,
      "num_tokens": 4798985.0,
      "reward": 0.5421874523162842,
      "reward_std": 0.3319449722766876,
      "rewards/format_reward/mean": 0.734375,
      "rewards/format_reward/std": 0.30820462107658386,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 72.453125,
      "completions/mean_terminated_length": 41.09524154663086,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.115,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.99255084991455,
      "learning_rate": 6.227427435703995e-07,
      "loss": -0.0,
      "num_tokens": 4918510.0,
      "reward": 0.31406253576278687,
      "reward_std": 0.0530330091714859,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.3642643094062805,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 608.0,
      "completions/mean_length": 79.3125,
      "completions/mean_terminated_length": 48.06349563598633,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1175,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 18.731386184692383,
      "learning_rate": 6.066498153718734e-07,
      "loss": 0.0,
      "num_tokens": 5018826.0,
      "reward": 0.6578124761581421,
      "reward_std": 0.29044055938720703,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.3177144229412079,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 855.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 66.71875,
      "completions/mean_terminated_length": 66.71875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 18.806800842285156,
      "learning_rate": 5.90440267166055e-07,
      "loss": 0.0,
      "num_tokens": 5152376.0,
      "reward": 0.3304687738418579,
      "reward_std": 0.30621567368507385,
      "rewards/format_reward/mean": 0.8046875,
      "rewards/format_reward/std": 0.37392371892929077,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 774.0,
      "completions/mean_length": 102.953125,
      "completions/mean_terminated_length": 72.0793685913086,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1225,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 34.07569885253906,
      "learning_rate": 5.741318238559209e-07,
      "loss": 0.0,
      "num_tokens": 5259269.0,
      "reward": 0.45234376192092896,
      "reward_std": 0.2820115089416504,
      "rewards/format_reward/mean": 0.7734375,
      "rewards/format_reward/std": 0.2807259261608124,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 127.9375,
      "completions/mean_terminated_length": 66.0,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 27.46225929260254,
      "learning_rate": 5.577423184847931e-07,
      "loss": 0.0,
      "num_tokens": 5354257.0,
      "reward": 0.592968761920929,
      "reward_std": 0.22307650744915009,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.23345555365085602,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1015.0,
      "completions/mean_length": 120.328125,
      "completions/mean_terminated_length": 89.73016357421875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1275,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 18.311309814453125,
      "learning_rate": 5.412896727361662e-07,
      "loss": 0.0,
      "num_tokens": 5452862.0,
      "reward": 0.510937511920929,
      "reward_std": 0.3758324980735779,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2592533528804779,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1425.0,
      "completions/mean_length": 101.140625,
      "completions/mean_terminated_length": 70.23809814453125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.13,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 15.757895469665527,
      "learning_rate": 5.247918773366111e-07,
      "loss": -0.0,
      "num_tokens": 5554471.0,
      "reward": 0.43046876788139343,
      "reward_std": 0.20688433945178986,
      "rewards/format_reward/mean": 0.8671875,
      "rewards/format_reward/std": 0.28510910272598267,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1296.0,
      "completions/max_terminated_length": 1296.0,
      "completions/mean_length": 67.234375,
      "completions/mean_terminated_length": 67.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1325,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.062511444091797,
      "learning_rate": 5.082669723831793e-07,
      "loss": -0.0,
      "num_tokens": 5651566.0,
      "reward": 0.42109376192092896,
      "reward_std": 0.17108294367790222,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.1751912236213684,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 447.0,
      "completions/mean_length": 153.5,
      "completions/mean_terminated_length": 27.200000762939453,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.135,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 16.75969123840332,
      "learning_rate": 4.917330276168208e-07,
      "loss": 0.0,
      "num_tokens": 5753262.0,
      "reward": 0.6187499761581421,
      "reward_std": 0.307129442691803,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.28171807527542114,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 462.0,
      "completions/max_terminated_length": 462.0,
      "completions/mean_length": 48.203125,
      "completions/mean_terminated_length": 48.203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.472973823547363,
      "learning_rate": 4.752081226633888e-07,
      "loss": -0.0,
      "num_tokens": 5829763.0,
      "reward": 0.45390626788139343,
      "reward_std": 0.1621313989162445,
      "rewards/format_reward/mean": 0.9453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 488.0,
      "completions/mean_length": 70.390625,
      "completions/mean_terminated_length": 39.000003814697266,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.14,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 9.199593544006348,
      "learning_rate": 4.5871032726383385e-07,
      "loss": 0.0,
      "num_tokens": 5927572.0,
      "reward": 0.18906250596046448,
      "reward_std": 0.11857090145349503,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "rewards/mcq_exact_match_reward/mean": 0.09375,
      "rewards/mcq_exact_match_reward/std": 0.29378482699394226,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 872.0,
      "completions/mean_length": 107.125,
      "completions/mean_terminated_length": 44.51612854003906,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1425,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.753751754760742,
      "learning_rate": 4.4225768151520694e-07,
      "loss": 0.0,
      "num_tokens": 6032756.0,
      "reward": 0.45000001788139343,
      "reward_std": 0.28659987449645996,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.233588308095932,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 553.0,
      "completions/max_terminated_length": 553.0,
      "completions/mean_length": 36.59375,
      "completions/mean_terminated_length": 36.59375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.145,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 18.922006607055664,
      "learning_rate": 4.258681761440789e-07,
      "loss": 0.0,
      "num_tokens": 6120570.0,
      "reward": 0.37968751788139343,
      "reward_std": 0.24809977412223816,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 667.0,
      "completions/mean_length": 143.96875,
      "completions/mean_terminated_length": 50.32786560058594,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1475,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 6.868103504180908,
      "learning_rate": 4.095597328339452e-07,
      "loss": -0.0,
      "num_tokens": 6236432.0,
      "reward": 0.5914062261581421,
      "reward_std": 0.20316563546657562,
      "rewards/format_reward/mean": 0.9140625,
      "rewards/format_reward/std": 0.27537402510643005,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 407.0,
      "completions/max_terminated_length": 407.0,
      "completions/mean_length": 21.84375,
      "completions/mean_terminated_length": 21.84375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.15,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.219839096069336,
      "learning_rate": 3.9335018462812664e-07,
      "loss": -0.0,
      "num_tokens": 6327390.0,
      "reward": 0.7085937857627869,
      "reward_std": 0.2691788673400879,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 164.0,
      "completions/mean_length": 80.140625,
      "completions/mean_terminated_length": 16.66128921508789,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1525,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 10.40705394744873,
      "learning_rate": 3.772572564296004e-07,
      "loss": 0.0,
      "num_tokens": 6440111.0,
      "reward": 0.5804687738418579,
      "reward_std": 0.24087271094322205,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 366.0,
      "completions/mean_length": 58.875,
      "completions/mean_terminated_length": 27.30158805847168,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.155,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.986087799072266,
      "learning_rate": 3.612985456190778e-07,
      "loss": -0.0,
      "num_tokens": 6518671.0,
      "reward": 0.7054687738418579,
      "reward_std": 0.26613086462020874,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 460.0,
      "completions/mean_length": 122.21875,
      "completions/mean_terminated_length": 27.508195877075195,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1575,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.218698501586914,
      "learning_rate": 3.454915028125263e-07,
      "loss": -0.0,
      "num_tokens": 6611805.0,
      "reward": 0.561718761920929,
      "reward_std": 0.14248578250408173,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.2498759627342224,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 876.0,
      "completions/max_terminated_length": 876.0,
      "completions/mean_length": 53.1875,
      "completions/mean_terminated_length": 53.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.511017799377441,
      "learning_rate": 3.2985341277917846e-07,
      "loss": 0.0,
      "num_tokens": 6698345.0,
      "reward": 0.32890626788139343,
      "reward_std": 0.04640388861298561,
      "rewards/format_reward/mean": 0.9453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 13.28125,
      "completions/mean_terminated_length": 13.28125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1625,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 4.530981063842773,
      "learning_rate": 3.1440137554088953e-07,
      "loss": 0.0,
      "num_tokens": 6789451.0,
      "reward": 0.3968750238418579,
      "reward_std": 0.0646936446428299,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1223.0,
      "completions/mean_length": 134.359375,
      "completions/mean_terminated_length": 72.6290283203125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.165,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 16.526830673217773,
      "learning_rate": 2.9915228767351535e-07,
      "loss": 0.0,
      "num_tokens": 6907946.0,
      "reward": 0.48593753576278687,
      "reward_std": 0.18089531362056732,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.19352105259895325,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 481.0,
      "completions/max_terminated_length": 481.0,
      "completions/mean_length": 35.515625,
      "completions/mean_terminated_length": 35.515625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1675,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 9.258106231689453,
      "learning_rate": 2.841228238307536e-07,
      "loss": -0.0,
      "num_tokens": 6978851.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.19044628739356995,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 618.0,
      "completions/mean_length": 63.296875,
      "completions/mean_terminated_length": 31.79365348815918,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.17,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 18.417194366455078,
      "learning_rate": 2.6932941851065615e-07,
      "loss": 0.0,
      "num_tokens": 7080566.0,
      "reward": 0.6421874761581421,
      "reward_std": 0.29052332043647766,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 550.0,
      "completions/max_terminated_length": 550.0,
      "completions/mean_length": 33.65625,
      "completions/mean_terminated_length": 33.65625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1725,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 28.758838653564453,
      "learning_rate": 2.547882480847461e-07,
      "loss": 0.0,
      "num_tokens": 7180000.0,
      "reward": 0.534375011920929,
      "reward_std": 0.33078908920288086,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.12198751419782639,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 480.0,
      "completions/max_terminated_length": 480.0,
      "completions/mean_length": 20.453125,
      "completions/mean_terminated_length": 20.453125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.175,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.625492095947266,
      "learning_rate": 2.4051521310939254e-07,
      "loss": 0.0,
      "num_tokens": 7277581.0,
      "reward": 0.5843750238418579,
      "reward_std": 0.15992169082164764,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 192.0,
      "completions/max_terminated_length": 192.0,
      "completions/mean_length": 18.796875,
      "completions/mean_terminated_length": 18.796875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1775,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 8.98841667175293,
      "learning_rate": 2.2652592093878665e-07,
      "loss": 0.0,
      "num_tokens": 7354840.0,
      "reward": 0.41093751788139343,
      "reward_std": 0.11330723762512207,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 336.0,
      "completions/max_terminated_length": 336.0,
      "completions/mean_length": 20.78125,
      "completions/mean_terminated_length": 20.78125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.18,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.35882568359375,
      "learning_rate": 2.128356686585282e-07,
      "loss": 0.0,
      "num_tokens": 7438378.0,
      "reward": 0.45859378576278687,
      "reward_std": 0.17108294367790222,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 15.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 13.03125,
      "completions/mean_terminated_length": 13.03125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1825,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.875134468078613,
      "learning_rate": 1.9945942635848745e-07,
      "loss": -0.0,
      "num_tokens": 7510852.0,
      "reward": 0.7390625476837158,
      "reward_std": 0.17329266667366028,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 370.0,
      "completions/max_terminated_length": 370.0,
      "completions/mean_length": 19.203125,
      "completions/mean_terminated_length": 19.203125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.185,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 13.027864456176758,
      "learning_rate": 1.8641182076323148e-07,
      "loss": 0.0,
      "num_tokens": 7608273.0,
      "reward": 0.7390625476837158,
      "reward_std": 0.18484057486057281,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1156.0,
      "completions/max_terminated_length": 1156.0,
      "completions/mean_length": 50.65625,
      "completions/mean_terminated_length": 50.65625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1875,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 8.088542938232422,
      "learning_rate": 1.7370711923791564e-07,
      "loss": 0.0,
      "num_tokens": 7691619.0,
      "reward": 0.5843750238418579,
      "reward_std": 0.16887325048446655,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1471.0,
      "completions/max_terminated_length": 1471.0,
      "completions/mean_length": 43.625,
      "completions/mean_terminated_length": 43.625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.19,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.211929321289062,
      "learning_rate": 1.6135921418712955e-07,
      "loss": -0.0,
      "num_tokens": 7776755.0,
      "reward": 0.8812500238418579,
      "reward_std": 0.1552036553621292,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.78125,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 249.0,
      "completions/max_terminated_length": 249.0,
      "completions/mean_length": 16.90625,
      "completions/mean_terminated_length": 16.90625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.1925,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 12.14816665649414,
      "learning_rate": 1.493816078637557e-07,
      "loss": -0.0,
      "num_tokens": 7856829.0,
      "reward": 0.4437500238418579,
      "reward_std": 0.1552036553621292,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 17.484375,
      "completions/mean_terminated_length": 17.484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.195,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 8.064270973205566,
      "learning_rate": 1.3778739760445552e-07,
      "loss": -0.0,
      "num_tokens": 7957444.0,
      "reward": 0.6195312738418579,
      "reward_std": 0.06007346510887146,
      "rewards/format_reward/mean": 0.8828125,
      "rewards/format_reward/std": 0.21347814798355103,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 862.0,
      "completions/max_terminated_length": 862.0,
      "completions/mean_length": 28.59375,
      "completions/mean_terminated_length": 28.59375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1975,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.456476211547852,
      "learning_rate": 1.2658926150792322e-07,
      "loss": 0.0,
      "num_tokens": 8047770.0,
      "reward": 0.753125011920929,
      "reward_std": 0.23930205404758453,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.1510545015335083,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 413.0,
      "completions/max_terminated_length": 413.0,
      "completions/mean_length": 23.6875,
      "completions/mean_terminated_length": 23.6875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.343238353729248,
      "learning_rate": 1.1579944457157059e-07,
      "loss": -0.0,
      "num_tokens": 8158614.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.0883883461356163,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 19.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 12.328125,
      "completions/mean_terminated_length": 12.328125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2025,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 19.15117073059082,
      "learning_rate": 1.0542974530180327e-07,
      "loss": -0.0,
      "num_tokens": 8261035.0,
      "reward": 0.5007812976837158,
      "reward_std": 0.1689612865447998,
      "rewards/format_reward/mean": 0.9453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 118.0,
      "completions/max_terminated_length": 118.0,
      "completions/mean_length": 16.375,
      "completions/mean_terminated_length": 16.375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.205,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 12.861918449401855,
      "learning_rate": 9.549150281252632e-08,
      "loss": 0.0,
      "num_tokens": 8359787.0,
      "reward": 0.20937500894069672,
      "reward_std": 0.16887325048446655,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.109375,
      "rewards/mcq_exact_match_reward/std": 0.3145764470100403,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 119.0,
      "completions/max_terminated_length": 119.0,
      "completions/mean_length": 15.03125,
      "completions/mean_terminated_length": 15.03125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2075,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.756561279296875,
      "learning_rate": 8.599558442598998e-08,
      "loss": 0.0,
      "num_tokens": 8441949.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.1315089464187622,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 310.0,
      "completions/max_terminated_length": 310.0,
      "completions/mean_length": 19.09375,
      "completions/mean_terminated_length": 19.09375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.21,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 10.06808090209961,
      "learning_rate": 7.695237378953224e-08,
      "loss": 0.0,
      "num_tokens": 8543755.0,
      "reward": 0.8500000238418579,
      "reward_std": 0.19727616012096405,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 18.0,
      "completions/mean_length": 75.015625,
      "completions/mean_terminated_length": 11.370967864990234,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.676437377929688,
      "learning_rate": 6.837175952121304e-08,
      "loss": 0.0,
      "num_tokens": 8640956.0,
      "reward": 0.4742187559604645,
      "reward_std": 0.34538891911506653,
      "rewards/format_reward/mean": 0.8359375,
      "rewards/format_reward/std": 0.2824873626232147,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 22.0,
      "completions/max_terminated_length": 22.0,
      "completions/mean_length": 13.515625,
      "completions/mean_terminated_length": 13.515625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.215,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.535361289978027,
      "learning_rate": 6.026312439675551e-08,
      "loss": 0.0,
      "num_tokens": 8729309.0,
      "reward": 0.55078125,
      "reward_std": 0.1658112108707428,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 101.0,
      "completions/mean_length": 46.234375,
      "completions/mean_terminated_length": 14.460318565368652,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2175,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 18.44989776611328,
      "learning_rate": 5.263533508961826e-08,
      "loss": 0.0,
      "num_tokens": 8833324.0,
      "reward": 0.81640625,
      "reward_std": 0.30399811267852783,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 347.0,
      "completions/max_terminated_length": 347.0,
      "completions/mean_length": 22.5625,
      "completions/mean_terminated_length": 22.5625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.22,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.351069450378418,
      "learning_rate": 4.549673247541874e-08,
      "loss": 0.0,
      "num_tokens": 8932336.0,
      "reward": 0.6781250238418579,
      "reward_std": 0.2519446909427643,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 388.0,
      "completions/max_terminated_length": 388.0,
      "completions/mean_length": 27.359375,
      "completions/mean_terminated_length": 27.359375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.2225,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.434761047363281,
      "learning_rate": 3.8855122511307626e-08,
      "loss": 0.0,
      "num_tokens": 9044767.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.31300368905067444,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1093.0,
      "completions/max_terminated_length": 1093.0,
      "completions/mean_length": 30.03125,
      "completions/mean_terminated_length": 30.03125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.225,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.73192310333252,
      "learning_rate": 3.271776770026963e-08,
      "loss": 0.0,
      "num_tokens": 9157089.0,
      "reward": 0.5367187857627869,
      "reward_std": 0.1834089457988739,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 499.0,
      "completions/max_terminated_length": 499.0,
      "completions/mean_length": 26.140625,
      "completions/mean_terminated_length": 26.140625,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2275,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.007508277893066,
      "learning_rate": 2.7091379149682682e-08,
      "loss": 0.0,
      "num_tokens": 9224186.0,
      "reward": 0.7718750238418579,
      "reward_std": 0.15992169082164764,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 305.0,
      "completions/max_terminated_length": 305.0,
      "completions/mean_length": 19.234375,
      "completions/mean_terminated_length": 19.234375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.23,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 7.761956691741943,
      "learning_rate": 2.1982109232821176e-08,
      "loss": -0.0,
      "num_tokens": 9321929.0,
      "reward": 0.5218750238418579,
      "reward_std": 0.1530819982290268,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 45.453125,
      "completions/mean_terminated_length": 13.666666984558105,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2325,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 12.21143627166748,
      "learning_rate": 1.7395544861325718e-08,
      "loss": -0.0,
      "num_tokens": 9410454.0,
      "reward": 0.64453125,
      "reward_std": 0.07132276892662048,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 321.0,
      "completions/mean_length": 87.828125,
      "completions/mean_terminated_length": 24.596773147583008,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.235,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.356138229370117,
      "learning_rate": 1.3336701375997127e-08,
      "loss": -0.0,
      "num_tokens": 9519363.0,
      "reward": 0.7523437738418579,
      "reward_std": 0.23261141777038574,
      "rewards/format_reward/mean": 0.9609375,
      "rewards/format_reward/std": 0.18483558297157288,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 400.0,
      "completions/max_terminated_length": 400.0,
      "completions/mean_length": 19.109375,
      "completions/mean_terminated_length": 19.109375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2375,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.05694055557251,
      "learning_rate": 9.810017062595321e-09,
      "loss": 0.0,
      "num_tokens": 9619178.0,
      "reward": 0.5835937261581421,
      "reward_std": 0.12194531410932541,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 326.0,
      "completions/max_terminated_length": 326.0,
      "completions/mean_length": 23.65625,
      "completions/mean_terminated_length": 23.65625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.24,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 13.755544662475586,
      "learning_rate": 6.819348298638839e-09,
      "loss": 0.0,
      "num_tokens": 9730556.0,
      "reward": 0.3476562798023224,
      "reward_std": 0.2659573554992676,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 278.0,
      "completions/mean_length": 52.09375,
      "completions/mean_terminated_length": 20.41269874572754,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.2425,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 16.00373077392578,
      "learning_rate": 4.367965336512403e-09,
      "loss": 0.0,
      "num_tokens": 9832362.0,
      "reward": 0.6765625476837158,
      "reward_std": 0.303753525018692,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 43.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 13.59375,
      "completions/mean_terminated_length": 13.59375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.245,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 11.894449234008789,
      "learning_rate": 2.458548727494292e-09,
      "loss": -0.0,
      "num_tokens": 9954448.0,
      "reward": 0.37890625,
      "reward_std": 0.09131823480129242,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 584.0,
      "completions/max_terminated_length": 584.0,
      "completions/mean_length": 38.921875,
      "completions/mean_terminated_length": 38.921875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.2475,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 21.100183486938477,
      "learning_rate": 1.0931863906127325e-09,
      "loss": 0.0,
      "num_tokens": 10048931.0,
      "reward": 0.4414062798023224,
      "reward_std": 0.2259259968996048,
      "rewards/format_reward/mean": 0.9765625,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 18.0,
      "completions/max_terminated_length": 18.0,
      "completions/mean_length": 13.078125,
      "completions/mean_terminated_length": 13.078125,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.25,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.943309783935547,
      "learning_rate": 2.733713295369755e-10,
      "loss": 0.0,
      "num_tokens": 10141176.0,
      "reward": 0.7718750238418579,
      "reward_std": 0.0646936446428299,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 100
    }
  ],
  "logging_steps": 1,
  "max_steps": 100,
  "num_input_tokens_seen": 10141176,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}