goldengoose-gumbel_tau1.00-…/checkpoint-50/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.125,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1464.0,
      "completions/mean_length": 477.234375,
      "completions/mean_terminated_length": 426.56451416015625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.820798873901367,
      "learning_rate": 0.0,
      "loss": 0.0,
      "num_tokens": 128463.0,
      "reward": 0.30078125,
      "reward_std": 0.2949070334434509,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.3294980227947235,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1324.0,
      "completions/max_terminated_length": 1324.0,
      "completions/mean_length": 549.921875,
      "completions/mean_terminated_length": 549.921875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.992785453796387,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 224890.0,
      "reward": 0.2890625,
      "reward_std": 0.39714252948760986,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.301698237657547,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1812.0,
      "completions/mean_length": 556.71875,
      "completions/mean_terminated_length": 533.0476684570312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0075,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.250213146209717,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 361720.0,
      "reward": 0.1874999850988388,
      "reward_std": 0.3287465274333954,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.2597312331199646,
      "rewards/mcq_exact_match_reward/mean": 0.15625,
      "rewards/mcq_exact_match_reward/std": 0.36596253514289856,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2010.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 388.546875,
      "completions/mean_terminated_length": 388.546875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.673406600952148,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 469803.0,
      "reward": 0.41718748211860657,
      "reward_std": 0.42704811692237854,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1047.0,
      "completions/max_terminated_length": 1047.0,
      "completions/mean_length": 299.203125,
      "completions/mean_terminated_length": 299.203125,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.0125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.532109260559082,
      "learning_rate": 8e-07,
      "loss": 0.0,
      "num_tokens": 581328.0,
      "reward": 0.32656246423721313,
      "reward_std": 0.25986582040786743,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.3177144229412079,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1405.0,
      "completions/max_terminated_length": 1405.0,
      "completions/mean_length": 549.515625,
      "completions/mean_terminated_length": 549.515625,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.551382064819336,
      "learning_rate": 1e-06,
      "loss": 0.0,
      "num_tokens": 695497.0,
      "reward": 0.25078123807907104,
      "reward_std": 0.3316580057144165,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.28770697116851807,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1751.0,
      "completions/mean_length": 496.078125,
      "completions/mean_terminated_length": 446.01611328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.0175,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 14.735747337341309,
      "learning_rate": 9.99726628670463e-07,
      "loss": 0.0,
      "num_tokens": 819054.0,
      "reward": 0.41015625,
      "reward_std": 0.42701074481010437,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.24688033759593964,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1369.0,
      "completions/max_terminated_length": 1369.0,
      "completions/mean_length": 461.125,
      "completions/mean_terminated_length": 461.125,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.02,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 2.3624932765960693,
      "learning_rate": 9.989068136093872e-07,
      "loss": 0.0,
      "num_tokens": 918694.0,
      "reward": 0.42656248807907104,
      "reward_std": 0.3252020478248596,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2741328477859497,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1707.0,
      "completions/mean_length": 575.140625,
      "completions/mean_terminated_length": 502.70489501953125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.942022800445557,
      "learning_rate": 9.975414512725056e-07,
      "loss": -0.0,
      "num_tokens": 1059111.0,
      "reward": 0.32343748211860657,
      "reward_std": 0.38642236590385437,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.18298126757144928,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1615.0,
      "completions/mean_length": 552.0625,
      "completions/mean_terminated_length": 528.3175048828125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.24199390411377,
      "learning_rate": 9.956320346634875e-07,
      "loss": -0.0,
      "num_tokens": 1199859.0,
      "reward": 0.29140621423721313,
      "reward_std": 0.28808674216270447,
      "rewards/format_reward/mean": 0.4140625,
      "rewards/format_reward/std": 0.209963858127594,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 971.0,
      "completions/mean_length": 323.359375,
      "completions/mean_terminated_length": 295.984130859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0275,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 11.983031272888184,
      "learning_rate": 9.931806517013612e-07,
      "loss": -0.0,
      "num_tokens": 1308242.0,
      "reward": 0.24374999105930328,
      "reward_std": 0.26579102873802185,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.19669894874095917,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1475.0,
      "completions/max_terminated_length": 1475.0,
      "completions/mean_length": 494.640625,
      "completions/mean_terminated_length": 494.640625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 13.534098625183105,
      "learning_rate": 9.901899829374047e-07,
      "loss": -0.0,
      "num_tokens": 1427139.0,
      "reward": 0.453125,
      "reward_std": 0.37656593322753906,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.21593283116817474,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1420.0,
      "completions/max_terminated_length": 1420.0,
      "completions/mean_length": 591.890625,
      "completions/mean_terminated_length": 591.890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0325,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 3.6356875896453857,
      "learning_rate": 9.866632986240029e-07,
      "loss": 0.0,
      "num_tokens": 1555604.0,
      "reward": 0.24765624105930328,
      "reward_std": 0.3406350612640381,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.2538151443004608,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1100.0,
      "completions/max_terminated_length": 1100.0,
      "completions/mean_length": 327.9375,
      "completions/mean_terminated_length": 327.9375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.035,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 19.802650451660156,
      "learning_rate": 9.826044551386742e-07,
      "loss": 0.0,
      "num_tokens": 1663176.0,
      "reward": 0.30781248211860657,
      "reward_std": 0.3470980226993561,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.20351573824882507,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1263.0,
      "completions/mean_length": 416.265625,
      "completions/mean_terminated_length": 390.3651123046875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.966890335083008,
      "learning_rate": 9.780178907671788e-07,
      "loss": -0.0,
      "num_tokens": 1757385.0,
      "reward": 0.3671875,
      "reward_std": 0.37327155470848083,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.3299681544303894,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1576.0,
      "completions/mean_length": 447.671875,
      "completions/mean_terminated_length": 422.2698669433594,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 10.119935989379883,
      "learning_rate": 9.729086208503173e-07,
      "loss": -0.0,
      "num_tokens": 1892308.0,
      "reward": 0.5914062261581421,
      "reward_std": 0.31337296962738037,
      "rewards/format_reward/mean": 0.4453125,
      "rewards/format_reward/std": 0.15728822350502014,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1743.0,
      "completions/max_terminated_length": 1743.0,
      "completions/mean_length": 421.59375,
      "completions/mean_terminated_length": 421.59375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0425,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 10.146256446838379,
      "learning_rate": 9.672822322997304e-07,
      "loss": 0.0,
      "num_tokens": 2011562.0,
      "reward": 0.29140621423721313,
      "reward_std": 0.3035487234592438,
      "rewards/format_reward/mean": 0.4140625,
      "rewards/format_reward/std": 0.209963858127594,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1499.0,
      "completions/max_terminated_length": 1499.0,
      "completions/mean_length": 368.15625,
      "completions/mean_terminated_length": 368.15625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.045,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 28.832923889160156,
      "learning_rate": 9.611448774886923e-07,
      "loss": 0.0,
      "num_tokens": 2107684.0,
      "reward": 0.31328123807907104,
      "reward_std": 0.32075032591819763,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.22589658200740814,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1508.0,
      "completions/max_terminated_length": 1508.0,
      "completions/mean_length": 311.4375,
      "completions/mean_terminated_length": 311.4375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0475,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 17.548686981201172,
      "learning_rate": 9.545032675245813e-07,
      "loss": -0.0,
      "num_tokens": 2220144.0,
      "reward": 0.5015624761581421,
      "reward_std": 0.41764065623283386,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1022.0,
      "completions/mean_length": 172.484375,
      "completions/mean_terminated_length": 142.71429443359375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 16.76396942138672,
      "learning_rate": 9.473646649103817e-07,
      "loss": 0.0,
      "num_tokens": 2318791.0,
      "reward": 0.5539062023162842,
      "reward_std": 0.17947588860988617,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.16194961965084076,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1462.0,
      "completions/max_terminated_length": 1462.0,
      "completions/mean_length": 254.03125,
      "completions/mean_terminated_length": 254.03125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0525,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 12.80144214630127,
      "learning_rate": 9.397368756032444e-07,
      "loss": -0.0,
      "num_tokens": 2429865.0,
      "reward": 0.25312498211860657,
      "reward_std": 0.33632034063339233,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.1259881556034088,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 954.0,
      "completions/max_terminated_length": 954.0,
      "completions/mean_length": 104.65625,
      "completions/mean_terminated_length": 104.65625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.055,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.90204906463623,
      "learning_rate": 9.316282404787869e-07,
      "loss": -0.0,
      "num_tokens": 2523307.0,
      "reward": 0.4093749523162842,
      "reward_std": 0.17358146607875824,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1322.0,
      "completions/max_terminated_length": 1322.0,
      "completions/mean_length": 377.9375,
      "completions/mean_terminated_length": 377.9375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0575,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 11.154988288879395,
      "learning_rate": 9.230476262104676e-07,
      "loss": -0.0,
      "num_tokens": 2626679.0,
      "reward": 0.22343748807907104,
      "reward_std": 0.28182199597358704,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1534975916147232,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 398.0,
      "completions/max_terminated_length": 398.0,
      "completions/mean_length": 12.125,
      "completions/mean_terminated_length": 12.125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 11.924612045288086,
      "learning_rate": 9.1400441557401e-07,
      "loss": 0.0,
      "num_tokens": 2733791.0,
      "reward": 0.4718749523162842,
      "reward_std": 0.24831001460552216,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.421875,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 979.0,
      "completions/max_terminated_length": 979.0,
      "completions/mean_length": 173.625,
      "completions/mean_terminated_length": 173.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 17.104806900024414,
      "learning_rate": 9.045084971874737e-07,
      "loss": -0.0,
      "num_tokens": 2827215.0,
      "reward": 0.51953125,
      "reward_std": 0.22175219655036926,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.1406387835741043,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1075.0,
      "completions/max_terminated_length": 1075.0,
      "completions/mean_length": 161.796875,
      "completions/mean_terminated_length": 161.796875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.065,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 24.93585968017578,
      "learning_rate": 8.945702546981968e-07,
      "loss": 0.0,
      "num_tokens": 2902970.0,
      "reward": 0.40859371423721313,
      "reward_std": 0.22210699319839478,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 928.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 235.765625,
      "completions/mean_terminated_length": 235.765625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0675,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 16.02617645263672,
      "learning_rate": 8.842005554284295e-07,
      "loss": -0.0,
      "num_tokens": 3005379.0,
      "reward": 0.29999998211860657,
      "reward_std": 0.2879316806793213,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.08908708393573761,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 935.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 141.0,
      "completions/mean_terminated_length": 141.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.07,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 29.071575164794922,
      "learning_rate": 8.734107384920769e-07,
      "loss": -0.0,
      "num_tokens": 3102595.0,
      "reward": 0.4562499523162842,
      "reward_std": 0.3846532702445984,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.08908708393573761,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 961.0,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 109.046875,
      "completions/mean_terminated_length": 109.046875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0725,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 14.398436546325684,
      "learning_rate": 8.622126023955445e-07,
      "loss": -0.0,
      "num_tokens": 3190334.0,
      "reward": 0.6898437142372131,
      "reward_std": 0.19096830487251282,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 2034.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 313.390625,
      "completions/mean_terminated_length": 313.390625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.075,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 23.27433967590332,
      "learning_rate": 8.506183921362442e-07,
      "loss": 0.0,
      "num_tokens": 3301191.0,
      "reward": 0.6148437261581421,
      "reward_std": 0.3767889142036438,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.13886408507823944,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1409.0,
      "completions/max_terminated_length": 1409.0,
      "completions/mean_length": 302.03125,
      "completions/mean_terminated_length": 302.03125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0775,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 9.451353073120117,
      "learning_rate": 8.386407858128706e-07,
      "loss": -0.0,
      "num_tokens": 3431513.0,
      "reward": 0.3273437023162842,
      "reward_std": 0.24055621027946472,
      "rewards/format_reward/mean": 0.4609375,
      "rewards/format_reward/std": 0.16194961965084076,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1716.0,
      "completions/max_terminated_length": 1716.0,
      "completions/mean_length": 140.75,
      "completions/mean_terminated_length": 140.75,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 19.545513153076172,
      "learning_rate": 8.262928807620843e-07,
      "loss": -0.0,
      "num_tokens": 3516601.0,
      "reward": 0.42890626192092896,
      "reward_std": 0.0956839770078659,
      "rewards/format_reward/mean": 0.5390625,
      "rewards/format_reward/std": 0.2236899733543396,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 26.1875,
      "completions/mean_terminated_length": 26.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0825,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.879921913146973,
      "learning_rate": 8.135881792367685e-07,
      "loss": 0.0,
      "num_tokens": 3610013.0,
      "reward": 0.7226561903953552,
      "reward_std": 0.19158241152763367,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1395.0,
      "completions/max_terminated_length": 1395.0,
      "completions/mean_length": 130.375,
      "completions/mean_terminated_length": 130.375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.085,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 22.375093460083008,
      "learning_rate": 8.005405736415125e-07,
      "loss": 0.0,
      "num_tokens": 3706909.0,
      "reward": 0.4156249761581421,
      "reward_std": 0.34389790892601013,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.18898223340511322,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1169.0,
      "completions/mean_length": 228.734375,
      "completions/mean_terminated_length": 170.0483856201172,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0875,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 17.84543228149414,
      "learning_rate": 7.871643313414718e-07,
      "loss": -0.0,
      "num_tokens": 3815884.0,
      "reward": 0.7546874284744263,
      "reward_std": 0.28800931572914124,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.1534975916147232,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 311.0,
      "completions/max_terminated_length": 311.0,
      "completions/mean_length": 27.609375,
      "completions/mean_terminated_length": 27.609375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.09,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 31.98973274230957,
      "learning_rate": 7.734740790612136e-07,
      "loss": -0.0,
      "num_tokens": 3941163.0,
      "reward": 0.4898437261581421,
      "reward_std": 0.2993735373020172,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.4375,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1835.0,
      "completions/max_terminated_length": 1835.0,
      "completions/mean_length": 61.625,
      "completions/mean_terminated_length": 61.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0925,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.227835655212402,
      "learning_rate": 7.594847868906076e-07,
      "loss": 0.0,
      "num_tokens": 4027779.0,
      "reward": 0.528124988079071,
      "reward_std": 0.1930253505706787,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1583.0,
      "completions/max_terminated_length": 1583.0,
      "completions/mean_length": 239.546875,
      "completions/mean_terminated_length": 239.546875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.095,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 20.220304489135742,
      "learning_rate": 7.452117519152541e-07,
      "loss": 0.0,
      "num_tokens": 4120606.0,
      "reward": 0.39531248807907104,
      "reward_std": 0.3611350357532501,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.2847827076911926,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 72.34375,
      "completions/mean_terminated_length": 72.34375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0975,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 21.585006713867188,
      "learning_rate": 7.306705814893439e-07,
      "loss": -0.0,
      "num_tokens": 4210956.0,
      "reward": 0.37812501192092896,
      "reward_std": 0.23370197415351868,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.233588308095932,
      "rewards/mcq_exact_match_reward/mean": 0.3125,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1230.0,
      "completions/max_terminated_length": 1230.0,
      "completions/mean_length": 107.265625,
      "completions/mean_terminated_length": 107.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 19.986061096191406,
      "learning_rate": 7.158771761692464e-07,
      "loss": 0.0,
      "num_tokens": 4302349.0,
      "reward": 0.28593748807907104,
      "reward_std": 0.20377102494239807,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 784.0,
      "completions/max_terminated_length": 784.0,
      "completions/mean_length": 170.578125,
      "completions/mean_terminated_length": 170.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1025,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 23.84389305114746,
      "learning_rate": 7.008477123264847e-07,
      "loss": 0.0,
      "num_tokens": 4402810.0,
      "reward": 0.6890624761581421,
      "reward_std": 0.3438800573348999,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.24346621334552765,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1099.0,
      "completions/mean_length": 197.78125,
      "completions/mean_terminated_length": 106.78688049316406,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.105,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 30.521644592285156,
      "learning_rate": 6.855986244591103e-07,
      "loss": -0.0,
      "num_tokens": 4512004.0,
      "reward": 0.5382812023162842,
      "reward_std": 0.25127214193344116,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.31644338369369507,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1161.0,
      "completions/mean_length": 134.6875,
      "completions/mean_terminated_length": 104.31746673583984,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1075,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 28.4834041595459,
      "learning_rate": 6.701465872208216e-07,
      "loss": 0.0,
      "num_tokens": 4623384.0,
      "reward": 0.2992187440395355,
      "reward_std": 0.23480820655822754,
      "rewards/format_reward/mean": 0.8046875,
      "rewards/format_reward/std": 0.29028159379959106,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 715.0,
      "completions/max_terminated_length": 715.0,
      "completions/mean_length": 76.203125,
      "completions/mean_terminated_length": 76.203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.11,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 14.868675231933594,
      "learning_rate": 6.545084971874736e-07,
      "loss": 0.0,
      "num_tokens": 4706005.0,
      "reward": 0.32734376192092896,
      "reward_std": 0.24423527717590332,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.1751912236213684,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 668.0,
      "completions/mean_length": 154.5625,
      "completions/mean_terminated_length": 61.44261932373047,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 15.19470500946045,
      "learning_rate": 6.387014543809223e-07,
      "loss": 0.0,
      "num_tokens": 4798985.0,
      "reward": 0.5421874523162842,
      "reward_std": 0.3319449722766876,
      "rewards/format_reward/mean": 0.734375,
      "rewards/format_reward/std": 0.30820462107658386,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 72.453125,
      "completions/mean_terminated_length": 41.09524154663086,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.115,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.99255084991455,
      "learning_rate": 6.227427435703995e-07,
      "loss": -0.0,
      "num_tokens": 4918510.0,
      "reward": 0.31406253576278687,
      "reward_std": 0.0530330091714859,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.3642643094062805,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 608.0,
      "completions/mean_length": 79.3125,
      "completions/mean_terminated_length": 48.06349563598633,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1175,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 18.731386184692383,
      "learning_rate": 6.066498153718734e-07,
      "loss": 0.0,
      "num_tokens": 5018826.0,
      "reward": 0.6578124761581421,
      "reward_std": 0.29044055938720703,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.3177144229412079,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 855.0,
      "completions/max_terminated_length": 855.0,
      "completions/mean_length": 66.71875,
      "completions/mean_terminated_length": 66.71875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.12,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 18.806800842285156,
      "learning_rate": 5.90440267166055e-07,
      "loss": 0.0,
      "num_tokens": 5152376.0,
      "reward": 0.3304687738418579,
      "reward_std": 0.30621567368507385,
      "rewards/format_reward/mean": 0.8046875,
      "rewards/format_reward/std": 0.37392371892929077,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 774.0,
      "completions/mean_length": 102.953125,
      "completions/mean_terminated_length": 72.0793685913086,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1225,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 34.07569885253906,
      "learning_rate": 5.741318238559209e-07,
      "loss": 0.0,
      "num_tokens": 5259269.0,
      "reward": 0.45234376192092896,
      "reward_std": 0.2820115089416504,
      "rewards/format_reward/mean": 0.7734375,
      "rewards/format_reward/std": 0.2807259261608124,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 414.0,
      "completions/mean_length": 127.9375,
      "completions/mean_terminated_length": 66.0,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 27.46225929260254,
      "learning_rate": 5.577423184847931e-07,
      "loss": 0.0,
      "num_tokens": 5354257.0,
      "reward": 0.592968761920929,
      "reward_std": 0.22307650744915009,
      "rewards/format_reward/mean": 0.9296875,
      "rewards/format_reward/std": 0.23345555365085602,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 50
    }
  ],
  "logging_steps": 1,
  "max_steps": 100,
  "num_input_tokens_seen": 5354257,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}