goldengoose-corr-v4-0.25-200/checkpoint-100/trainer_state.json

{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.125,
  "eval_steps": 500,
  "global_step": 100,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1743.0,
      "completions/mean_length": 471.375,
      "completions/mean_terminated_length": 446.3492431640625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.00125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 27.719789505004883,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 107576.0,
      "reward": 0.32109373807907104,
      "reward_std": 0.35813236236572266,
      "rewards/format_reward/mean": 0.3984375,
      "rewards/format_reward/std": 0.31090864539146423,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1343.0,
      "completions/max_terminated_length": 1343.0,
      "completions/mean_length": 420.609375,
      "completions/mean_terminated_length": 420.609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.0025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.845207214355469,
      "learning_rate": 1e-07,
      "loss": -0.0,
      "num_tokens": 217615.0,
      "reward": 0.0898437350988388,
      "reward_std": 0.1637348234653473,
      "rewards/format_reward/mean": 0.2734375,
      "rewards/format_reward/std": 0.2662152051925659,
      "rewards/mcq_exact_match_reward/mean": 0.0625,
      "rewards/mcq_exact_match_reward/std": 0.24397502839565277,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1600.0,
      "completions/mean_length": 604.359375,
      "completions/mean_terminated_length": 581.4444580078125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.00375,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 3.8941690921783447,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 346454.0,
      "reward": 0.2398437261581421,
      "reward_std": 0.34310027956962585,
      "rewards/format_reward/mean": 0.3671875,
      "rewards/format_reward/std": 0.23974503576755524,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1399.0,
      "completions/max_terminated_length": 1399.0,
      "completions/mean_length": 485.09375,
      "completions/mean_terminated_length": 485.09375,
      "completions/min_length": 15.0,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.005,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.741612672805786,
      "learning_rate": 3e-07,
      "loss": 0.0,
      "num_tokens": 453380.0,
      "reward": 0.18437498807907104,
      "reward_std": 0.2578701674938202,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.33923351764678955,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 588.8125,
      "completions/mean_terminated_length": 517.0491333007812,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.00625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.286240339279175,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 582664.0,
      "reward": 0.25390625,
      "reward_std": 0.264077365398407,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.24688033759593964,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1609.0,
      "completions/mean_length": 472.859375,
      "completions/mean_terminated_length": 447.857177734375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0075,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 14.97139835357666,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 689791.0,
      "reward": 0.17031249403953552,
      "reward_std": 0.27876684069633484,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.2916666865348816,
      "rewards/mcq_exact_match_reward/mean": 0.140625,
      "rewards/mcq_exact_match_reward/std": 0.3503824472427368,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1476.0,
      "completions/mean_length": 554.796875,
      "completions/mean_terminated_length": 531.0952758789062,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.00875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.5775206089019775,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 802154.0,
      "reward": 0.26953125,
      "reward_std": 0.3855266869068146,
      "rewards/format_reward/mean": 0.3515625,
      "rewards/format_reward/std": 0.2302463799715042,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1346.0,
      "completions/mean_length": 447.140625,
      "completions/mean_terminated_length": 421.7301940917969,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.01,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.306600570678711,
      "learning_rate": 7e-07,
      "loss": -0.0,
      "num_tokens": 909003.0,
      "reward": 0.3125,
      "reward_std": 0.41604068875312805,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.2745848298072815,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1898.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 663.96875,
      "completions/mean_terminated_length": 663.96875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01125,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.932246208190918,
      "learning_rate": 8e-07,
      "loss": 0.0,
      "num_tokens": 1022305.0,
      "reward": 0.1484375,
      "reward_std": 0.2908669710159302,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.2592533528804779,
      "rewards/mcq_exact_match_reward/mean": 0.109375,
      "rewards/mcq_exact_match_reward/std": 0.3145764470100403,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1804.0,
      "completions/max_terminated_length": 1804.0,
      "completions/mean_length": 508.078125,
      "completions/mean_terminated_length": 508.078125,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.0125,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.8813296556472778,
      "learning_rate": 9e-07,
      "loss": 0.0,
      "num_tokens": 1149174.0,
      "reward": 0.39531248807907104,
      "reward_std": 0.25146484375,
      "rewards/format_reward/mean": 0.359375,
      "rewards/format_reward/std": 0.2741328477859497,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1901.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 587.140625,
      "completions/mean_terminated_length": 587.140625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.928985118865967,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 1288087.0,
      "reward": 0.26640623807907104,
      "reward_std": 0.413688063621521,
      "rewards/format_reward/mean": 0.3203125,
      "rewards/format_reward/std": 0.27265870571136475,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1818.0,
      "completions/max_terminated_length": 1818.0,
      "completions/mean_length": 631.4375,
      "completions/mean_terminated_length": 631.4375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.015,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.051449298858643,
      "learning_rate": 9.999316524962345e-07,
      "loss": -0.0,
      "num_tokens": 1432555.0,
      "reward": 0.24062499403953552,
      "reward_std": 0.37458372116088867,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.2357022762298584,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1405.0,
      "completions/max_terminated_length": 1405.0,
      "completions/mean_length": 502.890625,
      "completions/mean_terminated_length": 502.890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 10.412359237670898,
      "learning_rate": 9.99726628670463e-07,
      "loss": 0.0,
      "num_tokens": 1562860.0,
      "reward": 0.3140624761581421,
      "reward_std": 0.4029204845428467,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 911.0,
      "completions/max_terminated_length": 911.0,
      "completions/mean_length": 404.0,
      "completions/mean_terminated_length": 404.0,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0175,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 5.526406288146973,
      "learning_rate": 9.993849845741523e-07,
      "loss": 0.0,
      "num_tokens": 1660916.0,
      "reward": 0.296875,
      "reward_std": 0.35833704471588135,
      "rewards/format_reward/mean": 0.46875,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1440.0,
      "completions/mean_length": 540.46875,
      "completions/mean_terminated_length": 491.83868408203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.01875,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.893363952636719,
      "learning_rate": 9.989068136093872e-07,
      "loss": 0.0,
      "num_tokens": 1783450.0,
      "reward": 0.38203123211860657,
      "reward_std": 0.30731916427612305,
      "rewards/format_reward/mean": 0.3828125,
      "rewards/format_reward/std": 0.2313210517168045,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1407.0,
      "completions/max_terminated_length": 1407.0,
      "completions/mean_length": 553.1875,
      "completions/mean_terminated_length": 553.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 1.701499342918396,
      "learning_rate": 9.982922465033348e-07,
      "loss": -0.0,
      "num_tokens": 1904958.0,
      "reward": 0.5335937738418579,
      "reward_std": 0.41488444805145264,
      "rewards/format_reward/mean": 0.4921875,
      "rewards/format_reward/std": 0.1406387835741043,
      "rewards/mcq_exact_match_reward/mean": 0.484375,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1545.0,
      "completions/max_terminated_length": 1545.0,
      "completions/mean_length": 344.5,
      "completions/mean_terminated_length": 344.5,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02125,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 12.08929443359375,
      "learning_rate": 9.975414512725056e-07,
      "loss": -0.0,
      "num_tokens": 2031982.0,
      "reward": 0.500781238079071,
      "reward_std": 0.28046733140945435,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.10652101784944534,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1117.0,
      "completions/mean_length": 473.9375,
      "completions/mean_terminated_length": 448.952392578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0225,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.033034324645996,
      "learning_rate": 9.966546331768192e-07,
      "loss": -0.0,
      "num_tokens": 2146218.0,
      "reward": 0.606249988079071,
      "reward_std": 0.4029581844806671,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.24397502839565277,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1098.0,
      "completions/max_terminated_length": 1098.0,
      "completions/mean_length": 385.65625,
      "completions/mean_terminated_length": 385.65625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02375,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 16.02350425720215,
      "learning_rate": 9.956320346634875e-07,
      "loss": -0.0,
      "num_tokens": 2258940.0,
      "reward": 0.36953121423721313,
      "reward_std": 0.33910423517227173,
      "rewards/format_reward/mean": 0.4140625,
      "rewards/format_reward/std": 0.2280818521976471,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1207.0,
      "completions/max_terminated_length": 1207.0,
      "completions/mean_length": 400.625,
      "completions/mean_terminated_length": 400.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.025,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.332343101501465,
      "learning_rate": 9.944739353007341e-07,
      "loss": 0.0,
      "num_tokens": 2363068.0,
      "reward": 0.2867187559604645,
      "reward_std": 0.3396008610725403,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.2735668122768402,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1964.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 399.25,
      "completions/mean_terminated_length": 399.25,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02625,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 11.555954933166504,
      "learning_rate": 9.931806517013612e-07,
      "loss": 0.0,
      "num_tokens": 2465620.0,
      "reward": 0.3179687559604645,
      "reward_std": 0.44024717807769775,
      "rewards/format_reward/mean": 0.5234375,
      "rewards/format_reward/std": 0.2735668122768402,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1505.0,
      "completions/max_terminated_length": 1505.0,
      "completions/mean_length": 327.234375,
      "completions/mean_terminated_length": 327.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0275,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 4.289510726928711,
      "learning_rate": 9.917525374361911e-07,
      "loss": -0.0,
      "num_tokens": 2565427.0,
      "reward": 0.28984373807907104,
      "reward_std": 0.3399752378463745,
      "rewards/format_reward/mean": 0.5546875,
      "rewards/format_reward/std": 0.2538151443004608,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 940.0,
      "completions/max_terminated_length": 940.0,
      "completions/mean_length": 297.296875,
      "completions/mean_terminated_length": 297.296875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.02875,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 13.854007720947266,
      "learning_rate": 9.901899829374047e-07,
      "loss": 0.0,
      "num_tokens": 2667206.0,
      "reward": 0.32890623807907104,
      "reward_std": 0.2701229453086853,
      "rewards/format_reward/mean": 0.4765625,
      "rewards/format_reward/std": 0.20758795738220215,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1274.0,
      "completions/max_terminated_length": 1274.0,
      "completions/mean_length": 195.265625,
      "completions/mean_terminated_length": 195.265625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 16.784299850463867,
      "learning_rate": 9.884934153917996e-07,
      "loss": 0.0,
      "num_tokens": 2755663.0,
      "reward": 0.31640625,
      "reward_std": 0.1695163995027542,
      "rewards/format_reward/mean": 0.5078125,
      "rewards/format_reward/std": 0.20877929031848907,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1681.0,
      "completions/max_terminated_length": 1681.0,
      "completions/mean_length": 253.875,
      "completions/mean_terminated_length": 253.875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 25.41985511779785,
      "learning_rate": 9.866632986240029e-07,
      "loss": -0.0,
      "num_tokens": 2837511.0,
      "reward": 0.2749999761581421,
      "reward_std": 0.20281967520713806,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.208927720785141,
      "rewards/mcq_exact_match_reward/mean": 0.21875,
      "rewards/mcq_exact_match_reward/std": 0.4166666865348816,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1067.0,
      "completions/max_terminated_length": 1067.0,
      "completions/mean_length": 199.59375,
      "completions/mean_terminated_length": 199.59375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0325,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 8.237688064575195,
      "learning_rate": 9.847001329696652e-07,
      "loss": 0.0,
      "num_tokens": 2942205.0,
      "reward": 0.38984376192092896,
      "reward_std": 0.24267949163913727,
      "rewards/format_reward/mean": 0.6171875,
      "rewards/format_reward/std": 0.21347814798355103,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 492.0,
      "completions/max_terminated_length": 492.0,
      "completions/mean_length": 92.25,
      "completions/mean_terminated_length": 92.25,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 15.814329147338867,
      "learning_rate": 9.826044551386742e-07,
      "loss": 0.0,
      "num_tokens": 3037685.0,
      "reward": 0.526562511920929,
      "reward_std": 0.17365704476833344,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.23935678601264954,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 677.0,
      "completions/max_terminated_length": 677.0,
      "completions/mean_length": 136.734375,
      "completions/mean_terminated_length": 136.734375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.035,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.19939613342285,
      "learning_rate": 9.803768380684242e-07,
      "loss": -0.0,
      "num_tokens": 3138908.0,
      "reward": 0.4531249701976776,
      "reward_std": 0.23708730936050415,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 543.0,
      "completions/max_terminated_length": 543.0,
      "completions/mean_length": 85.0625,
      "completions/mean_terminated_length": 85.0625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03625,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 20.77176284790039,
      "learning_rate": 9.780178907671788e-07,
      "loss": 0.0,
      "num_tokens": 3233712.0,
      "reward": 0.3414062261581421,
      "reward_std": 0.28602826595306396,
      "rewards/format_reward/mean": 0.6015625,
      "rewards/format_reward/std": 0.2387082874774933,
      "rewards/mcq_exact_match_reward/mean": 0.28125,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 832.0,
      "completions/max_terminated_length": 832.0,
      "completions/mean_length": 79.84375,
      "completions/mean_terminated_length": 79.84375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.60765266418457,
      "learning_rate": 9.755282581475767e-07,
      "loss": 0.0,
      "num_tokens": 3333926.0,
      "reward": 0.16249999403953552,
      "reward_std": 0.20131680369377136,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.1510545015335083,
      "rewards/mcq_exact_match_reward/mean": 0.109375,
      "rewards/mcq_exact_match_reward/std": 0.3145764470100403,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 555.0,
      "completions/max_terminated_length": 555.0,
      "completions/mean_length": 75.1875,
      "completions/mean_terminated_length": 75.1875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.03875,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.49142074584961,
      "learning_rate": 9.729086208503173e-07,
      "loss": -0.0,
      "num_tokens": 3423410.0,
      "reward": 0.6078124642372131,
      "reward_std": 0.270576536655426,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.2083333432674408,
      "rewards/mcq_exact_match_reward/mean": 0.546875,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 817.0,
      "completions/max_terminated_length": 817.0,
      "completions/mean_length": 53.890625,
      "completions/mean_terminated_length": 53.890625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 22.353435516357422,
      "learning_rate": 9.701596950580807e-07,
      "loss": -0.0,
      "num_tokens": 3516563.0,
      "reward": 0.31953126192092896,
      "reward_std": 0.2242286503314972,
      "rewards/format_reward/mean": 0.6953125,
      "rewards/format_reward/std": 0.24587368965148926,
      "rewards/mcq_exact_match_reward/mean": 0.25,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 172.0,
      "completions/max_terminated_length": 172.0,
      "completions/mean_length": 20.453125,
      "completions/mean_terminated_length": 20.453125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04125,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.851003646850586,
      "learning_rate": 9.672822322997304e-07,
      "loss": 0.0,
      "num_tokens": 3591016.0,
      "reward": 0.668749988079071,
      "reward_std": 0.2041158676147461,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.2519763112068176,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 405.0,
      "completions/max_terminated_length": 405.0,
      "completions/mean_length": 58.453125,
      "completions/mean_terminated_length": 58.453125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0425,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 15.381024360656738,
      "learning_rate": 9.642770192448535e-07,
      "loss": 0.0,
      "num_tokens": 3684013.0,
      "reward": 0.5406250357627869,
      "reward_std": 0.3712288737297058,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.25,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 620.0,
      "completions/max_terminated_length": 620.0,
      "completions/mean_length": 61.640625,
      "completions/mean_terminated_length": 61.640625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04375,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 16.52124786376953,
      "learning_rate": 9.611448774886923e-07,
      "loss": -0.0,
      "num_tokens": 3761286.0,
      "reward": 0.33124998211860657,
      "reward_std": 0.25726157426834106,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.12198751419782639,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 844.0,
      "completions/mean_length": 94.890625,
      "completions/mean_terminated_length": 63.888893127441406,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.045,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 19.276613235473633,
      "learning_rate": 9.578866633275286e-07,
      "loss": 0.0,
      "num_tokens": 3846599.0,
      "reward": 0.31640625,
      "reward_std": 0.2186937928199768,
      "rewards/format_reward/mean": 0.8203125,
      "rewards/format_reward/std": 0.2576941251754761,
      "rewards/mcq_exact_match_reward/mean": 0.234375,
      "rewards/mcq_exact_match_reward/std": 0.42695629596710205,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 362.0,
      "completions/max_terminated_length": 362.0,
      "completions/mean_length": 44.015625,
      "completions/mean_terminated_length": 44.015625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04625,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 28.070011138916016,
      "learning_rate": 9.545032675245813e-07,
      "loss": 0.0,
      "num_tokens": 3917384.0,
      "reward": 0.542187511920929,
      "reward_std": 0.21420830488204956,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2083333432674408,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 167.0,
      "completions/mean_length": 70.375,
      "completions/mean_terminated_length": 38.984130859375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0475,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 24.695430755615234,
      "learning_rate": 9.509956150664795e-07,
      "loss": -0.0,
      "num_tokens": 3998968.0,
      "reward": 0.45078128576278687,
      "reward_std": 0.18445391952991486,
      "rewards/format_reward/mean": 0.9140625,
      "rewards/format_reward/std": 0.209963858127594,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 559.0,
      "completions/max_terminated_length": 559.0,
      "completions/mean_length": 44.234375,
      "completions/mean_terminated_length": 44.234375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.04875,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 9.522443771362305,
      "learning_rate": 9.473646649103817e-07,
      "loss": -0.0,
      "num_tokens": 4083311.0,
      "reward": 0.2914062738418579,
      "reward_std": 0.18989473581314087,
      "rewards/format_reward/mean": 0.8828125,
      "rewards/format_reward/std": 0.21347814798355103,
      "rewards/mcq_exact_match_reward/mean": 0.203125,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 225.0,
      "completions/max_terminated_length": 225.0,
      "completions/mean_length": 36.28125,
      "completions/mean_terminated_length": 36.28125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 36.65068054199219,
      "learning_rate": 9.436114097218058e-07,
      "loss": 0.0,
      "num_tokens": 4155497.0,
      "reward": 0.43906253576278687,
      "reward_std": 0.20156370103359222,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14689241349697113,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 297.0,
      "completions/max_terminated_length": 297.0,
      "completions/mean_length": 35.578125,
      "completions/mean_terminated_length": 35.578125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.05125,
      "frac_reward_zero_std": 0.25,
      "grad_norm": 24.931636810302734,
      "learning_rate": 9.397368756032444e-07,
      "loss": -0.0,
      "num_tokens": 4249622.0,
      "reward": 0.5,
      "reward_std": 0.23814013600349426,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 285.0,
      "completions/max_terminated_length": 285.0,
      "completions/mean_length": 29.625,
      "completions/mean_terminated_length": 29.625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0525,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 15.338611602783203,
      "learning_rate": 9.357421218136386e-07,
      "loss": 0.0,
      "num_tokens": 4333710.0,
      "reward": 0.3531250059604645,
      "reward_std": 0.1804211586713791,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.2182178944349289,
      "rewards/mcq_exact_match_reward/mean": 0.265625,
      "rewards/mcq_exact_match_reward/std": 0.44515693187713623,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 710.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 72.84375,
      "completions/mean_terminated_length": 72.84375,
      "completions/min_length": 9.0,
      "completions/min_terminated_length": 9.0,
      "epoch": 0.05375,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 10.507906913757324,
      "learning_rate": 9.316282404787869e-07,
      "loss": 0.0,
      "num_tokens": 4440996.0,
      "reward": 0.4296875,
      "reward_std": 0.21748682856559753,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "rewards/mcq_exact_match_reward/mean": 0.34375,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 101.0,
      "completions/max_terminated_length": 101.0,
      "completions/mean_length": 17.46875,
      "completions/mean_terminated_length": 17.46875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.055,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 16.530738830566406,
      "learning_rate": 9.273963562927694e-07,
      "loss": -0.0,
      "num_tokens": 4527058.0,
      "reward": 0.5625,
      "reward_std": 0.2709311842918396,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1666666716337204,
      "rewards/mcq_exact_match_reward/mean": 0.46875,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 274.0,
      "completions/max_terminated_length": 274.0,
      "completions/mean_length": 38.5625,
      "completions/mean_terminated_length": 38.5625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.05625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 9.306171417236328,
      "learning_rate": 9.230476262104676e-07,
      "loss": 0.0,
      "num_tokens": 4591334.0,
      "reward": 0.6625000238418579,
      "reward_std": 0.23356688022613525,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 664.0,
      "completions/max_terminated_length": 664.0,
      "completions/mean_length": 49.96875,
      "completions/mean_terminated_length": 49.96875,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0575,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 8.561665534973145,
      "learning_rate": 9.185832391312642e-07,
      "loss": 0.0,
      "num_tokens": 4671492.0,
      "reward": 0.3890625238418579,
      "reward_std": 0.1695934236049652,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.18298126757144928,
      "rewards/mcq_exact_match_reward/mean": 0.296875,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1079.0,
      "completions/mean_length": 102.84375,
      "completions/mean_terminated_length": 71.96826171875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.05875,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 5.362051486968994,
      "learning_rate": 9.1400441557401e-07,
      "loss": 0.0,
      "num_tokens": 4742906.0,
      "reward": 0.612500011920929,
      "reward_std": 0.10678248107433319,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.1510545015335083,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 275.0,
      "completions/max_terminated_length": 275.0,
      "completions/mean_length": 43.265625,
      "completions/mean_terminated_length": 43.265625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 9.450302124023438,
      "learning_rate": 9.093124073433462e-07,
      "loss": -0.0,
      "num_tokens": 4808035.0,
      "reward": 0.4906250238418579,
      "reward_std": 0.2109457552433014,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 264.0,
      "completions/max_terminated_length": 264.0,
      "completions/mean_length": 27.03125,
      "completions/mean_terminated_length": 27.03125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06125,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 7.115367889404297,
      "learning_rate": 9.045084971874737e-07,
      "loss": 0.0,
      "num_tokens": 4889933.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.12255740165710449,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 194.0,
      "completions/max_terminated_length": 194.0,
      "completions/mean_length": 28.4375,
      "completions/mean_terminated_length": 28.4375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0625,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.029036521911621,
      "learning_rate": 8.995939984474623e-07,
      "loss": 0.0,
      "num_tokens": 4971121.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 771.0,
      "completions/max_terminated_length": 771.0,
      "completions/mean_length": 80.90625,
      "completions/mean_terminated_length": 80.90625,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.06375,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 30.694612503051758,
      "learning_rate": 8.945702546981968e-07,
      "loss": 0.0,
      "num_tokens": 5048643.0,
      "reward": 0.6203124523162842,
      "reward_std": 0.07164573669433594,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2592533528804779,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 455.0,
      "completions/max_terminated_length": 455.0,
      "completions/mean_length": 37.765625,
      "completions/mean_terminated_length": 37.765625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.065,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 12.923490524291992,
      "learning_rate": 8.894386393810562e-07,
      "loss": 0.0,
      "num_tokens": 5134100.0,
      "reward": 0.5046875476837158,
      "reward_std": 0.11330723762512207,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 473.0,
      "completions/max_terminated_length": 473.0,
      "completions/mean_length": 41.296875,
      "completions/mean_terminated_length": 41.296875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06625,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 12.523974418640137,
      "learning_rate": 8.842005554284295e-07,
      "loss": 0.0,
      "num_tokens": 5252135.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.22461533546447754,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 441.0,
      "completions/max_terminated_length": 441.0,
      "completions/mean_length": 74.453125,
      "completions/mean_terminated_length": 74.453125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0675,
      "frac_reward_zero_std": 0.375,
      "grad_norm": 15.70406723022461,
      "learning_rate": 8.788574348801674e-07,
      "loss": 0.0,
      "num_tokens": 5334148.0,
      "reward": 0.659375011920929,
      "reward_std": 0.25896912813186646,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 671.0,
      "completions/max_terminated_length": 671.0,
      "completions/mean_length": 46.109375,
      "completions/mean_terminated_length": 46.109375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.06875,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 23.37506103515625,
      "learning_rate": 8.734107384920769e-07,
      "loss": 0.0,
      "num_tokens": 5411715.0,
      "reward": 0.5531250238418579,
      "reward_std": 0.17358146607875824,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 628.0,
      "completions/max_terminated_length": 628.0,
      "completions/mean_length": 40.359375,
      "completions/mean_terminated_length": 40.359375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.58426856994629,
      "learning_rate": 8.678619553365658e-07,
      "loss": -0.0,
      "num_tokens": 5495986.0,
      "reward": 0.6312500238418579,
      "reward_std": 0.0883883461356163,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 333.0,
      "completions/max_terminated_length": 333.0,
      "completions/mean_length": 49.171875,
      "completions/mean_terminated_length": 49.171875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 9.968092918395996,
      "learning_rate": 8.622126023955445e-07,
      "loss": 0.0,
      "num_tokens": 5589749.0,
      "reward": 0.6765625476837158,
      "reward_std": 0.06687791645526886,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 330.0,
      "completions/max_terminated_length": 330.0,
      "completions/mean_length": 45.828125,
      "completions/mean_terminated_length": 45.828125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0725,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.7135562896728516,
      "learning_rate": 8.564642241456986e-07,
      "loss": -0.0,
      "num_tokens": 5676570.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.04419417306780815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 458.0,
      "completions/max_terminated_length": 458.0,
      "completions/mean_length": 39.21875,
      "completions/mean_terminated_length": 39.21875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07375,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 17.85677146911621,
      "learning_rate": 8.506183921362442e-07,
      "loss": 0.0,
      "num_tokens": 5756528.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 186.0,
      "completions/max_terminated_length": 186.0,
      "completions/mean_length": 22.78125,
      "completions/mean_terminated_length": 22.78125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.075,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 33.40819549560547,
      "learning_rate": 8.446767045592829e-07,
      "loss": 0.0,
      "num_tokens": 5829058.0,
      "reward": 0.7710937857627869,
      "reward_std": 0.09984822571277618,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 17.703125,
      "completions/mean_terminated_length": 17.703125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07625,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.3193039894104,
      "learning_rate": 8.386407858128706e-07,
      "loss": 0.0,
      "num_tokens": 5903063.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.0578637570142746,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 226.0,
      "completions/max_terminated_length": 226.0,
      "completions/mean_length": 33.96875,
      "completions/mean_terminated_length": 33.96875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0775,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.325122860569241e-07,
      "loss": 0.0,
      "num_tokens": 5984173.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1581.0,
      "completions/max_terminated_length": 1581.0,
      "completions/mean_length": 96.734375,
      "completions/mean_terminated_length": 96.734375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.07875,
      "frac_reward_zero_std": 0.5,
      "grad_norm": 11.058859825134277,
      "learning_rate": 8.262928807620843e-07,
      "loss": -0.0,
      "num_tokens": 6062100.0,
      "reward": 0.6296875476837158,
      "reward_std": 0.15962307155132294,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.08768405020236969,
      "rewards/mcq_exact_match_reward/mean": 0.53125,
      "rewards/mcq_exact_match_reward/std": 0.5029674172401428,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 466.0,
      "completions/max_terminated_length": 466.0,
      "completions/mean_length": 34.109375,
      "completions/mean_terminated_length": 34.109375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 6.3704962730407715,
      "learning_rate": 8.199842702516582e-07,
      "loss": -0.0,
      "num_tokens": 6133635.0,
      "reward": 0.4750000238418579,
      "reward_std": 0.0883883461356163,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.375,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 302.0,
      "completions/max_terminated_length": 302.0,
      "completions/mean_length": 30.375,
      "completions/mean_terminated_length": 30.375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 8.135881792367685e-07,
      "loss": 0.0,
      "num_tokens": 6218427.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 389.0,
      "completions/max_terminated_length": 389.0,
      "completions/mean_length": 44.90625,
      "completions/mean_terminated_length": 44.90625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0825,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.963454723358154,
      "learning_rate": 8.071063563448339e-07,
      "loss": -0.0,
      "num_tokens": 6304541.0,
      "reward": 0.9117187857627869,
      "reward_std": 0.06768143177032471,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.8125,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 115.0,
      "completions/max_terminated_length": 115.0,
      "completions/mean_length": 15.109375,
      "completions/mean_terminated_length": 15.109375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08375,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 10.610746383666992,
      "learning_rate": 8.005405736415125e-07,
      "loss": -0.0,
      "num_tokens": 6382612.0,
      "reward": 0.6929687857627869,
      "reward_std": 0.09059805423021317,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.59375,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 161.0,
      "completions/max_terminated_length": 161.0,
      "completions/mean_length": 21.765625,
      "completions/mean_terminated_length": 21.765625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.085,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.212800979614258,
      "learning_rate": 7.938926261462365e-07,
      "loss": 0.0,
      "num_tokens": 6446757.0,
      "reward": 0.7875000238418579,
      "reward_std": 0.06681530922651291,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.6875,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 312.0,
      "completions/mean_length": 71.140625,
      "completions/mean_terminated_length": 39.761905670166016,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08625,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 9.373642921447754,
      "learning_rate": 7.871643313414718e-07,
      "loss": 0.0,
      "num_tokens": 6524198.0,
      "reward": 1.0046875476837158,
      "reward_std": 0.11330723762512207,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.90625,
      "rewards/mcq_exact_match_reward/std": 0.29378482699394226,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 222.0,
      "completions/max_terminated_length": 222.0,
      "completions/mean_length": 27.703125,
      "completions/mean_terminated_length": 27.703125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0875,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 14.864502906799316,
      "learning_rate": 7.803575286758363e-07,
      "loss": 0.0,
      "num_tokens": 6599307.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.0883883461356163,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 199.0,
      "completions/max_terminated_length": 199.0,
      "completions/mean_length": 32.71875,
      "completions/mean_terminated_length": 32.71875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.08875,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.8891595602035522,
      "learning_rate": 7.734740790612136e-07,
      "loss": 0.0,
      "num_tokens": 6682921.0,
      "reward": 0.7093750238418579,
      "reward_std": 0.04419417306780815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 556.0,
      "completions/max_terminated_length": 556.0,
      "completions/mean_length": 27.71875,
      "completions/mean_terminated_length": 27.71875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 3.046689033508301,
      "learning_rate": 7.665158643639969e-07,
      "loss": -0.0,
      "num_tokens": 6767743.0,
      "reward": 0.8492187857627869,
      "reward_std": 0.00220970856025815,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 260.0,
      "completions/max_terminated_length": 260.0,
      "completions/mean_length": 22.734375,
      "completions/mean_terminated_length": 22.734375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.001441955566406,
      "learning_rate": 7.594847868906076e-07,
      "loss": 0.0,
      "num_tokens": 6864566.0,
      "reward": 0.8968750238418579,
      "reward_std": 0.0646936446428299,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.796875,
      "rewards/mcq_exact_match_reward/std": 0.40550529956817627,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.0625,
      "completions/mean_terminated_length": 12.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0925,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 32.40105056762695,
      "learning_rate": 7.523827688674219e-07,
      "loss": 0.0,
      "num_tokens": 6921450.0,
      "reward": 0.7718750238418579,
      "reward_std": 0.11100947856903076,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.671875,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 179.0,
      "completions/max_terminated_length": 179.0,
      "completions/mean_length": 14.734375,
      "completions/mean_terminated_length": 14.734375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09375,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 17.64237403869629,
      "learning_rate": 7.452117519152541e-07,
      "loss": 0.0,
      "num_tokens": 7008801.0,
      "reward": 0.7093750238418579,
      "reward_std": 0.189372718334198,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.609375,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 148.0,
      "completions/max_terminated_length": 148.0,
      "completions/mean_length": 15.921875,
      "completions/mean_terminated_length": 15.921875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.095,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 17.417652130126953,
      "learning_rate": 7.379736965185368e-07,
      "loss": -0.0,
      "num_tokens": 7089380.0,
      "reward": 0.4281250238418579,
      "reward_std": 0.1530819982290268,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.328125,
      "rewards/mcq_exact_match_reward/std": 0.4732423722743988,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 290.0,
      "completions/max_terminated_length": 290.0,
      "completions/mean_length": 16.765625,
      "completions/mean_terminated_length": 16.765625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09625,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 15.111842155456543,
      "learning_rate": 7.306705814893439e-07,
      "loss": 0.0,
      "num_tokens": 7194789.0,
      "reward": 0.45781251788139343,
      "reward_std": 0.04861358925700188,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.359375,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.03125,
      "completions/mean_terminated_length": 12.03125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.0975,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 30.111845016479492,
      "learning_rate": 7.233044034264033e-07,
      "loss": 0.0,
      "num_tokens": 7284127.0,
      "reward": 0.2718750238418579,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.171875,
      "rewards/mcq_exact_match_reward/std": 0.38025420904159546,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 12.0625,
      "completions/mean_terminated_length": 12.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.09875,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 35.65519714355469,
      "learning_rate": 7.158771761692464e-07,
      "loss": 0.0,
      "num_tokens": 7379515.0,
      "reward": 0.8187500238418579,
      "reward_std": 0.0578637570142746,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.71875,
      "rewards/mcq_exact_match_reward/std": 0.4531635046005249,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 66.0,
      "completions/max_terminated_length": 66.0,
      "completions/mean_length": 12.90625,
      "completions/mean_terminated_length": 12.90625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.51276206970215,
      "learning_rate": 7.083909302476452e-07,
      "loss": 0.0,
      "num_tokens": 7446461.0,
      "reward": 0.6781250238418579,
      "reward_std": 0.0646936446428299,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.09375,
      "completions/mean_terminated_length": 12.09375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 7.008477123264847e-07,
      "loss": 0.0,
      "num_tokens": 7561899.0,
      "reward": 0.6000000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 557.0,
      "completions/max_terminated_length": 557.0,
      "completions/mean_length": 20.515625,
      "completions/mean_terminated_length": 20.515625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1025,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 6.932495846462261e-07,
      "loss": 0.0,
      "num_tokens": 7644644.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.015625,
      "completions/mean_terminated_length": 12.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10375,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 6.890464782714844,
      "learning_rate": 6.855986244591103e-07,
      "loss": -0.0,
      "num_tokens": 7729413.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.04419417306780815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 12.140625,
      "completions/mean_terminated_length": 12.140625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.105,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 19.9100341796875,
      "learning_rate": 6.778969234612583e-07,
      "loss": -0.0,
      "num_tokens": 7815534.0,
      "reward": 0.7906250357627869,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.0625,
      "completions/mean_terminated_length": 12.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10625,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.460962295532227,
      "learning_rate": 6.701465872208216e-07,
      "loss": 0.0,
      "num_tokens": 7896298.0,
      "reward": 0.7875000238418579,
      "reward_std": 0.06681530922651291,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.6875,
      "rewards/mcq_exact_match_reward/std": 0.467176616191864,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.03125,
      "completions/mean_terminated_length": 12.03125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1075,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 30.060848236083984,
      "learning_rate": 6.623497346023417e-07,
      "loss": 0.0,
      "num_tokens": 7974052.0,
      "reward": 0.8031250238418579,
      "reward_std": 0.1530819982290268,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.703125,
      "rewards/mcq_exact_match_reward/std": 0.4604927599430084,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 101.0,
      "completions/max_terminated_length": 101.0,
      "completions/mean_length": 13.453125,
      "completions/mean_terminated_length": 13.453125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.10875,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 20.207021713256836,
      "learning_rate": 6.545084971874736e-07,
      "loss": 0.0,
      "num_tokens": 8049993.0,
      "reward": 0.7562500238418579,
      "reward_std": 0.0578637570142746,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.65625,
      "rewards/mcq_exact_match_reward/std": 0.4787135720252991,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 54.0,
      "completions/max_terminated_length": 54.0,
      "completions/mean_length": 12.65625,
      "completions/mean_terminated_length": 12.65625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 18.510467529296875,
      "learning_rate": 6.466250186922324e-07,
      "loss": -0.0,
      "num_tokens": 8119723.0,
      "reward": 0.6625000238418579,
      "reward_std": 0.06681530922651291,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.5625,
      "rewards/mcq_exact_match_reward/std": 0.5,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.046875,
      "completions/mean_terminated_length": 12.046875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11125,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 26.590150833129883,
      "learning_rate": 6.387014543809223e-07,
      "loss": 0.0,
      "num_tokens": 8201878.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.13363061845302582,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 444.0,
      "completions/max_terminated_length": 444.0,
      "completions/mean_length": 22.1875,
      "completions/mean_terminated_length": 22.1875,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 1.3266140222549438,
      "learning_rate": 6.307399704769098e-07,
      "loss": -0.0,
      "num_tokens": 8289162.0,
      "reward": 0.8492187261581421,
      "reward_std": 0.00220970856025815,
      "rewards/format_reward/mean": 0.9921875,
      "rewards/format_reward/std": 0.0625,
      "rewards/mcq_exact_match_reward/mean": 0.75,
      "rewards/mcq_exact_match_reward/std": 0.4364357888698578,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.03125,
      "completions/mean_terminated_length": 12.03125,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11375,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 27.705215454101562,
      "learning_rate": 6.227427435703995e-07,
      "loss": -0.0,
      "num_tokens": 8378164.0,
      "reward": 0.4828125238418579,
      "reward_std": 0.12902677059173584,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "rewards/mcq_exact_match_reward/mean": 0.390625,
      "rewards/mcq_exact_match_reward/std": 0.4917473793029785,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.0625,
      "completions/mean_terminated_length": 12.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.115,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 25.083688735961914,
      "learning_rate": 6.147119600233758e-07,
      "loss": -0.0,
      "num_tokens": 8471760.0,
      "reward": 0.9109375476837158,
      "reward_std": 0.11871248483657837,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.8125,
      "rewards/mcq_exact_match_reward/std": 0.39339789748191833,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 13.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 12.015625,
      "completions/mean_terminated_length": 12.015625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11625,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 22.57097053527832,
      "learning_rate": 6.066498153718734e-07,
      "loss": 0.0,
      "num_tokens": 8532233.0,
      "reward": 0.6781250238418579,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.578125,
      "rewards/mcq_exact_match_reward/std": 0.49776285886764526,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1175,
      "frac_reward_zero_std": 0.625,
      "grad_norm": 21.33436393737793,
      "learning_rate": 5.985585137257401e-07,
      "loss": 0.0,
      "num_tokens": 8625297.0,
      "reward": 0.6156250238418579,
      "reward_std": 0.189372718334198,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.515625,
      "rewards/mcq_exact_match_reward/std": 0.5037065148353577,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.11875,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.90440267166055e-07,
      "loss": 0.0,
      "num_tokens": 8693017.0,
      "reward": 0.9750000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.875,
      "rewards/mcq_exact_match_reward/std": 0.3333333432674408,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 17.82723617553711,
      "learning_rate": 5.82297295140367e-07,
      "loss": 0.0,
      "num_tokens": 8762857.0,
      "reward": 0.5062500238418579,
      "reward_std": 0.0578637570142746,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.40625,
      "rewards/mcq_exact_match_reward/std": 0.49501484632492065,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12125,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 12.498894691467285,
      "learning_rate": 5.741318238559209e-07,
      "loss": 0.0,
      "num_tokens": 8837825.0,
      "reward": 0.7406250238418579,
      "reward_std": 0.04419417306780815,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.640625,
      "rewards/mcq_exact_match_reward/std": 0.4836103618144989,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 14.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 12.0625,
      "completions/mean_terminated_length": 12.0625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.1225,
      "frac_reward_zero_std": 0.875,
      "grad_norm": 7.857953071594238,
      "learning_rate": 5.659460856710345e-07,
      "loss": 0.0,
      "num_tokens": 8930893.0,
      "reward": 0.5984375476837158,
      "reward_std": 0.0044194171205163,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "rewards/mcq_exact_match_reward/mean": 0.5,
      "rewards/mcq_exact_match_reward/std": 0.5039526224136353,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 20.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 12.140625,
      "completions/mean_terminated_length": 12.140625,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.12375,
      "frac_reward_zero_std": 0.75,
      "grad_norm": 16.862316131591797,
      "learning_rate": 5.577423184847931e-07,
      "loss": -0.0,
      "num_tokens": 9008550.0,
      "reward": 0.5531250238418579,
      "reward_std": 0.10205793380737305,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.453125,
      "rewards/mcq_exact_match_reward/std": 0.501733124256134,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 12.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 12.0,
      "completions/mean_terminated_length": 12.0,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.125,
      "frac_reward_zero_std": 1.0,
      "grad_norm": 0.0,
      "learning_rate": 5.495227651252315e-07,
      "loss": 0.0,
      "num_tokens": 9088822.0,
      "reward": 0.7250000238418579,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/mcq_exact_match_reward/mean": 0.625,
      "rewards/mcq_exact_match_reward/std": 0.48795005679130554,
      "step": 100
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 9088822,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}